From 246b062bd6fa11ce40b4e4c195281acf064c261a Mon Sep 17 00:00:00 2001 From: Jakub Marcowski Date: Tue, 11 Mar 2025 10:51:10 +0100 Subject: [PATCH] basis_universal: Update to 1.60 --- modules/basis_universal/SCsub | 8 +- .../basis_universal/image_compress_basisu.cpp | 14 +- thirdparty/README.md | 5 +- .../encoder/3rdparty/android_astc_decomp.cpp | 8 + .../encoder/basisu_astc_hdr_6x6_enc.cpp | 7015 ++++++ .../encoder/basisu_astc_hdr_6x6_enc.h | 129 + .../encoder/basisu_astc_hdr_common.cpp | 5357 +++++ .../encoder/basisu_astc_hdr_common.h | 423 + .../encoder/basisu_astc_hdr_enc.cpp | 3310 --- .../basis_universal/encoder/basisu_comp.cpp | 1001 +- .../basis_universal/encoder/basisu_comp.h | 273 +- .../basis_universal/encoder/basisu_enc.cpp | 374 +- .../basis_universal/encoder/basisu_enc.h | 583 +- .../encoder/basisu_frontend.cpp | 110 +- .../encoder/basisu_gpu_texture.cpp | 79 +- .../basis_universal/encoder/basisu_math.h | 3146 +++ .../basis_universal/encoder/basisu_opencl.cpp | 22 +- .../basis_universal/encoder/basisu_opencl.h | 2 +- .../encoder/basisu_resample_filters.cpp | 26 +- .../encoder/basisu_resampler.cpp | 2 +- .../encoder/basisu_resampler.h | 2 + .../encoder/basisu_resampler_filters.h | 12 + .../basis_universal/encoder/basisu_ssim.cpp | 2 + .../encoder/basisu_uastc_enc.cpp | 33 +- .../encoder/basisu_uastc_hdr_4x4_enc.cpp | 1277 + ...c_hdr_enc.h => basisu_uastc_hdr_4x4_enc.h} | 75 +- .../patches/0001-external-zstd-pr344.patch | 6 +- .../patches/0002-external-jpgd.patch | 13 +- .../patches/0003-external-tinyexr.patch | 6 +- .../patches/0004-remove-tinydds-qoi.patch | 22 +- .../0005-windows-illegal-character.patch | 13 + .../patches/0006-ambiguous-calls.patch | 22 + .../basis_universal/transcoder/basisu.h | 217 +- .../transcoder/basisu_astc_hdr_core.h | 156 +- .../transcoder/basisu_astc_helpers.h | 288 +- .../transcoder/basisu_containers.h | 5993 +++-- .../transcoder/basisu_containers_impl.h | 1058 +- .../transcoder/basisu_file_headers.h | 11 +- .../transcoder/basisu_transcoder.cpp | 19481 ++++++++++------ .../transcoder/basisu_transcoder.h | 306 +- .../transcoder/basisu_transcoder_internal.h | 68 +- thirdparty/libktx/lib/basis_transcode.cpp | 2 +- .../libktx/patches/0003-basisu-1.60.patch | 13 + 43 files changed, 36977 insertions(+), 13986 deletions(-) create mode 100644 thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.cpp create mode 100644 thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.h create mode 100644 thirdparty/basis_universal/encoder/basisu_astc_hdr_common.cpp create mode 100644 thirdparty/basis_universal/encoder/basisu_astc_hdr_common.h delete mode 100644 thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.cpp create mode 100644 thirdparty/basis_universal/encoder/basisu_math.h create mode 100644 thirdparty/basis_universal/encoder/basisu_uastc_hdr_4x4_enc.cpp rename thirdparty/basis_universal/encoder/{basisu_astc_hdr_enc.h => basisu_uastc_hdr_4x4_enc.h} (62%) create mode 100644 thirdparty/basis_universal/patches/0005-windows-illegal-character.patch create mode 100644 thirdparty/basis_universal/patches/0006-ambiguous-calls.patch create mode 100644 thirdparty/libktx/patches/0003-basisu-1.60.patch diff --git a/modules/basis_universal/SCsub b/modules/basis_universal/SCsub index b8cce67f897b..0f98b3108022 100644 --- a/modules/basis_universal/SCsub +++ b/modules/basis_universal/SCsub @@ -20,22 +20,24 @@ basisu_encoder = env.editor_build if basisu_encoder: encoder_sources = [ "3rdparty/android_astc_decomp.cpp", - "basisu_astc_hdr_enc.cpp", + "basisu_astc_hdr_6x6_enc.cpp", + "basisu_astc_hdr_common.cpp", "basisu_backend.cpp", "basisu_basis_file.cpp", "basisu_bc7enc.cpp", - "basisu_opencl.cpp", "basisu_comp.cpp", "basisu_enc.cpp", "basisu_etc.cpp", "basisu_frontend.cpp", "basisu_gpu_texture.cpp", "basisu_kernels_sse.cpp", + "basisu_opencl.cpp", "basisu_pvrtc1_4.cpp", - "basisu_resampler.cpp", "basisu_resample_filters.cpp", + "basisu_resampler.cpp", "basisu_ssim.cpp", "basisu_uastc_enc.cpp", + "basisu_uastc_hdr_4x4_enc.cpp", "pvpngreader.cpp", ] encoder_sources = [thirdparty_dir + "encoder/" + file for file in encoder_sources] diff --git a/modules/basis_universal/image_compress_basisu.cpp b/modules/basis_universal/image_compress_basisu.cpp index be28d8950822..2c06de7417bb 100644 --- a/modules/basis_universal/image_compress_basisu.cpp +++ b/modules/basis_universal/image_compress_basisu.cpp @@ -101,13 +101,13 @@ Vector basis_universal_packer(const Ref &p_image, Image::UsedCha basisu::basis_compressor_params params; params.m_uastc = true; - params.m_quality_level = basisu::BASISU_QUALITY_MIN; - params.m_pack_uastc_flags &= ~basisu::cPackUASTCLevelMask; - params.m_pack_uastc_flags |= basisu::cPackUASTCLevelFastest; + params.m_etc1s_quality_level = basisu::BASISU_QUALITY_MIN; + params.m_pack_uastc_ldr_4x4_flags &= ~basisu::cPackUASTCLevelMask; + params.m_pack_uastc_ldr_4x4_flags |= basisu::cPackUASTCLevelFastest; - params.m_rdo_uastc = 0.0f; - params.m_rdo_uastc_quality_scalar = 0.0f; - params.m_rdo_uastc_dict_size = 1024; + params.m_rdo_uastc_ldr_4x4 = 0.0f; + params.m_rdo_uastc_ldr_4x4_quality_scalar = 0.0f; + params.m_rdo_uastc_ldr_4x4_dict_size = 1024; params.m_mip_fast = true; params.m_multithreading = true; @@ -127,7 +127,7 @@ Vector basis_universal_packer(const Ref &p_image, Image::UsedCha if (is_hdr) { decompress_format = BASIS_DECOMPRESS_HDR_RGB; params.m_hdr = true; - params.m_uastc_hdr_options.set_quality_level(0); + params.m_uastc_hdr_4x4_options.set_quality_level(0); } else { switch (p_channels) { diff --git a/thirdparty/README.md b/thirdparty/README.md index b6a4a037498f..dddacc9d4d39 100644 --- a/thirdparty/README.md +++ b/thirdparty/README.md @@ -62,7 +62,7 @@ Files extracted from upstream source: ## basis_universal - Upstream: https://github.com/BinomialLLC/basis_universal -- Version: 1.50.0 (051ad6d8a64bb95a79e8601c317055fd1782ad3e, 2024) +- Version: 1.60 (323239a6a5ffa57d6570cfc403be99156e33a8b0, 2025) - License: Apache 2.0 Files extracted from upstream source: @@ -77,6 +77,8 @@ Patches: - `0002-external-jpgd.patch` (GH-88508) - `0003-external-tinyexr.patch` (GH-97582) - `0004-remove-tinydds-qoi.patch` (GH-97582) +- `0005-windows-illegal-character.patch` (GH-103968) +- `0006-ambiguous-calls.patch` (GH-103968) ## brotli @@ -512,6 +514,7 @@ Patches: - `0001-external-basisu.patch` (GH-76572) - `0002-disable-astc-block-ext.patch` (GH-76572) +- `0003-basisu-1.60.patch` (GH-103968) ## libogg diff --git a/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.cpp b/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.cpp index 5abfe2faf922..a667d0d63787 100644 --- a/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.cpp +++ b/thirdparty/basis_universal/encoder/3rdparty/android_astc_decomp.cpp @@ -836,10 +836,12 @@ void decodeISETritBlock (ISEDecodedResult* dst, int numValues, BitAccessStream& m[4] = data.getNext(numBits); deUint32 T7 = data.getNext(1); +#ifndef __EMSCRIPTEN__ #ifdef __GNUC__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wimplicit-fallthrough=" #endif +#endif switch (numValues) { // \note Fall-throughs. @@ -851,9 +853,11 @@ void decodeISETritBlock (ISEDecodedResult* dst, int numValues, BitAccessStream& default: DE_ASSERT(false); } +#ifndef __EMSCRIPTEN__ #ifdef __GNUC__ #pragma GCC diagnostic pop #endif +#endif const deUint32 T = (T7 << 7) | (T56 << 5) | (T4 << 4) | (T23 << 2) | (T01 << 0); @@ -898,10 +902,12 @@ void decodeISEQuintBlock (ISEDecodedResult* dst, int numValues, BitAccessStream& m[2] = data.getNext(numBits); deUint32 Q56 = data.getNext(2); +#ifndef __EMSCRIPTEN__ #ifdef __GNUC__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wimplicit-fallthrough=" #endif +#endif switch (numValues) { // \note Fall-throughs. @@ -911,9 +917,11 @@ void decodeISEQuintBlock (ISEDecodedResult* dst, int numValues, BitAccessStream& default: DE_ASSERT(false); } +#ifndef __EMSCRIPTEN__ #ifdef __GNUC__ #pragma GCC diagnostic pop #endif +#endif const deUint32 Q = (Q56 << 5) | (Q34 << 3) | (Q012 << 0); diff --git a/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.cpp b/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.cpp new file mode 100644 index 000000000000..6d5863265a09 --- /dev/null +++ b/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.cpp @@ -0,0 +1,7015 @@ +// File: basisu_astc_hdr_6x6_enc.cpp +#include "basisu_astc_hdr_6x6_enc.h" +#include "basisu_enc.h" +#include "basisu_astc_hdr_common.h" +#include "basisu_math.h" +#include "basisu_resampler.h" +#include "basisu_resampler_filters.h" + +#define MINIZ_HEADER_FILE_ONLY +#define MINIZ_NO_ZLIB_COMPATIBLE_NAMES +#include "basisu_miniz.h" + +#include "3rdparty/android_astc_decomp.h" + +#include + +using namespace basisu; +using namespace buminiz; +using namespace basist::astc_6x6_hdr; + +namespace astc_6x6_hdr +{ + +static void atomic_max(std::atomic& atomic_var, uint32_t new_value) +{ + uint32_t current = atomic_var.load(std::memory_order_relaxed); + for ( ; ; ) + { + uint32_t new_max = std::max(current, new_value); + if (atomic_var.compare_exchange_weak(current, new_max, std::memory_order_relaxed, std::memory_order_relaxed)) + break; + } +} + +void astc_hdr_6x6_global_config::set_user_level(int level) +{ + level = basisu::clamp(level, 0, ASTC_HDR_6X6_MAX_USER_COMP_LEVEL); + + m_master_comp_level = 0; + m_highest_comp_level = 0; + m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS; + m_extra_patterns_flag = false; + m_brute_force_partition_matching = false; + + switch (level) + { + case 0: + { + // Both reduce compression a lot when lambda>0 + m_favor_higher_compression = false; + m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS / 2; + break; + } + case 1: + { + m_master_comp_level = 0; + m_highest_comp_level = 0; + break; + } + case 2: + { + m_master_comp_level = 0; + m_highest_comp_level = 1; + break; + } + case 3: + { + m_master_comp_level = 1; + m_highest_comp_level = 1; + break; + } + case 4: + { + m_master_comp_level = 1; + m_highest_comp_level = 2; + break; + } + case 5: + { + m_master_comp_level = 1; + m_highest_comp_level = 3; + break; + } + case 6: + { + m_master_comp_level = 1; + m_highest_comp_level = 4; + break; + } + case 7: + { + m_master_comp_level = 2; + m_highest_comp_level = 2; + break; + } + case 8: + { + m_master_comp_level = 2; + m_highest_comp_level = 3; + break; + } + case 9: + { + m_master_comp_level = 2; + m_highest_comp_level = 4; + break; + } + case 10: + { + m_master_comp_level = 3; + m_highest_comp_level = 3; + break; + } + case 11: + { + m_master_comp_level = 3; + m_highest_comp_level = 4; + break; + } + case 12: + default: + { + m_master_comp_level = 4; + m_highest_comp_level = 4; + m_extra_patterns_flag = true; + m_brute_force_partition_matching = true; + break; + } + } +} + +const float m1 = 0.1593017578125f; // (2610 / 2^14) * (1/100) +const float m2 = 78.84375f; // (2523 / 32) * (1/100) +const float c1 = 0.8359375f; // 3424 / (2^12) +const float c2 = 18.8515625f; // (2413 / 128) +const float c3 = 18.6875f; // (2392 / 128) + +static float forwardPQ(float Y) +{ + // 10,000 here is an absolute scale - it's in nits (cd per square meter) + float L = Y * (1.0f / 10000.0f); + + float num = powf(L, m1); + float N = powf((c1 + c2 * num) / (1 + c3 * num), m2); + + return N; +} + +#if 0 +static float inversePQ(float E) +{ + float N = powf(E, 1.0f / m2); + + float num = basisu::maximum((N - c1), 0.0f) / (c2 - c3 * N); + float L = powf(num, 1.0f / m1); + + return L * 10000.0f; +} +#endif + +// PQ function approximation: convert input to bfloat16, look up in tables, bilinear interpolation between table entries. +// max_er: 0.000023007392883, max_rel_er: 0.000023472490284, avg_er: 0.000004330495689, 6-7x faster on x86 +// Highest error is for values less than SMALLEST_PQ_VAL_IN. +// +// Approximation is round trip lossless for 10-12 bits at [0,10000] nits: +// for x [0,1024] (SCALE=1023) or for x [0,4095] (SCALE=4096): +// round(forwardPQTab(inversePQ(x / SCALE)) * SCALE) == x +// +// bfloat16 has enough precision to handle 8-bit sRGB to linear conversions: +// round(linear_to_srgb(bfloat16_to_float(float_to_bfloat16(srgb_to_linear(isRGB/255.0f))))*255.0) is lossless + +const int PQ_APPROX_MIN_EXP = -16, PQ_APPROX_MAX_EXP = 16; +const int PQ_APPROX_EXP_RANGE = (PQ_APPROX_MAX_EXP - PQ_APPROX_MIN_EXP + 1); + +const float SMALLEST_PQ_VAL_IN = 0.000015258829080f; +const float SMALLEST_PQ_VAL = 0.000551903737f; // forwardPQ(SMALLEST_PQ_VAL_IN) + +const float LARGEST_PQ_VAL = 1.251312f; + +float g_pq_approx_tabs[PQ_APPROX_EXP_RANGE][128]; + +static void init_pq_tables() +{ + for (int exp = PQ_APPROX_MIN_EXP; exp <= PQ_APPROX_MAX_EXP; exp++) + { + for (int mant = 0; mant < 128; mant++) + { + bfloat16 b = bfloat16_init(1, exp, mant); + float bf = bfloat16_to_float(b); + + float pq = forwardPQ(bf); + + g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant] = pq; + } + } + + //fmt_printf("{.15} {.15}\n", g_pq_approx_tabs[0][0], inversePQ(g_pq_approx_tabs[0][0])); + //fmt_printf("{.15}\n", forwardPQ(SMALLEST_PQ_VAL_IN)); +} + +static inline float forwardPQTab(float v) +{ + assert(g_pq_approx_tabs[0][0]); + + assert(v >= 0.0f); + if (v == 0.0f) + return 0.0f; + + bfloat16 bf = float_to_bfloat16(v, false); + assert(v >= bfloat16_to_float(bf)); + + int exp = bfloat16_get_exp(bf); + + if (exp < PQ_APPROX_MIN_EXP) + { + // not accurate but should be good enough for our uses + return lerp(0.0f, SMALLEST_PQ_VAL, minimum(1.0f, v / SMALLEST_PQ_VAL_IN)); + } + else if (exp > PQ_APPROX_MAX_EXP) + return LARGEST_PQ_VAL; + + int mant = bfloat16_get_mantissa(bf); + + float a = g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant]; + float bf_f32 = bfloat16_to_float(bf); + + int next_mant = mant + 1; + int next_exp = exp; + if (next_mant == 128) + { + next_mant = 0; + next_exp++; + if (next_exp > PQ_APPROX_MAX_EXP) + return a; + } + + float b = g_pq_approx_tabs[next_exp - PQ_APPROX_MIN_EXP][next_mant]; + + bfloat16 next_bf = bfloat16_init(1, next_exp, next_mant); + float next_bf_f32 = bfloat16_to_float(next_bf); + assert(v <= next_bf_f32); + + float lerp_factor = (v - bf_f32) / (next_bf_f32 - bf_f32); + assert((lerp_factor >= 0) && (lerp_factor <= 1.0f)); + + return lerp(a, b, lerp_factor); +} + +// 100 nits = ~.5 i +// This converts absolute linear RGB light in either REC 709 or REC2020/BT2100 color gamut to ICtCp, a coding space where Ct is scaled by 2. +// To convert to perceptual ITP for error/distance calculations, multiply the result Ct by .5 (or set itp_flag to true). +// Assumes REC 709 input, or REC 2020/BT.2100 RGB input if rec2020_bt2100_color_gamut is true. +// +// ITP info: +// https://www.portrait.com/resource-center/ictcp-color-difference-metric/ +// https://professional.dolby.com/siteassets/pdfs/measuringperceptualcolorvolume_v07.253.pdf (see scale to JND's) +// This also converts from a ICtCp coding space to threshold or perceptually uniform space ITP. +// +// Linear REC709 to REC2020/BT.2100 gamut conversion: +// rgb_2100[0] = rgb_in[0] * 0.6274f + rgb_in[1] * 0.3293f + rgb_in[2] * 0.0433f; +// rgb_2100[1] = rgb_in[0] * 0.0691f + rgb_in[1] * 0.9195f + rgb_in[2] * 0.0114f; +// rgb_2100[2] = rgb_in[0] * 0.0164f + rgb_in[1] * 0.0880f + rgb_in[2] * 0.8956f; +// const float S = 1.0f / 4096.0f; +// l = (1688.0f * S) * rgb_2100[0] + (2146.0f * S) * rgb_2100[1] + (262.0f * S) * rgb_2100[2]; +// m = (683.0f * S) * rgb_2100[0] + (2951.0f * S) * rgb_2100[1] + (462.0f * S) * rgb_2100[2]; +// s = (99.0f * S) * rgb_2100[0] + (309.0f * S) * rgb_2100[1] + (3688.0f * S) * rgb_2100[2]; +static void linear_rgb_to_ictcp(const vec3F& rgb_in, vec3F& ictcp, bool itp_flag = false, bool rec2020_bt2100_color_gamut = false) +{ + vec3F rgb_2100(rgb_in); + + float l, m, s; + if (!rec2020_bt2100_color_gamut) + { + // Assume REC 709 input color gamut + // (REC2020_to_LMS * REC709_to_2020) * input_color + l = rgb_2100[0] * 0.2958097f + rgb_2100[1] * 0.6230863f + rgb_2100[2] * 0.0811040f; + m = rgb_2100[0] * 0.1562512f + rgb_2100[1] * 0.7272980f + rgb_2100[2] * 0.1164508f; + s = rgb_2100[0] * 0.0351435f + rgb_2100[1] * 0.1565601f + rgb_2100[2] * 0.8082964f; + } + else + { + // Assumes REC2020/BT.2100 input color gamut (this is from the spec) + l = 0.412109375f * rgb_2100[0] + 0.52392578125f * rgb_2100[1] + 0.06396484375f * rgb_2100[2]; + m = 0.166748046875f * rgb_2100[0] + 0.720458984375f * rgb_2100[1] + 0.11279296875f * rgb_2100[2]; + s = 0.024169921875f * rgb_2100[0] + 0.075439453125f * rgb_2100[1] + 0.900390625f * rgb_2100[2]; + } + + float ld = forwardPQTab(l); + float md = forwardPQTab(m); + float sd = forwardPQTab(s); + + ictcp[0] = .5f * ld + .5f * md; + + // if ITP scale Ct by .5 (the ICtCp spec scaled Ct to better exploit the full scaled output, which is not perceptually linear) + if (itp_flag) + ictcp[1] = 0.806884765625f * ld + -1.6617431640625f * md + 0.8548583984375f * sd; + else + ictcp[1] = 1.61376953125f * ld + -3.323486328125f * md + 1.709716796875f * sd; + + ictcp[2] = 4.378173828125f * ld + -4.24560546875f * md + -0.132568359375f * sd; +} + +static inline void linear_rgb_to_itp(const vec3F& rgb_in, vec3F& itp, const astc_hdr_6x6_global_config &cfg) +{ + linear_rgb_to_ictcp(rgb_in, itp, true, cfg.m_rec2020_bt2100_color_gamut); +} + +#if 0 +// Outputs rec2020/bt2100 color gamut (i.e. this doesn't convert back to REC709 gamut). +static void ictcp_to_linear_rgb(const vec3F& ictcp, vec3F& rgb, bool itp_flag = false) +{ + float ct = ictcp[1]; + + if (itp_flag) + ct *= 2.0f; + + float ld = ictcp[0] + ct * 0.008609037037932726f + ictcp[2] * 0.11102962500302596f; + float md = ictcp[0] + ct * -0.008609037037932726f + ictcp[2] * -0.11102962500302596f; + float sd = ictcp[0] + ct * 0.5600313357106792f + ictcp[2] * -0.32062717498731885f; + + float l = inversePQ(ld); + float m = inversePQ(md); + float s = inversePQ(sd); + + rgb[0] = l * 3.436606694333079f + m * -2.5064521186562705f + s * 0.06984542432319149f; + rgb[1] = l * -0.7913295555989289f + m * 1.983600451792291f + s * -0.192270896193362f; + rgb[2] = l * -0.025949899690592672f + m * -0.09891371471172646f + s * 1.1248636144023192f; +} +#endif + +struct half_vec3 +{ + basist::half_float m_vals[3]; + + inline half_vec3() { } + + inline half_vec3(basist::half_float x, basist::half_float y, basist::half_float z) + { + m_vals[0] = x; + m_vals[1] = y; + m_vals[2] = z; + } + + inline half_vec3(const half_vec3& other) + { + *this = other; + } + + inline half_vec3& operator= (const half_vec3& rhs) + { + m_vals[0] = rhs.m_vals[0]; + m_vals[1] = rhs.m_vals[1]; + m_vals[2] = rhs.m_vals[2]; + return *this; + } + + inline void clear() + { + clear_obj(m_vals); + } + + inline half_vec3 &set(basist::half_float x, basist::half_float y, basist::half_float z) + { + m_vals[0] = x; + m_vals[1] = y; + m_vals[2] = z; + return *this; + } + + inline half_vec3& set(float x, float y, float z) + { + m_vals[0] = basist::float_to_half(x); + m_vals[1] = basist::float_to_half(y); + m_vals[2] = basist::float_to_half(z); + return *this; + } + + template + inline half_vec3& set_vec(const T& vec) + { + m_vals[0] = basist::float_to_half(vec[0]); + m_vals[1] = basist::float_to_half(vec[1]); + m_vals[2] = basist::float_to_half(vec[2]); + return *this; + } + + template + inline T get_vec() const + { + return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2])); + } + + inline basist::half_float operator[] (uint32_t c) const { assert(c < 3); return m_vals[c]; } + inline basist::half_float& operator[] (uint32_t c) { assert(c < 3); return m_vals[c]; } + + float get_float_comp(uint32_t c) const + { + assert(c < 3); + return basist::half_to_float(m_vals[c]); + } + + half_vec3& set_float_comp(uint32_t c, float v) + { + assert(c < 3); + m_vals[c] = basist::float_to_half(v); + return *this; + } +}; + +struct half_vec4 +{ + basist::half_float m_vals[4]; + + inline half_vec4() { } + + inline half_vec4(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w) + { + m_vals[0] = x; + m_vals[1] = y; + m_vals[2] = z; + m_vals[3] = w; + } + + inline half_vec4(const half_vec4& other) + { + *this = other; + } + + inline half_vec4& operator= (const half_vec4& rhs) + { + m_vals[0] = rhs.m_vals[0]; + m_vals[1] = rhs.m_vals[1]; + m_vals[2] = rhs.m_vals[2]; + m_vals[3] = rhs.m_vals[3]; + return *this; + } + + inline void clear() + { + clear_obj(m_vals); + } + + inline half_vec4& set(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w) + { + m_vals[0] = x; + m_vals[1] = y; + m_vals[2] = z; + m_vals[3] = w; + return *this; + } + + inline half_vec4& set(float x, float y, float z, float w) + { + m_vals[0] = basist::float_to_half(x); + m_vals[1] = basist::float_to_half(y); + m_vals[2] = basist::float_to_half(z); + m_vals[3] = basist::float_to_half(w); + return *this; + } + + template + inline half_vec4& set_vec(const T& vec) + { + m_vals[0] = basist::float_to_half(vec[0]); + m_vals[1] = basist::float_to_half(vec[1]); + m_vals[2] = basist::float_to_half(vec[2]); + m_vals[3] = basist::float_to_half(vec[3]); + return *this; + } + + template + inline T get_vec() const + { + return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]), basist::half_to_float(m_vals[3])); + } + + inline basist::half_float operator[] (uint32_t c) const { assert(c < 4); return m_vals[c]; } + inline basist::half_float &operator[] (uint32_t c) { assert(c < 4); return m_vals[c]; } + + float get_float_comp(uint32_t c) const + { + assert(c < 4); + return basist::half_to_float(m_vals[c]); + } + + half_vec4& set_float_comp(uint32_t c, float v) + { + assert(c < 4); + m_vals[c] = basist::float_to_half(v); + return *this; + } +}; + +const uint32_t MAX_BLOCK_W = 6, MAX_BLOCK_H = 6; + +struct trial_result +{ + astc_helpers::log_astc_block m_log_blk; + double m_err; + bool m_valid; +}; + +//---------------------------------------------------------- + +const uint32_t NUM_PART3_MAPPINGS = 6; +static uint8_t g_part3_mapping[NUM_PART3_MAPPINGS][3] = +{ + { 0, 1, 2 }, + { 1, 2, 0 }, + { 2, 0, 1 }, + { 0, 2, 1 }, + { 1, 0, 2 }, + { 2, 1, 0 } +}; + +struct partition_pattern_vec +{ + uint8_t m_parts[6 * 6]; + + partition_pattern_vec() + { + clear(); + } + + partition_pattern_vec(const partition_pattern_vec& other) + { + *this = other; + } + + void clear() + { + memset(m_parts, 0, sizeof(m_parts)); + } + + partition_pattern_vec& operator= (const partition_pattern_vec& rhs) + { + if (this == &rhs) + return *this; + memcpy(m_parts, rhs.m_parts, 36); + return *this; + } + + uint8_t operator[] (uint32_t i) const { assert(i < 36); return m_parts[i]; } + uint8_t& operator[] (uint32_t i) { assert(i < 36); return m_parts[i]; } + + uint8_t operator() (uint32_t x, uint32_t y) const { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; } + uint8_t& operator() (uint32_t x, uint32_t y) { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; } + + int get_squared_distance(const partition_pattern_vec& other) const + { + int total_dist = 0; + for (uint32_t i = 0; i < 36; i++) + total_dist += iabs((int)m_parts[i] - (int)other.m_parts[i]); + return total_dist; + } + + float get_distance(const partition_pattern_vec& other) const + { + return sqrtf((float)get_squared_distance(other)); + } + + partition_pattern_vec get_permuted2(uint32_t permute_index) const + { + assert(permute_index <= 1); + + partition_pattern_vec res; + for (uint32_t i = 0; i < 36; i++) + { + assert(m_parts[i] <= 1); + res.m_parts[i] = (uint8_t)(m_parts[i] ^ permute_index); + } + + return res; + } + + partition_pattern_vec get_permuted3(uint32_t permute_index) const + { + assert(permute_index <= 5); + + partition_pattern_vec res; + for (uint32_t i = 0; i < 36; i++) + { + assert(m_parts[i] <= 2); + res.m_parts[i] = g_part3_mapping[permute_index][m_parts[i]]; + } + + return res; + } + + partition_pattern_vec get_canonicalized() const + { + partition_pattern_vec res; + + int new_labels[3] = { -1, -1, -1 }; + uint32_t next_index = 0; + for (uint32_t i = 0; i < 36; i++) + { + uint32_t p = m_parts[i]; + if (new_labels[p] == -1) + new_labels[p] = next_index++; + + res.m_parts[i] = (uint8_t)new_labels[p]; + } + + return res; + } + + bool operator== (const partition_pattern_vec& rhs) const + { + return memcmp(m_parts, rhs.m_parts, sizeof(m_parts)) == 0; + } + + operator size_t() const + { + return basisu::hash_hsieh(m_parts, sizeof(m_parts)); + } +}; + +struct vp_tree_node +{ + partition_pattern_vec m_vantage_point; + uint32_t m_point_index; + float m_dist; + + int m_inner_node, m_outer_node; +}; + +#define BRUTE_FORCE_PART_SEARCH (0) + +class vp_tree +{ +public: + vp_tree() + { + } + + void clear() + { + m_nodes.clear(); + } + + // This requires no redundant patterns, i.e. all must be unique. + bool init(uint32_t n, const partition_pattern_vec* pUnique_pats) + { + clear(); + + uint_vec pat_indices(n); + for (uint32_t i = 0; i < n; i++) + pat_indices[i] = i; + + std::pair root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices); + + if (root_idx.first == -1) + return false; + + m_nodes.resize(1); + m_nodes[0].m_vantage_point = pUnique_pats[root_idx.first]; + m_nodes[0].m_point_index = root_idx.first; + m_nodes[0].m_dist = root_idx.second; + m_nodes[0].m_inner_node = -1; + m_nodes[0].m_outer_node = -1; + + uint_vec inner_list, outer_list; + + inner_list.reserve(n / 2); + outer_list.reserve(n / 2); + + for (uint32_t pat_index = 0; pat_index < n; pat_index++) + { + if ((int)pat_index == root_idx.first) + continue; + + const float dist = m_nodes[0].m_vantage_point.get_distance(pUnique_pats[pat_index]); + + if (dist <= root_idx.second) + inner_list.push_back(pat_index); + else + outer_list.push_back(pat_index); + } + + if (inner_list.size()) + { + m_nodes[0].m_inner_node = create_node(n, pUnique_pats, inner_list); + if (m_nodes[0].m_inner_node < 0) + return false; + } + + if (outer_list.size()) + { + m_nodes[0].m_outer_node = create_node(n, pUnique_pats, outer_list); + if (m_nodes[0].m_outer_node < 0) + return false; + } + + return true; + } + + struct result + { + uint32_t m_pat_index; + uint32_t m_mapping_index; + float m_dist; + + bool operator< (const result& rhs) const { return m_dist < rhs.m_dist; } + bool operator> (const result& rhs) const { return m_dist > rhs.m_dist; } + }; + + class result_queue + { + enum { MaxSupportedSize = 256 + 1 }; + + public: + result_queue() : + m_cur_size(0) + { + } + + size_t get_size() const + { + return m_cur_size; + } + + bool empty() const + { + return !m_cur_size; + } + + typedef std::array result_array_type; + + const result_array_type& get_elements() const { return m_elements; } + result_array_type& get_elements() { return m_elements; } + + void clear() + { + m_cur_size = 0; + } + + void reserve(uint32_t n) + { + BASISU_NOTE_UNUSED(n); + } + + const result& top() const + { + assert(m_cur_size); + return m_elements[1]; + } + + bool insert(const result& val, uint32_t max_size) + { + assert(max_size < MaxSupportedSize); + + if (m_cur_size >= MaxSupportedSize) + return false; + + m_elements[++m_cur_size] = val; + up_heap(m_cur_size); + + if (m_cur_size > max_size) + pop(); + + return true; + } + + bool pop() + { + if (m_cur_size == 0) + return false; + + m_elements[1] = m_elements[m_cur_size--]; + down_heap(1); + return true; + } + + float get_highest_dist() const + { + if (!m_cur_size) + return 0.0f; + + return top().m_dist; + } + + private: + result_array_type m_elements; + size_t m_cur_size; + + void up_heap(size_t index) + { + while ((index > 1) && (m_elements[index] > m_elements[index >> 1])) + { + std::swap(m_elements[index], m_elements[index >> 1]); + index >>= 1; + } + } + + void down_heap(size_t index) + { + for ( ; ; ) + { + size_t largest = index, left_child = 2 * index, right_child = 2 * index + 1; + + if ((left_child <= m_cur_size) && (m_elements[left_child] > m_elements[largest])) + largest = left_child; + + if ((right_child <= m_cur_size) && (m_elements[right_child] > m_elements[largest])) + largest = right_child; + + if (largest == index) + break; + + std::swap(m_elements[index], m_elements[largest]); + index = largest; + } + } + }; + + void find_nearest(uint32_t num_subsets, const partition_pattern_vec& desired_pat, result_queue& results, uint32_t max_results) + { + assert((num_subsets >= 2) && (num_subsets <= 3)); + + results.clear(); + + if (!m_nodes.size()) + return; + + uint32_t num_desired_pats; + partition_pattern_vec desired_pats[NUM_PART3_MAPPINGS]; + + if (num_subsets == 2) + { + num_desired_pats = 2; + for (uint32_t i = 0; i < 2; i++) + desired_pats[i] = desired_pat.get_permuted2(i); + } + else + { + num_desired_pats = NUM_PART3_MAPPINGS; + for (uint32_t i = 0; i < NUM_PART3_MAPPINGS; i++) + desired_pats[i] = desired_pat.get_permuted3(i); + } + +#if 0 + find_nearest_at_node(0, num_desired_pats, desired_pats, results, max_results); +#else + find_nearest_at_node_non_recursive(0, num_desired_pats, desired_pats, results, max_results); +#endif + } + +private: + basisu::vector m_nodes; + + void find_nearest_at_node(int node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results) + { + float best_dist_to_vantage = BIG_FLOAT_VAL; + uint32_t best_mapping = 0; + for (uint32_t i = 0; i < num_desired_pats; i++) + { + float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point); + if (dist < best_dist_to_vantage) + { + best_dist_to_vantage = dist; + best_mapping = i; + } + } + + result r; + r.m_dist = best_dist_to_vantage; + r.m_mapping_index = best_mapping; + r.m_pat_index = m_nodes[node_index].m_point_index; + + results.insert(r, max_results); + + if (best_dist_to_vantage <= m_nodes[node_index].m_dist) + { + // inner first + if (m_nodes[node_index].m_inner_node >= 0) + find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results); + + if (m_nodes[node_index].m_outer_node >= 0) + { + if ( (results.get_size() < max_results) || + ((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist()) + ) + { + find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results); + } + } + } + else + { + // outer first + if (m_nodes[node_index].m_outer_node >= 0) + find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results); + + if (m_nodes[node_index].m_inner_node >= 0) + { + if ( (results.get_size() < max_results) || + ((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist()) + ) + { + find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results); + } + } + } + } + + void find_nearest_at_node_non_recursive(int init_node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results) + { + uint_vec node_stack; + node_stack.reserve(16); + node_stack.push_back(init_node_index); + + do + { + const uint32_t node_index = node_stack.back(); + node_stack.pop_back(); + + float best_dist_to_vantage = BIG_FLOAT_VAL; + uint32_t best_mapping = 0; + for (uint32_t i = 0; i < num_desired_pats; i++) + { + float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point); + if (dist < best_dist_to_vantage) + { + best_dist_to_vantage = dist; + best_mapping = i; + } + } + + result r; + r.m_dist = best_dist_to_vantage; + r.m_mapping_index = best_mapping; + r.m_pat_index = m_nodes[node_index].m_point_index; + + results.insert(r, max_results); + + if (best_dist_to_vantage <= m_nodes[node_index].m_dist) + { + if (m_nodes[node_index].m_outer_node >= 0) + { + if ((results.get_size() < max_results) || + ((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist()) + ) + { + node_stack.push_back(m_nodes[node_index].m_outer_node); + } + } + + // inner first + if (m_nodes[node_index].m_inner_node >= 0) + { + node_stack.push_back(m_nodes[node_index].m_inner_node); + } + } + else + { + if (m_nodes[node_index].m_inner_node >= 0) + { + if ((results.get_size() < max_results) || + ((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist()) + ) + { + node_stack.push_back(m_nodes[node_index].m_inner_node); + } + } + + // outer first + if (m_nodes[node_index].m_outer_node >= 0) + { + node_stack.push_back(m_nodes[node_index].m_outer_node); + } + } + + } while (!node_stack.empty()); + } + + // returns the index of the new node, or -1 on error + int create_node(uint32_t n, const partition_pattern_vec* pUnique_pats, const uint_vec& pat_indices) + { + std::pair root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices); + + if (root_idx.first < 0) + return -1; + + m_nodes.resize(m_nodes.size() + 1); + const uint32_t new_node_index = m_nodes.size_u32() - 1; + + m_nodes[new_node_index].m_vantage_point = pUnique_pats[root_idx.first]; + m_nodes[new_node_index].m_point_index = root_idx.first; + m_nodes[new_node_index].m_dist = root_idx.second; + m_nodes[new_node_index].m_inner_node = -1; + m_nodes[new_node_index].m_outer_node = -1; + + uint_vec inner_list, outer_list; + + inner_list.reserve(pat_indices.size_u32() / 2); + outer_list.reserve(pat_indices.size_u32() / 2); + + for (uint32_t pat_indices_iter = 0; pat_indices_iter < pat_indices.size(); pat_indices_iter++) + { + const uint32_t pat_index = pat_indices[pat_indices_iter]; + + if ((int)pat_index == root_idx.first) + continue; + + const float dist = m_nodes[new_node_index].m_vantage_point.get_distance(pUnique_pats[pat_index]); + + if (dist <= root_idx.second) + inner_list.push_back(pat_index); + else + outer_list.push_back(pat_index); + } + + if (inner_list.size()) + m_nodes[new_node_index].m_inner_node = create_node(n, pUnique_pats, inner_list); + + if (outer_list.size()) + m_nodes[new_node_index].m_outer_node = create_node(n, pUnique_pats, outer_list); + + return new_node_index; + } + + // returns the pattern index of the vantage point (-1 on error), and the optimal split distance + std::pair find_best_vantage_point(uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, const uint_vec &pat_indices) + { + BASISU_NOTE_UNUSED(num_unique_pats); + + const uint32_t n = pat_indices.size_u32(); + + assert(n); + if (n == 1) + return std::pair(pat_indices[0], 0.0f); + + float best_split_metric = -1.0f; + int best_split_pat = -1; + float best_split_dist = 0.0f; + float best_split_var = 0.0f; + + basisu::vector< std::pair > dists; + dists.reserve(n); + + float_vec float_dists; + float_dists.reserve(n); + + for (uint32_t pat_indices_iter = 0; pat_indices_iter < n; pat_indices_iter++) + { + const uint32_t split_pat_index = pat_indices[pat_indices_iter]; + assert(split_pat_index < num_unique_pats); + + const partition_pattern_vec& trial_vantage = pUnique_pats[split_pat_index]; + + dists.resize(0); + float_dists.resize(0); + + for (uint32_t j = 0; j < n; j++) + { + const uint32_t pat_index = pat_indices[j]; + assert(pat_index < num_unique_pats); + + if (pat_index == split_pat_index) + continue; + + float dist = trial_vantage.get_distance(pUnique_pats[pat_index]); + dists.emplace_back(std::pair(dist, pat_index)); + + float_dists.push_back(dist); + } + + stats s; + s.calc(float_dists.size_u32(), float_dists.data()); + + std::sort(dists.begin(), dists.end(), [](const auto &a, const auto &b) { + return a.first < b.first; + }); + + const uint32_t num_dists = dists.size_u32(); + float split_dist = dists[num_dists / 2].first; + if ((num_dists & 1) == 0) + split_dist = (split_dist + dists[(num_dists / 2) - 1].first) * .5f; + + uint32_t total_inner = 0, total_outer = 0; + + for (uint32_t j = 0; j < n; j++) + { + const uint32_t pat_index = pat_indices[j]; + if (pat_index == split_pat_index) + continue; + + float dist = trial_vantage.get_distance(pUnique_pats[pat_index]); + + if (dist <= split_dist) + total_inner++; + else + total_outer++; + } + + float split_metric = (float)minimum(total_inner, total_outer) / (float)maximum(total_inner, total_outer); + + if ( (split_metric > best_split_metric) || + ((split_metric == best_split_metric) && (s.m_var > best_split_var)) ) + { + best_split_metric = split_metric; + best_split_dist = split_dist; + best_split_pat = split_pat_index; + best_split_var = (float)s.m_var; + } + } + + return std::pair(best_split_pat, best_split_dist); + } +}; + +struct partition +{ + uint64_t m_p; + + inline partition() : + m_p(0) + { + } + + inline partition(uint64_t p) : + m_p(p) + { + assert(p < (1ULL << 36)); + } + + inline partition& operator=(uint64_t p) + { + assert(p < (1ULL << 36)); + m_p = p; + return *this; + } + + inline bool operator< (const partition& p) const + { + return m_p < p.m_p; + } + + inline bool operator== (const partition& p) const + { + return m_p == p.m_p; + } + + inline operator size_t() const + { + return hash_hsieh((const uint8_t *)&m_p, sizeof(m_p)); + } +}; + +partition_pattern_vec g_partitions2[NUM_UNIQUE_PARTITIONS2]; +int g_part2_seed_to_unique_index[1024]; +vp_tree g_part2_vp_tree; + +static inline vec3F vec3F_norm_approx(vec3F axis) +{ + float l = axis.norm(); + axis = (fabs(l) >= SMALL_FLOAT_VAL) ? (axis * bu_math::inv_sqrt(l)) : vec3F(0.577350269f); + return axis; +} + +static void init_partitions2_6x6() +{ +#if 0 + // makes pattern bits to the 10-bit ASTC seed index + typedef basisu::hash_map partition2_hash_map; + partition2_hash_map phash; + phash.reserve(1024); + + for (uint32_t i = 0; i < 1024; i++) + { + uint64_t p_bits = 0; + uint64_t p_bits_inv = 0; + + for (uint32_t y = 0; y < 6; y++) + { + for (uint32_t x = 0; x < 6; x++) + { + uint64_t p = astc_helpers::compute_texel_partition(i, x, y, 0, 2, false); + assert(p < 2); + + p_bits |= (p << (x + y * 6)); + p_bits_inv |= ((1 - p) << (x + y * 6)); + } + } + + if (!p_bits) + continue; + if (p_bits == ((1ULL << 36) - 1)) + continue; + + assert(p_bits < (1ULL << 36)); + assert(p_bits_inv < (1ULL << 36)); + + if (phash.contains(p_bits)) + { + } + else if (phash.contains(p_bits_inv)) + { + } + else + { + auto res = phash.insert(p_bits, i); + assert(res.second); + BASISU_NOTE_UNUSED(res); + } + } + + uint32_t num_unique_partitions2 = 0; + + for (const auto& r : phash) + { + assert(r.second < 1024); + + const uint32_t unique_index = num_unique_partitions2; + assert(unique_index < NUM_UNIQUE_PARTITIONS2); + + partition_pattern_vec pat_vec; + for (uint32_t i = 0; i < 36; i++) + pat_vec[i] = (uint8_t)((r.first >> i) & 1); + + g_partitions2[unique_index] = pat_vec; + + assert(g_part2_unique_index_to_seed[unique_index] == r.second); + g_part2_seed_to_unique_index[r.second] = unique_index; + + num_unique_partitions2++; + } + assert(num_unique_partitions2 == NUM_UNIQUE_PARTITIONS2); +#else + for (uint32_t unique_index = 0; unique_index < NUM_UNIQUE_PARTITIONS2; unique_index++) + { + const uint32_t seed_index = g_part2_unique_index_to_seed[unique_index]; + assert(seed_index < 1024); + + assert(g_part2_seed_to_unique_index[seed_index] == 0); + g_part2_seed_to_unique_index[seed_index] = unique_index; + + partition_pattern_vec& pat_vec = g_partitions2[unique_index]; + + for (uint32_t y = 0; y < 6; y++) + { + for (uint32_t x = 0; x < 6; x++) + { + uint8_t p = (uint8_t)astc_helpers::compute_texel_partition(seed_index, x, y, 0, 2, false); + assert(p < 2); + + pat_vec[x + y * 6] = p; + } + } + } +#endif + + g_part2_vp_tree.init(NUM_UNIQUE_PARTITIONS2, g_partitions2); +} + +static bool estimate_partition2_6x6( + const basist::half_float pBlock_pixels_half[][3], + int* pBest_parts, uint32_t num_best_parts) +{ + const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H; + + vec3F training_vecs[BLOCK_T], mean(0.0f); + + for (uint32_t i = 0; i < BLOCK_T; i++) + { + vec3F& v = training_vecs[i]; + + v[0] = (float)pBlock_pixels_half[i][0]; + v[1] = (float)pBlock_pixels_half[i][1]; + v[2] = (float)pBlock_pixels_half[i][2]; + + mean += v; + } + mean *= (1.0f / (float)BLOCK_T); + + vec3F max_vals(-BIG_FLOAT_VAL); + + for (uint32_t i = 0; i < BLOCK_T; i++) + { + vec3F& v = training_vecs[i]; + max_vals = vec3F::component_max(max_vals, v); + } + + // Initialize principle axis approximation + vec3F axis(max_vals - mean); + + // Incremental approx. PCA - only viable if we have a reasonably fast approximation for 1.0/sqrt(x). + for (uint32_t i = 0; i < BLOCK_T; i++) + { + axis = vec3F_norm_approx(axis); + + vec3F color(training_vecs[i] - mean); + + float d = color.dot(axis); + + axis += color * d; + } + + if (axis.norm() < SMALL_FLOAT_VAL) + axis.set(0.57735027f); + else + axis.normalize_in_place(); + +#if BRUTE_FORCE_PART_SEARCH + int desired_parts[BLOCK_H][BLOCK_W]; // [y][x] + for (uint32_t i = 0; i < BLOCK_T; i++) + { + float proj = (training_vecs[i] - mean).dot(axis); + + desired_parts[i / BLOCK_W][i % BLOCK_W] = proj < 0.0f; + } +#else + partition_pattern_vec desired_part; + + for (uint32_t i = 0; i < BLOCK_T; i++) + { + float proj = (training_vecs[i] - mean).dot(axis); + + desired_part.m_parts[i] = proj < 0.0f; + } +#endif + + //interval_timer tm; + //tm.start(); + +#if BRUTE_FORCE_PART_SEARCH + uint32_t part_similarity[NUM_UNIQUE_PARTITIONS2]; + + for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS2; part_index++) + { + const partition_pattern_vec &pat_vec = g_partitions2[part_index]; + + int total_sim_non_inv = 0; + int total_sim_inv = 0; + + for (uint32_t y = 0; y < BLOCK_H; y++) + { + for (uint32_t x = 0; x < BLOCK_W; x++) + { + int part = pat_vec[x + y * 6]; + + if (part == desired_parts[y][x]) + total_sim_non_inv++; + + if ((part ^ 1) == desired_parts[y][x]) + total_sim_inv++; + } + } + + int total_sim = maximum(total_sim_non_inv, total_sim_inv); + + part_similarity[part_index] = (total_sim << 16) | part_index; + + } // part_index; + + std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS2); + + for (uint32_t i = 0; i < num_best_parts; i++) + pBest_parts[i] = part_similarity[(NUM_UNIQUE_PARTITIONS2 - 1) - i] & 0xFFFF; +#else + vp_tree::result_queue results; + results.reserve(num_best_parts); + g_part2_vp_tree.find_nearest(2, desired_part, results, num_best_parts); + + assert(results.get_size() == num_best_parts); + + const auto& elements = results.get_elements(); + + for (uint32_t i = 0; i < results.get_size(); i++) + pBest_parts[i] = elements[1 + i].m_pat_index; +#endif + + //fmt_printf("{} ", tm.get_elapsed_ms()); + + return true; +} + +const uint32_t MIN_REFINE_LEVEL = 0; + +static bool encode_block_2_subsets( + trial_result res[2], + uint32_t grid_w, uint32_t grid_h, + uint32_t cem, + uint32_t weights_ise_range, uint32_t endpoints_ise_range, + const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16, + astc_hdr_codec_base_options& coptions, + bool uber_mode_flag, + int unique_pat_index, + uint32_t comp_level, + opt_mode_t mode11_opt_mode, + bool refine_endpoints_flag) +{ + const uint32_t num_endpoint_vals = (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS; + + res[0].m_valid = false; + res[1].m_valid = false; + + const uint32_t BLOCK_W = 6, BLOCK_H = 6; + + astc_helpers::log_astc_block best_log_blk; + clear_obj(best_log_blk); + + best_log_blk.m_num_partitions = 2; + best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem; + best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem; + best_log_blk.m_grid_width = (uint8_t)grid_w; + best_log_blk.m_grid_height = (uint8_t)grid_h; + + best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range; + best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range; + + partition_pattern_vec* pPat = &g_partitions2[unique_pat_index]; + const uint32_t p_seed = g_part2_unique_index_to_seed[unique_pat_index]; + + vec4F part_pixels_q16[2][64]; + half_vec3 part_half_pixels[2][64]; + uint8_t part_pixel_index[2][64]; + uint32_t part_total_pixels[2] = { 0 }; + + for (uint32_t y = 0; y < BLOCK_H; y++) + { + for (uint32_t x = 0; x < BLOCK_W; x++) + { + uint32_t part_index = (*pPat)[x + y * BLOCK_W]; + + uint32_t l = part_total_pixels[part_index]; + + part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W]; + part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W]; + part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W); + + part_total_pixels[part_index] = l + 1; + } // x + } // y + + uint8_t blk_endpoints[2][basist::NUM_MODE11_ENDPOINTS]; + uint8_t blk_weights[2][BLOCK_W * BLOCK_H]; + uint32_t best_submode[2]; + + for (uint32_t part_iter = 0; part_iter < 2; part_iter++) + { + assert(part_total_pixels[part_iter]); + + double e; + if (cem == 7) + { + e = encode_astc_hdr_block_mode_7( + part_total_pixels[part_iter], + (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], + best_log_blk.m_weight_ise_range, + best_submode[part_iter], + BIG_FLOAT_VAL, + blk_endpoints[part_iter], + blk_weights[part_iter], + coptions, + best_log_blk.m_endpoint_ise_range); + } + else + { + assert(cem == 11); + + e = encode_astc_hdr_block_mode_11( + part_total_pixels[part_iter], + (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], + best_log_blk.m_weight_ise_range, + best_submode[part_iter], + BIG_FLOAT_VAL, + blk_endpoints[part_iter], + blk_weights[part_iter], + coptions, + false, + best_log_blk.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false, + mode11_opt_mode); + } + + if (e == BIG_FLOAT_VAL) + return false; + + } // part_iter + + uint8_t ise_weights[BLOCK_W * BLOCK_H]; + + uint32_t src_pixel_index[2] = { 0, 0 }; + for (uint32_t y = 0; y < BLOCK_H; y++) + { + for (uint32_t x = 0; x < BLOCK_W; x++) + { + uint32_t part_index = (*pPat)[x + y * BLOCK_W]; + ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]]; + src_pixel_index[part_index]++; + } // x + } // y + + if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H)) + { + best_log_blk.m_partition_id = (uint16_t)p_seed; + + memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals); + memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals); + memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H); + + res[0].m_valid = true; + res[0].m_log_blk = best_log_blk; + } + else + { + uint8_t desired_weights[BLOCK_H * BLOCK_W]; + + const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val; + + for (uint32_t by = 0; by < BLOCK_H; by++) + for (uint32_t bx = 0; bx < BLOCK_W; bx++) + desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]]; + + uint8_t downsampled_weights[BLOCK_H * BLOCK_W]; + + const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h); + if (!pDownsample_matrix) + { + assert(0); + return false; + } + + downsample_weight_grid( + pDownsample_matrix, + BLOCK_W, BLOCK_H, // source/from dimension (block size) + grid_w, grid_h, // dest/to dimension (grid size) + desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx] + downsampled_weights); // [wy][wx] + + best_log_blk.m_partition_id = (uint16_t)p_seed; + memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals); + memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals); + + const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise; + + for (uint32_t gy = 0; gy < grid_h; gy++) + for (uint32_t gx = 0; gx < grid_w; gx++) + best_log_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]]; + + res[0].m_valid = true; + res[0].m_log_blk = best_log_blk; + + if ((refine_endpoints_flag) && (comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6))) + { + bool any_refined = false; + + for (uint32_t part_iter = 0; part_iter < 2; part_iter++) + { + bool refine_status = refine_endpoints( + cem, + endpoints_ise_range, + best_log_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize + BLOCK_W, BLOCK_H, // block dimensions + grid_w, grid_h, best_log_blk.m_weights, weights_ise_range, // weight grid + part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], + &part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets + coptions, mode11_opt_mode); + + if (refine_status) + any_refined = true; + } + + if (any_refined) + { + res[1].m_valid = true; + res[1].m_log_blk = best_log_blk; + } + } + } + + return true; +} + +typedef basisu::hash_map > partition3_hash_map; + +partition_pattern_vec g_partitions3[NUM_UNIQUE_PARTITIONS3]; +int g_part3_seed_to_unique_index[1024]; +vp_tree g_part3_vp_tree; + +static void init_partitions3_6x6() +{ + uint32_t t = 0; + + for (uint32_t i = 0; i < 1024; i++) + g_part3_seed_to_unique_index[i] = -1; + + partition3_hash_map part3_hash; + part3_hash.reserve(512); + + for (uint32_t seed_index = 0; seed_index < 1024; seed_index++) + { + partition_pattern_vec p3; + uint32_t part_hist[3] = { 0 }; + + for (uint32_t y = 0; y < 6; y++) + { + for (uint32_t x = 0; x < 6; x++) + { + uint64_t p = astc_helpers::compute_texel_partition(seed_index, x, y, 0, 3, false); + assert(p < 3); + + p3.m_parts[x + y * 6] = (uint8_t)p; + part_hist[p]++; + } + } + + if (!part_hist[0] || !part_hist[1] || !part_hist[2]) + continue; + + uint32_t j; + for (j = 0; j < NUM_PART3_MAPPINGS; j++) + { + partition_pattern_vec temp_part3(p3.get_permuted3(j)); + + if (part3_hash.contains(temp_part3)) + break; + } + if (j < NUM_PART3_MAPPINGS) + continue; + + part3_hash.insert(p3, std::make_pair(seed_index, t) ); + + assert(g_part3_unique_index_to_seed[t] == seed_index); + g_part3_seed_to_unique_index[seed_index] = t; + g_partitions3[t] = p3; + + t++; + } + + g_part3_vp_tree.init(NUM_UNIQUE_PARTITIONS3, g_partitions3); +} + +static bool estimate_partition3_6x6( + const basist::half_float pBlock_pixels_half[][3], + int* pBest_parts, uint32_t num_best_parts) +{ + const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H, NUM_SUBSETS = 3; + + assert(num_best_parts && (num_best_parts <= NUM_UNIQUE_PARTITIONS3)); + + vec3F training_vecs[BLOCK_T], mean(0.0f); + + float brightest_inten = 0.0f, darkest_inten = BIG_FLOAT_VAL; + vec3F cluster_centroids[NUM_SUBSETS]; + + for (uint32_t i = 0; i < BLOCK_T; i++) + { + vec3F& v = training_vecs[i]; + + v.set((float)pBlock_pixels_half[i][0], (float)pBlock_pixels_half[i][1], (float)pBlock_pixels_half[i][2]); + + float inten = v.dot(vec3F(1.0f)); + if (inten < darkest_inten) + { + darkest_inten = inten; + cluster_centroids[0] = v; + } + + if (inten > brightest_inten) + { + brightest_inten = inten; + cluster_centroids[1] = v; + } + } + + if (cluster_centroids[0] == cluster_centroids[1]) + return false; + + float furthest_dist2 = 0.0f; + for (uint32_t i = 0; i < BLOCK_T; i++) + { + vec3F& v = training_vecs[i]; + + float dist_a = v.squared_distance(cluster_centroids[0]); + if (dist_a == 0.0f) + continue; + + float dist_b = v.squared_distance(cluster_centroids[1]); + if (dist_b == 0.0f) + continue; + + float dist2 = dist_a + dist_b; + if (dist2 > furthest_dist2) + { + furthest_dist2 = dist2; + cluster_centroids[2] = v; + } + } + + if ((cluster_centroids[0] == cluster_centroids[2]) || (cluster_centroids[1] == cluster_centroids[2])) + return false; + + uint32_t cluster_pixels[NUM_SUBSETS][BLOCK_T]; + uint32_t num_cluster_pixels[NUM_SUBSETS]; + vec3F new_cluster_means[NUM_SUBSETS]; + + const uint32_t NUM_ITERS = 4; + + for (uint32_t s = 0; s < NUM_ITERS; s++) + { + memset(num_cluster_pixels, 0, sizeof(num_cluster_pixels)); + memset(new_cluster_means, 0, sizeof(new_cluster_means)); + + for (uint32_t i = 0; i < BLOCK_T; i++) + { + float d[NUM_SUBSETS] = { + training_vecs[i].squared_distance(cluster_centroids[0]), + training_vecs[i].squared_distance(cluster_centroids[1]), + training_vecs[i].squared_distance(cluster_centroids[2]) }; + + float min_d = d[0]; + uint32_t min_idx = 0; + for (uint32_t j = 1; j < NUM_SUBSETS; j++) + { + if (d[j] < min_d) + { + min_d = d[j]; + min_idx = j; + } + } + + cluster_pixels[min_idx][num_cluster_pixels[min_idx]] = i; + new_cluster_means[min_idx] += training_vecs[i]; + num_cluster_pixels[min_idx]++; + } // i + + for (uint32_t j = 0; j < NUM_SUBSETS; j++) + { + if (!num_cluster_pixels[j]) + return false; + + cluster_centroids[j] = new_cluster_means[j] / (float)num_cluster_pixels[j]; + } + } // s + + partition_pattern_vec desired_part; + for (uint32_t p = 0; p < NUM_SUBSETS; p++) + { + for (uint32_t i = 0; i < num_cluster_pixels[p]; i++) + { + const uint32_t pix_index = cluster_pixels[p][i]; + desired_part[pix_index] = (uint8_t)p; + } + } + +#if BRUTE_FORCE_PART_SEARCH + partition_pattern_vec desired_parts[NUM_PART3_MAPPINGS]; + for (uint32_t j = 0; j < NUM_PART3_MAPPINGS; j++) + desired_parts[j] = desired_part.get_permuted3(j); + + uint32_t part_similarity[NUM_UNIQUE_PARTITIONS3]; + + for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS3; part_index++) + { + const partition_pattern_vec& pat = g_partitions3[part_index]; + + uint32_t lowest_pat_dist = UINT32_MAX; + for (uint32_t p = 0; p < NUM_PART3_MAPPINGS; p++) + { + uint32_t dist = pat.get_squared_distance(desired_parts[p]); + if (dist < lowest_pat_dist) + lowest_pat_dist = dist; + } + + part_similarity[part_index] = (lowest_pat_dist << 16) | part_index; + + } // part_index; + + std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS3); + + for (uint32_t i = 0; i < num_best_parts; i++) + pBest_parts[i] = part_similarity[i] & 0xFFFF; +#else + vp_tree::result_queue results; + results.reserve(num_best_parts); + g_part3_vp_tree.find_nearest(3, desired_part, results, num_best_parts); + + assert(results.get_size() == num_best_parts); + + const auto& elements = results.get_elements(); + + for (uint32_t i = 0; i < results.get_size(); i++) + pBest_parts[i] = elements[1 + i].m_pat_index; +#endif + + return true; +} + +static bool encode_block_3_subsets( + trial_result& res, + uint32_t cem, + uint32_t grid_w, uint32_t grid_h, + uint32_t weights_ise_range, uint32_t endpoints_ise_range, + const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16, + astc_hdr_codec_base_options& coptions, + bool uber_mode_flag, + const int* pEst_patterns, int num_est_patterns, + uint32_t comp_level, + opt_mode_t mode11_opt_mode) +{ + BASISU_NOTE_UNUSED(uber_mode_flag); + const uint32_t BLOCK_W = 6, BLOCK_H = 6, NUM_SUBSETS = 3; + const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem); + + res.m_valid = false; + + double best_e = BIG_FLOAT_VAL; + + astc_helpers::log_astc_block best_log_blk; + clear_obj(best_log_blk); + + best_log_blk.m_num_partitions = NUM_SUBSETS; + best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem; + best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem; + best_log_blk.m_color_endpoint_modes[2] = (uint8_t)cem; + best_log_blk.m_grid_width = (uint8_t)grid_w; + best_log_blk.m_grid_height = (uint8_t)grid_h; + + best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range; + best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range; + + const uint32_t n = num_est_patterns ? num_est_patterns : NUM_UNIQUE_PARTITIONS3; + + for (uint32_t unique_p_iter = 0; unique_p_iter < n; unique_p_iter++) + { + const uint32_t unique_part_index = num_est_patterns ? pEst_patterns[unique_p_iter] : unique_p_iter; + assert(unique_part_index < NUM_UNIQUE_PARTITIONS3); + const partition_pattern_vec*pPart = &g_partitions3[unique_part_index]; + + vec4F part_pixels_q16[NUM_SUBSETS][64]; + half_vec3 part_half_pixels[NUM_SUBSETS][64]; + uint8_t part_pixel_index[NUM_SUBSETS][64]; + uint32_t part_total_pixels[NUM_SUBSETS] = { 0 }; + + for (uint32_t y = 0; y < BLOCK_H; y++) + { + for (uint32_t x = 0; x < BLOCK_W; x++) + { + const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W]; + + uint32_t l = part_total_pixels[part_index]; + + part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W]; + part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W]; + part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W); + + part_total_pixels[part_index] = l + 1; + } // x + } // y + + uint8_t blk_endpoints[NUM_SUBSETS][basist::NUM_MODE11_ENDPOINTS]; + uint8_t blk_weights[NUM_SUBSETS][BLOCK_W * BLOCK_H]; + uint32_t best_submode[NUM_SUBSETS]; + + double e = 0.0f; + for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++) + { + assert(part_total_pixels[part_iter]); + + if (cem == 7) + { + e += encode_astc_hdr_block_mode_7( + part_total_pixels[part_iter], + (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], + best_log_blk.m_weight_ise_range, + best_submode[part_iter], + BIG_FLOAT_VAL, + blk_endpoints[part_iter], + blk_weights[part_iter], + coptions, + best_log_blk.m_endpoint_ise_range); + } + else + { + assert(cem == 11); + + e += encode_astc_hdr_block_mode_11( + part_total_pixels[part_iter], + (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], + best_log_blk.m_weight_ise_range, + best_submode[part_iter], + BIG_FLOAT_VAL, + blk_endpoints[part_iter], + blk_weights[part_iter], + coptions, + false, best_log_blk.m_endpoint_ise_range, uber_mode_flag, false, + FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode); + } + + } // part_iter + + uint8_t ise_weights[BLOCK_W * BLOCK_H]; + + uint32_t src_pixel_index[NUM_SUBSETS] = { 0 }; + for (uint32_t y = 0; y < BLOCK_H; y++) + { + for (uint32_t x = 0; x < BLOCK_W; x++) + { + const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W]; + + ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]]; + src_pixel_index[part_index]++; + } // x + } // y + + if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H)) + { + if (e < best_e) + { + best_e = e; + best_log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index]; + + for (uint32_t p = 0; p < NUM_SUBSETS; p++) + memcpy(best_log_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals); + + memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H); + } + } + else + { + uint8_t desired_weights[BLOCK_H * BLOCK_W]; + + const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val; + + for (uint32_t by = 0; by < BLOCK_H; by++) + for (uint32_t bx = 0; bx < BLOCK_W; bx++) + desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]]; + + uint8_t downsampled_weights[BLOCK_H * BLOCK_W]; + + const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h); + if (!pDownsample_matrix) + { + assert(0); + return false; + } + + downsample_weight_grid( + pDownsample_matrix, + BLOCK_W, BLOCK_H, // source/from dimension (block size) + grid_w, grid_h, // dest/to dimension (grid size) + desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx] + downsampled_weights); // [wy][wx] + + astc_helpers::log_astc_block trial_blk(best_log_blk); + + trial_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index]; + + for (uint32_t p = 0; p < NUM_SUBSETS; p++) + memcpy(trial_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals); + + const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise; + + for (uint32_t gy = 0; gy < grid_h; gy++) + for (uint32_t gx = 0; gx < grid_w; gx++) + trial_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]]; + + if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6))) + { + for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++) + { + bool refine_status = refine_endpoints( + cem, + endpoints_ise_range, + trial_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize + BLOCK_W, BLOCK_H, // block dimensions + grid_w, grid_h, trial_blk.m_weights, weights_ise_range, // weight grid + part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], + &part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets + coptions, mode11_opt_mode); + + BASISU_NOTE_UNUSED(refine_status); + } + } + + half_vec4 decoded_pixels_half4[BLOCK_H][BLOCK_W]; // [y][x] + bool status = astc_helpers::decode_block(trial_blk, decoded_pixels_half4, BLOCK_W, BLOCK_H, astc_helpers::cDecodeModeHDR16); + assert(status); + if (!status) + return false; + + half_vec3 decoded_pixels_half3[BLOCK_H][BLOCK_W]; + for (uint32_t y = 0; y < BLOCK_H; y++) + for (uint32_t x = 0; x < BLOCK_W; x++) + decoded_pixels_half3[y][x].set(decoded_pixels_half4[y][x][0], decoded_pixels_half4[y][x][1], decoded_pixels_half4[y][x][2]); + + double trial_err = compute_block_error(BLOCK_W * BLOCK_H, (const basist::half_float*)pBlock_pixels_half, (const basist::half_float*)decoded_pixels_half3, coptions); + if (trial_err < best_e) + { + best_e = trial_err; + best_log_blk = trial_blk; + } + } + + } // unique_p_iter + + if (best_e < BIG_FLOAT_VAL) + { + res.m_log_blk = best_log_blk; + res.m_valid = true; + res.m_err = best_e; + } + else + { + res.m_valid = false; + } + + return res.m_valid; +} + +static uint32_t encode_values(bitwise_coder &coder, uint32_t total_values, const uint8_t *pVals, uint32_t endpoint_range) +{ + const uint32_t MAX_VALS = 64; + uint32_t bit_values[MAX_VALS], tq_values[(MAX_VALS + 2) / 3]; + uint32_t total_tq_values = 0, tq_accum = 0, tq_mul = 1; + + assert((total_values) && (total_values <= MAX_VALS)); + + const uint32_t ep_bits = astc_helpers::g_ise_range_table[endpoint_range][0]; + const uint32_t ep_trits = astc_helpers::g_ise_range_table[endpoint_range][1]; + const uint32_t ep_quints = astc_helpers::g_ise_range_table[endpoint_range][2]; + + for (uint32_t i = 0; i < total_values; i++) + { + uint32_t val = pVals[i]; + + uint32_t bits = val & ((1 << ep_bits) - 1); + uint32_t tq = val >> ep_bits; + + bit_values[i] = bits; + + if (ep_trits) + { + assert(tq < 3); + tq_accum += tq * tq_mul; + tq_mul *= 3; + if (tq_mul == 243) + { + assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values)); + tq_values[total_tq_values++] = tq_accum; + tq_accum = 0; + tq_mul = 1; + } + } + else if (ep_quints) + { + assert(tq < 5); + tq_accum += tq * tq_mul; + tq_mul *= 5; + if (tq_mul == 125) + { + assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values)); + tq_values[total_tq_values++] = tq_accum; + tq_accum = 0; + tq_mul = 1; + } + } + } + + uint32_t total_bits_output = 0; + + for (uint32_t i = 0; i < total_tq_values; i++) + { + const uint32_t num_bits = ep_trits ? 8 : 7; + coder.put_bits(tq_values[i], num_bits); + total_bits_output += num_bits; + } + + if (tq_mul > 1) + { + uint32_t num_bits; + if (ep_trits) + { + if (tq_mul == 3) + num_bits = 2; + else if (tq_mul == 9) + num_bits = 4; + else if (tq_mul == 27) + num_bits = 5; + else //if (tq_mul == 81) + num_bits = 7; + } + else + { + if (tq_mul == 5) + num_bits = 3; + else //if (tq_mul == 25) + num_bits = 5; + } + coder.put_bits(tq_accum, num_bits); + total_bits_output += num_bits; + } + + for (uint32_t i = 0; i < total_values; i++) + { + coder.put_bits(bit_values[i], ep_bits); + total_bits_output += ep_bits; + } + + return total_bits_output; +} + +static inline uint32_t get_num_endpoint_vals(uint32_t cem) +{ + assert((cem == 7) || (cem == 11)); + return (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS; +} + +static void code_block(bitwise_coder& coder, + const astc_helpers::log_astc_block& log_blk, + block_mode block_mode_index, + endpoint_mode em, const uint8_t *pEP_deltas) +{ + coder.put_truncated_binary((uint32_t)block_mode_index, (uint32_t)block_mode::cBMTotalModes); + coder.put_truncated_binary((uint32_t)em, (uint32_t)endpoint_mode::cTotal); + + const uint32_t num_endpoint_vals = get_num_endpoint_vals(log_blk.m_color_endpoint_modes[0]); + + if ((em == endpoint_mode::cUseLeftDelta) || (em == endpoint_mode::cUseUpperDelta)) + { + assert(log_blk.m_num_partitions == 1); + + for (uint32_t i = 0; i < num_endpoint_vals; i++) + coder.put_bits(pEP_deltas[i], NUM_ENDPOINT_DELTA_BITS); + } + else if (em == endpoint_mode::cRaw) + { + if (log_blk.m_num_partitions == 2) + { + const int unique_partition_index = g_part2_seed_to_unique_index[log_blk.m_partition_id]; + assert(unique_partition_index != -1); + + coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS2); + } + else if (log_blk.m_num_partitions == 3) + { + const int unique_partition_index = g_part3_seed_to_unique_index[log_blk.m_partition_id]; + assert(unique_partition_index != -1); + + coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS3); + } + + encode_values(coder, num_endpoint_vals * log_blk.m_num_partitions, log_blk.m_endpoints, log_blk.m_endpoint_ise_range); + } + + encode_values(coder, log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1), log_blk.m_weights, log_blk.m_weight_ise_range); +} + +struct smooth_map_params +{ + bool m_no_mse_scaling; + + float m_max_smooth_std_dev; + float m_smooth_max_mse_scale; + + float m_max_med_smooth_std_dev; + float m_med_smooth_max_mse_scale; + + float m_max_ultra_smooth_std_dev; + float m_ultra_smooth_max_mse_scale; + + bool m_debug_images; + + smooth_map_params() + { + clear(); + } + + void clear() + { + m_no_mse_scaling = false; + + // 3x3 region + m_max_smooth_std_dev = 100.0f; + m_smooth_max_mse_scale = 13000.0f; + + // 7x7 region + m_max_med_smooth_std_dev = 9.0f; + m_med_smooth_max_mse_scale = 15000.0f; + + // 11x11 region + m_max_ultra_smooth_std_dev = 4.0f; + //m_ultra_smooth_max_mse_scale = 4500.0f; + //m_ultra_smooth_max_mse_scale = 10000.0f; + //m_ultra_smooth_max_mse_scale = 50000.0f; + //m_ultra_smooth_max_mse_scale = 100000.0f; + //m_ultra_smooth_max_mse_scale = 400000.0f; + //m_ultra_smooth_max_mse_scale = 800000.0f; + m_ultra_smooth_max_mse_scale = 2000000.0f; + + m_debug_images = true; + } +}; + +Resampler::Contrib_List* g_contrib_lists[7]; // 1-6 + +static void init_contrib_lists() +{ + for (uint32_t dst_width = 1; dst_width <= 6; dst_width++) + //g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_GAUSSIAN_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f); + g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_BELL_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f); +} + +#if 0 +static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, half_vec3 *pDst_block_half3, vec4F *pDst_block_q16) +{ + vec3F temp_block[6][6]; // [y][x] + + // first filter rows to temp_block + if (grid_x == 6) + { + memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6); + } + else + { + Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x]; + + for (uint32_t y = 0; y < 6; y++) + { + for (uint32_t x = 0; x < 6; x++) + { + vec3F p(0.0f); + + for (uint32_t i = 0; i < pRow_lists[x].n; i++) + p += pSrc_block[y * 6 + pRow_lists[x].p[i].pixel] * pRow_lists[x].p[i].weight; + + p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL); + + temp_block[y][x] = p; + } // x + } // y + } + + // filter columns + if (grid_y == 6) + { + for (uint32_t y = 0; y < 6; y++) + { + for (uint32_t x = 0; x < 6; x++) + { + for (uint32_t c = 0; c < 3; c++) + { + const basist::half_float h = basist::float_to_half(temp_block[y][x][c]); + + pDst_block_half3[x + y * 6][c] = h; + pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h); + } + + pDst_block_q16[x + y * 6][3] = 0.0f; + } // x + } // y + } + else + { + Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y]; + + for (uint32_t x = 0; x < 6; x++) + { + for (uint32_t y = 0; y < 6; y++) + { + vec3F p(0.0f); + + for (uint32_t i = 0; i < pCol_lists[y].n; i++) + p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight; + + p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL); + + for (uint32_t c = 0; c < 3; c++) + { + const basist::half_float h = basist::float_to_half(p[c]); + + pDst_block_half3[x + y * 6][c] = h; + pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h); + } + + pDst_block_q16[x + y * 6][3] = 0.0f; + + } // x + } // y + } +} +#endif + +static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec4F* pSrc_block, vec4F* pDst_block) +{ + vec4F temp_block[6][6]; // [y][x] + + // first filter rows to temp_block + if (grid_x == 6) + { + memcpy(temp_block, pSrc_block, sizeof(vec4F) * 6 * 6); + } + else + { + Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x]; + + for (uint32_t y = 0; y < 6; y++) + { + for (uint32_t x = 0; x < 6; x++) + { + vec3F p(0.0f); + + for (uint32_t i = 0; i < pRow_lists[x].n; i++) + p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight; + + p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL); + + temp_block[y][x] = p; + } // x + } // y + } + + // filter columns + if (grid_y == 6) + { + for (uint32_t y = 0; y < 6; y++) + { + for (uint32_t x = 0; x < 6; x++) + { + for (uint32_t c = 0; c < 3; c++) + pDst_block[x + y * 6][c] = temp_block[y][x][c]; + } // x + } // y + } + else + { + Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y]; + + for (uint32_t x = 0; x < 6; x++) + { + for (uint32_t y = 0; y < 6; y++) + { + vec3F p(0.0f); + + for (uint32_t i = 0; i < pCol_lists[y].n; i++) + p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight; + + p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL); + + pDst_block[x + y * 6] = p; + + } // x + } // y + } +} + +static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, vec3F* pDst_block) +{ + vec3F temp_block[6][6]; // [y][x] + + // first filter rows to temp_block + if (grid_x == 6) + { + memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6); + } + else + { + Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x]; + + for (uint32_t y = 0; y < 6; y++) + { + for (uint32_t x = 0; x < 6; x++) + { + vec3F p(0.0f); + + for (uint32_t i = 0; i < pRow_lists[x].n; i++) + p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight; + + temp_block[y][x] = p; + } // x + } // y + } + + // filter columns + if (grid_y == 6) + { + memcpy(pDst_block, temp_block, sizeof(vec3F) * 6 * 6); + } + else + { + Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y]; + + for (uint32_t x = 0; x < 6; x++) + { + for (uint32_t y = 0; y < 6; y++) + { + vec3F& p = pDst_block[x + y * 6]; + p.set(0.0f); + + for (uint32_t i = 0; i < pCol_lists[y].n; i++) + p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight; + } // x + } // y + } +} + +static float diff_blocks(const vec4F* pA, const vec4F* pB) +{ + const uint32_t BLOCK_T = 36; + + float diff = 0.0f; + for (uint32_t i = 0; i < BLOCK_T; i++) + diff += square(pA[i][0] - pB[i][0]) + square(pA[i][1] - pB[i][1]) + square(pA[i][2] - pB[i][2]); + + return diff * (1.0f / (float)BLOCK_T); +} + +static float sub_and_compute_std_dev(const vec3F* pA, const vec3F* pB) +{ + const uint32_t BLOCK_T = 36; + + vec3F mean(0.0f); + + for (uint32_t i = 0; i < BLOCK_T; i++) + { + vec3F diff(pA[i] - pB[i]); + mean += diff; + } + + mean *= (1.0f / (float)BLOCK_T); + + vec3F diff_sum(0.0f); + for (uint32_t i = 0; i < BLOCK_T; i++) + { + vec3F diff(pA[i] - pB[i]); + diff -= mean; + diff_sum += vec3F::component_mul(diff, diff); + } + + vec3F var(diff_sum * (1.0f / (float)BLOCK_T)); + + vec3F std_dev(sqrtf(var[0]), sqrtf(var[1]), sqrtf(var[2])); + + return maximum(std_dev[0], std_dev[1], std_dev[2]); +} + +static void create_smooth_maps2( + vector2D& smooth_block_mse_scales, + const image& orig_img, + smooth_map_params& params, image* pUltra_smooth_img = nullptr) +{ + const uint32_t width = orig_img.get_width(); + const uint32_t height = orig_img.get_height(); + //const uint32_t total_pixels = orig_img.get_total_pixels(); + const uint32_t num_comps = 3; + + if (params.m_no_mse_scaling) + { + smooth_block_mse_scales.set_all(1.0f); + return; + } + + smooth_block_mse_scales.resize(width, height); + + image smooth_vis, med_smooth_vis, ultra_smooth_vis; + + if (params.m_debug_images) + { + smooth_vis.resize(width, height); + med_smooth_vis.resize(width, height); + ultra_smooth_vis.resize(width, height); + } + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + { + tracked_stat_dbl comp_stats[4]; + for (int yd = -1; yd <= 1; yd++) + { + for (int xd = -1; xd <= 1; xd++) + { + const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd); + + comp_stats[0].update((float)p[0]); + comp_stats[1].update((float)p[1]); + comp_stats[2].update((float)p[2]); + } + } + + float max_std_dev = 0.0f; + for (uint32_t i = 0; i < num_comps; i++) + max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev()); + + float yl = clampf(max_std_dev / params.m_max_smooth_std_dev, 0.0f, 1.0f); + //yl = powf(yl, 2.0f); + yl = powf(yl, 1.0f / 2.0f); // substantially less bits + + smooth_block_mse_scales(x, y) = lerp(params.m_smooth_max_mse_scale, 1.0f, yl); + + if (params.m_debug_images) + { + //smooth_vis(x, y).set(clamp((int)((smooth_block_mse_scales(x, y) - 1.0f) / (params.m_smooth_max_mse_scale - 1.0f) * 255.0f + .5f), 0, 255)); + // white=high local activity (edges/detail) + // black=low local activity (smooth - error is amplified) + smooth_vis(x, y).set(clamp((int)((yl * 255.0f) + .5f), 0, 255)); + } + } + + { + tracked_stat_dbl comp_stats[4]; + + const int S = 3; + for (int yd = -S; yd < S; yd++) + { + for (int xd = -S; xd < S; xd++) + { + const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd); + + comp_stats[0].update((float)p[0]); + comp_stats[1].update((float)p[1]); + comp_stats[2].update((float)p[2]); + } + } + + float max_std_dev = 0.0f; + for (uint32_t i = 0; i < num_comps; i++) + max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev()); + + float yl = clampf(max_std_dev / params.m_max_med_smooth_std_dev, 0.0f, 1.0f); + //yl = powf(yl, 2.0f); + + smooth_block_mse_scales(x, y) = lerp(params.m_med_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl); + + if (params.m_debug_images) + med_smooth_vis(x, y).set((int)std::round(yl * 255.0f)); + } + + { + tracked_stat_dbl comp_stats[4]; + + const int S = 5; + for (int yd = -S; yd < S; yd++) + { + for (int xd = -S; xd < S; xd++) + { + const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd); + + comp_stats[0].update((float)p[0]); + comp_stats[1].update((float)p[1]); + comp_stats[2].update((float)p[2]); + } + } + + float max_std_dev = 0.0f; + for (uint32_t i = 0; i < num_comps; i++) + max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev()); + + float yl = clampf(max_std_dev / params.m_max_ultra_smooth_std_dev, 0.0f, 1.0f); + yl = powf(yl, 2.0f); + + smooth_block_mse_scales(x, y) = lerp(params.m_ultra_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl); + + if (params.m_debug_images) + ultra_smooth_vis(x, y).set((int)std::round(yl * 255.0f)); + } + + } + } + + if (params.m_debug_images) + { + save_png("dbg_smooth_vis.png", smooth_vis); + save_png("dbg_med_smooth_vis.png", med_smooth_vis); + save_png("dbg_ultra_smooth_vis.png", ultra_smooth_vis); + + image vis_img(width, height); + + float max_scale = 0.0f; + for (uint32_t y = 0; y < height; y++) + for (uint32_t x = 0; x < width; x++) + max_scale = basisu::maximumf(max_scale, smooth_block_mse_scales(x, y)); + + for (uint32_t y = 0; y < height; y++) + for (uint32_t x = 0; x < width; x++) + vis_img(x, y).set((int)std::round(smooth_block_mse_scales(x, y) * 255.0f / max_scale)); + + save_png("scale_vis.png", vis_img); + } + + if (pUltra_smooth_img) + *pUltra_smooth_img = ultra_smooth_vis; +} + +const float REALLY_DARK_I_THRESHOLD = 0.0625f; +const float REALLY_DARK_MSE_ERR_SCALE = 128.0f; +const float REALLY_DARK_DELTA_ITP_JND_SCALE = 5.0f; + +static float compute_pixel_mse_itp(const vec3F& orig_pixel_itp, const vec3F& comp_pixel_itp, bool delta_itp_dark_adjustment) +{ + float delta_i = orig_pixel_itp[0] - comp_pixel_itp[0]; + float delta_t = orig_pixel_itp[1] - comp_pixel_itp[1]; + float delta_p = orig_pixel_itp[2] - comp_pixel_itp[2]; + + float err = (delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p); + + if (delta_itp_dark_adjustment) + { + // We have to process a large range of inputs, including extremely dark inputs. + // Artifically amplify MSE on very dark pixels - otherwise they'll be overly compressed at higher lambdas. + // This is to better handle very dark signals which could be explictly overexposed. + float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig_pixel_itp[0]); + s = lerp(REALLY_DARK_MSE_ERR_SCALE, 1.0f, s); + err *= s; + } + + return err; +} + +static float compute_block_mse_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp, bool delta_itp_dark_adjustment) +{ + float total_mse = 0.0f; + + for (uint32_t y = 0; y < block_h; y++) + { + for (uint32_t x = 0; x < block_w; x++) + { + total_mse += compute_pixel_mse_itp(pOrig_pixels_itp[x + y * block_w], pComp_pixels_itp[x + y * block_w], delta_itp_dark_adjustment); + } // x + } // y + + return total_mse * (1.0f / (float)(block_w * block_h)); +} + +static float compute_block_ssim_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp) +{ + const uint32_t n = block_w * block_h; + assert(n <= 36); + + stats x_stats[3], y_stats[3]; + comparative_stats xy_cov[3]; + + for (uint32_t c = 0; c < 3; c++) + { + x_stats[c].calc_simplified(n, &pOrig_pixels_itp[0][c], 3); + y_stats[c].calc_simplified(n, &pComp_pixels_itp[0][c], 3); + } + + for (uint32_t c = 0; c < 3; c++) + xy_cov[c].calc_cov(n, &pOrig_pixels_itp[0][c], &pComp_pixels_itp[0][c], 3, 3, &x_stats[c], &y_stats[c]); + + float ssim[3]; + const double d = 1.0f, k1 = .01f, k2 = .03f; + + // weight mean error more highly to reduce blocking + float ap = 1.5f, bp = 1.0f, cp = 1.0f; + + const double s_c1 = square(k1 * d), s_c2 = square(k2 * d); + const double s_c3(s_c2 * .5f); + + for (uint32_t c = 0; c < 3; c++) + { + float lum = (float)((2.0f * x_stats[c].m_avg * y_stats[c].m_avg + s_c1) / (square(x_stats[c].m_avg) + square(y_stats[c].m_avg) + s_c1)); + lum = saturate(lum); + + float con = (float)((2.0f * x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c2) / (x_stats[c].m_var + y_stats[c].m_var + s_c2)); + con = saturate(con); + + float str = (float)((xy_cov[c].m_cov + s_c3) / (x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c3)); + str = saturate(str); + + ssim[c] = powf(lum, ap) * powf(con, bp) * powf(str, cp); + } + +#if 0 + float final_ssim = (ssim[0] * .4f + ssim[1] * .3f + ssim[2] * .3f); +#elif 1 + float final_ssim = ssim[0] * ssim[1] * ssim[2]; +#else + const float LP = .75f; + float final_ssim = ssim[0] * powf((ssim[1] + ssim[2]) * .5f, LP); +#endif + + return final_ssim; +} + +// delta ITP, 1.0 is JND (Rec. ITU-R BT.2124), modified for higher error at low light +static float compute_pixel_delta_itp(const vec3F& a, const vec3F& b, const vec3F& orig, bool delta_itp_dark_adjustment) +{ + float delta_i = a[0] - b[0]; + float delta_t = a[1] - b[1]; + float delta_p = a[2] - b[2]; + + float err = 720.0f * sqrtf((delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p)); + + float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig[0]); + + if (delta_itp_dark_adjustment) + { + // This is to better handle very dark signals which could be explictly overexposed. + s = lerp(REALLY_DARK_DELTA_ITP_JND_SCALE, 1.0f, s); + err *= s; + } + + return err; +} + +struct candidate_encoding +{ + encoding_type m_encoding_type; + + basist::half_float m_solid_color[3]; + + uint32_t m_run_len; + + vec3F m_comp_pixels[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x] + vec3F m_comp_pixels_itp[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x] + + endpoint_mode m_endpoint_mode; + block_mode m_block_mode; + + bitwise_coder m_coder; + + // The block to code, which may not be valid ASTC. This may have to be transcoded (by requantizing the weights/endpoints) before it's valid ASTC. + // Note the endpoints may be coded endpoints OR transcoded endpoints, depending on the encoding type. + astc_helpers::log_astc_block m_coded_log_blk; + + // The block the decoder outputs. + astc_helpers::log_astc_block m_decomp_log_blk; + + int m_reuse_delta_index; + + float m_t, m_d, m_bits; + + candidate_encoding() + { + clear(); + } + + candidate_encoding(const candidate_encoding &other) + { + *this = other; + } + + candidate_encoding(candidate_encoding&& other) + { + *this = std::move(other); + } + + candidate_encoding& operator=(const candidate_encoding& rhs) + { + if (this == &rhs) + return *this; + + m_encoding_type = rhs.m_encoding_type; + memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color)); + m_run_len = rhs.m_run_len; + memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels)); + m_endpoint_mode = rhs.m_endpoint_mode; + m_block_mode = rhs.m_block_mode; + m_coder = rhs.m_coder; + m_coded_log_blk = rhs.m_coded_log_blk; + m_decomp_log_blk = rhs.m_decomp_log_blk; + m_reuse_delta_index = rhs.m_reuse_delta_index; + + return *this; + } + + candidate_encoding& operator=(candidate_encoding&& rhs) + { + if (this == &rhs) + return *this; + + m_encoding_type = rhs.m_encoding_type; + memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color)); + m_run_len = rhs.m_run_len; + memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels)); + m_endpoint_mode = rhs.m_endpoint_mode; + m_block_mode = rhs.m_block_mode; + m_coder = std::move(rhs.m_coder); + m_coded_log_blk = rhs.m_coded_log_blk; + m_decomp_log_blk = rhs.m_decomp_log_blk; + m_reuse_delta_index = rhs.m_reuse_delta_index; + + return *this; + } + + void clear() + { + m_encoding_type = encoding_type::cInvalid; + + clear_obj(m_solid_color); + + m_run_len = 0; + + clear_obj(m_comp_pixels); + + m_endpoint_mode = endpoint_mode::cInvalid; + m_block_mode = block_mode::cInvalid; + + m_coder.restart(); + + m_coded_log_blk.clear(); + m_decomp_log_blk.clear(); + + m_t = 0; + m_d = 0; + m_bits = 0; + + m_reuse_delta_index = 0; + } +}; + +bool decode_astc_block(uint32_t block_w, uint32_t block_h, astc_helpers::log_astc_block &log_blk, vec3F *pPixels) +{ + assert((block_w <= 6) && (block_h <= 6)); + + half_vec4 decoded_pixels_half4[6 * 6]; // [y][x] + bool status = astc_helpers::decode_block(log_blk, decoded_pixels_half4, block_w, block_h, astc_helpers::cDecodeModeHDR16); + assert(status); + + if (!status) + return false; + + for (uint32_t y = 0; y < block_h; y++) + { + for (uint32_t x = 0; x < block_w; x++) + { + pPixels[x + y * block_w].set( + basist::half_to_float(decoded_pixels_half4[x + y * block_w][0]), + basist::half_to_float(decoded_pixels_half4[x + y * block_w][1]), + basist::half_to_float(decoded_pixels_half4[x + y * block_w][2])); + } // x + } //y + + return true; +} + +static inline bool validate_log_blk(const astc_helpers::log_astc_block &decomp_blk) +{ + astc_helpers::astc_block phys_blk; + return astc_helpers::pack_astc_block(phys_blk, decomp_blk); +} + +#define SYNC_MARKERS (0) + +static bool decode_file(const uint8_vec& comp_data, vector2D& decoded_blocks, uint32_t &width, uint32_t &height) +{ + interval_timer tm; + tm.start(); + + const uint32_t BLOCK_W = 6, BLOCK_H = 6; + + width = 0; + height = 0; + + if (comp_data.size() <= 2*3) + return false; + + basist::bitwise_decoder decoder; + if (!decoder.init(comp_data.data(), comp_data.size_u32())) + return false; + + if (decoder.get_bits(16) != 0xABCD) + return false; + + width = decoder.get_bits(16); + height = decoder.get_bits(16); + + if (!width || !height || (width > MAX_ASTC_HDR_6X6_DIM) || (height > MAX_ASTC_HDR_6X6_DIM)) + return false; + + const uint32_t num_blocks_x = (width + BLOCK_W - 1) / BLOCK_W; + const uint32_t num_blocks_y = (height + BLOCK_H - 1) / BLOCK_H; + const uint32_t total_blocks = num_blocks_x * num_blocks_y; + + decoded_blocks.resize(num_blocks_x, num_blocks_y); + //memset(decoded_blocks.get_ptr(), 0, decoded_blocks.size_in_bytes()); + + vector2D decoded_log_blocks(num_blocks_x, num_blocks_y); + //memset(decoded_log_blocks.get_ptr(), 0, decoded_log_blocks.size_in_bytes()); + + uint32_t cur_bx = 0, cur_by = 0; + uint32_t step_counter = 0; + BASISU_NOTE_UNUSED(step_counter); + + while (cur_by < num_blocks_y) + { + step_counter++; + + //if ((cur_bx == 9) && (cur_by == 13)) + // printf("!"); + +#if SYNC_MARKERS + uint32_t mk = decoder.get_bits(16); + if (mk != 0xDEAD) + { + printf("!"); + assert(0); + return false; + } +#endif + if (decoder.get_bits_remaining() < 1) + return false; + + encoding_type et = encoding_type::cBlock; + + uint32_t b0 = decoder.get_bits(1); + if (!b0) + { + uint32_t b1 = decoder.get_bits(1); + if (b1) + et = encoding_type::cReuse; + else + { + uint32_t b2 = decoder.get_bits(1); + if (b2) + et = encoding_type::cSolid; + else + et = encoding_type::cRun; + } + } + + switch (et) + { + case encoding_type::cRun: + { + if (!cur_bx && !cur_by) + return false; + + const uint32_t run_len = decoder.decode_vlc(5) + 1; + + uint32_t num_blocks_remaining = total_blocks - (cur_bx + cur_by * num_blocks_x); + if (run_len > num_blocks_remaining) + return false; + + uint32_t prev_bx = cur_bx, prev_by = cur_by; + + if (cur_bx) + prev_bx--; + else + { + prev_bx = num_blocks_x - 1; + prev_by--; + } + + const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by); + const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by); + + for (uint32_t i = 0; i < run_len; i++) + { + decoded_log_blocks(cur_bx, cur_by) = prev_log_blk; + decoded_blocks(cur_bx, cur_by) = prev_phys_blk; + + cur_bx++; + if (cur_bx == num_blocks_x) + { + cur_bx = 0; + cur_by++; + } + } + + break; + } + case encoding_type::cSolid: + { + const basist::half_float rh = (basist::half_float)decoder.get_bits(15); + const basist::half_float gh = (basist::half_float)decoder.get_bits(15); + const basist::half_float bh = (basist::half_float)decoder.get_bits(15); + + astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by); + + log_blk.clear(); + log_blk.m_solid_color_flag_hdr = true; + log_blk.m_solid_color[0] = rh; + log_blk.m_solid_color[1] = gh; + log_blk.m_solid_color[2] = bh; + log_blk.m_solid_color[3] = basist::float_to_half(1.0f); + + bool status = astc_helpers::pack_astc_block(decoded_blocks(cur_bx, cur_by), log_blk); + if (!status) + return false; + + cur_bx++; + if (cur_bx == num_blocks_x) + { + cur_bx = 0; + cur_by++; + } + + break; + } + case encoding_type::cReuse: + { + if (!cur_bx && !cur_by) + return false; + + const uint32_t reuse_delta_index = decoder.get_bits(REUSE_XY_DELTA_BITS); + + const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x; + const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y; + + const int prev_bx = cur_bx + reuse_delta_x, prev_by = cur_by + reuse_delta_y; + if ((prev_bx < 0) || (prev_bx >= (int)num_blocks_x)) + return false; + if (prev_by < 0) + return false; + + const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by); + const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by); + + if (prev_log_blk.m_solid_color_flag_hdr) + return false; + + astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by); + astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by); + + log_blk = prev_log_blk; + + const uint32_t total_grid_weights = log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1); + + bool status = basist::astc_6x6_hdr::decode_values(decoder, total_grid_weights, log_blk.m_weight_ise_range, log_blk.m_weights); + if (!status) + return false; + + astc_helpers::log_astc_block decomp_blk; + status = astc_helpers::unpack_block(&prev_phys_blk, decomp_blk, BLOCK_W, BLOCK_H); + if (!status) + return false; + + uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; + basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, log_blk.m_weight_ise_range, transcode_weights, decomp_blk.m_weight_ise_range); + + copy_weight_grid(log_blk.m_dual_plane, log_blk.m_grid_width, log_blk.m_grid_height, transcode_weights, decomp_blk); + + status = astc_helpers::pack_astc_block(phys_blk, decomp_blk); + if (!status) + return false; + + cur_bx++; + if (cur_bx == num_blocks_x) + { + cur_bx = 0; + cur_by++; + } + + break; + } + case encoding_type::cBlock: + { + const block_mode bm = (block_mode)decoder.decode_truncated_binary((uint32_t)block_mode::cBMTotalModes); + const endpoint_mode em = (endpoint_mode)decoder.decode_truncated_binary((uint32_t)endpoint_mode::cTotal); + + switch (em) + { + case endpoint_mode::cUseLeft: + case endpoint_mode::cUseUpper: + { + int neighbor_bx = cur_bx, neighbor_by = cur_by; + + if (em == endpoint_mode::cUseLeft) + neighbor_bx--; + else + neighbor_by--; + + if ((neighbor_bx < 0) || (neighbor_by < 0)) + return false; + + const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by); + if (!neighbor_blk.m_color_endpoint_modes[0]) + return false; + + const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm]; + const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem); + + if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0]) + return false; + + astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by); + astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by); + + log_blk.clear(); + log_blk.m_num_partitions = 1; + log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem; + log_blk.m_endpoint_ise_range = neighbor_blk.m_endpoint_ise_range; + log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range; + log_blk.m_grid_width = (uint8_t)bmd.m_grid_x; + log_blk.m_grid_height = (uint8_t)bmd.m_grid_y; + log_blk.m_dual_plane = (uint8_t)bmd.m_dp; + log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; + + memcpy(log_blk.m_endpoints, neighbor_blk.m_endpoints, num_endpoint_values); + + const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1); + + bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights); + if (!status) + return false; + + astc_helpers::log_astc_block decomp_blk; + decomp_blk.clear(); + + decomp_blk.m_num_partitions = 1; + decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem; + decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range; + decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range; + decomp_blk.m_dual_plane = bmd.m_dp; + decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; + + basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints); + + uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; + basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range); + + copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk); + + status = astc_helpers::pack_astc_block(phys_blk, decomp_blk); + if (!status) + return false; + + cur_bx++; + if (cur_bx == num_blocks_x) + { + cur_bx = 0; + cur_by++; + } + + break; + } + case endpoint_mode::cUseLeftDelta: + case endpoint_mode::cUseUpperDelta: + { + int neighbor_bx = cur_bx, neighbor_by = cur_by; + + if (em == endpoint_mode::cUseLeftDelta) + neighbor_bx--; + else + neighbor_by--; + + if ((neighbor_bx < 0) || (neighbor_by < 0)) + return false; + + const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by); + if (!neighbor_blk.m_color_endpoint_modes[0]) + return false; + + const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm]; + const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem); + + if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0]) + return false; + + astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by); + astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by); + + log_blk.clear(); + log_blk.m_num_partitions = 1; + log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem; + log_blk.m_dual_plane = bmd.m_dp; + log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; + + log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range; + basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, neighbor_blk.m_endpoint_ise_range, neighbor_blk.m_endpoints, bmd.m_endpoint_ise_range, log_blk.m_endpoints); + + const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS; + const int low_delta_limit = -(total_endpoint_delta_vals / 2); // high_delta_limit = (total_endpoint_delta_vals / 2) - 1; + + const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_ISE_to_rank; + const auto& rank_to_ise = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_rank_to_ISE; + const int total_endpoint_levels = astc_helpers::get_ise_levels(log_blk.m_endpoint_ise_range); + + for (uint32_t i = 0; i < num_endpoint_values; i++) + { + int cur_val = ise_to_rank[log_blk.m_endpoints[i]]; + + int delta = (int)decoder.get_bits(NUM_ENDPOINT_DELTA_BITS) + low_delta_limit; + + cur_val += delta; + if ((cur_val < 0) || (cur_val >= total_endpoint_levels)) + return false; + + log_blk.m_endpoints[i] = rank_to_ise[cur_val]; + } + + log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range; + log_blk.m_grid_width = (uint8_t)bmd.m_grid_x; + log_blk.m_grid_height = (uint8_t)bmd.m_grid_y; + + const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1); + + bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights); + if (!status) + return false; + + astc_helpers::log_astc_block decomp_blk; + decomp_blk.clear(); + + decomp_blk.m_num_partitions = 1; + decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem; + decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range; + decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range; + decomp_blk.m_dual_plane = (uint8_t)bmd.m_dp; + decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; + + basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints); + + uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; + basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range); + + copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk); + + status = astc_helpers::pack_astc_block(phys_blk, decomp_blk); + if (!status) + return false; + + cur_bx++; + if (cur_bx == num_blocks_x) + { + cur_bx = 0; + cur_by++; + } + + break; + } + case endpoint_mode::cRaw: + { + const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm]; + + const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem); + + astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by); + astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by); + + log_blk.clear(); + log_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions; + + for (uint32_t p = 0; p < bmd.m_num_partitions; p++) + log_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem; + + log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range; + log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range; + + log_blk.m_grid_width = (uint8_t)bmd.m_grid_x; + log_blk.m_grid_height = (uint8_t)bmd.m_grid_y; + log_blk.m_dual_plane = (uint8_t)bmd.m_dp; + log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; + + if (bmd.m_num_partitions == 2) + { + const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS2); + log_blk.m_partition_id = (uint16_t)g_part2_unique_index_to_seed[unique_partition_index]; + } + else if (bmd.m_num_partitions == 3) + { + const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS3); + log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_partition_index]; + } + + bool status = decode_values(decoder, num_endpoint_values * bmd.m_num_partitions, bmd.m_endpoint_ise_range, log_blk.m_endpoints); + if (!status) + return false; + + const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1); + + status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights); + if (!status) + return false; + + astc_helpers::log_astc_block decomp_blk; + decomp_blk.clear(); + + decomp_blk.m_dual_plane = bmd.m_dp; + decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; + decomp_blk.m_partition_id = log_blk.m_partition_id; + + decomp_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions; + + for (uint32_t p = 0; p < bmd.m_num_partitions; p++) + decomp_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem; + + decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range; + decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range; + + for (uint32_t p = 0; p < bmd.m_num_partitions; p++) + basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, bmd.m_endpoint_ise_range, log_blk.m_endpoints + num_endpoint_values * p, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints + num_endpoint_values * p); + + uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; + basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range); + + copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk); + + status = astc_helpers::pack_astc_block(phys_blk, decomp_blk); + if (!status) + return false; + + cur_bx++; + if (cur_bx == num_blocks_x) + { + cur_bx = 0; + cur_by++; + } + + break; + } + default: + { + assert(0); + return false; + } + } + + break; + } + default: + { + assert(0); + return false; + } + } + } + + if (decoder.get_bits(16) != 0xA742) + { + fmt_error_printf("End marker not found!\n"); + return false; + } + + //fmt_printf("Total decode_file() time: {} secs\n", tm.get_elapsed_secs()); + + return true; +} + +static bool unpack_physical_astc_block(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels) +{ + astc_helpers::log_astc_block log_blk; + if (!astc_helpers::unpack_block(pBlock, log_blk, block_width, block_height)) + return false; + + basist::half_float half_block[MAX_BLOCK_W * MAX_BLOCK_H][4]; + if (!astc_helpers::decode_block(log_blk, half_block, block_width, block_height, astc_helpers::cDecodeModeHDR16)) + return false; + + const uint32_t total_block_pixels = block_width * block_height; + for (uint32_t p = 0; p < total_block_pixels; p++) + { + pPixels[p][0] = basist::half_to_float(half_block[p][0]); + pPixels[p][1] = basist::half_to_float(half_block[p][1]); + pPixels[p][2] = basist::half_to_float(half_block[p][2]); + pPixels[p][3] = basist::half_to_float(half_block[p][3]); + } + + return true; +} + +static bool unpack_physical_astc_block_google(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels) +{ + return basisu_astc::astc::decompress_hdr((float *)pPixels, (uint8_t*)pBlock, block_width, block_height); +} + +static bool pack_bc6h_image(const imagef &src_img, vector2D &bc6h_blocks, imagef *pPacked_bc6h_img, const fast_bc6h_params &enc_params) +{ + const uint32_t width = src_img.get_width(); + const uint32_t height = src_img.get_height(); + + if (pPacked_bc6h_img) + pPacked_bc6h_img->resize(width, height); + + interval_timer tm; + double total_enc_time = 0.0f; + + const uint32_t num_blocks_x = src_img.get_block_width(4); + const uint32_t num_blocks_y = src_img.get_block_height(4); + + bc6h_blocks.resize(num_blocks_x, num_blocks_y); + + for (uint32_t by = 0; by < num_blocks_y; by++) + { + for (uint32_t bx = 0; bx < num_blocks_x; bx++) + { + // Extract source image block + vec4F block_pixels[4][4]; // [y][x] + src_img.extract_block_clamped(&block_pixels[0][0], bx * 4, by * 4, 4, 4); + + basist::half_float half_pixels[16 * 3]; // [y][x] + + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + for (uint32_t c = 0; c < 3; c++) + { + float v = block_pixels[y][x][c]; + + basist::half_float h = basist::float_to_half(v); + + half_pixels[(x + y * 4) * 3 + c] = h; + + } // c + + } // x + } // y + + basist::bc6h_block& bc6h_blk = bc6h_blocks(bx, by); + + tm.start(); + + basist::astc_6x6_hdr::fast_encode_bc6h(half_pixels, &bc6h_blk, enc_params); + + total_enc_time += tm.get_elapsed_secs(); + + if (pPacked_bc6h_img) + { + basist::half_float unpacked_blk[16 * 3]; + bool status = unpack_bc6h(&bc6h_blk, unpacked_blk, false); + assert(status); + if (!status) + { + fmt_error_printf("unpack_bc6h() failed\n"); + return false; + } + + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + vec4F p; + + for (uint32_t c = 0; c < 3; c++) + { + float v = basist::half_to_float(unpacked_blk[(x + y * 4) * 3 + c]); + p[c] = v; + + } // c + + p[3] = 1.0f; + + pPacked_bc6h_img->set_clipped(bx * 4 + x, by * 4 + y, p); + } // x + } // y + } + + } // bx + } // by + + //fmt_printf("Total BC6H encode time: {}\n", total_enc_time); + + return true; +} + +static float dist_to_line_squared(const vec3F& p, const vec3F &line_org, const vec3F &line_dir) +{ + vec3F q(p - line_org); + vec3F v(q - q.dot(line_dir) * line_dir); + return v.dot(v); +} + +static void estimate_partitions_mode7_and_11( + uint32_t num_parts, // 2 or 3 partitions + uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns + uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine + const vec3F *pHalf_pixels_as_floats, // block's half pixel values casted to floats + const astc_hdr_codec_base_options& coptions, // options + uint32_t num_desired_pats, + int *pDesired_pat_indices_mode11, int *pDesired_pat_indices_mode7) // output indices +{ + BASISU_NOTE_UNUSED(coptions); + BASISU_NOTE_UNUSED(num_unique_pats); + + const uint32_t BLOCK_W = 6, BLOCK_H = 6, MAX_PARTS = 3; // BLOCK_T = 6 * 6 + assert(num_parts <= MAX_PARTS); + + struct candidate_res + { + float m_total_sq_dist; + uint32_t m_index; + bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; } + }; + + const uint32_t MAX_CANDIDATES = 1024; + assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES)); + + candidate_res mode11_candidates[MAX_CANDIDATES]; + candidate_res mode7_candidates[MAX_CANDIDATES]; + + const vec3F grayscale_axis(0.5773502691f); + + for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++) + { + const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter]; + assert(unique_part_index < num_unique_pats); + + const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index]; + + vec3F part_means[MAX_PARTS]; + uint32_t part_total_texels[MAX_PARTS] = { 0 }; + + for (uint32_t i = 0; i < num_parts; i++) + part_means[i].clear(); + + for (uint32_t y = 0; y < BLOCK_H; y++) + { + for (uint32_t x = 0; x < BLOCK_W; x++) + { + const uint32_t part_index = (*pPat)(x, y); + assert(part_index < num_parts); + + part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W]; + part_total_texels[part_index]++; + + } // x + } // y + + for (uint32_t i = 0; i < num_parts; i++) + { + assert(part_total_texels[i]); + part_means[i] /= (float)part_total_texels[i]; + } + + float part_cov[MAX_PARTS][6]; + memset(part_cov, 0, sizeof(part_cov)); + + for (uint32_t y = 0; y < BLOCK_H; y++) + { + for (uint32_t x = 0; x < BLOCK_W; x++) + { + const uint32_t part_index = (*pPat)(x, y); + assert(part_index < num_parts); + + const vec3F p(pHalf_pixels_as_floats[x + y * BLOCK_W] - part_means[part_index]); + + const float r = p[0], g = p[1], b = p[2]; + + part_cov[part_index][0] += r * r; + part_cov[part_index][1] += r * g; + part_cov[part_index][2] += r * b; + part_cov[part_index][3] += g * g; + part_cov[part_index][4] += g * b; + part_cov[part_index][5] += b * b; + + } // x + } // y + + // For each partition compute the total variance of all channels. + float total_variance[MAX_PARTS]; + for (uint32_t part_index = 0; part_index < num_parts; part_index++) + total_variance[part_index] = part_cov[part_index][0] + part_cov[part_index][3] + part_cov[part_index][5]; + + vec3F part_axis[MAX_PARTS]; + float mode11_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis + float mode7_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis + + for (uint32_t part_index = 0; part_index < num_parts; part_index++) + { + float* pCov = &part_cov[part_index][0]; + + float xr = .9f, xg = 1.0f, xb = .7f; + + const uint32_t NUM_POWER_ITERS = 4; + for (uint32_t iter = 0; iter < NUM_POWER_ITERS; iter++) + { + float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2]; + float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4]; + float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5]; + + float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b)); + + if (m >= 1e-10f) + { + m = 1.0f / m; + + r *= m; + g *= m; + b *= m; + } + + xr = r; + xg = g; + xb = b; + } + + float len_sq = xr * xr + xg * xg + xb * xb; + + if (len_sq < 1e-10f) + { + xr = grayscale_axis[0]; + xg = grayscale_axis[0]; + xb = grayscale_axis[0]; + } + else + { + len_sq = 1.0f / sqrtf(len_sq); + + xr *= len_sq; + xg *= len_sq; + xb *= len_sq; + } + + { + // Transform the principle axis by the covariance matrix, which will scale the vector by its eigenvalue (the variance of the dataset projected onto the principle axis). + float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2]; + float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4]; + float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5]; + + // Estimate the principle eigenvalue by computing the magnitude of the transformed vector. + // The result is the variance along the principle axis. + //float z1 = sqrtf(r * r + g * g + b * b); // this works with the principle axis + //float z2 = r * xr + g * xg + b * xb; // compute length projected along xr,xg,xb + + mode11_eigenvalue_est[part_index] = r * xr + g * xg + b * xb; + } + + { + const float yrgb = grayscale_axis[0]; + + // Transform the grayscale axis by the covariance matrix, which will scale the vector by the eigenvalue (which is the variance of the dataset projected onto this vector). + float r = yrgb * pCov[0] + yrgb * pCov[1] + yrgb * pCov[2]; + float g = yrgb * pCov[1] + yrgb * pCov[3] + yrgb * pCov[4]; + float b = yrgb * pCov[2] + yrgb * pCov[4] + yrgb * pCov[5]; + + mode7_eigenvalue_est[part_index] = r * yrgb + g * yrgb + b * yrgb; + } + + } // part_index + + // Compute the total variance (squared error) of the other 2 axes by subtracting the total variance of all channels by the variance of the principle axis. + // TODO: Could also compute the ratio of the principle axis's variance vs. the total variance. + float mode11_total_sq_dist_to_line_alt = 0.0f; + for (uint32_t part_index = 0; part_index < num_parts; part_index++) + { + float d = maximum(0.0f, total_variance[part_index] - mode11_eigenvalue_est[part_index]); + mode11_total_sq_dist_to_line_alt += d; + } + + { +#if 0 + // TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix), + // then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances. + float total_sq_dist_to_line = 0.0f; + for (uint32_t i = 0; i < BLOCK_T; i++) + { + const uint32_t part_index = (*pPat)[i]; + assert(part_index < num_parts); + + total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis[part_index]); + } + + mode11_candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line; +#else + mode11_candidates[examine_iter].m_total_sq_dist = mode11_total_sq_dist_to_line_alt; +#endif + mode11_candidates[examine_iter].m_index = unique_part_index; + } + + { + float mode7_total_sq_dist_to_line_alt = 0.0f; + for (uint32_t part_index = 0; part_index < num_parts; part_index++) + { + float d = maximum(0.0f, total_variance[part_index] - mode7_eigenvalue_est[part_index]); + mode7_total_sq_dist_to_line_alt += d; + } + + mode7_candidates[examine_iter].m_total_sq_dist = mode7_total_sq_dist_to_line_alt; + mode7_candidates[examine_iter].m_index = unique_part_index; + } + + } // examine_iter + + std::sort(&mode11_candidates[0], &mode11_candidates[num_pats_to_examine]); + std::sort(&mode7_candidates[0], &mode7_candidates[num_pats_to_examine]); + + for (uint32_t i = 0; i < num_desired_pats; i++) + pDesired_pat_indices_mode11[i] = mode11_candidates[i].m_index; + + for (uint32_t i = 0; i < num_desired_pats; i++) + pDesired_pat_indices_mode7[i] = mode7_candidates[i].m_index; +} + +static void estimate_partitions_mode7( + uint32_t num_parts, // 2 or 3 partitions + uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns + uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine + const vec3F* pHalf_pixels_as_floats, // block's half pixel values casted to floats + const astc_hdr_codec_base_options& coptions, // options + uint32_t num_desired_pats, uint32_t* pDesired_pat_indices) // output indices +{ + BASISU_NOTE_UNUSED(coptions); + BASISU_NOTE_UNUSED(num_unique_pats); + + const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = 6 * 6, MAX_PARTS = 3; + assert(num_parts <= MAX_PARTS); + + struct candidate_res + { + float m_total_sq_dist; + uint32_t m_index; + bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; } + }; + + const uint32_t MAX_CANDIDATES = 1024; + assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES)); + + candidate_res candidates[MAX_CANDIDATES]; + + for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++) + { + const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter]; + assert(unique_part_index < num_unique_pats); + + const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index]; + + vec3F part_means[MAX_PARTS]; + uint32_t part_total_texels[MAX_PARTS] = { 0 }; + + for (uint32_t i = 0; i < num_parts; i++) + part_means[i].clear(); + + for (uint32_t y = 0; y < BLOCK_H; y++) + { + for (uint32_t x = 0; x < BLOCK_W; x++) + { + const uint32_t part_index = (*pPat)(x, y); + assert(part_index < num_parts); + + part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W]; + part_total_texels[part_index]++; + + } // x + } // y + + for (uint32_t i = 0; i < num_parts; i++) + { + assert(part_total_texels[i]); + part_means[i] /= (float)part_total_texels[i]; + } + + vec3F part_axis(0.5773502691f); + + // TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix), + // then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances. + float total_sq_dist_to_line = 0.0f; + for (uint32_t i = 0; i < BLOCK_T; i++) + { + const uint32_t part_index = (*pPat)[i]; + assert(part_index < num_parts); + + total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis); + } + + candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line; + + candidates[examine_iter].m_index = unique_part_index; + + } // examine_iter + + std::sort(&candidates[0], &candidates[num_pats_to_examine]); + + for (uint32_t i = 0; i < num_desired_pats; i++) + pDesired_pat_indices[i] = candidates[i].m_index; +} + +static float calc_deblocking_penalty_itp( + uint32_t bx, uint32_t by, uint32_t width, uint32_t height, + const imagef& pass_src_img_itp, const candidate_encoding& candidate) +{ + float total_deblock_penalty = 0.0f; + + float total_orig_mse = 0.0f, total_comp_mse = 0.0f; + uint32_t total_c = 0; + + for (uint32_t b = 0; b < 4; b++) + { + for (uint32_t i = 0; i < 6; i++) + { + int ox = 0, oy = 0, qx = 0, qy = 0; + + switch (b) + { + case 0: + ox = bx * 6 + i; oy = (by - 1) * 6 + 5; + qx = bx * 6 + i; qy = by * 6; + break; + case 1: + ox = bx * 6 + i; oy = (by + 1) * 6; + qx = bx * 6 + i; qy = by * 6 + 5; + break; + case 2: + ox = (bx - 1) * 6 + 5; oy = by * 6 + i; + qx = bx * 6; qy = by * 6 + i; + break; + case 3: + ox = (bx + 1) * 6; oy = by * 6 + i; + qx = bx * 6 + 5; qy = by * 6 + i; + break; + } + + if ((ox < 0) || (oy < 0) || (ox >= (int)width) || (oy >= (int)height)) + continue; + + const vec3F& o_pixel_itp = pass_src_img_itp(ox, oy); + const vec3F& q_pixel_itp = pass_src_img_itp(qx, qy); + + const vec3F &d_pixel_itp = candidate.m_comp_pixels_itp[qy - by * 6][qx - bx * 6]; // compressed block + + vec3F orig_delta_v(o_pixel_itp - q_pixel_itp); + total_orig_mse += square(orig_delta_v[0]) + square(orig_delta_v[1]) + square(orig_delta_v[2]); + + vec3F d_delta_v(o_pixel_itp - d_pixel_itp); + total_comp_mse += square(d_delta_v[0]) + square(d_delta_v[1]) + square(d_delta_v[2]); + + total_c++; + } + } + + if (total_c) + { + total_orig_mse /= (float)total_c; + total_comp_mse /= (float)total_c; + + if (total_orig_mse) + { + total_deblock_penalty = fabsf((total_comp_mse - total_orig_mse) / total_orig_mse); + } + } + + return total_deblock_penalty; +} + +static bool calc_strip_size( + float lambda, + uint32_t num_blocks_y, uint32_t total_threads, bool force_one_strip, + uint32_t& res_total_strips, uint32_t& res_rows_per_strip, astc_hdr_6x6_global_config &global_cfg) +{ + uint32_t total_strips = 1; + + if (lambda == 0.0f) + { + if (!force_one_strip) + { + total_strips = total_threads; + } + } + else + { + const uint32_t MIN_DESIRED_STRIPS = 8; + const uint32_t MAX_TARGET_STRIPS = 32; + const uint32_t TARGET_ASTC_6X6_ROWS_PER_STRIP = 12; + + if (!force_one_strip) + { + total_strips = maximum(1, num_blocks_y / TARGET_ASTC_6X6_ROWS_PER_STRIP); + + if (num_blocks_y >= MIN_DESIRED_STRIPS * 2) + total_strips = maximum(total_strips, MIN_DESIRED_STRIPS); + } + + total_strips = minimum(total_strips, MAX_TARGET_STRIPS); + } + + uint32_t rows_per_strip = 0; + if (total_strips <= 1) + { + rows_per_strip = num_blocks_y; + } + else + { + rows_per_strip = (num_blocks_y / total_strips) & ~1; + + if (rows_per_strip < 2) + rows_per_strip = 2;// num_blocks_y; + } + + assert((rows_per_strip == num_blocks_y) || ((rows_per_strip & 1) == 0)); + + total_strips = (num_blocks_y + rows_per_strip - 1) / rows_per_strip; + + if (global_cfg.m_debug_output) + { + fmt_printf("num_blocks_y: {}, total_threads : {}, Total strips : {}\n", num_blocks_y, total_threads, total_strips); + fmt_printf("ASTC 6x6 block rows per strip: {}\n", rows_per_strip); + fmt_printf("ASTC 6x6 block rows on final strip: {}\n", num_blocks_y - (total_strips - 1) * rows_per_strip); + } + + uint32_t total_rows = 0; + for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++) + { + uint32_t strip_first_by = strip_index * rows_per_strip; + uint32_t strip_last_by = minimum(strip_first_by + rows_per_strip - 1, num_blocks_y); + + if (strip_index == (total_strips - 1)) + strip_last_by = num_blocks_y - 1; + + uint32_t num_strip_block_rows = (strip_last_by - strip_first_by) + 1; + total_rows += num_strip_block_rows; + + if (global_cfg.m_debug_output) + fmt_printf("Strip row: {}, total block rows: {}\n", strip_index, num_strip_block_rows); + } + + if (total_rows != num_blocks_y) + { + fmt_error_printf("Strip calc failed\n"); + return false; + } + + res_total_strips = total_strips; + res_rows_per_strip = rows_per_strip; + + return true; +} + +static void convet_rgb_image_to_itp(const imagef &src_img, imagef &dst_img, const astc_hdr_6x6_global_config& cfg) +{ + const uint32_t width = src_img.get_width(), height = src_img.get_height(); + + dst_img.resize(width, height); + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + vec3F src_rgb(src_img(x, y)); + + vec3F src_itp; + linear_rgb_to_itp(src_rgb, src_itp, cfg); + + dst_img(x, y) = src_itp; + } + } +} + +const uint32_t BLOCK_W = 6, BLOCK_H = 6; +const uint32_t NUM_BLOCK_PIXELS = BLOCK_W * BLOCK_H; + +const float SOLID_PENALTY = 4.0f; +const float REUSE_PENALTY = 1.0f; +const float RUN_PENALTY = 10.0f; + +const float MSE_WEIGHT = 300000.0f; +const float SSIM_WEIGHT = 200.0f; +const float TWO_LEVEL_PENALTY = 1.425f; +const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM = .04f; +const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM = .04f; +const float COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY = 1.5f; +const float COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY = 1.25f; +const float COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY = 1.15f; + +struct uastc_hdr_6x6_debug_state +{ + uint32_t m_encoding_type_hist[(uint32_t)encoding_type::cTotal] = { 0 }; + uint32_t m_endpoint_mode_hist[(uint32_t)endpoint_mode::cTotal] = { 0 }; + uint32_t m_block_mode_hist[(uint32_t)block_mode::cBMTotalModes] = { 0 }; + uint64_t m_block_mode_total_bits[(uint32_t)block_mode::cBMTotalModes] = { 0 }; + + basisu::vector< basisu::stats > m_block_mode_comp_stats[(uint32_t)block_mode::cBMTotalModes][3]; + basisu::vector< basisu::comparative_stats > m_block_mode_comparative_stats[(uint32_t)block_mode::cBMTotalModes][3]; + + std::atomic m_total_gaussian1_blocks; + std::atomic m_total_gaussian2_blocks; + std::atomic m_total_filter_horizontal; + std::atomic m_detail_stats[5]; + std::atomic m_total_mode7_skips; + + std::atomic m_total_blocks_compressed; + + std::atomic m_total_candidates_considered; + std::atomic m_max_candidates_considered; + + std::atomic m_total_part2_stats[4]; + std::atomic m_dp_stats[5]; + + std::atomic m_reuse_num_parts[4]; + std::atomic m_reuse_total_dp; + + imagef m_stat_vis; + std::mutex m_stat_vis_mutex; + + image m_part_vis; + image m_mode_vis; + image m_mode_vis2; + image m_grid_vis; + image m_enc_vis; + std::mutex m_vis_image_mutex; + + std::atomic m_comp_level_hist[ASTC_HDR_6X6_MAX_COMP_LEVEL + 1]; + + std::atomic m_total_jnd_replacements; + + std::mutex m_stats_mutex; + + uastc_hdr_6x6_debug_state() + { + for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++) + { + for (uint32_t j = 0; j < 3; j++) + { + m_block_mode_comp_stats[i][j].reserve(512); + m_block_mode_comparative_stats[i][j].reserve(512); + } + } + } + + void init(uint32_t width, uint32_t height) + { + m_stat_vis.resize(width, height); + m_part_vis.resize(width, height); + m_mode_vis.resize(width, height); + m_mode_vis2.resize(width, height); + m_grid_vis.resize(width, height); + m_enc_vis.resize(width, height); + + basisu::clear_obj(m_encoding_type_hist); + basisu::clear_obj(m_endpoint_mode_hist); + basisu::clear_obj(m_block_mode_hist); + basisu::clear_obj(m_block_mode_total_bits); + + for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++) + { + for (uint32_t j = 0; j < 3; j++) + { + m_block_mode_comp_stats[i][j].clear(); + m_block_mode_comparative_stats[i][j].clear(); + } + } + + m_total_gaussian1_blocks.store(0); + m_total_gaussian2_blocks.store(0); + m_total_filter_horizontal.store(0); + for (uint32_t i = 0; i < std::size(m_detail_stats); i++) + m_detail_stats[i].store(0); + m_total_mode7_skips.store(0); + + for (uint32_t i = 0; i < std::size(m_comp_level_hist); i++) + m_comp_level_hist[i].store(0); + + m_total_blocks_compressed.store(0); + + m_total_candidates_considered.store(0); + m_max_candidates_considered.store(0); + + for (uint32_t i = 0; i < std::size(m_total_part2_stats); i++) + m_total_part2_stats[i].store(0); + + for (uint32_t i = 0; i < std::size(m_dp_stats); i++) + m_dp_stats[i].store(0); + + for (uint32_t i = 0; i < std::size(m_reuse_num_parts); i++) + m_reuse_num_parts[i] .store(0); + + m_reuse_total_dp.store(0); + + m_total_jnd_replacements.store(0); + } + + void print(uint32_t total_blocks) const + { + fmt_printf("Total blocks: {}\n", total_blocks); + fmt_printf("Total JND replacements: {} {3.2}%\n", m_total_jnd_replacements, (float)m_total_jnd_replacements * 100.0f / (float)total_blocks); + fmt_printf("Comp level histogram: {} {} {} {} {}\n", m_comp_level_hist[0], m_comp_level_hist[1], m_comp_level_hist[2], m_comp_level_hist[3], m_comp_level_hist[4]); + fmt_printf("Total gaussian 1 blocks: {} {3.2}%\n", m_total_gaussian1_blocks, (float)m_total_gaussian1_blocks * 100.0f / (float)total_blocks); + fmt_printf("Total gaussian 2 blocks: {} {3.2}%\n", m_total_gaussian2_blocks, (float)m_total_gaussian2_blocks * 100.0f / (float)total_blocks); + fmt_printf("Total filter horizontal: {} {3.2}%\n", m_total_filter_horizontal, (float)m_total_filter_horizontal * 100.0f / (float)total_blocks); + fmt_printf("Detail stats: Detailed block low grid skip: {}, Blurry block skip: {}, Very blurry block skip: {}, NH:{} H:{}\n", m_detail_stats[0], m_detail_stats[1], m_detail_stats[2], m_detail_stats[3], m_detail_stats[4]); + fmt_printf("Total mode7 skips: {}\n", m_total_mode7_skips); + + fmt_printf("Total candidates: {}, {} avg per block\n", m_total_candidates_considered, (float)m_total_candidates_considered / (float)total_blocks); + fmt_printf("Max ever candidates: {}\n", m_max_candidates_considered); + + fmt_printf("Part2/3 stats: {} {} {} {}\n", m_total_part2_stats[0], m_total_part2_stats[1], m_total_part2_stats[2], m_total_part2_stats[3]); + fmt_printf("Dual plane stats: {} {} {} {} {}\n", m_dp_stats[0], m_dp_stats[1], m_dp_stats[2], m_dp_stats[3], m_dp_stats[4]); + fmt_printf("Reuse total dual plane: {}\n", m_reuse_total_dp); + fmt_printf("Reuse part stats: {} {} {}\n", m_reuse_num_parts[1], m_reuse_num_parts[2], m_reuse_num_parts[3]); + + fmt_printf("\nEncoding type histogram:\n"); + for (uint32_t i = 0; i < std::size(m_encoding_type_hist); i++) + fmt_printf("{}: {}\n", i, m_encoding_type_hist[i]); + + fmt_printf("\nEndpoint mode histogram:\n"); + for (uint32_t i = 0; i < std::size(m_endpoint_mode_hist); i++) + fmt_printf("{}: {}\n", i, m_endpoint_mode_hist[i]); + + fmt_printf("\nBlock mode histogram:\n"); + + uint32_t total_dp = 0, total_sp = 0; + uint32_t total_mode11 = 0, total_mode7 = 0; + uint32_t part_hist[3] = { 0 }; + uint32_t part2_mode7_total = 0, part2_mode11_total = 0; + uint32_t total_used_modes = 0; + for (uint32_t i = 0; i < std::size(m_block_mode_hist); i++) + { + const auto& bm_desc = g_block_mode_descs[i]; + + const uint32_t total_uses = m_block_mode_hist[i]; + + if (bm_desc.m_dp) + total_dp += total_uses; + else + total_sp += total_uses; + + if (bm_desc.m_cem == 7) + total_mode7 += total_uses; + else + total_mode11 += total_uses; + + part_hist[bm_desc.m_num_partitions - 1] += total_uses; + + if (bm_desc.m_num_partitions == 2) + { + if (bm_desc.m_cem == 7) + part2_mode7_total += total_uses; + else + { + assert(bm_desc.m_cem == 11); + part2_mode11_total += total_uses; + } + } + + float avg_std_dev = 0.0f; + float avg_cross_correlations[3] = { 0 }; + + if (m_block_mode_comp_stats[i][0].size()) + { + const uint32_t num_uses = m_block_mode_comp_stats[i][0].size_u32(); + + for (uint32_t j = 0; j < num_uses; j++) + avg_std_dev += (float)maximum(m_block_mode_comp_stats[i][0][j].m_std_dev, m_block_mode_comp_stats[i][1][j].m_std_dev, m_block_mode_comp_stats[i][2][j].m_std_dev); + avg_std_dev /= (float)num_uses; + + for (uint32_t j = 0; j < num_uses; j++) + { + avg_cross_correlations[0] += fabsf((float)m_block_mode_comparative_stats[i][0][j].m_pearson); + avg_cross_correlations[1] += fabsf((float)m_block_mode_comparative_stats[i][1][j].m_pearson); + avg_cross_correlations[2] += fabsf((float)m_block_mode_comparative_stats[i][2][j].m_pearson); + } + + avg_cross_correlations[0] /= (float)num_uses; + avg_cross_correlations[1] /= (float)num_uses; + avg_cross_correlations[2] /= (float)num_uses; + } + + fmt_printf("{ 2}: uses: { 6}, cem: {}, dp: {} chan: {}, parts: {}, grid: {}x{}, endpoint levels: {}, weight levels: {}, Avg bits: {}, Avg Max Std Dev: {}, RG: {} RB: {} GB: {}\n", i, total_uses, + bm_desc.m_cem, + bm_desc.m_dp, bm_desc.m_dp_channel, + bm_desc.m_num_partitions, + bm_desc.m_grid_x, bm_desc.m_grid_y, + astc_helpers::get_ise_levels(bm_desc.m_endpoint_ise_range), + astc_helpers::get_ise_levels(bm_desc.m_weight_ise_range), + total_uses ? ((double)m_block_mode_total_bits[i] / total_uses) : 0.0f, + avg_std_dev, avg_cross_correlations[0], avg_cross_correlations[1], avg_cross_correlations[2]); + + if (total_uses) + total_used_modes++; + } + + fmt_printf("Total used modes: {}\n", total_used_modes); + + fmt_printf("Total single plane: {}, total dual plane: {}\n", total_sp, total_dp); + fmt_printf("Total mode 11: {}, mode 7: {}\n", total_mode11, total_mode7); + fmt_printf("Partition histogram: {} {} {}\n", part_hist[0], part_hist[1], part_hist[2]); + fmt_printf("2 subset mode 7 uses: {}, mode 11 uses: {}\n", part2_mode7_total, part2_mode11_total); + } +}; + +struct uastc_hdr_6x6_encode_state +{ + astc_hdr_codec_base_options master_coptions; + + imagef src_img; + + imagef src_img_filtered1; + imagef src_img_filtered2; + + imagef src_img_itp; + imagef src_img_filtered1_itp; + imagef src_img_filtered2_itp; + + vector2D smooth_block_mse_scales; + + imagef packed_img; + + basisu::vector strip_bits; + + basisu::vector2D final_astc_blocks; + + vector2D coded_blocks; +}; + +static bool compress_strip_task( + uint32_t strip_index, uint32_t total_strips, uint32_t strip_first_by, uint32_t strip_last_by, + uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t total_blocks, uint32_t width, uint32_t height, + astc_hdr_6x6_global_config &global_cfg, uastc_hdr_6x6_debug_state &debug_state, uastc_hdr_6x6_encode_state &enc_state) +{ + BASISU_NOTE_UNUSED(num_blocks_y); + BASISU_NOTE_UNUSED(total_strips); + + vec3F prev_comp_pixels[BLOCK_H][BLOCK_W]; // [y][x] + basisu::clear_obj(prev_comp_pixels); + + uint32_t prev_run_len = 0; + + bitwise_coder prev_encoding; + candidate_encoding prev_candidate_encoding; // the previous candidate written, which may have been a run extension + candidate_encoding prev_non_run_candidate_encoding; // the previous *non-run* candidate written + + bitwise_coder& strip_coded_bits = enc_state.strip_bits[strip_index]; + + const uint32_t CANDIDATES_TO_RESERVE = 1536; + + basisu::vector candidates; + candidates.reserve(CANDIDATES_TO_RESERVE); + + for (uint32_t by = strip_first_by; by <= strip_last_by; by++) + { + const bool has_upper_neighbor = by > strip_first_by; + + for (uint32_t bx = 0; bx < num_blocks_x; bx++) + { + //if ((bx == 1) && (by == 2)) + // basisu::fmt_printf("!"); + + for (uint32_t outer_pass = 0; outer_pass < 3; outer_pass++) + { + const bool has_left_neighbor = bx > 0; + //const bool has_prev = has_left_neighbor || has_upper_neighbor; + + // Select either the original source image, or the Gaussian filtered version. + // From here the encoder *must* use these 2 sources. + const imagef& pass_src_img = (outer_pass == 2) ? enc_state.src_img_filtered2 : + ((outer_pass == 1) ? enc_state.src_img_filtered1 : enc_state.src_img); + + const imagef& pass_src_img_itp = (outer_pass == 2) ? enc_state.src_img_filtered2_itp : + ((outer_pass == 1) ? enc_state.src_img_filtered1_itp : enc_state.src_img_itp); + + // Extract source image block + vec4F block_pixels[BLOCK_H][BLOCK_W]; // [y][x] + pass_src_img.extract_block_clamped(&block_pixels[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H); + + vec4F block_pixels_itp[BLOCK_H][BLOCK_W]; // [y][x] + pass_src_img_itp.extract_block_clamped(&block_pixels_itp[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H); + + half_vec3 half_pixels[BLOCK_H][BLOCK_W]; // [y][x] half-float values + vec3F half_pixels_as_floats[BLOCK_H][BLOCK_W]; // [y][x] half float values, integer bits as floats + vec4F block_pixels_q16[BLOCK_H][BLOCK_W]; // [y][x], q16 space for low-level ASTC encoding + vec3F block_pixels_as_itp[BLOCK_H][BLOCK_W]; // [y][x] input converted to itp space, for faster error calculations + + bool is_grayscale = true; + + candidates.resize(0); + + float block_ly = BIG_FLOAT_VAL, block_hy = 0.0f, block_avg_y = 0.0f; + + for (uint32_t y = 0; y < BLOCK_H; y++) + { + for (uint32_t x = 0; x < BLOCK_W; x++) + { + vec3F rgb_input; + + for (uint32_t c = 0; c < 3; c++) + { + float v = block_pixels[y][x][c]; + + rgb_input[c] = v; + + const basist::half_float h = basisu::fast_float_to_half_no_clamp_neg_nan_or_inf(v); + assert(h == basist::float_to_half(v)); + + half_pixels[y][x][c] = h; + + block_pixels_q16[y][x][c] = (float)half_to_qlog16(h); + + half_pixels_as_floats[y][x][c] = (float)h; + + } // c + + float py = rgb_input.dot(vec3F(REC_709_R, REC_709_G, REC_709_B)); + if (py < block_ly) + block_ly = py; + if (py > block_hy) + block_hy = py; + block_avg_y += py; + + //linear_rgb_to_itp(rgb_input, block_pixels_as_itp[y][x]); + + block_pixels_as_itp[y][x] = block_pixels_itp[y][x]; + + block_pixels_q16[y][x][3] = 0.0f; + + if ((half_pixels[y][x][0] != half_pixels[y][x][1]) || (half_pixels[y][x][0] != half_pixels[y][x][2])) + is_grayscale = false; + + } // x + } // y + + block_avg_y *= (1.0f / (float)NUM_BLOCK_PIXELS); + + encode_astc_block_stats enc_block_stats; + enc_block_stats.init(NUM_BLOCK_PIXELS, &block_pixels_q16[0][0]); + + vec4F x_filtered[6][6], y_filtered[6][6]; + + filter_block(3, 6, (vec4F*)block_pixels, (vec4F*)x_filtered); // filter rows (horizontal) + filter_block(6, 3, (vec4F*)block_pixels, (vec4F*)y_filtered); // filter cols (vertically) + + const float filtered_x_err = diff_blocks((vec4F*)block_pixels, (vec4F*)x_filtered); + const float filtered_y_err = diff_blocks((vec4F*)block_pixels, (vec4F*)y_filtered); + const bool filter_horizontally = filtered_x_err < filtered_y_err; + + //const float block_mag_gradient_mag = block_max_gradient_mag(bx, by); + + if (filter_horizontally) + debug_state.m_total_filter_horizontal.fetch_add(1, std::memory_order_relaxed); + + vec3F lowpass_filtered[6][6]; + filter_block(3, 3, &half_pixels_as_floats[0][0], &lowpass_filtered[0][0]); + float lowpass_std_dev = sub_and_compute_std_dev(&lowpass_filtered[0][0], &half_pixels_as_floats[0][0]); + + const bool very_detailed_block = lowpass_std_dev > 350.0f; + const bool very_blurry_block = lowpass_std_dev < 30.0f; + const bool super_blurry_block = lowpass_std_dev < 15.0f; + + basisu::stats half_comp_stats[3]; + for (uint32_t c = 0; c < 3; c++) + half_comp_stats[c].calc(NUM_BLOCK_PIXELS, &half_pixels_as_floats[0][0][c], 3); + + const float SINGLE_PART_HALF_THRESH = 256.0f; + const float COMPLEX_HALF_THRESH = 1024.0f; + // HACK HACK + const float VERY_COMPLEX_HALF_THRESH = 1400.0f; // 1536.0f; + + const float max_std_dev = (float)maximum(half_comp_stats[0].m_std_dev, half_comp_stats[1].m_std_dev, half_comp_stats[2].m_std_dev); + + const bool very_simple_block = (max_std_dev < SINGLE_PART_HALF_THRESH); + const bool complex_block = (max_std_dev > COMPLEX_HALF_THRESH); + const bool very_complex_block = (max_std_dev > VERY_COMPLEX_HALF_THRESH); + + // Dynamically choose a comp_level for this block. + astc_hdr_codec_base_options coptions(enc_state.master_coptions); + uint32_t comp_level = global_cfg.m_master_comp_level; + + if (very_complex_block) + comp_level = global_cfg.m_highest_comp_level; + else if (complex_block) + comp_level = (global_cfg.m_master_comp_level + global_cfg.m_highest_comp_level + 1) / 2; + + debug_state.m_comp_level_hist[comp_level].fetch_add(1, std::memory_order_relaxed); + + bool any_2subset_enabled = false, any_2subset_mode11_enabled = false, any_2subset_mode7_enabled = false, any_3subset_enabled = false; + BASISU_NOTE_UNUSED(any_2subset_mode11_enabled); + + for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++) + { + if (comp_level == 0) + { + if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL0) == 0) + continue; + } + else if (comp_level == 1) + { + if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL1) == 0) + continue; + } + else if (comp_level == 2) + { + if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL2) == 0) + continue; + } + + if (g_block_mode_descs[i].m_num_partitions == 2) + { + any_2subset_enabled = true; + + if (g_block_mode_descs[i].m_cem == 7) + { + any_2subset_mode7_enabled = true; + } + else + { + assert(g_block_mode_descs[i].m_cem == 11); + any_2subset_mode11_enabled = true; + } + } + else if (g_block_mode_descs[i].m_num_partitions == 3) + any_3subset_enabled = true; + } + + coptions.m_mode7_full_s_optimization = (comp_level >= 2); + + const bool uber_mode_flag = (comp_level >= 3); + coptions.m_allow_uber_mode = uber_mode_flag; + + coptions.m_ultra_quant = (comp_level >= 4); + + coptions.m_take_first_non_clamping_mode11_submode = (comp_level <= 2); + coptions.m_take_first_non_clamping_mode7_submode = (comp_level <= 2); + + coptions.m_disable_weight_plane_optimization = (comp_level >= 2); + + // ------------------- + + uint32_t total_used_block_chans = 0; + for (uint32_t i = 0; i < 3; i++) + total_used_block_chans += (half_comp_stats[i].m_range > 0.0f); + + const bool is_solid_block = (total_used_block_chans == 0); + + basisu::comparative_stats half_cross_chan_stats[3]; + + // R vs. G + half_cross_chan_stats[0].calc_pearson(NUM_BLOCK_PIXELS, + &half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][1], + 3, 3, + &half_comp_stats[0], &half_comp_stats[1]); + + // R vs. B + half_cross_chan_stats[1].calc_pearson(NUM_BLOCK_PIXELS, + &half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][2], + 3, 3, + &half_comp_stats[0], &half_comp_stats[2]); + + // G vs. B + half_cross_chan_stats[2].calc_pearson(NUM_BLOCK_PIXELS, + &half_pixels_as_floats[0][0][1], &half_pixels_as_floats[0][0][2], + 3, 3, + &half_comp_stats[1], &half_comp_stats[2]); + + const float rg_corr = fabsf((float)half_cross_chan_stats[0].m_pearson); + const float rb_corr = fabsf((float)half_cross_chan_stats[1].m_pearson); + const float gb_corr = fabsf((float)half_cross_chan_stats[2].m_pearson); + + float min_corr = BIG_FLOAT_VAL, max_corr = -BIG_FLOAT_VAL; + for (uint32_t i = 0; i < 3; i++) + { + if (half_comp_stats[i].m_range > 0.0f) + { + const float c = fabsf((float)half_cross_chan_stats[i].m_pearson); + min_corr = minimum(min_corr, c); + max_corr = maximum(max_corr, c); + } + } + + bool use_single_subset_mode7 = true; + if (comp_level <= 1) + { + // TODO: could also compute angle between principle axis and the grayscale axis. + // TODO: Transform grayscale axis by covar matrix, compute variance vs. total variance + const float MODE7_MIN_CHAN_CORR = .5f; + const float MODE7_PCA_ANGLE_THRESH = .9f; + use_single_subset_mode7 = is_grayscale || is_solid_block || (min_corr >= MODE7_MIN_CHAN_CORR); + + if (use_single_subset_mode7) + { + float cos_ang = fabsf(enc_block_stats.m_axis_q16.dot(vec3F(0.5773502691f))); + if (cos_ang < MODE7_PCA_ANGLE_THRESH) + use_single_subset_mode7 = false; + } + } + + const float STRONG_CORR_THRESH = (comp_level <= 1) ? .5f : ((comp_level <= 3) ? .75f : .9f); + + int desired_dp_chan = -1; + if (total_used_block_chans <= 1) + { + // no need for dual plane (except possibly 2x2 weight grids for RDO) + } + else + { + if (min_corr >= STRONG_CORR_THRESH) + { + // all channel pairs strongly correlated, no need for dual plane + debug_state.m_dp_stats[0].fetch_add(1, std::memory_order_relaxed); + } + else + { + if (total_used_block_chans == 2) + { + if (half_comp_stats[0].m_range == 0.0f) + { + // r unused, check for strong gb correlation + if (gb_corr < STRONG_CORR_THRESH) + desired_dp_chan = 1; + } + else if (half_comp_stats[1].m_range == 0.0f) + { + // g unused, check for strong rb correlation + if (rb_corr < STRONG_CORR_THRESH) + desired_dp_chan = 0; + } + else + { + // b unused, check for strong rg correlation + if (rg_corr < STRONG_CORR_THRESH) + desired_dp_chan = 0; + } + } + else + { + assert(total_used_block_chans == 3); + + // see if rg/rb is weakly correlated vs. gb + if ((rg_corr < gb_corr) && (rb_corr < gb_corr)) + desired_dp_chan = 0; + // see if gr/gb is weakly correlated vs. rb + else if ((rg_corr < rb_corr) && (gb_corr < rb_corr)) + desired_dp_chan = 1; + // assume b is weakest + else + desired_dp_chan = 2; + } + + if (desired_dp_chan == -1) + debug_state.m_dp_stats[1].fetch_add(1, std::memory_order_relaxed); + else + debug_state.m_dp_stats[2 + desired_dp_chan].fetch_add(1, std::memory_order_relaxed); + } + } + + // 2x2 is special for RDO at higher lambdas - always pick a preferred channel. + int desired_dp_chan_2x2 = 0; + if (total_used_block_chans == 2) + { + if (half_comp_stats[0].m_range == 0.0f) + desired_dp_chan_2x2 = 1; + } + else if (total_used_block_chans == 3) + { + // see if rg/rb is weakly correlated vs. gb + if ((rg_corr < gb_corr) && (rb_corr < gb_corr)) + desired_dp_chan_2x2 = 0; + // see if gr/gb is weakly correlated vs. rb + else if ((rg_corr < rb_corr) && (gb_corr < rb_corr)) + desired_dp_chan_2x2 = 1; + // assume b is weakest + else + desired_dp_chan_2x2 = 2; + } + + // Gather all candidate encodings + bool status = false; + + // ---- Run candidate + if ((global_cfg.m_use_runs) && (has_left_neighbor || has_upper_neighbor)) + { + candidate_encoding candidate; + candidate.m_coder.reserve(24); + + candidate.m_encoding_type = encoding_type::cRun; + + candidate.m_decomp_log_blk = prev_non_run_candidate_encoding.m_decomp_log_blk; + candidate.m_coded_log_blk = prev_non_run_candidate_encoding.m_coded_log_blk; + + memcpy(candidate.m_comp_pixels, prev_comp_pixels, sizeof(prev_comp_pixels)); + + if (!prev_run_len) + { + candidate.m_coder.put_bits(RUN_CODE, RUN_CODE_LEN); + candidate.m_coder.put_vlc(0, 5); + } + else + { + // extend current run - compute the # of new bits needed for the extension. + + uint32_t prev_run_bits = prev_encoding.get_total_bits_u32(); + assert(prev_run_bits > 0); + + // We're not actually going to code this, because the previously emitted run code will be extended. + bitwise_coder temp_coder; + temp_coder.put_bits(RUN_CODE, RUN_CODE_LEN); + temp_coder.put_vlc((prev_run_len + 1) - 1, 5); + + uint32_t cur_run_bits = temp_coder.get_total_bits_u32(); + assert(cur_run_bits >= prev_run_bits); + + uint32_t total_new_bits = cur_run_bits - prev_run_bits; + if (total_new_bits > 0) + candidate.m_coder.put_bits(0, total_new_bits); // dummy bits + } + + candidate.m_run_len = prev_run_len + 1; + + candidates.emplace_back(std::move(candidate)); + } + + // ---- Reuse candidate + if ((!is_solid_block) && (global_cfg.m_lambda > 0.0f)) + { + for (uint32_t reuse_delta_index = 0; reuse_delta_index < global_cfg.m_num_reuse_xy_deltas; reuse_delta_index++) + { + const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x; + const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y; + + const int reuse_bx = bx + reuse_delta_x, reuse_by = by + reuse_delta_y; + if ((reuse_bx < 0) || (reuse_bx >= (int)num_blocks_x)) + continue; + if (reuse_by < (int)strip_first_by) + break; + + const candidate_encoding& prev_candidate = enc_state.coded_blocks(reuse_bx, reuse_by); + + // TODO - support this. + if (prev_candidate.m_encoding_type == encoding_type::cSolid) + continue; + assert((prev_candidate.m_encoding_type == encoding_type::cBlock) || (prev_candidate.m_encoding_type == encoding_type::cReuse)); + + candidate_encoding candidate; + candidate.m_coder.reserve(24); + astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; + astc_helpers::log_astc_block& decomp_log_blk = candidate.m_decomp_log_blk; + + const astc_helpers::log_astc_block& prev_coded_log_blk = prev_candidate.m_coded_log_blk; + + const uint32_t grid_x = prev_coded_log_blk.m_grid_width, grid_y = prev_coded_log_blk.m_grid_height; + const bool dual_plane = prev_candidate.m_coded_log_blk.m_dual_plane; + const uint32_t num_grid_samples = grid_x * grid_y; + const uint32_t num_endpoint_vals = get_num_endpoint_vals(prev_coded_log_blk.m_color_endpoint_modes[0]); + + coded_log_blk = prev_candidate.m_coded_log_blk; + decomp_log_blk = prev_candidate.m_decomp_log_blk; + + if (prev_coded_log_blk.m_num_partitions == 1) + { + // Now encode the block using the transcoded endpoints + basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3]; + + if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7) + { + status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr, + astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range); + } + else + { + status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr, + astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range); + } + assert(status); + + uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H]; + uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; + + if (dual_plane) + { + eval_selectors_dual_plane(prev_candidate.m_coded_log_blk.m_color_component_selector, + BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); + + downsample_ise_weights_dual_plane( + coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range, + BLOCK_W, BLOCK_H, + grid_x, grid_y, + trial_weights0, trial_weights1, coded_log_blk.m_weights); + + basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * 2, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range); + } + else + { + eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, coded_log_blk.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); + + downsample_ise_weights( + coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range, + BLOCK_W, BLOCK_H, + grid_x, grid_y, + trial_weights0, coded_log_blk.m_weights); + + basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range); + } + + // Create the block the decoder would transcode into. + copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk); + } + else if (prev_coded_log_blk.m_num_partitions == 2) + { + assert(!dual_plane); + + const int unique_pat_index = g_part2_seed_to_unique_index[coded_log_blk.m_partition_id]; + assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS2)); + + const partition_pattern_vec& pat_vec = g_partitions2[unique_pat_index]; + + vec4F part_pixels_q16[2][64]; + half_vec3 part_half_pixels[2][64]; + uint32_t part_total_pixels[2] = { 0 }; + + for (uint32_t y = 0; y < BLOCK_H; y++) + { + for (uint32_t x = 0; x < BLOCK_W; x++) + { + const uint32_t part_index = pat_vec[x + y * 6]; + + uint32_t l = part_total_pixels[part_index]; + + part_pixels_q16[part_index][l] = block_pixels_q16[y][x]; + part_half_pixels[part_index][l] = half_pixels[y][x]; + + part_total_pixels[part_index] = l + 1; + } // x + } // y + + uint8_t blk_weights[2][BLOCK_W * BLOCK_H]; + + for (uint32_t part_index = 0; part_index < 2; part_index++) + { + basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3]; + + if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7) + { + status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr, + astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range); + } + else + { + status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr, + astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range); + } + assert(status); + + eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range, + (basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); + + } // part_index + + uint8_t ise_weights[BLOCK_W * BLOCK_H]; + + uint32_t src_pixel_index[2] = { 0, 0 }; + for (uint32_t y = 0; y < BLOCK_H; y++) + { + for (uint32_t x = 0; x < BLOCK_W; x++) + { + const uint32_t part_index = pat_vec[x + y * 6]; + + ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]]; + src_pixel_index[part_index]++; + } // x + } // y + + downsample_ise_weights( + coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range, + BLOCK_W, BLOCK_H, + grid_x, grid_y, + ise_weights, coded_log_blk.m_weights); + + // Transcode these codable weights to ASTC weights. + uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H]; + basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range); + + // Create the block the decoder would transcode into. + copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk); + } + else if (prev_coded_log_blk.m_num_partitions == 3) + { + assert(!dual_plane); + + const int unique_pat_index = g_part3_seed_to_unique_index[coded_log_blk.m_partition_id]; + assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS3)); + + const partition_pattern_vec& pat = g_partitions3[unique_pat_index]; + + vec4F part_pixels_q16[3][64]; + half_vec3 part_half_pixels[3][64]; + uint32_t part_total_pixels[3] = { 0 }; + + for (uint32_t y = 0; y < BLOCK_H; y++) + { + for (uint32_t x = 0; x < BLOCK_W; x++) + { + const uint32_t part_index = pat.m_parts[x + y * BLOCK_W]; + + uint32_t l = part_total_pixels[part_index]; + + part_pixels_q16[part_index][l] = block_pixels_q16[y][x]; + part_half_pixels[part_index][l] = half_pixels[y][x]; + + part_total_pixels[part_index] = l + 1; + } // x + } // y + + uint8_t blk_weights[3][BLOCK_W * BLOCK_H]; + + for (uint32_t part_index = 0; part_index < 3; part_index++) + { + basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3]; + + status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr, + astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range); + assert(status); + + eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range, + (basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); + + } // part_index + + uint8_t ise_weights[BLOCK_W * BLOCK_H]; + + uint32_t src_pixel_index[3] = { 0 }; + for (uint32_t y = 0; y < BLOCK_H; y++) + { + for (uint32_t x = 0; x < BLOCK_W; x++) + { + const uint32_t part_index = pat.m_parts[x + y * BLOCK_W]; + + ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]]; + src_pixel_index[part_index]++; + } // x + } // y + + downsample_ise_weights( + coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range, + BLOCK_W, BLOCK_H, + grid_x, grid_y, + ise_weights, coded_log_blk.m_weights); + + // Transcode these codable weights to ASTC weights. + uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H]; + basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range); + + // Create the block the decoder would transcode into. + copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk); + } + + if (!validate_log_blk(decomp_log_blk)) + { + fmt_error_printf("pack_astc_block() failed\n"); + return false; + } + + status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_log_blk, &candidate.m_comp_pixels[0][0]); + if (!status) + { + fmt_error_printf("decode_astc_block() failed\n"); + return false; + } + + candidate.m_coder.put_bits(REUSE_CODE, REUSE_CODE_LEN); + candidate.m_coder.put_bits(reuse_delta_index, REUSE_XY_DELTA_BITS); + encode_values(candidate.m_coder, num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range); + + candidate.m_encoding_type = encoding_type::cReuse; + candidate.m_block_mode = prev_candidate.m_block_mode; + candidate.m_endpoint_mode = prev_candidate.m_endpoint_mode; + candidate.m_reuse_delta_index = reuse_delta_index; + + candidates.emplace_back(std::move(candidate)); + + } // reuse_delta_index + } + + // ---- Solid candidate + if (global_cfg.m_use_solid_blocks) + { + candidate_encoding candidate; + candidate.m_coder.reserve(24); + + // solid + candidate.m_encoding_type = encoding_type::cSolid; + + float r = 0.0f, g = 0.0f, b = 0.0f; + const float LOG_BIAS = .125f; + bool solid_block = true; + for (uint32_t y = 0; y < BLOCK_H; y++) + { + for (uint32_t x = 0; x < BLOCK_W; x++) + { + if ((block_pixels[0][0][0] != block_pixels[y][x][0]) || + (block_pixels[0][0][1] != block_pixels[y][x][1]) || + (block_pixels[0][0][2] != block_pixels[y][x][2])) + { + solid_block = false; + } + + r += log2f(block_pixels[y][x][0] + LOG_BIAS); + g += log2f(block_pixels[y][x][1] + LOG_BIAS); + b += log2f(block_pixels[y][x][2] + LOG_BIAS); + } + } + + if (solid_block) + { + r = block_pixels[0][0][0]; + g = block_pixels[0][0][1]; + b = block_pixels[0][0][2]; + } + else + { + r = maximum(0.0f, powf(2.0f, r * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS); + g = maximum(0.0f, powf(2.0f, g * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS); + b = maximum(0.0f, powf(2.0f, b * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS); + + r = minimum(r, basist::MAX_HALF_FLOAT); + g = minimum(g, basist::MAX_HALF_FLOAT); + b = minimum(b, basist::MAX_HALF_FLOAT); + } + + basist::half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b); + + candidate.m_solid_color[0] = rh; + candidate.m_solid_color[1] = gh; + candidate.m_solid_color[2] = bh; + + candidate.m_coder.put_bits(SOLID_CODE, SOLID_CODE_LEN); + + candidate.m_coder.put_bits(rh, 15); + candidate.m_coder.put_bits(gh, 15); + candidate.m_coder.put_bits(bh, 15); + + vec3F cp(basist::half_to_float(rh), basist::half_to_float(gh), basist::half_to_float(bh)); + + for (uint32_t y = 0; y < BLOCK_H; y++) + for (uint32_t x = 0; x < BLOCK_W; x++) + candidate.m_comp_pixels[y][x] = cp; + + astc_helpers::log_astc_block& log_blk = candidate.m_coded_log_blk; + + log_blk.clear(); + log_blk.m_solid_color_flag_hdr = true; + log_blk.m_solid_color[0] = rh; + log_blk.m_solid_color[1] = gh; + log_blk.m_solid_color[2] = bh; + log_blk.m_solid_color[3] = basist::float_to_half(1.0f); + + candidate.m_decomp_log_blk = log_blk; + + candidates.emplace_back(std::move(candidate)); + } + + if ((!is_solid_block) || (!global_cfg.m_use_solid_blocks)) + { + static uint8_t s_parts2_normal[5] = { 0, 2, 4, 6, 8 }; + static uint8_t s_parts3_normal[5] = { 0, 0, 4, 6, 8 }; + + static uint8_t s_parts2_complex[5] = { 0, 4, 8, 10, 16 }; + static uint8_t s_parts3_complex[5] = { 0, 0, 8, 10, 16 }; + + static uint8_t s_parts2_very_complex[5] = { 0, 8, 12, 14, 20 }; + static uint8_t s_parts3_very_complex[5] = { 0, 0, 12, 14, 20 }; + + uint32_t total_parts2 = 0, total_parts3 = 0; + + assert(comp_level < 5); + if ((very_simple_block) && (comp_level <= 3)) + { + // Block's std dev is so low that 2-3 subsets are unlikely to help much + total_parts2 = 0; + total_parts3 = 0; + + debug_state.m_total_part2_stats[0].fetch_add(1, std::memory_order_relaxed); + } + else if (very_complex_block) + { + total_parts2 = s_parts2_very_complex[comp_level]; + total_parts3 = s_parts3_very_complex[comp_level]; + + if (global_cfg.m_extra_patterns_flag) + { + total_parts2 += (comp_level == 4) ? 30 : 20; + total_parts3 += (comp_level == 4) ? 30 : 20; + } + + debug_state.m_total_part2_stats[2].fetch_add(1, std::memory_order_relaxed); + } + else if (complex_block) + { + total_parts2 = s_parts2_complex[comp_level]; + total_parts3 = s_parts3_complex[comp_level]; + + if (global_cfg.m_extra_patterns_flag) + { + total_parts2 += (comp_level == 4) ? 15 : 10; + total_parts3 += (comp_level == 4) ? 15 : 10; + } + + debug_state.m_total_part2_stats[3].fetch_add(1, std::memory_order_relaxed); + } + else + { + // moderate complexity - use defaults + total_parts2 = s_parts2_normal[comp_level]; + total_parts3 = s_parts3_normal[comp_level]; + + if (global_cfg.m_extra_patterns_flag) + { + total_parts2 += 5; + total_parts3 += 5; + } + + debug_state.m_total_part2_stats[1].fetch_add(1, std::memory_order_relaxed); + } + + if (!any_2subset_enabled) + total_parts2 = 0; + + if (!any_3subset_enabled) + total_parts3 = 0; + + int best_parts2_mode11[NUM_UNIQUE_PARTITIONS2], best_parts2_mode7[NUM_UNIQUE_PARTITIONS2]; + bool has_estimated_parts2 = false; + + if (total_parts2) + { + if (global_cfg.m_brute_force_partition_matching) + { + int candidate_pats2[NUM_UNIQUE_PARTITIONS2]; + for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS2; i++) + candidate_pats2[i] = i; + + if (any_2subset_enabled) + { + estimate_partitions_mode7_and_11( + 2, + NUM_UNIQUE_PARTITIONS2, g_partitions2, + NUM_UNIQUE_PARTITIONS2, (uint32_t*)candidate_pats2, + &half_pixels_as_floats[0][0], + coptions, + total_parts2, best_parts2_mode11, best_parts2_mode7); + } + + has_estimated_parts2 = true; + } + else + { + if (comp_level >= 1) + { + const uint32_t MAX_CANDIDATES2 = 48; + int candidate_pats2[MAX_CANDIDATES2 * 2]; + + uint32_t num_candidate_pats2 = maximum((total_parts2 * 3) / 2, very_complex_block ? MAX_CANDIDATES2 : (MAX_CANDIDATES2 / 2)); + num_candidate_pats2 = minimum(num_candidate_pats2, (uint32_t)std::size(candidate_pats2)); + + has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, candidate_pats2, num_candidate_pats2); + + if (has_estimated_parts2) + { + estimate_partitions_mode7_and_11( + 2, + NUM_UNIQUE_PARTITIONS2, g_partitions2, + num_candidate_pats2, (uint32_t*)candidate_pats2, + &half_pixels_as_floats[0][0], + coptions, + total_parts2, best_parts2_mode11, best_parts2_mode7); + } + } + else + { + has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, best_parts2_mode11, total_parts2); + + if ((has_estimated_parts2) && (any_2subset_mode7_enabled)) + memcpy(best_parts2_mode7, best_parts2_mode11, total_parts2 * sizeof(best_parts2_mode7[0])); + } + } + } + + int best_parts3[NUM_UNIQUE_PARTITIONS3]; + bool has_estimated_parts3 = false; + + if (total_parts3) + { +#if 0 + has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, best_parts3, total_parts3); +#elif 1 + if (global_cfg.m_brute_force_partition_matching) + { + int candidate_pats3[NUM_UNIQUE_PARTITIONS3]; + for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS3; i++) + candidate_pats3[i] = i; + + estimate_partitions_mode7( + 3, + NUM_UNIQUE_PARTITIONS3, g_partitions3, + NUM_UNIQUE_PARTITIONS3, (uint32_t*)candidate_pats3, + &half_pixels_as_floats[0][0], + coptions, + total_parts3, (uint32_t*)best_parts3); + + has_estimated_parts3 = true; + } + else + { + const uint32_t MAX_CANDIDATES3 = 48; + int candidate_pats3[MAX_CANDIDATES3 * 2]; + + uint32_t num_candidate_pats3 = maximum((total_parts3 * 3) / 2, very_complex_block ? MAX_CANDIDATES3 : (MAX_CANDIDATES3 / 2)); + num_candidate_pats3 = minimum(num_candidate_pats3, (uint32_t)std::size(candidate_pats3)); + + has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, candidate_pats3, num_candidate_pats3); + + if (has_estimated_parts3) + { + estimate_partitions_mode7( + 3, + NUM_UNIQUE_PARTITIONS3, g_partitions3, + num_candidate_pats3, (uint32_t*)candidate_pats3, + &half_pixels_as_floats[0][0], + coptions, + total_parts3, (uint32_t*)best_parts3); + } + } +#endif + } + + const opt_mode_t mode11_opt_mode = complex_block ? cWeightedLeastSquares : cOrdinaryLeastSquares; + + // ---- Encoded block candidate + for (uint32_t block_mode_iter = 0; block_mode_iter < (uint32_t)block_mode::cBMTotalModes; block_mode_iter++) + { + const block_mode bm = (block_mode)block_mode_iter; + + if (comp_level == 0) + { + if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL0) == 0) + continue; + } + else if (comp_level == 1) + { + if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL1) == 0) + continue; + } + else if (comp_level == 2) + { + if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL2) == 0) + continue; + } + + if (global_cfg.m_block_stat_optimizations_flag) + { + if ((comp_level <= 3) && (g_block_mode_descs[block_mode_iter].m_dp)) + { + if ((global_cfg.m_lambda > 0.0f) && (!complex_block) && (g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2)) + { + if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan_2x2) + continue; + } + else + { + if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan) + continue; + } + } + + if (comp_level <= 3) + { + const uint32_t grid_x = g_block_mode_descs[block_mode_iter].m_grid_x; + const uint32_t grid_y = g_block_mode_descs[block_mode_iter].m_grid_y; + + if (!g_block_mode_descs[block_mode_iter].m_dp) + { + // Minor gain (.5-1% less canidates) + if (very_detailed_block) + { + if (grid_x * grid_y <= 12) + { + debug_state.m_detail_stats[0].fetch_add(1, std::memory_order_relaxed); + continue; + } + } + + // Major gains (10-25% less candidates) + if (very_blurry_block) + { + if ((grid_x > 4) || (grid_y > 4) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1)) + { + debug_state.m_detail_stats[1].fetch_add(1, std::memory_order_relaxed); + continue; + } + } + if (super_blurry_block) + { + if ((grid_x > 3) || (grid_y > 3) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1)) + { + debug_state.m_detail_stats[2].fetch_add(1, std::memory_order_relaxed); + continue; + } + } + } + + if (grid_x != grid_y) + { + if (grid_x < grid_y) + { + if (!filter_horizontally) + { + debug_state.m_detail_stats[3].fetch_add(1, std::memory_order_relaxed); + continue; + } + } + else + { + if (filter_horizontally) + { + debug_state.m_detail_stats[4].fetch_add(1, std::memory_order_relaxed); + continue; + } + } + } + } + + if (global_cfg.m_lambda == 0.0f) + { + // Rarely useful if lambda=0 + if ((g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2)) + continue; + } + } // block_stat_optimizations_flag + + if ((!use_single_subset_mode7) && + (g_block_mode_descs[block_mode_iter].m_cem == 7) && + (g_block_mode_descs[block_mode_iter].m_num_partitions == 1)) + { + debug_state.m_total_mode7_skips.fetch_add(1, std::memory_order_relaxed); + continue; + } + + for (uint32_t endpoint_mode_iter = 0; endpoint_mode_iter < (uint32_t)endpoint_mode::cTotal; endpoint_mode_iter++) + { + if (global_cfg.m_lambda == 0.0f) + { + // No use trying anything else + if (endpoint_mode_iter != (uint32_t)endpoint_mode::cRaw) + continue; + } + + if (global_cfg.m_disable_delta_endpoint_usage) + { + if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeftDelta)) + continue; + } + + if (!global_cfg.m_favor_higher_compression) + { + if (comp_level == 0) + { + if (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta) + continue; + } + + if (comp_level <= 1) + { + if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeft) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpper)) + continue; + } + } + + const endpoint_mode em = (endpoint_mode)endpoint_mode_iter; + + switch (em) + { + case endpoint_mode::cUseLeft: + case endpoint_mode::cUseUpper: + { + const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter]; + const uint32_t cem = local_md.m_cem; + + if (local_md.m_num_partitions > 1) + break; + + if ((em == endpoint_mode::cUseLeft) && (!has_left_neighbor)) + break; + else if ((em == endpoint_mode::cUseUpper) && (!has_upper_neighbor)) + break; + + candidate_encoding candidate; + candidate.m_coder.reserve(24); + astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; + + int nx = bx, ny = by; + if (em == endpoint_mode::cUseLeft) + nx--; + else + ny--; + + const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny); + if (neighbor_blk.m_encoding_type == encoding_type::cSolid) + break; + assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse)); + + const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode]; + + if (neighbor_md.m_cem != cem) + break; + + assert(neighbor_blk.m_coded_log_blk.m_color_endpoint_modes[0] == cem); + + const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y; + const bool dual_plane = local_md.m_dp; + const uint32_t num_grid_samples = grid_x * grid_y; + const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem); + + coded_log_blk.m_grid_width = (uint8_t)grid_x; + coded_log_blk.m_grid_height = (uint8_t)grid_y; + coded_log_blk.m_dual_plane = (uint8_t)dual_plane; + coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel; + coded_log_blk.m_num_partitions = 1; + coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)neighbor_md.m_cem; + coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range; + + // We're not explictly writing any endpoints, just reusing existing ones. So copy the neighbor's endpoints unchanged (so no loss). + coded_log_blk.m_endpoint_ise_range = neighbor_blk.m_coded_log_blk.m_endpoint_ise_range; + memcpy(coded_log_blk.m_endpoints, neighbor_blk.m_coded_log_blk.m_endpoints, num_endpoint_vals); + + uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS]; + + // Requantize the neighbor's endpoints to whatever we'll have to transcode into to make a valid ASTC encoding. + basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem, + neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints, + local_md.m_transcode_endpoint_ise_range, transcode_endpoints); + + // Now encode the block using the transcoded endpoints + basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3]; + + if (cem == 7) + { + status = get_astc_hdr_mode_7_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr, + astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range); + } + else + { + status = get_astc_hdr_mode_11_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr, + astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range); + } + if (!status) + break; + + uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H]; + if (dual_plane) + { + eval_selectors_dual_plane(local_md.m_dp_channel, BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); + + downsample_ise_weights_dual_plane( + local_md.m_weight_ise_range, local_md.m_weight_ise_range, + BLOCK_W, BLOCK_H, + grid_x, grid_y, + trial_weights0, trial_weights1, coded_log_blk.m_weights); + } + else + { + eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, local_md.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); + + downsample_ise_weights( + local_md.m_weight_ise_range, local_md.m_weight_ise_range, + BLOCK_W, BLOCK_H, + grid_x, grid_y, + trial_weights0, coded_log_blk.m_weights); + } + + // Transcode these codable weights to ASTC weights. + uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; + basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range); + + // Create the block the decoder would transcode into. + astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk; + decomp_blk.clear(); + + decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem; + decomp_blk.m_dual_plane = local_md.m_dp; + decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel; + decomp_blk.m_num_partitions = 1; + decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range; + decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range; + + memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals); + + copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk); + + if (!validate_log_blk(decomp_blk)) + { + fmt_error_printf("pack_astc_block() failed\n"); + return false; + } + + status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]); + if (!status) + { + fmt_error_printf("decode_astc_block() failed\n"); + return false; + } + + candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN); + code_block(candidate.m_coder, candidate.m_coded_log_blk, (block_mode)block_mode_iter, em, nullptr); + + candidate.m_encoding_type = encoding_type::cBlock; + candidate.m_endpoint_mode = em; + candidate.m_block_mode = bm; + + candidates.emplace_back(std::move(candidate)); + + break; + } + case endpoint_mode::cUseLeftDelta: + case endpoint_mode::cUseUpperDelta: + { + const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter]; + const uint32_t cem = local_md.m_cem; + + if (local_md.m_num_partitions > 1) + break; + + if ((em == endpoint_mode::cUseLeftDelta) && (!has_left_neighbor)) + break; + else if ((em == endpoint_mode::cUseUpperDelta) && (!has_upper_neighbor)) + break; + + candidate_encoding candidate; + candidate.m_coder.reserve(24); + astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; + + int nx = bx, ny = by; + if (em == endpoint_mode::cUseLeftDelta) + nx--; + else + ny--; + + const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny); + if (neighbor_blk.m_encoding_type == encoding_type::cSolid) + break; + assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse)); + + const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode]; + + if (neighbor_md.m_cem != cem) + break; + + assert(neighbor_md.m_cem == local_md.m_cem); + + const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y; + const bool dual_plane = local_md.m_dp; + const uint32_t num_grid_samples = grid_x * grid_y; + const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem); + + // Dequantize neighbor's endpoints to ISE 20 + uint8_t neighbor_endpoints_ise20[basist::NUM_MODE11_ENDPOINTS]; + basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem, + neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints, + astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20); + + // Requantize neighbor's endpoints to our local desired coding ISE range + uint8_t neighbor_endpoints_coding_ise_local[basist::NUM_MODE11_ENDPOINTS]; + basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem, astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20, local_md.m_endpoint_ise_range, neighbor_endpoints_coding_ise_local); + + uint8_t blk_endpoints[basist::NUM_MODE11_ENDPOINTS]; + uint8_t blk_weights0[NUM_BLOCK_PIXELS], blk_weights1[NUM_BLOCK_PIXELS]; + + // Now try to encode the current block using the neighbor's endpoints submode. + double err = 0.0f; + uint32_t best_submode = 0; + + if (cem == 7) + { + int maj_index, submode_index; + decode_cem_7_config(neighbor_endpoints_ise20, submode_index, maj_index); + + int first_submode = submode_index, last_submode = submode_index; + + err = encode_astc_hdr_block_mode_7( + NUM_BLOCK_PIXELS, + (basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16, + local_md.m_weight_ise_range, + best_submode, + BIG_FLOAT_VAL, + blk_endpoints, blk_weights0, + coptions, + local_md.m_endpoint_ise_range, + first_submode, last_submode, + &enc_block_stats); + } + else + { + int maj_index, submode_index; + decode_cem_11_config(neighbor_endpoints_ise20, submode_index, maj_index); + + int first_submode = -1, last_submode = -1; + if (maj_index == 3) + { + // direct + } + else + { + first_submode = submode_index; + last_submode = submode_index; + } + + if (dual_plane) + { + err = encode_astc_hdr_block_mode_11_dual_plane( + NUM_BLOCK_PIXELS, + (basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16, + local_md.m_dp_channel, + local_md.m_weight_ise_range, + best_submode, + BIG_FLOAT_VAL, + blk_endpoints, blk_weights0, blk_weights1, + coptions, + false, + local_md.m_endpoint_ise_range, + false, //uber_mode_flag, + false, + first_submode, last_submode, true); + } + else + { + err = encode_astc_hdr_block_mode_11( + NUM_BLOCK_PIXELS, + (basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16, + local_md.m_weight_ise_range, + best_submode, + BIG_FLOAT_VAL, + blk_endpoints, blk_weights0, + coptions, + false, + local_md.m_endpoint_ise_range, + false, //uber_mode_flag, + false, + first_submode, last_submode, true, + mode11_opt_mode, + &enc_block_stats); + } + } + + if (err == BIG_FLOAT_VAL) + break; + + uint8_t endpoint_deltas[basist::NUM_MODE11_ENDPOINTS]; + + // TODO: For now, just try 5 bits for each endpoint. Can tune later. + // This isn't right, it's computing the deltas in ISE space. + //const uint32_t NUM_ENDPOINT_DELTA_BITS = 5; + const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS; + const int low_delta_limit = -(total_endpoint_delta_vals / 2), high_delta_limit = (total_endpoint_delta_vals / 2) - 1; + + const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(local_md.m_endpoint_ise_range).m_ISE_to_rank; + + bool all_deltas_in_limits = true; + for (uint32_t i = 0; i < num_endpoint_vals; i++) + { + int endpoint_delta = (int)ise_to_rank[blk_endpoints[i]] - (int)ise_to_rank[neighbor_endpoints_coding_ise_local[i]]; + + if ((endpoint_delta < low_delta_limit) || (endpoint_delta > high_delta_limit)) + all_deltas_in_limits = false; + + endpoint_deltas[i] = (uint8_t)(endpoint_delta + -low_delta_limit); + } + + if (all_deltas_in_limits) + { + coded_log_blk.m_grid_width = (uint8_t)grid_x; + coded_log_blk.m_grid_height = (uint8_t)grid_y; + coded_log_blk.m_dual_plane = (uint8_t)dual_plane; + coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel; + coded_log_blk.m_num_partitions = 1; + coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem; + coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range; + coded_log_blk.m_endpoint_ise_range = (uint8_t)local_md.m_endpoint_ise_range; + + memcpy(coded_log_blk.m_endpoints, blk_endpoints, num_endpoint_vals); + + uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS]; + uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; + + basist::astc_6x6_hdr::requantize_ise_endpoints(local_md.m_cem, local_md.m_endpoint_ise_range, blk_endpoints, local_md.m_transcode_endpoint_ise_range, transcode_endpoints); + + if (dual_plane) + { + downsample_ise_weights_dual_plane( + local_md.m_weight_ise_range, local_md.m_weight_ise_range, + BLOCK_W, BLOCK_H, + grid_x, grid_y, + blk_weights0, blk_weights1, + coded_log_blk.m_weights); + } + else + { + downsample_ise_weights( + local_md.m_weight_ise_range, local_md.m_weight_ise_range, + BLOCK_W, BLOCK_H, + grid_x, grid_y, + blk_weights0, coded_log_blk.m_weights); + } + + basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range); + + // Create the block the decoder would transcode into. + + astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk; + decomp_blk.clear(); + + decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem; + decomp_blk.m_dual_plane = local_md.m_dp; + decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel; + decomp_blk.m_num_partitions = 1; + decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range; + decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range; + + memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals); + + copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk); + + if (!validate_log_blk(decomp_blk)) + { + fmt_error_printf("pack_astc_block() failed\n"); + return false; + } + + status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]); + if (!status) + { + fmt_error_printf("decode_astc_block() failed\n"); + return false; + } + + candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN); + code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, endpoint_deltas); + + candidate.m_encoding_type = encoding_type::cBlock; + candidate.m_endpoint_mode = em; + candidate.m_block_mode = bm; + + candidates.emplace_back(std::move(candidate)); + } + + break; + } + case endpoint_mode::cRaw: + { + //if (candidates.size() == 339) + // fmt_printf("!"); + + const auto& mode_desc = g_block_mode_descs[(uint32_t)bm]; + const uint32_t cem = mode_desc.m_cem; + //const uint32_t num_endpoint_vals = get_num_endpoint_vals(cem); + const bool dual_plane = mode_desc.m_dp; + + if ((global_cfg.m_disable_twothree_subsets) && (mode_desc.m_num_partitions >= 2)) + break; + + if (mode_desc.m_num_partitions == 3) + { + assert(!dual_plane); + + if (!has_estimated_parts3) + break; + + assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range); + assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range); + + trial_result res; + + status = encode_block_3_subsets( + res, + cem, + mode_desc.m_grid_x, mode_desc.m_grid_y, + mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range, + &half_pixels[0][0], (vec4F*)block_pixels_q16, + coptions, + uber_mode_flag, + best_parts3, total_parts3, comp_level, mode11_opt_mode); + + if (!status) + break; + + assert(res.m_valid); + + candidate_encoding candidate; + candidate.m_coder.reserve(24); + astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; + + coded_log_blk = res.m_log_blk; + + astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk; + decomp_blk = res.m_log_blk; + + if (!validate_log_blk(decomp_blk)) + { + fmt_error_printf("pack_astc_block() failed\n"); + return false; + } + + status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]); + if (!status) + { + fmt_error_printf("decode_astc_block() failed\n"); + return false; + } + + candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN); + code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr); + + candidate.m_encoding_type = encoding_type::cBlock; + candidate.m_endpoint_mode = em; + candidate.m_block_mode = bm; + + candidates.emplace_back(std::move(candidate)); + } + else if (mode_desc.m_num_partitions == 2) + { + assert(!dual_plane); + + if (!has_estimated_parts2) + break; + + assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range); + assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range); + + for (uint32_t est_part_iter = 0; est_part_iter < total_parts2; est_part_iter++) + { + trial_result results[2]; + + assert(((cem == 11) && any_2subset_mode11_enabled) || ((cem == 7) && any_2subset_mode7_enabled)); + + status = encode_block_2_subsets( + results, + mode_desc.m_grid_x, mode_desc.m_grid_y, + mode_desc.m_cem, + mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range, + &half_pixels[0][0], (vec4F*)block_pixels_q16, + coptions, + uber_mode_flag, + (cem == 11) ? best_parts2_mode11[est_part_iter] : best_parts2_mode7[est_part_iter], + comp_level, + mode11_opt_mode, + true); + + if (!status) + continue; + + for (uint32_t r_iter = 0; r_iter < 2; r_iter++) + { + const trial_result& res = results[r_iter]; + + if (!res.m_valid) + continue; + + candidate_encoding candidate; + candidate.m_coder.reserve(24); + astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; + + coded_log_blk = res.m_log_blk; + + astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk; + decomp_blk = res.m_log_blk; + + if (!validate_log_blk(decomp_blk)) + { + fmt_error_printf("pack_astc_block() failed\n"); + return false; + } + + status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]); + if (!status) + { + fmt_error_printf("decode_astc_block() failed\n"); + return false; + } + + candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN); + code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr); + + candidate.m_encoding_type = encoding_type::cBlock; + candidate.m_endpoint_mode = em; + candidate.m_block_mode = bm; + + candidates.emplace_back(std::move(candidate)); + + } // r_iter + } + } + else + { + // 1 subset + uint8_t blk_weights0[BLOCK_W * BLOCK_H], blk_weights1[BLOCK_W * BLOCK_H]; + uint32_t best_submode = 0; + + candidate_encoding candidate; + candidate.m_coder.reserve(24); + astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; + + const uint32_t grid_x = mode_desc.m_grid_x, grid_y = mode_desc.m_grid_y; + const uint32_t num_grid_samples = grid_x * grid_y; + + const half_vec3* pBlock_pixels_half = &half_pixels[0][0]; + const vec4F* pBlock_pixels_q16 = &block_pixels_q16[0][0]; + + const uint32_t num_grid_samples_dp = num_grid_samples * (dual_plane ? 2 : 1); + + uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; + + coded_log_blk.m_grid_width = (uint8_t)grid_x; + coded_log_blk.m_grid_height = (uint8_t)grid_y; + coded_log_blk.m_dual_plane = (uint8_t)dual_plane; + coded_log_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel; + coded_log_blk.m_num_partitions = 1; + coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem; + coded_log_blk.m_weight_ise_range = (uint8_t)mode_desc.m_weight_ise_range; + coded_log_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_endpoint_ise_range; + + if ((cem == 11) && (!dual_plane) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H))) + { + double e = encode_astc_hdr_block_downsampled_mode_11( + BLOCK_W, BLOCK_H, grid_x, grid_y, + mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range, + NUM_BLOCK_PIXELS, (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16, + BIG_FLOAT_VAL, + FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode, + coded_log_blk.m_endpoints, coded_log_blk.m_weights, best_submode, + coptions, + &enc_block_stats); + + if (e == BIG_FLOAT_VAL) + break; + } + else + { + if (cem == 7) + { + assert(!dual_plane); + + double e = encode_astc_hdr_block_mode_7( + NUM_BLOCK_PIXELS, + (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16, + mode_desc.m_weight_ise_range, + best_submode, + BIG_FLOAT_VAL, + coded_log_blk.m_endpoints, + blk_weights0, + coptions, + mode_desc.m_endpoint_ise_range, + 0, MAX_MODE7_SUBMODE_INDEX, + &enc_block_stats); + BASISU_NOTE_UNUSED(e); + } + else + { + double e; + + if (dual_plane) + { + e = encode_astc_hdr_block_mode_11_dual_plane( + NUM_BLOCK_PIXELS, + (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16, + mode_desc.m_dp_channel, + mode_desc.m_weight_ise_range, + best_submode, + BIG_FLOAT_VAL, + coded_log_blk.m_endpoints, + blk_weights0, blk_weights1, + coptions, + false, + mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false); + } + else + { + e = encode_astc_hdr_block_mode_11( + NUM_BLOCK_PIXELS, + (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16, + mode_desc.m_weight_ise_range, + best_submode, + BIG_FLOAT_VAL, + coded_log_blk.m_endpoints, + blk_weights0, + coptions, + false, + mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false, + mode11_opt_mode, + &enc_block_stats); + } + + if (e == BIG_FLOAT_VAL) + break; + } + + if (dual_plane) + { + downsample_ise_weights_dual_plane( + mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range, + BLOCK_W, BLOCK_H, + grid_x, grid_y, + blk_weights0, blk_weights1, + coded_log_blk.m_weights); + } + else + { + downsample_ise_weights( + mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range, + BLOCK_W, BLOCK_H, + grid_x, grid_y, + blk_weights0, coded_log_blk.m_weights); + + if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H))) + { + bool refine_status = refine_endpoints(cem, + mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints, + 6, 6, mode_desc.m_grid_x, mode_desc.m_grid_y, + coded_log_blk.m_weights, mode_desc.m_weight_ise_range, + BLOCK_W * BLOCK_H, + (basist::half_float(*)[3])pBlock_pixels_half, (vec4F*)pBlock_pixels_q16, + nullptr, + coptions, mode11_opt_mode); + BASISU_NOTE_UNUSED(refine_status); + } + } + } + + basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples_dp, coded_log_blk.m_weights, mode_desc.m_weight_ise_range, transcode_weights, mode_desc.m_transcode_weight_ise_range); + + // Create the block the decoder would transcode into. + astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk; + decomp_blk.clear(); + + decomp_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem; + decomp_blk.m_dual_plane = mode_desc.m_dp; + decomp_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel; + decomp_blk.m_num_partitions = 1; + decomp_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_transcode_endpoint_ise_range; + decomp_blk.m_weight_ise_range = (uint8_t)mode_desc.m_transcode_weight_ise_range; + + basist::astc_6x6_hdr::requantize_ise_endpoints(mode_desc.m_cem, mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints, mode_desc.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints); + + copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk); + + if (!validate_log_blk(decomp_blk)) + { + fmt_error_printf("pack_astc_block() failed\n"); + return false; + } + + status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]); + if (!status) + { + fmt_error_printf("decode_astc_block() failed\n"); + return false; + } + + candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN); + code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr); + + candidate.m_encoding_type = encoding_type::cBlock; + candidate.m_endpoint_mode = em; + candidate.m_block_mode = bm; + + candidates.emplace_back(std::move(candidate)); + } + + break; + } + default: + assert(0); + fmt_debug_printf("Invalid endpoint mode\n"); + return false; + + } // switch (em) + + } // endpoint_mode_iter + + } // block_mode_iter + + } // is_solid_block + + //------------------------------------------------ + + debug_state.m_total_candidates_considered.fetch_add(candidates.size_u32(), std::memory_order_relaxed); + atomic_max(debug_state.m_max_candidates_considered, candidates.size_u32()); + + for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++) + { + auto& candidate = candidates[candidate_iter]; + + for (uint32_t y = 0; y < BLOCK_H; y++) + for (uint32_t x = 0; x < BLOCK_W; x++) + linear_rgb_to_itp(candidate.m_comp_pixels[y][x], candidate.m_comp_pixels_itp[y][x], global_cfg); + } + + // Find best overall candidate + double best_t = BIG_FLOAT_VAL; + int best_candidate_index = -1; + + float best_d_ssim = BIG_FLOAT_VAL; + + if (global_cfg.m_lambda == 0.0f) + { + for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++) + { + const auto& candidate = candidates[candidate_iter]; + + float candidate_d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]); + + if (candidate_d_ssim < best_d_ssim) + best_d_ssim = candidate_d_ssim; + + candidate_d_ssim *= SSIM_WEIGHT; + + float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment); + + candidate_mse += candidate_d_ssim; + + float total_deblock_penalty = 0.0f; + if (global_cfg.m_deblocking_flag) + { + total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight; + } + candidate_mse += total_deblock_penalty * SSIM_WEIGHT; + + if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse)) + { + // Bias the encoder away from 2 level blocks on complex blocks + // TODO: Perhaps only do this on large or non-interpolated grids + if (complex_block) + { + if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS) + { + candidate_mse *= TWO_LEVEL_PENALTY; + } + } + + // Bias the encoder away from smaller weight grids if the block is very complex + // TODO: Use the DCT to compute an approximation of the block energy/variance retained vs. lost by downsampling. + if (complex_block) + { + if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2)) + candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY; + else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3) + candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY; + else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4) + candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY; + } + } + + float candidate_t = candidate_mse; + + if (candidate_t < best_t) + { + best_t = candidate_t; + best_candidate_index = candidate_iter; + } + + } // candidate_iter + + if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM)) + { + debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed); + continue; + } + + const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f); + + if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) && + (block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) && + (block_avg_y >= 1.5f)) + { + debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed); + continue; + } + } + else + { + assert(enc_state.smooth_block_mse_scales.get_width() > 0); + + // Compute block's perceptual weighting + float perceptual_scale = 0.0f; + for (uint32_t y = 0; y < BLOCK_H; y++) + for (uint32_t x = 0; x < BLOCK_W; x++) + perceptual_scale = basisu::maximumf(perceptual_scale, enc_state.smooth_block_mse_scales.at_clamped(bx * BLOCK_W + x, by * BLOCK_H + y)); + + // Very roughly normalize the computed distortion vs. bits. + perceptual_scale *= 10.0f; + + for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++) + { + auto& candidate = candidates[candidate_iter]; + + float d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]); + + if (d_ssim < best_d_ssim) + best_d_ssim = (float)d_ssim; + + d_ssim *= SSIM_WEIGHT; + + float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment); + + candidate_mse += d_ssim; + + float total_deblock_penalty = 0.0f; + if (global_cfg.m_deblocking_flag) + { + total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight; + } + candidate_mse += total_deblock_penalty * SSIM_WEIGHT; + + if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse)) + { + // Bias the encoder away from 2 level blocks on complex blocks + if (complex_block) + { + if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS) + { + candidate_mse *= TWO_LEVEL_PENALTY; + } + } + + // Bias the encoder away from smaller weight grids if the block is very complex + if (complex_block) + { + if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2)) + candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY; + else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3) + candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY; + else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4) + candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY; + } + } + + float mode_penalty = 1.0f; + if (candidate.m_encoding_type == encoding_type::cSolid) + mode_penalty *= SOLID_PENALTY; + else if (candidate.m_encoding_type == encoding_type::cReuse) + mode_penalty *= REUSE_PENALTY; + else if (candidate.m_encoding_type == encoding_type::cRun) + mode_penalty *= (complex_block ? RUN_PENALTY * 2.0f : RUN_PENALTY); + + float candidate_bits = (float)candidate.m_coder.get_total_bits(); + float candidate_d = candidate_mse * mode_penalty; + + const float D_POWER = 2.0f; + float candidate_t = perceptual_scale * powf(candidate_d, D_POWER) + candidate_bits * (global_cfg.m_lambda * 1000.0f); + + candidate.m_t = candidate_t; + candidate.m_d = candidate_d; + candidate.m_bits = candidate_bits; + + if (candidate_t < best_t) + { + best_t = candidate_t; + best_candidate_index = candidate_iter; + } + + } // candidate_iter + + if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM)) + { + debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed); + continue; + } + + const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f); + + if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) && + (block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) && + (block_avg_y >= 1.5f)) + { + debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed); + continue; + } + + if (global_cfg.m_rdo_candidate_diversity_boost) + { + // candidate diversity boosting - consider candidates along/near the Pareto front + const candidate_encoding& comp_candidate = candidates[best_candidate_index]; + + float best_d = BIG_FLOAT_VAL; + + for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++) + { + const auto& candidate = candidates[candidate_iter]; + + if (candidate.m_bits <= comp_candidate.m_bits * global_cfg.m_rdo_candidate_diversity_boost_bit_window_weight) + { + if (candidate.m_d < best_d) + { + best_d = candidate.m_d; + best_candidate_index = candidate_iter; + } + } + } + } + + // candidate JND optimization - if there's a cheaper to code candidate that is nearly equivalent visually to the best candidate chose, choose that + if (global_cfg.m_jnd_optimization) + { + const candidate_encoding& cur_comp_candidate = candidates[best_candidate_index]; + + float new_best_candidate_bits = BIG_FLOAT_VAL; + int new_best_candidate_index = -1; + + for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++) + { + if ((int)candidate_iter == best_candidate_index) + continue; + + const auto& candidate = candidates[candidate_iter]; + + if (candidate.m_bits >= cur_comp_candidate.m_bits) + continue; + + float max_delta_itp = 0.0f; + for (uint32_t y = 0; y < BLOCK_H; y++) + { + for (uint32_t x = 0; x < BLOCK_W; x++) + { + float delta_itp = compute_pixel_delta_itp(cur_comp_candidate.m_comp_pixels_itp[y][x], candidate.m_comp_pixels_itp[y][x], block_pixels_as_itp[y][x], global_cfg.m_delta_itp_dark_adjustment); + max_delta_itp = maximum(max_delta_itp, delta_itp); + + if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh) + goto skip; + } + } + + skip: + if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh) + continue; + + if (candidate.m_bits < new_best_candidate_bits) + { + new_best_candidate_bits = candidate.m_bits; + new_best_candidate_index = candidate_iter; + } + } + + if (new_best_candidate_index != -1) + { + best_candidate_index = new_best_candidate_index; + debug_state.m_total_jnd_replacements.fetch_add(1, std::memory_order_relaxed); + } + } + + } // if (lambda == 0.0f) + + if (global_cfg.m_debug_images) + { + std::lock_guard lck(debug_state.m_stat_vis_mutex); + debug_state.m_stat_vis.fill_box(bx * 6, by * 6, 6, 6, vec4F(best_d_ssim, max_std_dev, lowpass_std_dev, 1.0f)); + } + + if (best_candidate_index < 0) + { + assert(best_candidate_index >= 0); + fmt_error_printf("No candidates!\n"); + return false; + } + + const auto& best_candidate = candidates[best_candidate_index]; + + assert(best_candidate.m_encoding_type != encoding_type::cInvalid); + + if (best_candidate.m_encoding_type == encoding_type::cRun) + { + if (!prev_run_len) + { + if (prev_encoding.get_total_bits()) + { +#if SYNC_MARKERS + strip_coded_bits.put_bits(0xDEAD, 16); +#endif + + strip_coded_bits.append(prev_encoding); + } + + assert(best_candidate.m_coder.get_total_bits()); + + prev_encoding = best_candidate.m_coder; + + prev_run_len = 1; + } + else + { + prev_run_len++; + + const uint32_t prev_run_bits = prev_encoding.get_total_bits_u32(); + assert(prev_run_bits); + BASISU_NOTE_UNUSED(prev_run_bits); + + const uint32_t num_dummy_bits = best_candidate.m_coder.get_total_bits_u32(); + BASISU_NOTE_UNUSED(num_dummy_bits); + + // Rewrite the previous encoding to extend the run length. + prev_encoding.restart(); + prev_encoding.put_bits(RUN_CODE, RUN_CODE_LEN); + prev_encoding.put_vlc(prev_run_len - 1, 5); + + assert(prev_encoding.get_total_bits() == prev_run_bits + num_dummy_bits); + } + } + else + { + if (prev_encoding.get_total_bits()) + { +#if SYNC_MARKERS + strip_coded_bits.put_bits(0xDEAD, 16); +#endif + + strip_coded_bits.append(prev_encoding); + } + + prev_encoding = best_candidate.m_coder; + prev_run_len = 0; + } + + memcpy(prev_comp_pixels, best_candidate.m_comp_pixels, sizeof(vec3F) * BLOCK_W * BLOCK_H); + + prev_candidate_encoding = best_candidate; + + if (best_candidate.m_encoding_type != encoding_type::cRun) + prev_non_run_candidate_encoding = best_candidate; + + { + std::lock_guard lck(debug_state.m_stats_mutex); + + debug_state.m_encoding_type_hist[(uint32_t)best_candidate.m_encoding_type]++; + + if (best_candidate.m_encoding_type == encoding_type::cBlock) + { + debug_state.m_endpoint_mode_hist[(uint32_t)best_candidate.m_endpoint_mode]++; + } + + if ((best_candidate.m_encoding_type == encoding_type::cReuse) || (best_candidate.m_encoding_type == encoding_type::cBlock)) + { + const uint32_t bm_index = (uint32_t)best_candidate.m_block_mode; + assert(bm_index < (uint32_t)block_mode::cBMTotalModes); + + debug_state.m_block_mode_hist[bm_index]++; + debug_state.m_block_mode_total_bits[bm_index] += best_candidate.m_coder.get_total_bits(); + + for (uint32_t i = 0; i < 3; i++) + { + debug_state.m_block_mode_comp_stats[bm_index][i].push_back(half_comp_stats[i]); + debug_state.m_block_mode_comparative_stats[bm_index][i].push_back(half_cross_chan_stats[i]); + } + } + + if (best_candidate.m_encoding_type == encoding_type::cReuse) + { + debug_state.m_reuse_num_parts[best_candidate.m_coded_log_blk.m_num_partitions].fetch_add(1, std::memory_order_relaxed); + + if (best_candidate.m_coded_log_blk.m_dual_plane) + debug_state.m_reuse_total_dp.fetch_add(1, std::memory_order_relaxed); + } + } + + enc_state.coded_blocks(bx, by) = prev_non_run_candidate_encoding; + + // Update decoded image + vec4F decoded_float_pixels[BLOCK_H][BLOCK_W]; + for (uint32_t y = 0; y < BLOCK_H; y++) + for (uint32_t x = 0; x < BLOCK_W; x++) + decoded_float_pixels[y][x] = best_candidate.m_comp_pixels[y][x]; + + enc_state.packed_img.set_block_clipped((vec4F*)decoded_float_pixels, bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H); + + status = astc_helpers::pack_astc_block(enc_state.final_astc_blocks(bx, by), best_candidate.m_decomp_log_blk, nullptr, nullptr); + if (!status) + { + fmt_error_printf("Failed packing block\n"); + return false; + } + + const uint32_t r = debug_state.m_total_blocks_compressed.fetch_add(1, std::memory_order_relaxed); + if ((r & 2047) == 2047) + { + if (global_cfg.m_status_output) + { + basisu::fmt_printf("{} of {} total blocks compressed, {3.2}%\n", r, total_blocks, (r * 100.0f) / total_blocks); + } + } + + if ((global_cfg.m_debug_images) && + ((best_candidate.m_encoding_type != encoding_type::cRun) && (best_candidate.m_encoding_type != encoding_type::cSolid))) + { + std::lock_guard lck(debug_state.m_vis_image_mutex); + + if (best_candidate.m_decomp_log_blk.m_num_partitions == 2) + { + const int part2_unique_index = g_part2_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id]; + assert((part2_unique_index >= 0) && (part2_unique_index < (int)NUM_UNIQUE_PARTITIONS2)); + + const partition_pattern_vec& pat = g_partitions2[part2_unique_index]; + + for (uint32_t y = 0; y < 6; y++) + { + for (uint32_t x = 0; x < 6; x++) + { + const uint32_t p = pat[x + y * 6]; + debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, color_rgba(p ? 100 : 0, 128, p ? 100 : 0, 255)); + } // x + } // y + } + else if (best_candidate.m_decomp_log_blk.m_num_partitions == 3) + { + //part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(0, 0, 255, 255)); + + const int part3_unique_index = g_part3_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id]; + assert((part3_unique_index >= 0) && (part3_unique_index < (int)NUM_UNIQUE_PARTITIONS3)); + + const partition_pattern_vec& pat = g_partitions3[part3_unique_index]; + + for (uint32_t y = 0; y < 6; y++) + { + for (uint32_t x = 0; x < 6; x++) + { + const uint32_t p = pat[x + y * 6]; + color_rgba c(0, 0, 150, 255); + if (p == 1) + c.set(100, 0, 150, 255); + else if (p == 2) + c.set(0, 100, 150, 255); + debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, c); + } // x + } // y + } + else if (best_candidate.m_decomp_log_blk.m_dual_plane) + { + debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 255, 255)); + } + else + { + debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 0, 255)); + } + + color_rgba c; + c.set((best_candidate.m_coded_log_blk.m_grid_width * best_candidate.m_coded_log_blk.m_grid_height * 255 + 18) / 36); + debug_state.m_grid_vis.fill_box(bx * 6, by * 6, 6, 6, c); + + c.set(0, 0, 0, 255); + if (complex_block) + c[0] = 255; + + if (very_complex_block) + c[1] = 255; + + if (outer_pass == 2) + c[2] = 255; + else if (outer_pass == 1) + c[2] = 128; + + debug_state.m_mode_vis.fill_box(bx * 6, by * 6, 6, 6, c); + + c.set(0, 255, 0, 255); + if (best_candidate.m_coded_log_blk.m_color_endpoint_modes[0] == 7) + c.set(255, 0, 0, 255); + debug_state.m_mode_vis2.fill_box(bx * 6, by * 6, 6, 6, c); + + switch (best_candidate.m_encoding_type) + { + case encoding_type::cRun: + c.set(0, 0, 0, 255); + break; + case encoding_type::cSolid: + c.set(128, 128, 128, 255); // dark grey + break; + case encoding_type::cReuse: + c.set(255, 255, 0, 255); // yellow + break; + case encoding_type::cBlock: + { + switch (best_candidate.m_endpoint_mode) + { + case endpoint_mode::cRaw: + c.set(255, 0, 0, 255); // red + break; + case endpoint_mode::cUseLeft: + c.set(0, 0, 255, 255); // blue + break; + case endpoint_mode::cUseUpper: + c.set(0, 0, 192, 255); // darker blue + break; + case endpoint_mode::cUseLeftDelta: + c.set(0, 255, 0, 255); // green + break; + case endpoint_mode::cUseUpperDelta: + c.set(0, 192, 0, 255); // darker green + break; + default: + break; + } + + break; + } + default: + break; + } + + if (filtered_x_err < filtered_y_err) + c[3] = 0; + else + c[3] = 255; + + debug_state.m_enc_vis.fill_box(bx * 6, by * 6, 6, 6, c); + } + + break; + + } // outer_pass + + } // bx + + } // by + + if (prev_encoding.get_total_bits()) + { +#if SYNC_MARKERS + strip_coded_bits.put_bits(0xDEAD, 16); +#endif + + strip_coded_bits.append(prev_encoding); + } + + return true; +} + +bool g_initialized = false; + +void global_init() +{ + if (g_initialized) + return; + + interval_timer tm; + tm.start(); + + init_pq_tables(); + + init_partitions2_6x6(); + init_partitions3_6x6(); + + init_contrib_lists(); + + g_initialized = true; + + //fmt_printf("astc_6x6_hdr::global_init() total time: {}\n", tm.get_elapsed_secs()); +} + +bool compress_photo(const basisu::imagef &orig_src_img, const astc_hdr_6x6_global_config &orig_global_cfg, job_pool *pJob_pool, + basisu::uint8_vec& intermediate_tex_data, basisu::uint8_vec& astc_tex_data, result_metrics& metrics) +{ + assert(g_initialized); + if (!g_initialized) + return false; + + assert(pJob_pool); + + if (orig_global_cfg.m_debug_output) + { + fmt_debug_printf("------ astc_6x6_hdr::compress_photo:\n"); + fmt_debug_printf("Source image dimensions: {}x{}\n", orig_src_img.get_width(), orig_src_img.get_height()); + fmt_debug_printf("Job pool total threads: {}\n", (uint64_t)pJob_pool->get_total_threads()); + orig_global_cfg.print(); + } + + if (!orig_src_img.get_width() || !orig_src_img.get_height()) + { + assert(false); + fmt_error_printf("compress_photo: Invalid source image\n"); + return false; + } + + astc_hdr_6x6_global_config global_cfg(orig_global_cfg); + + uastc_hdr_6x6_encode_state enc_state; + enc_state.master_coptions.m_q_log_bias = Q_LOG_BIAS_6x6; + enc_state.src_img = orig_src_img; + + //src_img.crop(256, 256); + + const uint32_t width = enc_state.src_img.get_width(); + const uint32_t height = enc_state.src_img.get_height(); + const uint32_t num_blocks_x = enc_state.src_img.get_block_width(BLOCK_W); + const uint32_t num_blocks_y = enc_state.src_img.get_block_height(BLOCK_H); + const uint32_t total_blocks = num_blocks_x * num_blocks_y; + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + for (uint32_t c = 0; c < 3; c++) + { + float f = enc_state.src_img(x, y)[c]; + + if (std::isinf(f) || std::isnan(f) || (f < 0.0f)) + f = 0; + else if (f > basist::ASTC_HDR_MAX_VAL) + f = basist::ASTC_HDR_MAX_VAL; + + enc_state.src_img(x, y)[c] = f; + + } // c + + } // x + } // y + + if (global_cfg.m_debug_images) + { + write_exr((global_cfg.m_debug_image_prefix + "orig.exr").c_str(), enc_state.src_img, 3, 0); + } + + image src_img_compressed; + tonemap_image_compressive2(src_img_compressed, enc_state.src_img); + + if (global_cfg.m_debug_images) + { + save_png(global_cfg.m_debug_image_prefix + "compressive_tone_map.png", src_img_compressed); + } + + smooth_map_params rp; + rp.m_debug_images = global_cfg.m_debug_images; + + if (global_cfg.m_lambda != 0.0f) + { + if (global_cfg.m_status_output) + fmt_printf("Creating RDO perceptual weighting maps\n"); + + create_smooth_maps2(enc_state.smooth_block_mse_scales, src_img_compressed, rp); + } + + if (global_cfg.m_status_output) + fmt_printf("Blurring image\n"); + + enc_state.src_img_filtered1.resize(width, height); + image_resample(enc_state.src_img, enc_state.src_img_filtered1, "gaussian", global_cfg.m_gaussian1_strength); //1.45f); + + enc_state.src_img_filtered2.resize(width, height); + image_resample(enc_state.src_img, enc_state.src_img_filtered2, "gaussian", global_cfg.m_gaussian2_strength); //1.83f); + + if (global_cfg.m_debug_images) + { + write_exr((global_cfg.m_debug_image_prefix + "blurred1.exr").c_str(), enc_state.src_img_filtered1, 3, 0); + write_exr((global_cfg.m_debug_image_prefix + "blurred2.exr").c_str(), enc_state.src_img_filtered2, 3, 0); + } + + if (global_cfg.m_status_output) + fmt_printf("Transforming to ITP\n"); + + enc_state.src_img_itp.resize(width, height); + convet_rgb_image_to_itp(enc_state.src_img, enc_state.src_img_itp, global_cfg); + + enc_state.src_img_filtered1_itp.resize(width, height); + convet_rgb_image_to_itp(enc_state.src_img_filtered1, enc_state.src_img_filtered1_itp, global_cfg); + + enc_state.src_img_filtered2_itp.resize(width, height); + convet_rgb_image_to_itp(enc_state.src_img_filtered2, enc_state.src_img_filtered2_itp, global_cfg); + + if (global_cfg.m_lambda == 0.0f) + global_cfg.m_favor_higher_compression = false; + + uint32_t total_strips = 0, rows_per_strip = 0; + if (!calc_strip_size(global_cfg.m_lambda, num_blocks_y, (uint32_t)pJob_pool->get_total_threads(), global_cfg.m_force_one_strip, total_strips, rows_per_strip, global_cfg)) + { + fmt_error_printf("compress_photo: Failed computing strip sizes\n"); + return false; + } + + if (global_cfg.m_debug_output) + fmt_printf("lambda: {}, comp_level: {}, highest_comp_level: {}, extra patterns: {}\n", global_cfg.m_lambda, global_cfg.m_master_comp_level, global_cfg.m_highest_comp_level, global_cfg.m_extra_patterns_flag); + + enc_state.coded_blocks.resize(num_blocks_x, num_blocks_y); + + bitwise_coder coded_bits; + + coded_bits.put_bits(0xABCD, 16); + coded_bits.put_bits(width, 16); + coded_bits.put_bits(height, 16); + + enc_state.packed_img.resize(width, height); + + enc_state.strip_bits.resize(total_strips); + + enc_state.final_astc_blocks.resize(num_blocks_x, num_blocks_y); + + uastc_hdr_6x6_debug_state debug_state; + + if (global_cfg.m_debug_images) + debug_state.init(width, height); + else + debug_state.init(0, 0); + + interval_timer tm; + tm.start(); + + std::atomic_bool any_failed_flag; + any_failed_flag.store(false); + + for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++) + { + const uint32_t strip_first_by = strip_index * rows_per_strip; + + uint32_t strip_last_by = minimum(strip_first_by + rows_per_strip - 1, num_blocks_y); + if (strip_index == (total_strips - 1)) + strip_last_by = num_blocks_y - 1; + + pJob_pool->add_job([&any_failed_flag, &global_cfg, &debug_state, &enc_state, + strip_index, total_strips, strip_first_by, strip_last_by, + num_blocks_x, num_blocks_y, total_blocks, width, height] + { + if (!any_failed_flag) + { + bool status = compress_strip_task( + strip_index, total_strips, strip_first_by, strip_last_by, + num_blocks_x, num_blocks_y, total_blocks, width, height, + global_cfg, debug_state, enc_state); + + if (!status) + { + fmt_error_printf("compress_photo: compress_strip_task() failed\n"); + any_failed_flag.store(true, std::memory_order_relaxed); + } + } + } ); + + if (any_failed_flag) + break; + + } // strip_index + + pJob_pool->wait_for_all(); + + if (any_failed_flag) + { + fmt_error_printf("One or more strips failed during compression\n"); + return false; + } + + if (global_cfg.m_debug_output) + fmt_printf("Encoding time: {} secs\n", tm.get_elapsed_secs()); + + if (global_cfg.m_debug_output) + debug_state.print(total_blocks); + + if (global_cfg.m_debug_images) + { + save_png(global_cfg.m_debug_image_prefix + "part_vis.png", debug_state.m_part_vis); + save_png(global_cfg.m_debug_image_prefix + "grid_vis.png", debug_state.m_grid_vis); + save_png(global_cfg.m_debug_image_prefix + "mode_vis.png", debug_state.m_mode_vis); + save_png(global_cfg.m_debug_image_prefix + "mode_vis2.png", debug_state.m_mode_vis2); + save_png(global_cfg.m_debug_image_prefix + "enc_vis.png", debug_state.m_enc_vis); + write_exr((global_cfg.m_debug_image_prefix + "stat_vis.exr").c_str(), debug_state.m_stat_vis, 3, 0); + } + + for (uint32_t i = 0; i < total_strips; i++) + coded_bits.append(enc_state.strip_bits[i]); + + coded_bits.put_bits(0xA742, 16); + + coded_bits.flush(); + + if (global_cfg.m_output_images) + { + write_exr((global_cfg.m_output_image_prefix + "comp.exr").c_str(), enc_state.packed_img, 3, 0); + } + + if (global_cfg.m_debug_output) + fmt_printf("\nTotal intermediate output bits/pixel: {3.4}\n", (float)coded_bits.get_total_bits() / (float)(width * height)); + + vector2D decoded_blocks1; + vector2D decoded_blocks2; + + if (global_cfg.m_debug_output) + fmt_printf("decode_file\n"); + + uint32_t unpacked_width = 0, unpacked_height = 0; + bool status = decode_file(coded_bits.get_bytes(), decoded_blocks1, unpacked_width, unpacked_height); + if (!status) + { + fmt_error_printf("decode_file() failed\n"); + return false; + } + + if (global_cfg.m_debug_output) + fmt_printf("decode_6x6_hdr\n"); + + status = decode_6x6_hdr(coded_bits.get_bytes().get_ptr(), coded_bits.get_bytes().size_in_bytes_u32(), decoded_blocks2, unpacked_width, unpacked_height); + if (!status) + { + fmt_error_printf("decode_6x6_hdr_file() failed\n"); + return false; + } + + if ((enc_state.final_astc_blocks.get_width() != decoded_blocks1.get_width()) || + (enc_state.final_astc_blocks.get_height() != decoded_blocks1.get_height())) + { + fmt_error_printf("Decode size mismatch with decode_file\n"); + return false; + } + + if ((enc_state.final_astc_blocks.get_width() != decoded_blocks2.get_width()) || + (enc_state.final_astc_blocks.get_height() != decoded_blocks2.get_height())) + { + fmt_error_printf("Decode size mismatch with decode_6x6_hdr_file\n"); + return false; + } + + if (memcmp(decoded_blocks1.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks1.size_in_bytes()) != 0) + { + fmt_error_printf("Decoded ASTC blocks verification failed\n"); + return false; + } + + if (memcmp(decoded_blocks2.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks2.size_in_bytes()) != 0) + { + fmt_error_printf("Decoded ASTC blocks verification failed\n"); + return false; + } + + if (global_cfg.m_debug_output) + basisu::fmt_printf("Decoded ASTC verification checks succeeded\n"); + + if (global_cfg.m_output_images) + { + if (write_astc_file((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), decoded_blocks1.get_ptr(), BLOCK_W, BLOCK_H, width, height)) + { + basisu::platform_sleep(20); + + uint8_vec astc_file_data; + if (read_file_to_vec((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), astc_file_data)) + { + if (astc_file_data.size() > 16) + { + astc_file_data.erase(0, 16); + + size_t comp_size = 0; + void* pComp_data = tdefl_compress_mem_to_heap(&astc_file_data[0], astc_file_data.size(), &comp_size, TDEFL_MAX_PROBES_MASK); + mz_free(pComp_data); + + if (global_cfg.m_debug_output) + { + fmt_printf(".ASTC file size (less header): {}, bits/pixel: {}, Deflate bits/pixel: {}\n", + (uint64_t)astc_file_data.size(), + (float)astc_file_data.size() * 8.0f / (float)(width * height), + (float)comp_size * 8.0f / (float)(width * height)); + } + } + } + } + } + + // Must decode all the blocks (even padded rows/cols) to match what the transcoder does. + imagef unpacked_astc_img(num_blocks_x * 6, num_blocks_y * 6); + imagef unpacked_astc_google_img(num_blocks_x * 6, num_blocks_y * 6); + + for (uint32_t y = 0; y < decoded_blocks1.get_height(); y++) + { + for (uint32_t x = 0; x < decoded_blocks1.get_width(); x++) + { + const auto& phys_blk = decoded_blocks1(x, y); + + vec4F pixels[MAX_BLOCK_W * MAX_BLOCK_H]; + status = unpack_physical_astc_block(&phys_blk, BLOCK_W, BLOCK_H, pixels); + if (!status) + { + fmt_error_printf("unpack_physical_astc_block() failed\n"); + return false; + } + + unpacked_astc_img.set_block_clipped(pixels, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H); + + vec4F pixels_google[MAX_BLOCK_W * MAX_BLOCK_H]; + status = unpack_physical_astc_block_google(&phys_blk, BLOCK_W, BLOCK_H, pixels_google); + if (!status) + { + fmt_error_printf("unpack_physical_astc_block_google() failed\n"); + return false; + } + + unpacked_astc_google_img.set_block_clipped(pixels_google, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H); + + for (uint32_t i = 0; i < 36; i++) + { + if (pixels[i] != pixels_google[i]) + { + fmt_error_printf("pixel unpack mismatch\n"); + return false; + } + } + } + } + + if (global_cfg.m_debug_output) + fmt_printf("\nUnpack succeeded\n"); + + imagef unpacked_bc6h_img; + + { + vector2D bc6h_blocks; + + fast_bc6h_params enc_params; + + bool pack_status = pack_bc6h_image(unpacked_astc_img, bc6h_blocks, &unpacked_bc6h_img, enc_params); + if (!pack_status) + { + fmt_error_printf("pack_bc6h_image() failed!"); + return false; + } + + unpacked_bc6h_img.crop(width, height); + + if (global_cfg.m_output_images) + { + write_exr((global_cfg.m_output_image_prefix + "unpacked_bc6h.exr").c_str(), unpacked_bc6h_img, 3, 0); + } + } + + unpacked_astc_img.crop(width, height); + unpacked_astc_google_img.crop(width, height); + + if (global_cfg.m_output_images) + { + write_exr((global_cfg.m_output_image_prefix + "unpacked_astc.exr").c_str(), unpacked_astc_img, 3, 0); + write_exr((global_cfg.m_output_image_prefix + "unpacked_google_astc.exr").c_str(), unpacked_astc_google_img, 3, 0); + } + + // ASTC metrics + if (global_cfg.m_image_stats) + { + image_metrics im; + + if (global_cfg.m_debug_output) + printf("\nASTC log2 float error metrics:\n"); + + for (uint32_t i = 0; i < 3; i++) + { + im.calc(enc_state.src_img, unpacked_astc_img, i, 1, true, true); + + if (global_cfg.m_debug_output) + { + printf("%c: ", "RGBA"[i]); + im.print_hp(); + } + } + + metrics.m_im_astc_log2.calc(enc_state.src_img, unpacked_astc_img, 0, 3, true, true); + + if (global_cfg.m_debug_output) + { + printf("RGB: "); + metrics.m_im_astc_log2.print_hp(); + + printf("\n"); + } + } + + if (global_cfg.m_image_stats) + { + image_metrics im; + + if (global_cfg.m_debug_output) + printf("ASTC half float space error metrics (a piecewise linear approximation of log2 error):\n"); + + for (uint32_t i = 0; i < 3; i++) + { + im.calc_half(enc_state.src_img, unpacked_astc_img, i, 1, true); + + if (global_cfg.m_debug_output) + { + printf("%c: ", "RGBA"[i]); + im.print_hp(); + } + } + + metrics.m_im_astc_half.calc_half(enc_state.src_img, unpacked_astc_img, 0, 3, true); + + if (global_cfg.m_debug_output) + { + printf("RGB: "); + metrics.m_im_astc_half.print_hp(); + } + } + + // BC6H metrics + if (global_cfg.m_image_stats) + { + image_metrics im; + + if (global_cfg.m_debug_output) + printf("\nBC6H log2 float error metrics:\n"); + + for (uint32_t i = 0; i < 3; i++) + { + im.calc(enc_state.src_img, unpacked_bc6h_img, i, 1, true, true); + + if (global_cfg.m_debug_output) + { + printf("%c: ", "RGBA"[i]); + im.print_hp(); + } + } + + metrics.m_im_bc6h_log2.calc(enc_state.src_img, unpacked_bc6h_img, 0, 3, true, true); + + if (global_cfg.m_debug_output) + { + printf("RGB: "); + metrics.m_im_bc6h_log2.print_hp(); + + printf("\n"); + } + } + + if (global_cfg.m_image_stats) + { + image_metrics im; + + if (global_cfg.m_debug_output) + printf("BC6H half float space error metrics (a piecewise linear approximation of log2 error):\n"); + + for (uint32_t i = 0; i < 3; i++) + { + im.calc_half(enc_state.src_img, unpacked_bc6h_img, i, 1, true); + + if (global_cfg.m_debug_output) + { + printf("%c: ", "RGBA"[i]); + im.print_hp(); + } + } + + metrics.m_im_bc6h_half.calc_half(enc_state.src_img, unpacked_bc6h_img, 0, 3, true); + + if (global_cfg.m_debug_output) + { + printf("RGB: "); + metrics.m_im_bc6h_half.print_hp(); + + printf("\n"); + } + } + + intermediate_tex_data.swap(coded_bits.get_bytes()); + + astc_tex_data.resize(decoded_blocks1.size_in_bytes()); + memcpy(astc_tex_data.data(), decoded_blocks1.get_ptr(), decoded_blocks1.size_in_bytes()); + + return true; +} + +} // namespace astc_6x6_hdr diff --git a/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.h b/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.h new file mode 100644 index 000000000000..8b82ad8c2914 --- /dev/null +++ b/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.h @@ -0,0 +1,129 @@ +// File: basisu_astc_hdr_6x6_enc.h +#pragma once +#include "basisu_enc.h" +#include "../transcoder/basisu_astc_hdr_core.h" + +namespace astc_6x6_hdr +{ + const uint32_t ASTC_HDR_6X6_MAX_USER_COMP_LEVEL = 12; + + const uint32_t ASTC_HDR_6X6_MAX_COMP_LEVEL = 4; + + const float LDR_BLACK_BIAS = 0.0f;// .49f; + + // Note: This struct is copied several times, so do not place any heavyweight objects in here. + struct astc_hdr_6x6_global_config + { + // Important: The Delta ITP colorspace error metric we use internally makes several assumptions about the nature of the HDR RGB inputs supplied to the encoder. + // This encoder computes colorspace error in the ICtCp (or more accurately the delta ITP, where CT is scaled by .5 vs. ICtCp to become T) colorspace, so getting this correct is important. + // By default the encoder assumes the input is in absolute luminance (in nits or candela per square meter, cd/m^2), specified as positive-only linear light RGB, using the REC 709 colorspace gamut (but NOT the sRGB transfer function, i.e. linear light). + // If the m_rec2020_bt2100_color_gamut flag is true, the input colorspace is treated as REC 2020/BT.2100 (which is wider than 709). + // For SDR/LDR->HDR upconversion, the REC 709 sRGB input should be converted to linear light (sRGB->linear) and the resulting normalized linear RGB values scaled by either 80 or 100 nits (the luminance of a typical SDR monitor). + // SDR upconversion to normalized [0,1] (i.e. non-absolute) luminances may work but is not supported because ITP errors will not be predicted correctly. + bool m_rec2020_bt2100_color_gamut = false; + + // levels 0-3 normal levels, 4=exhaustive + uint32_t m_master_comp_level = 0; + uint32_t m_highest_comp_level = 1; + + float m_lambda = 0.0f; + + bool m_extra_patterns_flag = false; // def to false, works in comp levels [1,4] + bool m_brute_force_partition_matching = false; // def to false + + bool m_jnd_optimization = false; // defaults to false for HDR inputs, on SDR upconverted images this can default to enabled + float m_jnd_delta_itp_thresh = .75f; + + bool m_force_one_strip = false; + + bool m_gaussian1_fallback = true; // def to true, if this is disabled m_gaussian2_fallback should be disabled too + float m_gaussian1_strength = 1.45f; + + bool m_gaussian2_fallback = true; // def to true, hopefully rarely kicks in + float m_gaussian2_strength = 1.83f; + + // m_disable_delta_endpoint_usage may give a slight increase in RDO ASTC encoding efficiency. It's also faster. + bool m_disable_delta_endpoint_usage = false; + + // Scale up Delta ITP errors for very dark pixels, assuming they will be brightly exposed > 1.0x. + // We don't know if the output will be exposed, or not. If heavily exposed, our JND calculations will not be conservative enough. + bool m_delta_itp_dark_adjustment = true; + + bool m_debug_images = false; + std::string m_debug_image_prefix = "dbg_astc_hdr_6x6_devel_"; + + bool m_output_images = false; + std::string m_output_image_prefix = "dbg_astc_hdr_6x6_output_"; + + bool m_debug_output = false; + bool m_image_stats = false; + bool m_status_output = false; + + //------------------------------------------------------------------------------------- + // Very low level/devel parameters - intended for development. Best not to change them. + //------------------------------------------------------------------------------------- + bool m_deblocking_flag = true; + float m_deblock_penalty_weight = .03f; + bool m_disable_twothree_subsets = false; // def to false + bool m_use_solid_blocks = true; // def to true + bool m_use_runs = true; // def to true + bool m_block_stat_optimizations_flag = true; // def to true + + bool m_rdo_candidate_diversity_boost = true; // def to true + float m_rdo_candidate_diversity_boost_bit_window_weight = 1.2f; + + bool m_favor_higher_compression = true; // utilize all modes + uint32_t m_num_reuse_xy_deltas = basist::astc_6x6_hdr::NUM_REUSE_XY_DELTAS; + + void print() const + { + basisu::fmt_debug_printf("m_master_comp_level: {}, m_highest_comp_level: {}\n", m_master_comp_level, m_highest_comp_level); + basisu::fmt_debug_printf("m_lambda: {}\n", m_lambda); + basisu::fmt_debug_printf("m_rec2020_bt2100_color_gamut: {}\n", m_rec2020_bt2100_color_gamut); + basisu::fmt_debug_printf("m_extra_patterns_flag: {}, m_brute_force_partition_matching: {}\n", m_extra_patterns_flag, m_brute_force_partition_matching); + basisu::fmt_debug_printf("m_jnd_optimization: {}, m_jnd_delta_itp_thresh: {}\n", m_jnd_optimization, m_jnd_delta_itp_thresh); + basisu::fmt_debug_printf("m_force_one_strip: {}\n", m_force_one_strip); + basisu::fmt_debug_printf("m_gaussian1_fallback: {}, m_gaussian1_strength: {}\n", m_gaussian1_fallback, m_gaussian1_strength); + basisu::fmt_debug_printf("m_gaussian2_fallback: {}, m_gaussian2_strength: {}\n", m_gaussian2_fallback, m_gaussian2_strength); + basisu::fmt_debug_printf("m_disable_delta_endpoint_usage: {}\n", m_disable_delta_endpoint_usage); + basisu::fmt_debug_printf("m_delta_itp_dark_adjustment: {}\n", m_delta_itp_dark_adjustment); + basisu::fmt_debug_printf("m_debug_images: {}, m_debug_image_prefix: {}\n", m_debug_images, m_debug_image_prefix); + basisu::fmt_debug_printf("m_output_images: {}, m_output_image_prefix: {}\n", m_output_images, m_output_image_prefix); + basisu::fmt_debug_printf("m_image_stats: {}, m_status_output: {}\n", m_image_stats, m_status_output); + basisu::fmt_debug_printf("m_deblocking_flag: {}, m_deblock_penalty_weight: {}\n", m_deblocking_flag, m_deblock_penalty_weight); + basisu::fmt_debug_printf("m_disable_twothree_subsets: {}, m_use_solid_blocks: {}\n", m_disable_twothree_subsets, m_use_solid_blocks); + basisu::fmt_debug_printf("m_use_runs: {}, m_block_stat_optimizations_flag: {}\n", m_use_runs, m_block_stat_optimizations_flag); + basisu::fmt_debug_printf("m_rdo_candidate_diversity_boost: {}, m_rdo_candidate_diversity_boost_bit_window_weight: {}\n", m_rdo_candidate_diversity_boost, m_rdo_candidate_diversity_boost_bit_window_weight); + basisu::fmt_debug_printf("m_favor_higher_compression: {}, m_num_reuse_xy_deltas: {}\n", m_favor_higher_compression, m_num_reuse_xy_deltas); + } + + astc_hdr_6x6_global_config() + { + } + + void clear() + { + astc_hdr_6x6_global_config def; + std::swap(*this, def); + } + + // Max level is ASTC_HDR_6X6_MAX_USER_COMP_LEVEL + void set_user_level(int level); + }; + + void global_init(); + + struct result_metrics + { + basisu::image_metrics m_im_astc_log2; + basisu::image_metrics m_im_astc_half; + + basisu::image_metrics m_im_bc6h_log2; + basisu::image_metrics m_im_bc6h_half; + }; + + // The input image should be unpadded to 6x6 boundaries, i.e. the original unexpanded image. + bool compress_photo(const basisu::imagef& orig_src_img, const astc_hdr_6x6_global_config& global_cfg, basisu::job_pool* pJob_pool, + basisu::uint8_vec& intermediate_tex_data, basisu::uint8_vec& astc_tex_data, result_metrics& metrics); + +} // namespace uastc_6x6_hdr diff --git a/thirdparty/basis_universal/encoder/basisu_astc_hdr_common.cpp b/thirdparty/basis_universal/encoder/basisu_astc_hdr_common.cpp new file mode 100644 index 000000000000..b720e2690ed1 --- /dev/null +++ b/thirdparty/basis_universal/encoder/basisu_astc_hdr_common.cpp @@ -0,0 +1,5357 @@ +// File: basisu_astc_hdr_common.cpp +#include "basisu_enc.h" +#include "basisu_gpu_texture.h" +#include "../transcoder/basisu_astc_helpers.h" +#include "../transcoder/basisu_astc_hdr_core.h" +#include "basisu_astc_hdr_common.h" + +using namespace basist; + +#ifndef __EMSCRIPTEN__ + #define BASISU_MULTITHREADED_INIT (0) +#endif + +namespace basisu +{ + +const uint8_t g_ise_weight_lerps[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][33] = +{ + { 2, 0, 64 }, // 0, note ise range=0 is invalid for 4x4 block sizes (<24 weight bits in the block) + { 3, 0, 32, 64 }, // 1 + { 4, 0, 21, 43, 64 }, // 2 + { 5, 0, 16, 32, 48, 64 }, // 3 + { 6, 0, 64, 12, 52, 25, 39 }, // 4 + { 8, 0, 9, 18, 27, 37, 46, 55, 64 }, // 5 + { 10, 0, 64, 7, 57, 14, 50, 21, 43, 28, 36 }, // 6 + { 12, 0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36 }, // 7 + { 16, 0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64 }, // 8 + { 20, 0,64,16,48,3,61,19,45,6,58,23,41,9,55,26,38,13,51,29,35}, // 9 + { 24, 0,64,8,56,16,48,24,40,2,62,11,53,19,45,27,37,5,59,13,51,22,42,30,34}, // 10 + { 32, 0,2,4,6,8,10,12,14,16,18,20,22,24,26,28,30,34,36,38,40,42,44,46,48,50,52,54,56,58,60,62,64}, // 11 +}; + +//-------------------------------------------------------------------------------------------------------------------------- + +const float DEF_R_ERROR_SCALE = 2.0f; +const float DEF_G_ERROR_SCALE = 3.0f; + +void astc_hdr_codec_base_options::init() +{ + m_r_err_scale = DEF_R_ERROR_SCALE; + m_g_err_scale = DEF_G_ERROR_SCALE; + m_q_log_bias = Q_LOG_BIAS_4x4; + + m_ultra_quant = false; + + // Disabling by default to avoid transcoding outliers (try kodim26). The quality lost is very low. TODO: Could include the uber result in the output. + m_allow_uber_mode = false; + + m_mode7_full_s_optimization = true; + + m_take_first_non_clamping_mode11_submode = false; + m_take_first_non_clamping_mode7_submode = false; + + m_disable_weight_plane_optimization = true; +} + +//-------------------------------------------------------------------------------------------------------------------------- +// max usable qlog8 value is 247, 248=inf, >=249 is nan +// max usable qlog7 value is 123, 124=inf, >=125 is nan + +//const uint32_t TOTAL_USABLE_QLOG8 = 248; // 0-247 are usable, 0=0, 247=60416.0, 246=55296.0 + +// nearest values given a positive half float value (only) +static uint16_t g_half_to_qlog7[32768], g_half_to_qlog8[32768]; + +const uint32_t HALF_TO_QLOG_TABS_MIN_BITS = 7; +const uint32_t HALF_TO_QLOG_TABS_MAX_BITS = 8; +static uint16_t* g_pHalf_to_qlog_tabs[2] = +{ + g_half_to_qlog7, + g_half_to_qlog8, +}; + +#if 0 +static inline uint32_t half_to_qlog7_8(half_float h, uint32_t bits) +{ + assert((bits >= HALF_TO_QLOG_TABS_MIN_BITS) && (bits <= HALF_TO_QLOG_TABS_MAX_BITS)); + assert(h < 32768); + + return g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_MIN_BITS][h]; +} +#endif + +// TODO: Tune this +static inline uint32_t quant_qlog16(uint32_t q16, uint32_t desired_bits) +{ + assert((desired_bits >= 7) && (desired_bits <= 12)); + assert(q16 <= 65535); + + const uint32_t shift = 16 - desired_bits; + uint32_t e = (q16 + (1U << (shift - 1U)) - 1U) >> shift; + + uint32_t max_val = (1U << desired_bits) - 1U; + e = minimum(e, max_val); + + return e; +} + +static void compute_half_to_qlog_table(uint32_t bits, uint16_t* pTable, const basisu::vector& qlog16_to_float) +{ + assert(bits >= 5 && bits <= 12); + const uint32_t max_val = (1 << bits) - 1; + + const uint32_t FIRST_INVALID_QLOG16_INDEX = 63488; // first inf, rest are inf/nan's + assert(std::isinf(qlog16_to_float[FIRST_INVALID_QLOG16_INDEX])); + assert(std::isinf(qlog16_to_float[FIRST_INVALID_QLOG16_INDEX + 1])); + assert(!std::isnan(qlog16_to_float[FIRST_INVALID_QLOG16_INDEX - 1])); + assert(!std::isinf(qlog16_to_float[FIRST_INVALID_QLOG16_INDEX - 1])); + + // For all positive half-floats + for (uint32_t h = 0; h < 32768; h++) + { + // Skip invalid values + if (is_half_inf_or_nan((half_float)h)) + continue; + const float desired_val = half_to_float((half_float)h); + + float best_err = BIG_FLOAT_VAL; + uint32_t best_qlog = 0; + + double prev_err = BIG_FLOAT_VAL; + + // For all possible qlog's + for (uint32_t i = 0; i <= max_val; i++) + { + // Skip invalid values + uint32_t idx = i << (16 - bits); + if (idx >= FIRST_INVALID_QLOG16_INDEX) + break; + + float v = qlog16_to_float[idx]; + //assert(!std::isinf(v) && !std::isnan(v)); // too clostly in debug + + // Compute error + float err = fabsf(v - desired_val); + + if (err > prev_err) + { + // Every remaining entry will have guaranteed higher error + break; + } + + prev_err = err; + + // Find best + if (err < best_err) + { + best_err = err; + best_qlog = i; + + if (best_err == 0.0f) + break; + } + } + + pTable[h] = (uint16_t)best_qlog; + } +} + +static void init_qlog_tables() +{ + basisu::vector qlog16_to_float(65536); + + // for all possible qlog16, compute the corresponding half float + for (uint32_t i = 0; i <= 65535; i++) + { + half_float h = astc_helpers::qlog16_to_half(i); + + qlog16_to_float[i] = half_to_float(h); + } + +#if BASISU_MULTITHREADED_INIT + job_pool jp(3); + + for (uint32_t bits = HALF_TO_QLOG_TABS_MIN_BITS; bits <= HALF_TO_QLOG_TABS_MAX_BITS; bits++) + { + jp.add_job( [bits, &qlog16_to_float]() { compute_half_to_qlog_table(bits, g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_MIN_BITS], qlog16_to_float); }); + } + + jp.wait_for_all(); +#else + // for all possible half floats, find the nearest qlog5-12 float + for (uint32_t bits = HALF_TO_QLOG_TABS_MIN_BITS; bits <= HALF_TO_QLOG_TABS_MAX_BITS; bits++) + { + compute_half_to_qlog_table(bits, g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_MIN_BITS], qlog16_to_float); + +#if 0 + std::vector check_tab(32768); + compute_half_to_qlog_table_orig(bits, check_tab.data(), qlog16_to_float); + for (uint32_t i = 0; i < (1 << bits); i++) + { + assert(check_tab[i] == g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_MIN_BITS][i]); + } +#endif + } +#endif // BASISU_MULTITHREADED_INIT +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static vec3F calc_mean(uint32_t num_pixels, const vec4F* pPixels) +{ + vec3F mean(0.0f); + + for (uint32_t i = 0; i < num_pixels; i++) + { + const vec4F& p = pPixels[i]; + + mean[0] += p[0]; + mean[1] += p[1]; + mean[2] += p[2]; + } + + return mean / static_cast(num_pixels); +} + +static vec3F calc_rgb_pca(uint32_t num_pixels, const vec4F* pPixels, const vec3F& mean_color) +{ + float cov[6] = { 0, 0, 0, 0, 0, 0 }; + + for (uint32_t i = 0; i < num_pixels; i++) + { + const vec4F& v = pPixels[i]; + + float r = v[0] - mean_color[0]; + float g = v[1] - mean_color[1]; + float b = v[2] - mean_color[2]; + + cov[0] += r * r; + cov[1] += r * g; + cov[2] += r * b; + cov[3] += g * g; + cov[4] += g * b; + cov[5] += b * b; + } + + float xr = .9f, xg = 1.0f, xb = .7f; + for (uint32_t iter = 0; iter < 3; iter++) + { + float r = xr * cov[0] + xg * cov[1] + xb * cov[2]; + float g = xr * cov[1] + xg * cov[3] + xb * cov[4]; + float b = xr * cov[2] + xg * cov[4] + xb * cov[5]; + + float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b)); + + if (m > 1e-10f) + { + m = 1.0f / m; + + r *= m; + g *= m; + b *= m; + } + + xr = r; + xg = g; + xb = b; + } + + float len = xr * xr + xg * xg + xb * xb; + + vec3F axis(0.5773502691f); + + if (len >= 1e-10f) + { + len = 1.0f / sqrtf(len); + + xr *= len; + xg *= len; + xb *= len; + + axis.set(xr, xg, xb); + } + + return axis; +} + +void encode_astc_block_stats::init(uint32_t num_pixels, const vec4F pBlock_pixels_q16[]) +{ + m_num_pixels = num_pixels; + m_mean_q16 = calc_mean(num_pixels, pBlock_pixels_q16); + m_axis_q16 = calc_rgb_pca(num_pixels, pBlock_pixels_q16, m_mean_q16); +} + +static vec3F interp_color(const vec3F& mean, const vec3F& dir, float df, const aabb3F& colorspace_box, const aabb3F& input_box, bool* pInside = nullptr) +{ +#if 0 + assert(mean[0] >= input_box[0][0]); + assert(mean[1] >= input_box[0][1]); + assert(mean[2] >= input_box[0][2]); + assert(mean[0] <= input_box[1][0]); + assert(mean[1] <= input_box[1][1]); + assert(mean[2] <= input_box[1][2]); +#endif + + if (pInside) + *pInside = false; + + vec3F k(mean + dir * df); + if (colorspace_box.contains(k)) + { + if (pInside) + *pInside = true; + + return k; + } + + // starts inside + vec3F s(mean); + + // ends outside + vec3F e(mean + dir * df); + + // a ray guaranteed to go from the outside to inside + ray3F r(e, (s - e).normalize_in_place()); + vec3F c; + float t = 0.0f; + + intersection::result res = intersection::ray_aabb(c, t, r, input_box); + if (res != intersection::cSuccess) + c = k; + + return c; +} + +// all in Q16 space, 0-65535 +static bool compute_least_squares_endpoints_rgb( + uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights, + vec3F* pXl, vec3F* pXh, const vec4F* pColors, const aabb3F& input_box) +{ + // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf + // https://web.archive.org/web/20150319232457/http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf + // I did this in matrix form first, expanded out all the ops, then optimized it a bit. + float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; + float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; + float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; + float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f; + + for (uint32_t i = 0; i < N; i++) + { + const uint32_t sel = pSelectors[i]; + + z00 += pSelector_weights[sel][0]; + z10 += pSelector_weights[sel][1]; + z11 += pSelector_weights[sel][2]; + + float w = pSelector_weights[sel][3]; + + q00_r += w * pColors[i][0]; + t_r += pColors[i][0]; + + q00_g += w * pColors[i][1]; + t_g += pColors[i][1]; + + q00_b += w * pColors[i][2]; + t_b += pColors[i][2]; + } + + q10_r = t_r - q00_r; + q10_g = t_g - q00_g; + q10_b = t_b - q00_b; + + z01 = z10; + + float det = z00 * z11 - z01 * z10; + if (det == 0.0f) + return false; + + det = 1.0f / det; + + float iz00, iz01, iz10, iz11; + iz00 = z11 * det; + iz01 = -z01 * det; + iz10 = -z10 * det; + iz11 = z00 * det; + + (*pXl)[0] = (float)(iz00 * q00_r + iz01 * q10_r); + (*pXh)[0] = (float)(iz10 * q00_r + iz11 * q10_r); + + (*pXl)[1] = (float)(iz00 * q00_g + iz01 * q10_g); + (*pXh)[1] = (float)(iz10 * q00_g + iz11 * q10_g); + + (*pXl)[2] = (float)(iz00 * q00_b + iz01 * q10_b); + (*pXh)[2] = (float)(iz10 * q00_b + iz11 * q10_b); + + for (uint32_t c = 0; c < 3; c++) + { + float l = (*pXl)[c], h = (*pXh)[c]; + + if (input_box.get_dim(c) < .0000125f) + { + l = input_box[0][c]; + h = input_box[1][c]; + } + + (*pXl)[c] = l; + (*pXh)[c] = h; + } + + vec3F mean((*pXl + *pXh) * .5f); + vec3F dir(*pXh - *pXl); + + float ln = dir.length(); + if (ln) + { + dir /= ln; + + float ld = (*pXl - mean).dot(dir); + float hd = (*pXh - mean).dot(dir); + + aabb3F colorspace_box(vec3F(0.0f), vec3F(MAX_QLOG16_VAL)); + + bool was_inside1 = false; + + vec3F l = interp_color(mean, dir, ld, colorspace_box, input_box, &was_inside1); + if (!was_inside1) + *pXl = l; + + bool was_inside2 = false; + vec3F h = interp_color(mean, dir, hd, colorspace_box, input_box, &was_inside2); + if (!was_inside2) + *pXh = h; + } + + pXl->clamp(0.0f, MAX_QLOG16_VAL); + pXh->clamp(0.0f, MAX_QLOG16_VAL); + + return true; +} + +static bool compute_least_squares_endpoints_rgb_raw_weights( + uint32_t N, const uint8_t* pRaw_weights, + vec3F* pXl, vec3F* pXh, const vec4F* pColors, const aabb3F& input_box) +{ + // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf + // https://web.archive.org/web/20150319232457/http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf + // I did this in matrix form first, expanded out all the ops, then optimized it a bit. + float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; + float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; + float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; + float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f; + + for (uint32_t i = 0; i < N; i++) + { + const float wt = (float)pRaw_weights[i] * (1.0f / 64.0f); + assert(wt <= 1.0f); + + const float w0 = wt * wt; + const float w1 = (1.0f - wt) * wt; + const float w2 = (1.0f - wt) * (1.0f - wt); + const float w3 = wt; + + z00 += w0; + z10 += w1; + z11 += w2; + + float w = w3; + q00_r += w * pColors[i][0]; + t_r += pColors[i][0]; + + q00_g += w * pColors[i][1]; + t_g += pColors[i][1]; + + q00_b += w * pColors[i][2]; + t_b += pColors[i][2]; + } + + q10_r = t_r - q00_r; + q10_g = t_g - q00_g; + q10_b = t_b - q00_b; + + z01 = z10; + + float det = z00 * z11 - z01 * z10; + if (det == 0.0f) + return false; + + det = 1.0f / det; + + float iz00, iz01, iz10, iz11; + iz00 = z11 * det; + iz01 = -z01 * det; + iz10 = -z10 * det; + iz11 = z00 * det; + + (*pXl)[0] = (float)(iz00 * q00_r + iz01 * q10_r); + (*pXh)[0] = (float)(iz10 * q00_r + iz11 * q10_r); + + (*pXl)[1] = (float)(iz00 * q00_g + iz01 * q10_g); + (*pXh)[1] = (float)(iz10 * q00_g + iz11 * q10_g); + + (*pXl)[2] = (float)(iz00 * q00_b + iz01 * q10_b); + (*pXh)[2] = (float)(iz10 * q00_b + iz11 * q10_b); + + for (uint32_t c = 0; c < 3; c++) + { + float l = (*pXl)[c], h = (*pXh)[c]; + + if (input_box.get_dim(c) < .0000125f) + { + l = input_box[0][c]; + h = input_box[1][c]; + } + + (*pXl)[c] = l; + (*pXh)[c] = h; + } + + vec3F mean((*pXl + *pXh) * .5f); + vec3F dir(*pXh - *pXl); + + float ln = dir.length(); + if (ln) + { + dir /= ln; + + float ld = (*pXl - mean).dot(dir); + float hd = (*pXh - mean).dot(dir); + + aabb3F colorspace_box(vec3F(0.0f), vec3F(MAX_QLOG16_VAL)); + + bool was_inside1 = false; + + vec3F l = interp_color(mean, dir, ld, colorspace_box, input_box, &was_inside1); + if (!was_inside1) + *pXl = l; + + bool was_inside2 = false; + vec3F h = interp_color(mean, dir, hd, colorspace_box, input_box, &was_inside2); + if (!was_inside2) + *pXh = h; + } + + pXl->clamp(0.0f, MAX_QLOG16_VAL); + pXh->clamp(0.0f, MAX_QLOG16_VAL); + + return true; +} + +static bool compute_least_squares_endpoints_2D( + uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights, + vec2F* pXl, vec2F* pXh, const vec2F* pColors, const aabb2F& input_box) +{ + // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf + // https://web.archive.org/web/20150319232457/http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf + // I did this in matrix form first, expanded out all the ops, then optimized it a bit. + float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; + float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; + float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; + + for (uint32_t i = 0; i < N; i++) + { + const uint32_t sel = pSelectors[i]; + z00 += pSelector_weights[sel][0]; + z10 += pSelector_weights[sel][1]; + z11 += pSelector_weights[sel][2]; + + float w = pSelector_weights[sel][3]; + q00_r += w * pColors[i][0]; + t_r += pColors[i][0]; + + q00_g += w * pColors[i][1]; + t_g += pColors[i][1]; + } + + q10_r = t_r - q00_r; + q10_g = t_g - q00_g; + + z01 = z10; + + float det = z00 * z11 - z01 * z10; + if (det == 0.0f) + return false; + + det = 1.0f / det; + + float iz00, iz01, iz10, iz11; + iz00 = z11 * det; + iz01 = -z01 * det; + iz10 = -z10 * det; + iz11 = z00 * det; + + (*pXl)[0] = (float)(iz00 * q00_r + iz01 * q10_r); + (*pXh)[0] = (float)(iz10 * q00_r + iz11 * q10_r); + + (*pXl)[1] = (float)(iz00 * q00_g + iz01 * q10_g); + (*pXh)[1] = (float)(iz10 * q00_g + iz11 * q10_g); + + for (uint32_t c = 0; c < 2; c++) + { + float l = (*pXl)[c], h = (*pXh)[c]; + + if (input_box.get_dim(c) < .0000125f) + { + l = input_box[0][c]; + h = input_box[1][c]; + } + + (*pXl)[c] = l; + (*pXh)[c] = h; + } + + pXl->clamp(0.0f, MAX_QLOG16_VAL); + pXh->clamp(0.0f, MAX_QLOG16_VAL); + + return true; +} + +static bool compute_least_squares_endpoints_1D( + uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights, + vec1F* pXl, vec1F* pXh, const vec1F* pColors, const aabb1F& input_box) +{ + // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf + // https://web.archive.org/web/20150319232457/http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf + // I did this in matrix form first, expanded out all the ops, then optimized it a bit. + float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; + float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; + + for (uint32_t i = 0; i < N; i++) + { + const uint32_t sel = pSelectors[i]; + z00 += pSelector_weights[sel][0]; + z10 += pSelector_weights[sel][1]; + z11 += pSelector_weights[sel][2]; + + float w = pSelector_weights[sel][3]; + q00_r += w * pColors[i][0]; + t_r += pColors[i][0]; + } + + q10_r = t_r - q00_r; + + z01 = z10; + + float det = z00 * z11 - z01 * z10; + if (det == 0.0f) + return false; + + det = 1.0f / det; + + float iz00, iz01, iz10, iz11; + iz00 = z11 * det; + iz01 = -z01 * det; + iz10 = -z10 * det; + iz11 = z00 * det; + + (*pXl)[0] = (float)(iz00 * q00_r + iz01 * q10_r); + (*pXh)[0] = (float)(iz10 * q00_r + iz11 * q10_r); + + for (uint32_t c = 0; c < 1; c++) + { + float l = (*pXl)[c], h = (*pXh)[c]; + + if (input_box.get_dim(c) < .0000125f) + { + l = input_box[0][c]; + h = input_box[1][c]; + } + + (*pXl)[c] = l; + (*pXh)[c] = h; + } + + pXl->clamp(0.0f, MAX_QLOG16_VAL); + pXh->clamp(0.0f, MAX_QLOG16_VAL); + + return true; +} + +static bool compute_weighted_least_squares_endpoints_rgb( + uint32_t N, + const uint8_t* pSelectors, const vec4F* pSelector_weights, const float* pRaw_weights, /* ti */ + const float* pEmphasis_weights /* wi */, + vec3F* pXl, vec3F* pXh, + const vec4F* pColors, /* pi */ + const aabb3F& input_box) +{ + (void)input_box; + + assert(N); + assert((pSelectors && pSelector_weights) || pRaw_weights); + assert(pEmphasis_weights); + + // Pi = pixel colors + // Ti = project weights, [0,1] + // Wi = emphasis weights + + float total_wi = 0.0f; + for (uint32_t i = 0; i < N; i++) + total_wi += pEmphasis_weights[i]; + + if (total_wi == 0.0f) + return false; + + float weighted_mean_tw = 0.0f; + float weighted_mean_pw[3] = { 0.0f }; + + for (uint32_t i = 0; i < N; i++) + { + const float wi = pEmphasis_weights[i]; + const float ti = pSelectors ? pSelector_weights[pSelectors[i]][3] : pRaw_weights[i]; + const float pi_r = pColors[i][0], pi_g = pColors[i][1], pi_b = pColors[i][2]; + + weighted_mean_tw += wi * ti; + + weighted_mean_pw[0] += wi * pi_r; + weighted_mean_pw[1] += wi * pi_g; + weighted_mean_pw[2] += wi * pi_b; + } + + weighted_mean_tw /= total_wi; + + weighted_mean_pw[0] /= total_wi; + weighted_mean_pw[1] /= total_wi; + weighted_mean_pw[2] /= total_wi; + + float spt[3] = { 0.0f }; + float stt = 0.0f; + + for (uint32_t i = 0; i < N; i++) + { + const float wi = pEmphasis_weights[i]; + const float ti = pSelectors ? pSelector_weights[pSelectors[i]][3] : pRaw_weights[i]; + const float pi_r = pColors[i][0], pi_g = pColors[i][1], pi_b = pColors[i][2]; + + spt[0] += wi * (pi_r - weighted_mean_pw[0]) * (ti - weighted_mean_tw); + spt[1] += wi * (pi_g - weighted_mean_pw[1]) * (ti - weighted_mean_tw); + spt[2] += wi * (pi_b - weighted_mean_pw[2]) * (ti - weighted_mean_tw); + + stt += wi * square(ti - weighted_mean_tw); + } + + if (stt == 0.0f) + return false; + + for (uint32_t i = 0; i < 3; i++) + { + float h = weighted_mean_pw[i] + (spt[i] / stt) * (1.0f - weighted_mean_tw); + float l = weighted_mean_pw[i] - (spt[i] / stt) * weighted_mean_tw; + + (*pXh)[i] = h; + (*pXl)[i] = l; + } + + pXl->clamp(0.0f, MAX_QLOG16_VAL); + pXh->clamp(0.0f, MAX_QLOG16_VAL); + + return true; +} + +static vec4F g_astc_ls_weights_ise[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][MAX_SUPPORTED_WEIGHT_LEVELS]; + +static uint8_t g_map_astc_to_linear_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][MAX_SUPPORTED_WEIGHT_LEVELS]; // [ise_range][astc_index] -> linear index +static uint8_t g_map_linear_to_astc_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][MAX_SUPPORTED_WEIGHT_LEVELS]; // [ise_range][linear_index] -> astc_index + +static void encode_astc_hdr_init() +{ + // Precomputed weight constants used during least fit determination. For each entry: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w + for (uint32_t range = MIN_SUPPORTED_ISE_WEIGHT_INDEX; range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX; range++) + { + const uint32_t num_levels = g_ise_weight_lerps[range][0]; + assert(num_levels == astc_helpers::get_ise_levels(range)); + assert((num_levels >= MIN_SUPPORTED_WEIGHT_LEVELS) && (num_levels <= MAX_SUPPORTED_WEIGHT_LEVELS)); + + for (uint32_t i = 0; i < num_levels; i++) + { + float w = g_ise_weight_lerps[range][1 + i] * (1.0f / 64.0f); + + g_astc_ls_weights_ise[range][i].set(w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w); + } + } + + for (uint32_t ise_range = MIN_SUPPORTED_ISE_WEIGHT_INDEX; ise_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX; ise_range++) + { + const uint32_t num_levels = g_ise_weight_lerps[ise_range][0]; + assert((num_levels >= MIN_SUPPORTED_WEIGHT_LEVELS) && (num_levels <= MAX_SUPPORTED_WEIGHT_LEVELS)); + + uint32_t s[MAX_SUPPORTED_WEIGHT_LEVELS]; + for (uint32_t i = 0; i < num_levels; i++) + s[i] = (g_ise_weight_lerps[ise_range][1 + i] << 8) + i; + + std::sort(s, s + num_levels); + + for (uint32_t i = 0; i < num_levels; i++) + g_map_linear_to_astc_order[ise_range][i] = (uint8_t)(s[i] & 0xFF); + + for (uint32_t i = 0; i < num_levels; i++) + g_map_astc_to_linear_order[ise_range][g_map_linear_to_astc_order[ise_range][i]] = (uint8_t)i; + } + + //init_quantize_tables(); +} + +bool g_astc_hdr_enc_initialized; + +void astc_hdr_enc_init() +{ + if (g_astc_hdr_enc_initialized) + return; + + astc_hdr_core_init(); + + astc_helpers::init_tables(true); + + init_qlog_tables(); + + encode_astc_hdr_init(); + + g_astc_hdr_enc_initialized = true; +} + +void interpolate_qlog12_colors( + const int e[2][3], + half_float* pDecoded_half, + vec3F* pDecoded_float, + uint32_t n, uint32_t ise_weight_range) +{ + assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + + for (uint32_t i = 0; i < 2; i++) + { + for (uint32_t j = 0; j < 3; j++) + { + assert(in_range(e[i][j], 0, 0xFFF)); + } + } + + for (uint32_t i = 0; i < n; i++) + { + const int c = g_ise_weight_lerps[ise_weight_range][1 + i]; + assert(c == (int)astc_helpers::dequant_bise_weight(i, ise_weight_range)); + + half_float rf, gf, bf; + + { + uint32_t r0 = e[0][0] << 4; + uint32_t r1 = e[1][0] << 4; + int ri = (r0 * (64 - c) + r1 * c + 32) / 64; + rf = astc_helpers::qlog16_to_half(ri); + } + + { + uint32_t g0 = e[0][1] << 4; + uint32_t g1 = e[1][1] << 4; + int gi = (g0 * (64 - c) + g1 * c + 32) / 64; + gf = astc_helpers::qlog16_to_half(gi); + } + + { + uint32_t b0 = e[0][2] << 4; + uint32_t b1 = e[1][2] << 4; + int bi = (b0 * (64 - c) + b1 * c + 32) / 64; + bf = astc_helpers::qlog16_to_half(bi); + } + + if (pDecoded_half) + { + pDecoded_half[i * 3 + 0] = rf; + pDecoded_half[i * 3 + 1] = gf; + pDecoded_half[i * 3 + 2] = bf; + } + + if (pDecoded_float) + { + pDecoded_float[i][0] = half_to_float(rf); + pDecoded_float[i][1] = half_to_float(gf); + pDecoded_float[i][2] = half_to_float(bf); + } + } +} + +// decoded in ASTC order, not linear order +// return false if the ISE endpoint quantization leads to non-valid endpoints being decoded +bool get_astc_hdr_mode_11_block_colors( + const uint8_t* pEndpoints, + half_float* pDecoded_half, + vec3F* pDecoded_float, + uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range) +{ + assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + + int e[2][3]; + if (!decode_mode11_to_qlog12(pEndpoints, e, ise_endpoint_range)) + return false; + + interpolate_qlog12_colors(e, pDecoded_half, pDecoded_float, n, ise_weight_range); + + return true; +} + +// decoded in ASTC order, not linear order +// return false if the ISE endpoint quantization leads to non-valid endpoints being decoded +bool get_astc_hdr_mode_7_block_colors( + const uint8_t* pEndpoints, + half_float* pDecoded_half, + vec3F* pDecoded_float, + uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range) +{ + assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + + int e[2][3]; + if (!decode_mode7_to_qlog12(pEndpoints, e, nullptr, ise_endpoint_range)) + return false; + + interpolate_qlog12_colors(e, pDecoded_half, pDecoded_float, n, ise_weight_range); + + return true; +} + +double eval_selectors_f( + uint32_t num_pixels, + uint8_t* pWeights, + const half_float* pBlock_pixels_half, + uint32_t num_weight_levels, + const half_float* pDecoded_half, + const astc_hdr_codec_base_options& coptions, + uint32_t usable_selector_bitmask) +{ + assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS)); + assert(usable_selector_bitmask); + + const float R_WEIGHT = coptions.m_r_err_scale; + const float G_WEIGHT = coptions.m_g_err_scale; + + double total_error = 0; + +#ifdef _DEBUG + for (uint32_t i = 0; i < num_weight_levels; i++) + { + assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 0])); + assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 1])); + assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 2])); + } +#endif + + double decoded_half_q[MAX_SUPPORTED_WEIGHT_LEVELS][3]; + + for (uint32_t i = 0; i < num_weight_levels; i++) + { + const half_float* p = &pDecoded_half[i * 3]; + + decoded_half_q[i][0] = q(p[0], coptions.m_q_log_bias); + decoded_half_q[i][1] = q(p[1], coptions.m_q_log_bias); + decoded_half_q[i][2] = q(p[2], coptions.m_q_log_bias); + } + + for (uint32_t p = 0; p < num_pixels; p++) + { + const half_float* pDesired_half = &pBlock_pixels_half[p * 3]; + + const double desired_half_r_q = q(pDesired_half[0], coptions.m_q_log_bias); + const double desired_half_g_q = q(pDesired_half[1], coptions.m_q_log_bias); + const double desired_half_b_q = q(pDesired_half[2], coptions.m_q_log_bias); + + double lowest_e = BIG_FLOAT_VAL; + + //double dists[MAX_SUPPORTED_WEIGHT_LEVELS]; + + // this is an approximation of MSLE + for (uint32_t i = 0; i < num_weight_levels; i++) + { + if (((1 << i) & usable_selector_bitmask) == 0) + continue; + + // compute piecewise linear approximation of log2(a+eps)-log2(b+eps), for each component, then MSLE + double rd = decoded_half_q[i][0] - desired_half_r_q; + double gd = decoded_half_q[i][1] - desired_half_g_q; + double bd = decoded_half_q[i][2] - desired_half_b_q; + + double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd; + + //dists[i] = e; + + if (e < lowest_e) + { + lowest_e = e; + pWeights[p] = (uint8_t)i; + } + } + + total_error += lowest_e; + + } // p + + return total_error; +} + +double eval_selectors( + uint32_t num_pixels, + uint8_t* pWeights, + uint32_t ise_weight_range, + const half_float* pBlock_pixels_half, + uint32_t num_weight_levels, + const half_float* pDecoded_half, + const astc_hdr_codec_base_options& coptions, + uint32_t usable_selector_bitmask) +{ + if ((coptions.m_r_err_scale != 2.0f) || (coptions.m_g_err_scale != 3.0f)) + { + return eval_selectors_f( + num_pixels, + pWeights, + pBlock_pixels_half, + num_weight_levels, + pDecoded_half, + coptions, + usable_selector_bitmask); + } + + assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS)); + assert(usable_selector_bitmask); + + uint64_t total_error = 0; + +#ifdef _DEBUG + for (uint32_t i = 0; i < num_weight_levels; i++) + { + assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 0])); + assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 1])); + assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 2])); + } +#endif + + int64_t decoded_half_q[MAX_SUPPORTED_WEIGHT_LEVELS][3]; + + for (uint32_t i = 0; i < num_weight_levels; i++) + { + const half_float* p = &pDecoded_half[i * 3]; + + decoded_half_q[i][0] = q2(p[0], coptions.m_q_log_bias); + decoded_half_q[i][1] = q2(p[1], coptions.m_q_log_bias); + decoded_half_q[i][2] = q2(p[2], coptions.m_q_log_bias); + } + + if (usable_selector_bitmask != UINT32_MAX) + { + for (uint32_t p = 0; p < num_pixels; p++) + { + const half_float* pDesired_half = &pBlock_pixels_half[p * 3]; + + const int64_t desired_half_r_q = q2(pDesired_half[0], coptions.m_q_log_bias); + const int64_t desired_half_g_q = q2(pDesired_half[1], coptions.m_q_log_bias); + const int64_t desired_half_b_q = q2(pDesired_half[2], coptions.m_q_log_bias); + + int64_t lowest_e = INT64_MAX; + + for (uint32_t i = 0; i < num_weight_levels; i++) + { + if (((1 << i) & usable_selector_bitmask) == 0) + continue; + + int64_t rd = decoded_half_q[i][0] - desired_half_r_q; + int64_t gd = decoded_half_q[i][1] - desired_half_g_q; + int64_t bd = decoded_half_q[i][2] - desired_half_b_q; + + int64_t e = 2 * (rd * rd) + 3 * (gd * gd) + bd * bd; + + if (e < lowest_e) + { + lowest_e = e; + pWeights[p] = (uint8_t)i; + } + } + + total_error += lowest_e; + + } // p + } + else + { + if ((num_weight_levels <= 4) || (coptions.m_disable_weight_plane_optimization)) + { + for (uint32_t p = 0; p < num_pixels; p++) + { + const half_float* pDesired_half = &pBlock_pixels_half[p * 3]; + + const half_float desired_r = pDesired_half[0], desired_g = pDesired_half[1], desired_b = pDesired_half[2]; + + const int64_t desired_half_r_q = q2(desired_r, coptions.m_q_log_bias); + const int64_t desired_half_g_q = q2(desired_g, coptions.m_q_log_bias); + const int64_t desired_half_b_q = q2(desired_b, coptions.m_q_log_bias); + + int64_t lowest_e = INT64_MAX; + + uint32_t i; + for (i = 0; (i + 1) < num_weight_levels; i += 2) + { + int64_t e0, e1; + + { + int64_t rd0 = decoded_half_q[i][0] - desired_half_r_q; // 27 bits maximum with half float inputs + int64_t gd0 = decoded_half_q[i][1] - desired_half_g_q; + int64_t bd0 = decoded_half_q[i][2] - desired_half_b_q; + e0 = ((2 * (rd0 * rd0) + 3 * (gd0 * gd0) + bd0 * bd0) << 5) | i; // max 62 bits (27*2+3+5) + } + + { + int64_t rd1 = decoded_half_q[i + 1][0] - desired_half_r_q; + int64_t gd1 = decoded_half_q[i + 1][1] - desired_half_g_q; + int64_t bd1 = decoded_half_q[i + 1][2] - desired_half_b_q; + e1 = ((2 * (rd1 * rd1) + 3 * (gd1 * gd1) + bd1 * bd1) << 5) | (i + 1); + } + + lowest_e = minimum(lowest_e, e0, e1); + } + + if (i != num_weight_levels) + { + int64_t rd0 = decoded_half_q[i][0] - desired_half_r_q; + int64_t gd0 = decoded_half_q[i][1] - desired_half_g_q; + int64_t bd0 = decoded_half_q[i][2] - desired_half_b_q; + int64_t e0 = ((2 * (rd0 * rd0) + 3 * (gd0 * gd0) + bd0 * bd0) << 5) | i; + + lowest_e = minimum(lowest_e, e0); + } + + pWeights[p] = (uint8_t)(lowest_e & 31); + + total_error += (lowest_e >> 5); + + } // p + } + else + { + const auto& weight_val_to_ise_tab = astc_helpers::g_dequant_tables.get_weight_tab(ise_weight_range).m_val_to_ise; + const int lo_index = weight_val_to_ise_tab[0], hi_index = weight_val_to_ise_tab[64], mid_index = weight_val_to_ise_tab[32]; + + const vec3F low_color((float)pDecoded_half[lo_index * 3 + 0], (float)pDecoded_half[lo_index * 3 + 1], (float)pDecoded_half[lo_index * 3 + 2]); + const vec3F high_color((float)pDecoded_half[hi_index * 3 + 0], (float)pDecoded_half[hi_index * 3 + 1], (float)pDecoded_half[hi_index * 3 + 2]); + const vec3F mid_color((float)pDecoded_half[mid_index * 3 + 0], (float)pDecoded_half[mid_index * 3 + 1], (float)pDecoded_half[mid_index * 3 + 2]); + + const vec3F block_dir(high_color - low_color); + + for (uint32_t p = 0; p < num_pixels; p++) + { + const half_float* pDesired_half = &pBlock_pixels_half[p * 3]; + + const half_float desired_r = pDesired_half[0], desired_g = pDesired_half[1], desired_b = pDesired_half[2]; + + const int64_t desired_half_r_q = q2(desired_r, coptions.m_q_log_bias); + const int64_t desired_half_g_q = q2(desired_g, coptions.m_q_log_bias); + const int64_t desired_half_b_q = q2(desired_b, coptions.m_q_log_bias); + + // Determine which side of the middle plane the point is for a modest gain + vec3F c((float)desired_r - mid_color[0], (float)desired_g - mid_color[1], (float)desired_b - mid_color[2]); + float d = c.dot(block_dir); + + int i = 0, high_index = (num_weight_levels / 2) + 1; + if (d >= 0.0f) + { + i = num_weight_levels / 2; + high_index = num_weight_levels; + } + + int64_t lowest_e = INT64_MAX; + + for (; (i + 1) < high_index; i += 2) + { + int64_t e0, e1; + + { + int64_t rd0 = decoded_half_q[i][0] - desired_half_r_q; // 27 bits maximum with half float inputs + int64_t gd0 = decoded_half_q[i][1] - desired_half_g_q; + int64_t bd0 = decoded_half_q[i][2] - desired_half_b_q; + e0 = ((2 * (rd0 * rd0) + 3 * (gd0 * gd0) + bd0 * bd0) << 5) | i; // max 62 bits (27*2+3+5) + } + + { + int64_t rd1 = decoded_half_q[i + 1][0] - desired_half_r_q; + int64_t gd1 = decoded_half_q[i + 1][1] - desired_half_g_q; + int64_t bd1 = decoded_half_q[i + 1][2] - desired_half_b_q; + e1 = ((2 * (rd1 * rd1) + 3 * (gd1 * gd1) + bd1 * bd1) << 5) | (i + 1); + } + + lowest_e = minimum(lowest_e, e0, e1); + } + + if (i != high_index) + { + int64_t rd0 = decoded_half_q[i][0] - desired_half_r_q; + int64_t gd0 = decoded_half_q[i][1] - desired_half_g_q; + int64_t bd0 = decoded_half_q[i][2] - desired_half_b_q; + int64_t e0 = ((2 * (rd0 * rd0) + 3 * (gd0 * gd0) + bd0 * bd0) << 5) | i; + + lowest_e = minimum(lowest_e, e0); + } + + pWeights[p] = (uint8_t)(lowest_e & 31); + + total_error += (lowest_e >> 5); + + } // p + } + } + + return (double)total_error; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +double eval_selectors_dual_plane( + uint32_t channel_index, + uint32_t num_pixels, + uint8_t* pWeights0, uint8_t* pWeights1, + const half_float* pBlock_pixels_half, + uint32_t num_weight_levels, + const half_float* pDecoded_half, + const astc_hdr_codec_base_options& coptions, + uint32_t usable_selector_bitmask) +{ + assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS)); + assert(usable_selector_bitmask); + + const float R_WEIGHT = coptions.m_r_err_scale; + const float G_WEIGHT = coptions.m_g_err_scale; + + double total_error = 0; + +#ifdef _DEBUG + for (uint32_t i = 0; i < num_weight_levels; i++) + { + assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 0])); + assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 1])); + assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 2])); + } +#endif + + double decoded_half_q[MAX_SUPPORTED_WEIGHT_LEVELS][3]; + + for (uint32_t i = 0; i < num_weight_levels; i++) + { + const half_float* p = &pDecoded_half[i * 3]; + + decoded_half_q[i][0] = q(p[0], coptions.m_q_log_bias); + decoded_half_q[i][1] = q(p[1], coptions.m_q_log_bias); + decoded_half_q[i][2] = q(p[2], coptions.m_q_log_bias); + } + + const double channel_weights[3] = { R_WEIGHT, G_WEIGHT, 1.0f }; + + const uint32_t first_channel = (channel_index + 1) % 3; + const uint32_t second_channel = (channel_index + 2) % 3; + + // First plane + const double first_channel_weight = channel_weights[first_channel]; + const double second_channel_weight = channel_weights[second_channel]; + + for (uint32_t p = 0; p < num_pixels; p++) + { + const half_float* pDesired_half = &pBlock_pixels_half[p * 3]; + + const double desired_half_x_q = q(pDesired_half[first_channel], coptions.m_q_log_bias); + const double desired_half_y_q = q(pDesired_half[second_channel], coptions.m_q_log_bias); + + double lowest_e = BIG_FLOAT_VAL; + + // this is an approximation of MSLE + for (uint32_t i = 0; i < num_weight_levels; i++) + { + if (((1 << i) & usable_selector_bitmask) == 0) + continue; + + double xd = decoded_half_q[i][first_channel] - desired_half_x_q; + double yd = decoded_half_q[i][second_channel] - desired_half_y_q; + + double e = first_channel_weight * (xd * xd) + second_channel_weight * (yd * yd); + + if (e < lowest_e) + { + lowest_e = e; + pWeights0[p] = (uint8_t)i; + } + } + + total_error += lowest_e; + + } // p + + // Second plane + const double alt_channel_weight = channel_weights[channel_index]; + + for (uint32_t p = 0; p < num_pixels; p++) + { + const half_float* pDesired_half = &pBlock_pixels_half[p * 3]; + + const double desired_half_a_q = q(pDesired_half[channel_index], coptions.m_q_log_bias); + + double lowest_e = BIG_FLOAT_VAL; + + // this is an approximation of MSLE + for (uint32_t i = 0; i < num_weight_levels; i++) + { + if (((1 << i) & usable_selector_bitmask) == 0) + continue; + + double ad = decoded_half_q[i][channel_index] - desired_half_a_q; + + double e = alt_channel_weight * (ad * ad); + + if (e < lowest_e) + { + lowest_e = e; + pWeights1[p] = (uint8_t)i; + } + } + + total_error += lowest_e; + + } // p + + return total_error; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +double compute_block_error(uint32_t num_pixels, const half_float* pOrig_block, const half_float* pPacked_block, const astc_hdr_codec_base_options& coptions) +{ + const float R_WEIGHT = coptions.m_r_err_scale; + const float G_WEIGHT = coptions.m_g_err_scale; + + double total_error = 0; + + for (uint32_t p = 0; p < num_pixels; p++) + { + double rd = q(pOrig_block[p * 3 + 0], coptions.m_q_log_bias) - q(pPacked_block[p * 3 + 0], coptions.m_q_log_bias); + double gd = q(pOrig_block[p * 3 + 1], coptions.m_q_log_bias) - q(pPacked_block[p * 3 + 1], coptions.m_q_log_bias); + double bd = q(pOrig_block[p * 3 + 2], coptions.m_q_log_bias) - q(pPacked_block[p * 3 + 2], coptions.m_q_log_bias); + + double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd; + + total_error += e; + } + + return total_error; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +double compute_block_error_from_raw_weights( + uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], + const uint8_t* pRaw_weights, + int endpoints_qlog12[2][3], + const astc_hdr_codec_base_options& coptions) +{ + // qlog12->qlog16 + int trial_e[2][3]; + for (uint32_t i = 0; i < 3; i++) + { + assert(endpoints_qlog12[0][i] <= (int)basist::MAX_QLOG12); + assert(endpoints_qlog12[1][i] <= (int)basist::MAX_QLOG12); + + trial_e[0][i] = endpoints_qlog12[0][i] << 4; + trial_e[1][i] = endpoints_qlog12[1][i] << 4; + } + + const float R_WEIGHT = coptions.m_r_err_scale, G_WEIGHT = coptions.m_g_err_scale; + + double trial_error = 0; + for (uint32_t p = 0; p < num_pixels; p++) + { + const half_float* pDesired_half = &pBlock_pixels_half[p][0]; + + const double desired_half_r_q = q(pDesired_half[0], coptions.m_q_log_bias), desired_half_g_q = q(pDesired_half[1], coptions.m_q_log_bias), desired_half_b_q = q(pDesired_half[2], coptions.m_q_log_bias); + + const uint32_t c = pRaw_weights[p]; + assert(c <= 64); + + { + half_float rf, gf, bf; + { + uint32_t r0 = trial_e[0][0], r1 = trial_e[1][0]; + int ri = (r0 * (64 - c) + r1 * c + 32) / 64; + rf = astc_helpers::qlog16_to_half(ri); + } + { + uint32_t g0 = trial_e[0][1], g1 = trial_e[1][1]; + int gi = (g0 * (64 - c) + g1 * c + 32) / 64; + gf = astc_helpers::qlog16_to_half(gi); + } + { + uint32_t b0 = trial_e[0][2], b1 = trial_e[1][2]; + int bi = (b0 * (64 - c) + b1 * c + 32) / 64; + bf = astc_helpers::qlog16_to_half(bi); + } + + const double decoded_half_q0 = q(rf, coptions.m_q_log_bias), decoded_half_q1 = q(gf, coptions.m_q_log_bias), decoded_half_q2 = q(bf, coptions.m_q_log_bias); + const double rd = decoded_half_q0 - desired_half_r_q, gd = decoded_half_q1 - desired_half_g_q, bd = decoded_half_q2 - desired_half_b_q; + trial_error += R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd; + } + } + + return trial_error; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static inline int compute_clamped_val(int v, int l, int h, bool& did_clamp, int& max_clamp_mag) +{ + assert(l < h); + + if (v < l) + { + max_clamp_mag = basisu::maximum(max_clamp_mag, l - v); + + v = l; + did_clamp = true; + } + else if (v > h) + { + max_clamp_mag = basisu::maximum(max_clamp_mag, v - h); + + v = h; + did_clamp = true; + } + + return v; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +const uint8_t s_b_bits[8] = { 7, 8, 6, 7, 8, 6, 7, 6 }; +const uint8_t s_c_bits[8] = { 6, 6, 7, 7, 6, 7, 7, 7 }; +const uint8_t s_d_bits[8] = { 7, 6, 7, 6, 5, 6, 5, 6 }; + +// val_q[] must be already packed to qlog9-qlog12. +bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, int val_q[2][3], int& max_clamp_mag, bool early_out_if_clamped, int max_clamp_mag_accept_thresh) +{ + assert(submode <= 7); + + const uint32_t a_bits = 9 + (submode >> 1); + const uint32_t b_bits = s_b_bits[submode]; + const uint32_t c_bits = s_c_bits[submode]; + const uint32_t d_bits = s_d_bits[submode]; + + const int max_a_val = (1 << a_bits) - 1; + const int max_b_val = (1 << b_bits) - 1; + const int max_c_val = (1 << c_bits) - 1; + + // The maximum usable value before it turns to NaN/Inf + const int max_a_qlog = get_max_qlog(a_bits); + BASISU_NOTE_UNUSED(max_a_qlog); + + const int min_d_val = -(1 << (d_bits - 1)); + const int max_d_val = -min_d_val - 1; + assert((max_d_val - min_d_val + 1) == (1 << d_bits)); + + int highest_q = -1, highest_val = 0, highest_comp = 0; + + for (uint32_t c = 0; c < 3; c++) + { + assert(val_q[0][c] <= max_a_qlog); + assert(val_q[1][c] <= max_a_qlog); + } + + for (uint32_t v = 0; v < 2; v++) + { + for (uint32_t c = 0; c < 3; c++) + { + assert(val_q[v][c] >= 0 && val_q[v][c] <= max_a_val); + + if (val_q[v][c] > highest_q) + { + highest_q = val_q[v][c]; + highest_val = v; + highest_comp = c; + } + } + } + + const bool had_tie = (val_q[highest_val ^ 1][highest_comp] == highest_q); + + if (highest_val != 1) + { + for (uint32_t c = 0; c < 3; c++) + { + std::swap(val_q[0][c], val_q[1][c]); + } + } + + if (highest_comp) + { + std::swap(val_q[0][0], val_q[0][highest_comp]); + std::swap(val_q[1][0], val_q[1][highest_comp]); + } + + int orig_q[2][3]; + memcpy(orig_q, val_q, sizeof(int) * 6); + + // val[1][0] is now guaranteed to be highest + int best_va = 0, best_vb0 = 0, best_vb1 = 0, best_vc = 0, best_vd0 = 0, best_vd1 = 0; + int best_max_clamp_mag = 0; + bool best_did_clamp = false; + int best_q[2][3] = { { 0, 0, 0}, { 0, 0, 0 } }; + BASISU_NOTE_UNUSED(best_q); + uint32_t best_dist = UINT_MAX; + + for (uint32_t pass = 0; pass < 2; pass++) + { + int trial_va = val_q[1][0]; + + assert(trial_va <= max_a_val); + assert(trial_va >= val_q[1][1]); + assert(trial_va >= val_q[1][2]); + + assert(trial_va >= val_q[0][0]); + assert(trial_va >= val_q[0][1]); + assert(trial_va >= val_q[0][2]); + + bool did_clamp = false; + int trial_max_clamp_mag = 0; + + int trial_vb0 = compute_clamped_val(trial_va - val_q[1][1], 0, max_b_val, did_clamp, trial_max_clamp_mag); + int trial_vb1 = compute_clamped_val(trial_va - val_q[1][2], 0, max_b_val, did_clamp, trial_max_clamp_mag); + int trial_vc = compute_clamped_val(trial_va - val_q[0][0], 0, max_c_val, did_clamp, trial_max_clamp_mag); + int trial_vd0 = compute_clamped_val((trial_va - trial_vb0 - trial_vc) - val_q[0][1], min_d_val, max_d_val, did_clamp, trial_max_clamp_mag); + int trial_vd1 = compute_clamped_val((trial_va - trial_vb1 - trial_vc) - val_q[0][2], min_d_val, max_d_val, did_clamp, trial_max_clamp_mag); + + if ((early_out_if_clamped) && (did_clamp) && (trial_max_clamp_mag > max_clamp_mag_accept_thresh)) + { + if ((!had_tie) || (pass == 1)) + { + max_clamp_mag = trial_max_clamp_mag; + return true; + } + } + + if (!did_clamp) + { + // Make sure decoder gets the expected values + assert(trial_va == val_q[1][0]); + assert(trial_va - trial_vb0 == val_q[1][1]); + assert(trial_va - trial_vb1 == val_q[1][2]); + + assert((trial_va - trial_vc) == val_q[0][0]); + assert((trial_va - trial_vb0 - trial_vc - trial_vd0) == val_q[0][1]); + assert((trial_va - trial_vb1 - trial_vc - trial_vd1) == val_q[0][2]); + } + + const int r_e0 = clamp(trial_va, 0, max_a_val); + const int r_e1 = clamp(trial_va - trial_vb0, 0, max_a_val); + const int r_e2 = clamp(trial_va - trial_vb1, 0, max_a_val); + + const int r_f0 = clamp(trial_va - trial_vc, 0, max_a_val); + const int r_f1 = clamp(trial_va - trial_vb0 - trial_vc - trial_vd0, 0, max_a_val); + const int r_f2 = clamp(trial_va - trial_vb1 - trial_vc - trial_vd1, 0, max_a_val); + + assert(r_e0 <= max_a_qlog); + assert(r_e1 <= max_a_qlog); + assert(r_e2 <= max_a_qlog); + + assert(r_f0 <= max_a_qlog); + assert(r_f1 <= max_a_qlog); + assert(r_f2 <= max_a_qlog); + + if ((!did_clamp) || (!had_tie)) + { + best_va = trial_va; + best_vb0 = trial_vb0; + best_vb1 = trial_vb1; + best_vc = trial_vc; + best_vd0 = trial_vd0; + best_vd1 = trial_vd1; + best_max_clamp_mag = trial_max_clamp_mag; + best_did_clamp = did_clamp; + + best_q[1][0] = r_e0; + best_q[1][1] = r_e1; + best_q[1][2] = r_e2; + best_q[0][0] = r_f0; + best_q[0][1] = r_f1; + best_q[0][2] = r_f2; + break; + } + + // we had a tie and it did clamp, try swapping L/H for a potential slight gain + + const uint32_t r_dist1 = basisu::square(r_e0 - val_q[1][0]) + basisu::square(r_e1 - val_q[1][1]) + basisu::square(r_e2 - val_q[1][2]); + const uint32_t r_dist0 = basisu::square(r_f0 - val_q[0][0]) + basisu::square(r_f1 - val_q[0][1]) + basisu::square(r_f2 - val_q[0][2]); + + const uint32_t total_dist = r_dist1 + r_dist0; + + if (total_dist < best_dist) + { + best_dist = total_dist; + + best_va = trial_va; + best_vb0 = trial_vb0; + best_vb1 = trial_vb1; + best_vc = trial_vc; + best_vd0 = trial_vd0; + best_vd1 = trial_vd1; + best_did_clamp = did_clamp; + + best_q[1][0] = r_e0; + best_q[1][1] = r_e1; + best_q[1][2] = r_e2; + best_q[0][0] = r_f0; + best_q[0][1] = r_f1; + best_q[0][2] = r_f2; + } + + for (uint32_t c = 0; c < 3; c++) + std::swap(val_q[0][c], val_q[1][c]); + } + + // pack bits now + int v0 = 0, v1 = 0, v2 = 0, v3 = 0, v4 = 0, v5 = 0; + + int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0; + switch (submode) + { + case 0: + x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vd0, 6); x3 = get_bit(best_vd1, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); + break; + case 1: + x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vb0, 7); x3 = get_bit(best_vb1, 7); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); + break; + case 2: + x0 = get_bit(best_va, 9); x1 = get_bit(best_vc, 6); x2 = get_bit(best_vd0, 6); x3 = get_bit(best_vd1, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); + break; + case 3: + x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_va, 9); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); + break; + case 4: + x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vb0, 7); x3 = get_bit(best_vb1, 7); x4 = get_bit(best_va, 9); x5 = get_bit(best_va, 10); + break; + case 5: + x0 = get_bit(best_va, 9); x1 = get_bit(best_va, 10); x2 = get_bit(best_vc, 7); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); + break; + case 6: + x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_va, 11); x3 = get_bit(best_vc, 6); x4 = get_bit(best_va, 9); x5 = get_bit(best_va, 10); + break; + case 7: + x0 = get_bit(best_va, 9); x1 = get_bit(best_va, 10); x2 = get_bit(best_va, 11); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); + break; + default: + break; + } + + // write mode + pack_bit(v1, 7, submode, 0); + pack_bit(v2, 7, submode, 1); + pack_bit(v3, 7, submode, 2); + + // highest component + pack_bit(v4, 7, highest_comp, 0); + pack_bit(v5, 7, highest_comp, 1); + + // write bit 8 of va + pack_bit(v1, 6, best_va, 8); + + // extra bits + pack_bit(v2, 6, x0); + pack_bit(v3, 6, x1); + pack_bit(v4, 6, x2); + pack_bit(v5, 6, x3); + pack_bit(v4, 5, x4); + pack_bit(v5, 5, x5); + + v0 = best_va & 0xFF; + v1 |= (best_vc & 63); + v2 |= (best_vb0 & 63); + v3 |= (best_vb1 & 63); + v4 |= (best_vd0 & 31); + v5 |= (best_vd1 & 31); + + assert(in_range(v0, 0, 255) && in_range(v1, 0, 255) && in_range(v2, 0, 255) && in_range(v3, 0, 255) && in_range(v4, 0, 255) && in_range(v5, 0, 255)); + + pEndpoints[0] = (uint8_t)v0; + pEndpoints[1] = (uint8_t)v1; + pEndpoints[2] = (uint8_t)v2; + pEndpoints[3] = (uint8_t)v3; + pEndpoints[4] = (uint8_t)v4; + pEndpoints[5] = (uint8_t)v5; + +#ifdef _DEBUG + // Test for valid pack by unpacking + { + if (highest_comp) + { + std::swap(best_q[0][0], best_q[0][highest_comp]); + std::swap(best_q[1][0], best_q[1][highest_comp]); + + std::swap(orig_q[0][0], orig_q[0][highest_comp]); + std::swap(orig_q[1][0], orig_q[1][highest_comp]); + } + + int test_e[2][3]; + decode_mode11_to_qlog12(pEndpoints, test_e, astc_helpers::BISE_256_LEVELS); + for (uint32_t i = 0; i < 2; i++) + { + for (uint32_t j = 0; j < 3; j++) + { + assert(best_q[i][j] == test_e[i][j] >> (12 - a_bits)); + + if (!best_did_clamp) + { + assert((orig_q[i][j] == test_e[i][j] >> (12 - a_bits)) || + (orig_q[1 - i][j] == test_e[i][j] >> (12 - a_bits))); + } + } + } + } +#endif + + max_clamp_mag = best_max_clamp_mag; + + return best_did_clamp; +} + +bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& low_q16, const vec3F& high_q16, int& max_clamp_mag, bool early_out_if_clamped, int max_clamp_mag_accept_thresh) +{ + assert(submode <= 7); + + const uint32_t a_bits = 9 + (submode >> 1); + const int max_a_val = (1 << a_bits) - 1; + + // The maximum usable value before it turns to NaN/Inf + const int max_a_qlog = get_max_qlog(a_bits); + + int val_q[2][3]; + + for (uint32_t c = 0; c < 3; c++) + { +#if 0 + // This is very slightly better, but ~8% slower likely due to the table lookups. + const half_float l = astc_helpers::qlog16_to_half((uint32_t)std::round(low_q16[c])); + val_q[0][c] = half_to_qlog7_12(l, a_bits); + + const half_float h = astc_helpers::qlog16_to_half((uint32_t)std::round(high_q16[c])); + val_q[1][c] = half_to_qlog7_12(h, a_bits); +#else + // TODO: Tune quant_qlog16() for higher precision. + val_q[0][c] = quant_qlog16((uint32_t)std::round(low_q16[c]), a_bits); + val_q[1][c] = quant_qlog16((uint32_t)std::round(high_q16[c]), a_bits); +#endif + +#if 1 + if (val_q[0][c] == val_q[1][c]) + { +#if 0 + if (l <= h) +#else + if (low_q16[c] < high_q16[c]) +#endif + { + if (val_q[0][c]) + val_q[0][c]--; + + if (val_q[1][c] != max_a_val) + val_q[1][c]++; + } + else + { + if (val_q[0][c] != max_a_val) + val_q[0][c]++; + + if (val_q[1][c]) + val_q[1][c]--; + } + } +#endif + + val_q[0][c] = minimum(val_q[0][c], max_a_qlog); + val_q[1][c] = minimum(val_q[1][c], max_a_qlog); + } + + return pack_astc_mode11_submode(submode, pEndpoints, val_q, max_clamp_mag, early_out_if_clamped, max_clamp_mag_accept_thresh); +} + +//-------------------------------------------------------------------------------------------------------------------------- + +void pack_astc_mode11_direct(uint8_t* pEndpoints, vec3F l_q16, vec3F h_q16) +{ + float lg = l_q16.dot(vec3F(1.0f)), hg = h_q16.dot(vec3F(1.0f)); + if (lg > hg) + { + // Ensure low endpoint is generally less bright than high in direct mode. + std::swap(l_q16, h_q16); + } + + for (uint32_t i = 0; i < 3; i++) + { + // TODO: This goes from QLOG16->HALF->QLOG8/7 + half_float l_half = astc_helpers::qlog16_to_half(clamp((int)std::round(l_q16[i]), 0, 65535)); + half_float h_half = astc_helpers::qlog16_to_half(clamp((int)std::round(h_q16[i]), 0, 65535)); + + int l_q, h_q; + + if (i == 2) + { + l_q = g_half_to_qlog7[bounds_check((uint32_t)l_half, 0U, 32768U)]; + h_q = g_half_to_qlog7[bounds_check((uint32_t)h_half, 0U, 32768U)]; + + l_q = minimum(l_q, MAX_QLOG7); + h_q = minimum(h_q, MAX_QLOG7); + } + else + { + l_q = g_half_to_qlog8[bounds_check((uint32_t)l_half, 0U, 32768U)]; + h_q = g_half_to_qlog8[bounds_check((uint32_t)h_half, 0U, 32768U)]; + + // this quantizes R and G as 7 bits vs. 8, for grayscale. + //l_q = g_half_to_qlog7[bounds_check((uint32_t)l_half, 0U, 32768U)] << 1; + //h_q = g_half_to_qlog7[bounds_check((uint32_t)h_half, 0U, 32768U)] << 1; + + l_q = minimum(l_q, MAX_QLOG8); + h_q = minimum(h_q, MAX_QLOG8); + } + +#if 1 + if (l_q == h_q) + { + const int m = (i == 2) ? MAX_QLOG7 : MAX_QLOG8; + + if (l_q16[i] <= h_q16[i]) + { + if (l_q) + l_q--; + + if (h_q != m) + h_q++; + } + else + { + if (h_q) + h_q--; + + if (l_q != m) + l_q++; + } + } +#endif + + if (i == 2) + { + assert(l_q <= (int)MAX_QLOG7 && h_q <= (int)MAX_QLOG7); + l_q |= 128; + h_q |= 128; + } + else + { + assert(l_q <= (int)MAX_QLOG8 && h_q <= (int)MAX_QLOG8); + } + + pEndpoints[2 * i + 0] = (uint8_t)l_q; + pEndpoints[2 * i + 1] = (uint8_t)h_q; + } +} + +//-------------------------------------------------------------------------------------------------------------------------- + +bool pack_astc_mode7_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& rgb_q16, float s_q16, int& max_clamp_mag, uint32_t ise_weight_range, bool early_out_if_clamped, int max_clamp_mag_accept_thresh) +{ + assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + + assert(submode <= 5); + max_clamp_mag = 0; + + static const uint8_t s_r_bits[6] = { 11, 11, 10, 9, 8, 7 }; + static const uint8_t s_g_b_bits[6] = { 5, 6, 5, 6, 7, 7 }; + static const uint8_t s_s_bits[6] = { 7, 5, 8, 7, 6, 7 }; + + // The precision of the components + const uint32_t prec_bits = s_r_bits[submode]; + + int qlog[4], pack_bits[4]; + + for (uint32_t i = 0; i < 4; i++) + { + const float f = (i == 3) ? s_q16 : rgb_q16[i]; + + // The # of bits the component is packed into + if (i == 0) + pack_bits[i] = s_r_bits[submode]; + else if (i == 3) + pack_bits[i] = s_s_bits[submode]; + else + pack_bits[i] = s_g_b_bits[submode]; + +#if 0 + // this is slightly worse + // TODO: going from qlog16 to half loses some precision. Then going from half to qlog 7-12 will have extra error. + half_float h = qlog_to_half(clamp((int)std::round(f), 0, MAX_QLOG16), 16); + qlog[i] = half_to_qlog7_12((half_float)bounds_check((uint32_t)h, 0U, 32768U), prec_bits); +#else + qlog[i] = quant_qlog16(clamp((int)std::round(f), 0, MAX_QLOG16), prec_bits); + + // Only bias if there are enough texel weights, 4=6 weights + if (ise_weight_range >= 4) + { + // Explictly bias the high color, and the scale up, to better exploit the weights. + // The quantized range also then encompases the complete input range. + const uint32_t max_val = (1 << prec_bits) - 1; + const uint32_t K = 3; + if (i == 3) + { + qlog[i] = minimum(qlog[i] + K * 2, max_val); + } + else + { + qlog[i] = minimum(qlog[i] + K, max_val); + } + } +#endif + + if (i != 3) + qlog[i] = minimum(qlog[i], get_max_qlog(prec_bits)); + + // If S=0, we lose freedom for the texel weights to add any value. + if ((i == 3) && (qlog[i] == 0)) + qlog[i] = 1; + } + + uint32_t maj_index = 0; + + bool did_clamp = false; + + if (submode != 5) + { + int largest_qlog = 0; + for (uint32_t i = 0; i < 3; i++) + { + if (qlog[i] > largest_qlog) + { + largest_qlog = qlog[i]; + maj_index = i; + } + } + + if (maj_index) + { + std::swap(qlog[0], qlog[maj_index]); + } + + assert(qlog[0] >= qlog[1]); + assert(qlog[0] >= qlog[2]); + + qlog[1] = qlog[0] - qlog[1]; + qlog[2] = qlog[0] - qlog[2]; + + for (uint32_t i = 1; i < 4; i++) + { + const int max_val = (1 << pack_bits[i]) - 1; + + if (qlog[i] > max_val) + { + max_clamp_mag = maximum(max_clamp_mag, qlog[i] - max_val); + qlog[i] = max_val; + did_clamp = true; + + if ((early_out_if_clamped) && (max_clamp_mag > max_clamp_mag_accept_thresh)) + return true; + } + } + } + + for (uint32_t i = 0; i < 4; i++) + { + const int max_val = (1 << pack_bits[i]) - 1; (void)max_val; + + assert(qlog[i] <= max_val); + } + + int mode = 0; + + int r = qlog[0] & 63; // 6-bits + int g = qlog[1] & 31; // 5-bits + int b = qlog[2] & 31; // 5-bits + int s = qlog[3] & 31; // 5-bits + + int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0, x6 = 0; + + switch (submode) + { + case 0: + { + mode = (maj_index << 2) | 0; + assert((mode & 0xC) != 0xC); + + x0 = get_bit(qlog[0], 9); // R9 + x1 = get_bit(qlog[0], 8); // R8 + x2 = get_bit(qlog[0], 7); // R7 + x3 = get_bit(qlog[0], 10); // R10 + x4 = get_bit(qlog[0], 6); // R6 + x5 = get_bit(qlog[3], 6); // S6 + x6 = get_bit(qlog[3], 5); // S5 + break; + } + case 1: + { + mode = (maj_index << 2) | 1; + assert((mode & 0xC) != 0xC); + + x0 = get_bit(qlog[0], 8); // R8 + x1 = get_bit(qlog[1], 5); // G5 + x2 = get_bit(qlog[0], 7); // R7 + x3 = get_bit(qlog[2], 5); // B5 + x4 = get_bit(qlog[0], 6); // R6 + x5 = get_bit(qlog[0], 10); // R10 + x6 = get_bit(qlog[0], 9); // R9 + break; + } + case 2: + { + mode = (maj_index << 2) | 2; + assert((mode & 0xC) != 0xC); + + x0 = get_bit(qlog[0], 9); // R9 + x1 = get_bit(qlog[0], 8); // R8 + x2 = get_bit(qlog[0], 7); // R7 + x3 = get_bit(qlog[0], 6); // R6 + x4 = get_bit(qlog[3], 7); // S7 + x5 = get_bit(qlog[3], 6); // S6 + x6 = get_bit(qlog[3], 5); // S5 + break; + } + case 3: + { + mode = (maj_index << 2) | 3; + assert((mode & 0xC) != 0xC); + + x0 = get_bit(qlog[0], 8); // R8 + x1 = get_bit(qlog[1], 5); // G5 + x2 = get_bit(qlog[0], 7); // R7 + x3 = get_bit(qlog[2], 5); // B5 + x4 = get_bit(qlog[0], 6); // R6 + x5 = get_bit(qlog[3], 6); // S6 + x6 = get_bit(qlog[3], 5); // S5 + break; + } + case 4: + { + mode = maj_index | 0xC; // 0b1100 + assert((mode & 0xC) == 0xC); + assert(mode != 0xF); + + x0 = get_bit(qlog[1], 6); // G6 + x1 = get_bit(qlog[1], 5); // G5 + x2 = get_bit(qlog[2], 6); // B6 + x3 = get_bit(qlog[2], 5); // B5 + x4 = get_bit(qlog[0], 6); // R6 + x5 = get_bit(qlog[0], 7); // R7 + x6 = get_bit(qlog[3], 5); // S5 + break; + } + case 5: + { + mode = 0xF; + + x0 = get_bit(qlog[1], 6); // G6 + x1 = get_bit(qlog[1], 5); // G5 + x2 = get_bit(qlog[2], 6); // B6 + x3 = get_bit(qlog[2], 5); // B5 + x4 = get_bit(qlog[0], 6); // R6 + x5 = get_bit(qlog[3], 6); // S6 + x6 = get_bit(qlog[3], 5); // S5 + break; + } + default: + { + assert(0); + break; + } + } + + pEndpoints[0] = (uint8_t)((get_bit(mode, 1) << 7) | (get_bit(mode, 0) << 6) | r); + pEndpoints[1] = (uint8_t)((get_bit(mode, 2) << 7) | (x0 << 6) | (x1 << 5) | g); + pEndpoints[2] = (uint8_t)((get_bit(mode, 3) << 7) | (x2 << 6) | (x3 << 5) | b); + pEndpoints[3] = (uint8_t)((x4 << 7) | (x5 << 6) | (x6 << 5) | s); + +#ifdef _DEBUG + // Test for valid pack by unpacking + { + const int inv_shift = 12 - prec_bits; + + int unpacked_e[2][3]; + if (submode != 5) + { + unpacked_e[1][0] = left_shift32(qlog[0], inv_shift); + unpacked_e[1][1] = clamp(left_shift32((qlog[0] - qlog[1]), inv_shift), 0, 0xFFF); + unpacked_e[1][2] = clamp(left_shift32((qlog[0] - qlog[2]), inv_shift), 0, 0xFFF); + + unpacked_e[0][0] = clamp(left_shift32((qlog[0] - qlog[3]), inv_shift), 0, 0xFFF); + unpacked_e[0][1] = clamp(left_shift32(((qlog[0] - qlog[1]) - qlog[3]), inv_shift), 0, 0xFFF); + unpacked_e[0][2] = clamp(left_shift32(((qlog[0] - qlog[2]) - qlog[3]), inv_shift), 0, 0xFFF); + } + else + { + unpacked_e[1][0] = left_shift32(qlog[0], inv_shift); + unpacked_e[1][1] = left_shift32(qlog[1], inv_shift); + unpacked_e[1][2] = left_shift32(qlog[2], inv_shift); + + unpacked_e[0][0] = clamp(left_shift32((qlog[0] - qlog[3]), inv_shift), 0, 0xFFF); + unpacked_e[0][1] = clamp(left_shift32((qlog[1] - qlog[3]), inv_shift), 0, 0xFFF); + unpacked_e[0][2] = clamp(left_shift32((qlog[2] - qlog[3]), inv_shift), 0, 0xFFF); + } + + if (maj_index) + { + std::swap(unpacked_e[0][0], unpacked_e[0][maj_index]); + std::swap(unpacked_e[1][0], unpacked_e[1][maj_index]); + } + + int e[2][3]; + decode_mode7_to_qlog12_ise20(pEndpoints, e, nullptr); + + for (uint32_t i = 0; i < 3; i++) + { + assert(unpacked_e[0][i] == e[0][i]); + assert(unpacked_e[1][i] == e[1][i]); + } + } +#endif + + return did_clamp; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +bool pack_mode11(mode11_log_desc& desc, uint8_t* pEndpoints) +{ + memset(pEndpoints, 0, NUM_MODE11_ENDPOINTS); + + if (desc.is_direct()) + { + if ((desc.m_a < 0) || (desc.m_c < 0) || (desc.m_b0 < 0)) + return false; + + if (!((desc.m_a <= 255) && (desc.m_c <= 255) && (desc.m_b0 <= 127))) + return false; + + pEndpoints[0] = (uint8_t)desc.m_a; + pEndpoints[2] = (uint8_t)desc.m_c; + pEndpoints[4] = (uint8_t)desc.m_b0 | 128; + + if ((desc.m_b1 < 0) || (desc.m_d0 < 0) || (desc.m_d1 < 0)) + return false; + + if (!((desc.m_b1 <= 255) && (desc.m_d0 <= 255) && (desc.m_d1 <= 127))) + return false; + + pEndpoints[1] = (uint8_t)desc.m_b1; + pEndpoints[3] = (uint8_t)desc.m_d0; + pEndpoints[5] = (uint8_t)desc.m_d1 | 128; + + return true; + } + + if (!((desc.m_a >= 0) && (desc.m_a <= desc.m_max_a_val))) + return false; + if (!(((desc.m_c >= 0) && (desc.m_c <= desc.m_max_c_val)))) + return false; + if (!((desc.m_b0 >= 0) && (desc.m_b0 <= desc.m_max_b_val))) + return false; + if (!((desc.m_b1 >= 0) && (desc.m_b1 <= desc.m_max_b_val))) + return false; + if (!((desc.m_d0 >= desc.m_min_d_val) && (desc.m_d0 <= desc.m_max_d_val))) + return false; + if (!((desc.m_d1 >= desc.m_min_d_val) && (desc.m_d1 <= desc.m_max_d_val))) + return false; + + const int va = desc.m_a, vb0 = desc.m_b0, vb1 = desc.m_b1, vc = desc.m_c, vd0 = desc.m_d0, vd1 = desc.m_d1; + + int v0 = 0, v1 = 0, v2 = 0, v3 = 0, v4 = 0, v5 = 0; + + int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0; + switch (desc.m_submode) + { + case 0: + x0 = get_bit(vb0, 6); x1 = get_bit(vb1, 6); x2 = get_bit(vd0, 6); x3 = get_bit(vd1, 6); x4 = get_bit(vd0, 5); x5 = get_bit(vd1, 5); + break; + case 1: + x0 = get_bit(vb0, 6); x1 = get_bit(vb1, 6); x2 = get_bit(vb0, 7); x3 = get_bit(vb1, 7); x4 = get_bit(vd0, 5); x5 = get_bit(vd1, 5); + break; + case 2: + x0 = get_bit(va, 9); x1 = get_bit(vc, 6); x2 = get_bit(vd0, 6); x3 = get_bit(vd1, 6); x4 = get_bit(vd0, 5); x5 = get_bit(vd1, 5); + break; + case 3: + x0 = get_bit(vb0, 6); x1 = get_bit(vb1, 6); x2 = get_bit(va, 9); x3 = get_bit(vc, 6); x4 = get_bit(vd0, 5); x5 = get_bit(vd1, 5); + break; + case 4: + x0 = get_bit(vb0, 6); x1 = get_bit(vb1, 6); x2 = get_bit(vb0, 7); x3 = get_bit(vb1, 7); x4 = get_bit(va, 9); x5 = get_bit(va, 10); + break; + case 5: + x0 = get_bit(va, 9); x1 = get_bit(va, 10); x2 = get_bit(vc, 7); x3 = get_bit(vc, 6); x4 = get_bit(vd0, 5); x5 = get_bit(vd1, 5); + break; + case 6: + x0 = get_bit(vb0, 6); x1 = get_bit(vb1, 6); x2 = get_bit(va, 11); x3 = get_bit(vc, 6); x4 = get_bit(va, 9); x5 = get_bit(va, 10); + break; + case 7: + x0 = get_bit(va, 9); x1 = get_bit(va, 10); x2 = get_bit(va, 11); x3 = get_bit(vc, 6); x4 = get_bit(vd0, 5); x5 = get_bit(vd1, 5); + break; + default: + break; + } + + // write mode + pack_bit(v1, 7, desc.m_submode, 0); + pack_bit(v2, 7, desc.m_submode, 1); + pack_bit(v3, 7, desc.m_submode, 2); + + // highest component + pack_bit(v4, 7, desc.m_maj_comp, 0); + pack_bit(v5, 7, desc.m_maj_comp, 1); + + // write bit 8 of va + pack_bit(v1, 6, va, 8); + + // extra bits + pack_bit(v2, 6, x0); + pack_bit(v3, 6, x1); + pack_bit(v4, 6, x2); + pack_bit(v5, 6, x3); + pack_bit(v4, 5, x4); + pack_bit(v5, 5, x5); + + v0 = va & 0xFF; + v1 |= (vc & 63); + v2 |= (vb0 & 63); + v3 |= (vb1 & 63); + v4 |= (vd0 & 31); + v5 |= (vd1 & 31); + + assert(in_range(v0, 0, 255) && in_range(v1, 0, 255) && in_range(v2, 0, 255) && in_range(v3, 0, 255) && in_range(v4, 0, 255) && in_range(v5, 0, 255)); + + pEndpoints[0] = (uint8_t)v0; + pEndpoints[1] = (uint8_t)v1; + pEndpoints[2] = (uint8_t)v2; + pEndpoints[3] = (uint8_t)v3; + pEndpoints[4] = (uint8_t)v4; + pEndpoints[5] = (uint8_t)v5; + + return true; +} + +static inline int astc_hdr_sign_extend(int src, int num_src_bits) +{ + assert(basisu::in_range(num_src_bits, 2, 31)); + + const bool negative = (src & (1 << (num_src_bits - 1))) != 0; + if (negative) + return src | ~((1 << num_src_bits) - 1); + else + return src & ((1 << num_src_bits) - 1); +} + +void unpack_mode11(const uint8_t* pEndpoints, mode11_log_desc& desc) +{ + clear_obj(desc); + + pack_bit(desc.m_maj_comp, 0, pEndpoints[4], 7); + pack_bit(desc.m_maj_comp, 1, pEndpoints[5], 7); + + if (desc.m_maj_comp == 3) + { + desc.m_a = pEndpoints[0]; + desc.m_c = pEndpoints[2]; + desc.m_b0 = pEndpoints[4] & 0x7F; + + desc.m_b1 = pEndpoints[1]; + desc.m_d0 = pEndpoints[3]; + desc.m_d1 = pEndpoints[5] & 0x7F; + + return; + } + + pack_bit(desc.m_submode, 0, pEndpoints[1], 7); + pack_bit(desc.m_submode, 1, pEndpoints[2], 7); + pack_bit(desc.m_submode, 2, pEndpoints[3], 7); + + desc.m_a = pEndpoints[0]; // 8 bits + pack_bit(desc.m_a, 8, pEndpoints[1], 6); + + desc.m_c = pEndpoints[1] & 63; // 6 bits + desc.m_b0 = pEndpoints[2] & 63; // 6 bits + desc.m_b1 = pEndpoints[3] & 63; // 6 bits + desc.m_d0 = pEndpoints[4] & 31; // 5 bits + desc.m_d1 = pEndpoints[5] & 31; // 5 bits + + const int x0 = get_bit(pEndpoints[2], 6); + const int x1 = get_bit(pEndpoints[3], 6); + const int x2 = get_bit(pEndpoints[4], 6); + const int x3 = get_bit(pEndpoints[5], 6); + const int x4 = get_bit(pEndpoints[4], 5); + const int x5 = get_bit(pEndpoints[5], 5); + + switch (desc.m_submode) + { + case 0: + pack_bit(desc.m_b0, 6, x0, 0); pack_bit(desc.m_b1, 6, x1, 0); pack_bit(desc.m_d0, 6, x2, 0); pack_bit(desc.m_d1, 6, x3, 0); pack_bit(desc.m_d0, 5, x4, 0); pack_bit(desc.m_d1, 5, x5, 0); + break; + case 1: + pack_bit(desc.m_b0, 6, x0, 0); pack_bit(desc.m_b1, 6, x1, 0); pack_bit(desc.m_b0, 7, x2, 0); pack_bit(desc.m_b1, 7, x3, 0); pack_bit(desc.m_d0, 5, x4, 0); pack_bit(desc.m_d1, 5, x5, 0); + break; + case 2: + pack_bit(desc.m_a, 9, x0, 0); pack_bit(desc.m_c, 6, x1, 0); pack_bit(desc.m_d0, 6, x2, 0); pack_bit(desc.m_d1, 6, x3, 0); pack_bit(desc.m_d0, 5, x4, 0); pack_bit(desc.m_d1, 5, x5, 0); + break; + case 3: + pack_bit(desc.m_b0, 6, x0, 0); pack_bit(desc.m_b1, 6, x1, 0); pack_bit(desc.m_a, 9, x2, 0); pack_bit(desc.m_c, 6, x3, 0); pack_bit(desc.m_d0, 5, x4, 0); pack_bit(desc.m_d1, 5, x5, 0); + break; + case 4: + pack_bit(desc.m_b0, 6, x0, 0); pack_bit(desc.m_b1, 6, x1, 0); pack_bit(desc.m_b0, 7, x2, 0); pack_bit(desc.m_b1, 7, x3, 0); pack_bit(desc.m_a, 9, x4, 0); pack_bit(desc.m_a, 10, x5, 0); + break; + case 5: + pack_bit(desc.m_a, 9, x0, 0); pack_bit(desc.m_a, 10, x1, 0); pack_bit(desc.m_c, 7, x2, 0); pack_bit(desc.m_c, 6, x3, 0); pack_bit(desc.m_d0, 5, x4, 0); pack_bit(desc.m_d1, 5, x5, 0); + break; + case 6: + pack_bit(desc.m_b0, 6, x0, 0); pack_bit(desc.m_b1, 6, x1, 0); pack_bit(desc.m_a, 11, x2, 0); pack_bit(desc.m_c, 6, x3, 0); pack_bit(desc.m_a, 9, x4, 0); pack_bit(desc.m_a, 10, x5, 0); + break; + case 7: + default: + pack_bit(desc.m_a, 9, x0, 0); pack_bit(desc.m_a, 10, x1, 0); pack_bit(desc.m_a, 11, x2, 0); pack_bit(desc.m_c, 6, x3, 0); pack_bit(desc.m_d0, 5, x4, 0); pack_bit(desc.m_d1, 5, x5, 0); + break; + } + + desc.m_a_bits = 9 + (desc.m_submode >> 1); + desc.m_b_bits = s_b_bits[desc.m_submode]; + desc.m_c_bits = s_c_bits[desc.m_submode]; + desc.m_d_bits = s_d_bits[desc.m_submode]; + + desc.m_max_a_val = (1 << desc.m_a_bits) - 1; + desc.m_max_b_val = (1 << desc.m_b_bits) - 1; + desc.m_max_c_val = (1 << desc.m_c_bits) - 1; + + desc.m_min_d_val = -(1 << (desc.m_d_bits - 1)); + desc.m_max_d_val = -desc.m_min_d_val - 1; + + desc.m_d0 = astc_hdr_sign_extend(desc.m_d0, desc.m_d_bits); + desc.m_d1 = astc_hdr_sign_extend(desc.m_d1, desc.m_d_bits); + + assert((desc.m_a >= 0) && (desc.m_a <= desc.m_max_a_val)); + assert((desc.m_c >= 0) && (desc.m_c <= desc.m_max_c_val)); + assert((desc.m_b0 >= 0) && (desc.m_b0 <= desc.m_max_b_val)); + assert((desc.m_b1 >= 0) && (desc.m_b1 <= desc.m_max_b_val)); + assert((desc.m_d0 >= desc.m_min_d_val) && (desc.m_d0 <= desc.m_max_d_val)); + assert((desc.m_d1 >= desc.m_min_d_val) && (desc.m_d1 <= desc.m_max_d_val)); +} + +//-------------------------------------------------------------------------------------------------------------------------- + +void decode_cem_11_config(const uint8_t* pEndpoints, int& submode_index, int& maj_index) +{ + submode_index = 0; + maj_index = 0; + + pack_bit(submode_index, 0, pEndpoints[1], 7); + pack_bit(submode_index, 1, pEndpoints[2], 7); + pack_bit(submode_index, 2, pEndpoints[3], 7); + + pack_bit(maj_index, 0, pEndpoints[4], 7); + pack_bit(maj_index, 1, pEndpoints[5], 7); +} + +//-------------------------------------------------------------------------------------------------------------------------- + +void decode_cem_7_config(const uint8_t* pEndpoints, int& submode_index, int &maj_index) +{ + const int v0 = pEndpoints[0], v1 = pEndpoints[1], v2 = pEndpoints[2], v3 = pEndpoints[3]; + (void)v3; + + // Extract mode bits and unpack to major component and mode. + const int modeval = ((v0 & 0xC0) >> 6) | ((v1 & 0x80) >> 5) | ((v2 & 0x80) >> 4); + + if ((modeval & 0xC) != 0xC) + { + maj_index = modeval >> 2; + submode_index = modeval & 3; + } + else if (modeval != 0xF) + { + maj_index = modeval & 3; + submode_index = 4; + } + else + { + maj_index = 0; + submode_index = 5; + } +} + +//-------------------------------------------------------------------------------------------------------------------------- +// TODO: Use pack_mode11() as a shared function. + +bool pack_mode11( + const vec3F& low_color_q16, const vec3F& high_color_q16, + uint32_t ise_endpoint_range, uint8_t* pEndpoints, + const astc_hdr_codec_base_options& coptions, + bool direct_only, int32_t first_submode, int32_t last_submode, bool ignore_clamping, uint32_t& submode_used) +{ + uint8_t orig_trial_endpoints[NUM_MODE11_ENDPOINTS]; + + if (direct_only) + { + first_submode = -1; + last_submode = -1; + } + + assert(first_submode <= last_submode); + assert((first_submode >= -1) && (first_submode <= 7)); + assert((last_submode >= -1) && (last_submode <= 7)); + + memset(pEndpoints, 0, NUM_MODE11_ENDPOINTS); + + double best_trial_dist = BIG_FLOAT_VAL; + int best_submode = 0; + + for (int submode = last_submode; submode >= first_submode; submode--) + { + bool did_clamp = false; + int max_clamp_mag = 0; + if (submode == -1) + { + // If it had to clamp with one of the submodes, try direct which can't clamp, but has low precision. + pack_astc_mode11_direct(orig_trial_endpoints, low_color_q16, high_color_q16); + } + else + { + const int MAX_CLAMP_MAG_ACCEPT_THRESH = 32; + did_clamp = pack_astc_mode11_submode(submode, orig_trial_endpoints, low_color_q16, high_color_q16, max_clamp_mag, !ignore_clamping, MAX_CLAMP_MAG_ACCEPT_THRESH); + + if (!ignore_clamping) + { + // If it had to clamp and the clamp was too high, it'll distort the endpoint colors too much, which could lead to noticeable artifacts. + if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH)) + continue; + } + } + + uint8_t trial_endpoints[NUM_MODE11_ENDPOINTS]; + + // This will distort the endpoints if the ISE endpoint range isn't 256 levels (20). + // It could massively distort the endpoints, but still result in a valid encoding. + basist::astc_6x6_hdr::requantize_ise_endpoints(11, astc_helpers::BISE_256_LEVELS, orig_trial_endpoints, ise_endpoint_range, trial_endpoints); + + int e[2][3]; + if (!decode_mode11_to_qlog12(trial_endpoints, e, ise_endpoint_range)) + continue; + + vec3F e0( + (float)(e[0][0] << 4), + (float)(e[0][1] << 4), + (float)(e[0][2] << 4) + ); + + vec3F e1( + (float)(e[1][0] << 4), + (float)(e[1][1] << 4), + (float)(e[1][2] << 4) + ); + + double dist0 = e0.squared_distance_d(low_color_q16) + e1.squared_distance_d(high_color_q16); + double dist1 = e1.squared_distance_d(low_color_q16) + e0.squared_distance_d(high_color_q16); + double dist = helpers::minimum(dist0, dist1); + + if (dist < best_trial_dist) + { + best_trial_dist = dist; + best_submode = submode; + memcpy(pEndpoints, trial_endpoints, NUM_MODE11_ENDPOINTS); + } + + if (coptions.m_take_first_non_clamping_mode11_submode) + { + if (!did_clamp) + break; + } + + } // submode + + if ((coptions.m_ultra_quant) && + (ise_endpoint_range < astc_helpers::BISE_256_LEVELS) && + (best_trial_dist != BIG_FLOAT_VAL)) + { + uint8_t orig_best_trial_endpoints[NUM_MODE11_ENDPOINTS]; + memcpy(orig_best_trial_endpoints, pEndpoints, NUM_MODE11_ENDPOINTS); + + for (uint32_t c = 0; c < NUM_MODE11_ENDPOINTS; c++) + { + for (int dt = 0; dt <= 1; dt++) + { + const int d = dt ? 1 : -1; + + uint8_t varied_endpoints[NUM_MODE11_ENDPOINTS]; + memcpy(varied_endpoints, orig_best_trial_endpoints, NUM_MODE11_ENDPOINTS); + + int ise = varied_endpoints[c]; + + int rank = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_rank[ise]; + rank = clamp(rank + d, 0, astc_helpers::get_ise_levels(ise_endpoint_range) - 1); + + ise = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_rank_to_ISE[rank]; + + varied_endpoints[c] = (uint8_t)ise; + + int e[2][3]; + if (!decode_mode11_to_qlog12(varied_endpoints, e, ise_endpoint_range)) + continue; + + vec3F e0( + (float)(e[0][0] << 4), + (float)(e[0][1] << 4), + (float)(e[0][2] << 4) + ); + + vec3F e1( + (float)(e[1][0] << 4), + (float)(e[1][1] << 4), + (float)(e[1][2] << 4) + ); + + double dist0 = e0.squared_distance_d(low_color_q16) + e1.squared_distance_d(high_color_q16); + double dist1 = e1.squared_distance_d(low_color_q16) + e0.squared_distance_d(high_color_q16); + double dist = helpers::minimum(dist0, dist1); + + if (dist < best_trial_dist) + { + best_trial_dist = dist; + memcpy(pEndpoints, varied_endpoints, NUM_MODE11_ENDPOINTS); + } + } // d + } // c + } // if (coptions.m_ultra_quant) + + submode_used = best_submode + 1; + + return (best_trial_dist != BIG_FLOAT_VAL); +} + +bool try_mode11(uint32_t num_pixels, + uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used, + const vec3F& low_color_q16, const vec3F& high_color_q16, + const basist::half_float block_pixels_half[][3], + uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions, bool direct_only, uint32_t ise_endpoint_range, + bool constrain_ise_weight_selectors, + int32_t first_submode, int32_t last_submode, bool ignore_clamping) // -1, 7 +{ + assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + assert((num_weight_levels >= MIN_SUPPORTED_WEIGHT_LEVELS) && (num_weight_levels <= MAX_SUPPORTED_WEIGHT_LEVELS)); + assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS)); + assert(num_weight_levels == astc_helpers::get_ise_levels(ise_weight_range)); + + half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3]; + uint8_t orig_trial_endpoints[NUM_MODE11_ENDPOINTS], trial_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + + if (direct_only) + { + first_submode = -1; + last_submode = -1; + } + + assert(first_submode <= last_submode); + assert((first_submode >= -1) && (first_submode <= 7)); + assert((last_submode >= -1) && (last_submode <= 7)); + + uint8_t best_trial_endpoints[NUM_MODE11_ENDPOINTS]; + clear_obj(best_trial_endpoints); + double best_trial_dist = BIG_FLOAT_VAL; + int best_submode = 0; + + for (int submode = last_submode; submode >= first_submode; submode--) + { + bool did_clamp = false; + int max_clamp_mag = 0; + if (submode == -1) + { + // If it had to clamp with one of the submodes, try direct which can't clamp, but has low precision. + pack_astc_mode11_direct(orig_trial_endpoints, low_color_q16, high_color_q16); + } + else + { + const int MAX_CLAMP_MAG_ACCEPT_THRESH = 32; + did_clamp = pack_astc_mode11_submode(submode, orig_trial_endpoints, low_color_q16, high_color_q16, max_clamp_mag, !ignore_clamping, MAX_CLAMP_MAG_ACCEPT_THRESH); + + if (!ignore_clamping) + { + // If it had to clamp and the clamp was too high, it'll distort the endpoint colors too much, which could lead to noticeable artifacts. + if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH)) + continue; + } + } + + uint8_t trial_endpoints[NUM_MODE11_ENDPOINTS]; + + // This will distort the endpoints if the ISE endpoint range isn't 256 levels (20). + // It could massively distort the endpoints, but still result in a valid encoding. + basist::astc_6x6_hdr::requantize_ise_endpoints(11, astc_helpers::BISE_256_LEVELS, orig_trial_endpoints, ise_endpoint_range, trial_endpoints); + + int e[2][3]; + if (!decode_mode11_to_qlog12(trial_endpoints, e, ise_endpoint_range)) + continue; + + vec3F e0( + (float)(e[0][0] << 4), + (float)(e[0][1] << 4), + (float)(e[0][2] << 4) + ); + + vec3F e1( + (float)(e[1][0] << 4), + (float)(e[1][1] << 4), + (float)(e[1][2] << 4) + ); + + double dist0 = e0.squared_distance_d(low_color_q16) + e1.squared_distance_d(high_color_q16); + double dist1 = e1.squared_distance_d(low_color_q16) + e0.squared_distance_d(high_color_q16); + double dist = helpers::minimum(dist0, dist1); + + if (dist < best_trial_dist) + { + best_trial_dist = dist; + best_submode = submode; + memcpy(best_trial_endpoints, trial_endpoints, sizeof(best_trial_endpoints)); + } + + if (coptions.m_take_first_non_clamping_mode11_submode) + { + if (!did_clamp) + break; + } + + } // submode + + if ((coptions.m_ultra_quant) && + (ise_endpoint_range < astc_helpers::BISE_256_LEVELS) && + (best_trial_dist != BIG_FLOAT_VAL)) + { + uint8_t orig_best_trial_endpoints[NUM_MODE11_ENDPOINTS]; + memcpy(orig_best_trial_endpoints, best_trial_endpoints, NUM_MODE11_ENDPOINTS); + + for (uint32_t c = 0; c < NUM_MODE11_ENDPOINTS; c++) + { + for (int dt = 0; dt <= 1; dt++) + { + const int d = dt ? 1 : -1; + + uint8_t varied_endpoints[NUM_MODE11_ENDPOINTS]; + memcpy(varied_endpoints, orig_best_trial_endpoints, NUM_MODE11_ENDPOINTS); + + int ise = varied_endpoints[c]; + + int rank = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_rank[ise]; + rank = clamp(rank + d, 0, astc_helpers::get_ise_levels(ise_endpoint_range) - 1); + + ise = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_rank_to_ISE[rank]; + + varied_endpoints[c] = (uint8_t)ise; + + int e[2][3]; + if (!decode_mode11_to_qlog12(varied_endpoints, e, ise_endpoint_range)) + continue; + + vec3F e0( + (float)(e[0][0] << 4), + (float)(e[0][1] << 4), + (float)(e[0][2] << 4) + ); + + vec3F e1( + (float)(e[1][0] << 4), + (float)(e[1][1] << 4), + (float)(e[1][2] << 4) + ); + + double dist0 = e0.squared_distance_d(low_color_q16) + e1.squared_distance_d(high_color_q16); + double dist1 = e1.squared_distance_d(low_color_q16) + e0.squared_distance_d(high_color_q16); + double dist = helpers::minimum(dist0, dist1); + + if (dist < best_trial_dist) + { + best_trial_dist = dist; + memcpy(best_trial_endpoints, varied_endpoints, NUM_MODE11_ENDPOINTS); + } + } // d + } // c + } // if (coptions.m_ultra_quant) + + bool improved_flag = false; + + if (best_trial_dist != BIG_FLOAT_VAL) + { + if (get_astc_hdr_mode_11_block_colors(best_trial_endpoints, &decoded_half[0][0], nullptr, num_weight_levels, ise_weight_range, ise_endpoint_range)) + { + uint32_t usable_selector_bitmask = UINT32_MAX; + if ((constrain_ise_weight_selectors) && (ise_weight_range == astc_helpers::BISE_16_LEVELS)) + usable_selector_bitmask = (1 << 0) | (1 << 1) | (1 << 4) | (1 << 5) | (1 << 10) | (1 << 11) | (1 << 14) | (1 << 15); + else if ((constrain_ise_weight_selectors) && (ise_weight_range == astc_helpers::BISE_12_LEVELS)) + usable_selector_bitmask = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3); + + double trial_blk_error = eval_selectors(num_pixels, trial_weights, ise_weight_range, &block_pixels_half[0][0], num_weight_levels, &decoded_half[0][0], coptions, usable_selector_bitmask); + if (trial_blk_error < cur_block_error) + { + cur_block_error = trial_blk_error; + memcpy(pEndpoints, best_trial_endpoints, NUM_MODE11_ENDPOINTS); + memcpy(pWeights, trial_weights, num_pixels); + submode_used = best_submode + 1; + improved_flag = true; + } + } + } + + return improved_flag; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +bool try_mode11_dual_plane(uint32_t channel_index, uint32_t num_pixels, + uint8_t* pEndpoints, uint8_t* pWeights0, uint8_t* pWeights1, double& cur_block_error, uint32_t& submode_used, + const vec3F& low_color_q16, const vec3F& high_color_q16, + const basist::half_float block_pixels_half[][3], + uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions, bool direct_only, uint32_t ise_endpoint_range, + bool constrain_ise_weight_selectors, + int32_t first_submode, int32_t last_submode, bool ignore_clamping) // -1, 7 +{ + assert(channel_index <= 2); + assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + assert((num_weight_levels >= MIN_SUPPORTED_WEIGHT_LEVELS) && (num_weight_levels <= MAX_SUPPORTED_WEIGHT_LEVELS)); + assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS)); + assert(num_weight_levels == astc_helpers::get_ise_levels(ise_weight_range)); + + half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3]; + uint8_t orig_trial_endpoints[NUM_MODE11_ENDPOINTS], trial_weights0[MAX_ASTC_HDR_ENC_BLOCK_PIXELS], trial_weights1[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + + if (direct_only) + { + first_submode = -1; + last_submode = -1; + } + + assert(first_submode <= last_submode); + assert((first_submode >= -1) && (first_submode <= 7)); + assert((last_submode >= -1) && (last_submode <= 7)); + + uint8_t best_trial_endpoints[NUM_MODE11_ENDPOINTS]; + clear_obj(best_trial_endpoints); + + double best_trial_dist = BIG_FLOAT_VAL; + int best_submode = 0; + + for (int submode = last_submode; submode >= first_submode; submode--) + { + bool did_clamp = false; + int max_clamp_mag = 0; + if (submode == -1) + { + // If it had to clamp with one of the submodes, try direct which can't clamp, but has low precision. + pack_astc_mode11_direct(orig_trial_endpoints, low_color_q16, high_color_q16); + } + else + { + const int MAX_CLAMP_MAG_ACCEPT_THRESH = 32; + did_clamp = pack_astc_mode11_submode(submode, orig_trial_endpoints, low_color_q16, high_color_q16, max_clamp_mag, !ignore_clamping, MAX_CLAMP_MAG_ACCEPT_THRESH); + + if (!ignore_clamping) + { + // If it had to clamp and the clamp was too high, it'll distort the endpoint colors too much, which could lead to noticeable artifacts. + if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH)) + continue; + } + } + + uint8_t trial_endpoints[NUM_MODE11_ENDPOINTS]; + + // This will distort the endpoints if the ISE endpoint range isn't 256 levels (20). + // It could massively distort the endpoints, but still result in a valid encoding. + basist::astc_6x6_hdr::requantize_ise_endpoints(11, astc_helpers::BISE_256_LEVELS, orig_trial_endpoints, ise_endpoint_range, trial_endpoints); + + int e[2][3]; + if (!decode_mode11_to_qlog12(trial_endpoints, e, ise_endpoint_range)) + continue; + + vec3F e0( + (float)(e[0][0] << 4), + (float)(e[0][1] << 4), + (float)(e[0][2] << 4) + ); + + vec3F e1( + (float)(e[1][0] << 4), + (float)(e[1][1] << 4), + (float)(e[1][2] << 4) + ); + + double dist0 = e0.squared_distance_d(low_color_q16) + e1.squared_distance_d(high_color_q16); + double dist1 = e1.squared_distance_d(low_color_q16) + e0.squared_distance_d(high_color_q16); + double dist = helpers::minimum(dist0, dist1); + + if (dist < best_trial_dist) + { + best_trial_dist = dist; + best_submode = submode; + memcpy(best_trial_endpoints, trial_endpoints, sizeof(best_trial_endpoints)); + } + + if (coptions.m_take_first_non_clamping_mode11_submode) + { + if (!did_clamp) + break; + } + + } // submode + + if ((coptions.m_ultra_quant) && + (ise_endpoint_range < astc_helpers::BISE_256_LEVELS) && + (best_trial_dist != BIG_FLOAT_VAL)) + { + uint8_t orig_best_trial_endpoints[NUM_MODE11_ENDPOINTS]; + memcpy(orig_best_trial_endpoints, best_trial_endpoints, NUM_MODE11_ENDPOINTS); + + for (uint32_t c = 0; c < NUM_MODE11_ENDPOINTS; c++) + { + for (int dt = 0; dt <= 1; dt++) + { + const int d = dt ? 1 : -1; + + uint8_t varied_endpoints[NUM_MODE11_ENDPOINTS]; + memcpy(varied_endpoints, orig_best_trial_endpoints, NUM_MODE11_ENDPOINTS); + + int ise = varied_endpoints[c]; + + int rank = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_rank[ise]; + rank = clamp(rank + d, 0, astc_helpers::get_ise_levels(ise_endpoint_range) - 1); + + ise = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_rank_to_ISE[rank]; + + varied_endpoints[c] = (uint8_t)ise; + + int e[2][3]; + if (!decode_mode11_to_qlog12(varied_endpoints, e, ise_endpoint_range)) + continue; + + vec3F e0( + (float)(e[0][0] << 4), + (float)(e[0][1] << 4), + (float)(e[0][2] << 4) + ); + + vec3F e1( + (float)(e[1][0] << 4), + (float)(e[1][1] << 4), + (float)(e[1][2] << 4) + ); + + double dist0 = e0.squared_distance_d(low_color_q16) + e1.squared_distance_d(high_color_q16); + double dist1 = e1.squared_distance_d(low_color_q16) + e0.squared_distance_d(high_color_q16); + double dist = helpers::minimum(dist0, dist1); + + if (dist < best_trial_dist) + { + best_trial_dist = dist; + memcpy(best_trial_endpoints, varied_endpoints, NUM_MODE11_ENDPOINTS); + } + } // d + } // c + } // if (coptions.m_ultra_quant) + + bool improved_flag = false; + + if (best_trial_dist != BIG_FLOAT_VAL) + { + if (get_astc_hdr_mode_11_block_colors(best_trial_endpoints, &decoded_half[0][0], nullptr, num_weight_levels, ise_weight_range, ise_endpoint_range)) + { + uint32_t usable_selector_bitmask = UINT32_MAX; + if ((constrain_ise_weight_selectors) && (ise_weight_range == astc_helpers::BISE_16_LEVELS)) + usable_selector_bitmask = (1 << 0) | (1 << 1) | (1 << 4) | (1 << 5) | (1 << 10) | (1 << 11) | (1 << 14) | (1 << 15); + else if ((constrain_ise_weight_selectors) && (ise_weight_range == astc_helpers::BISE_12_LEVELS)) + usable_selector_bitmask = (1 << 0) | (1 << 1) | (1 << 2) | (1 << 3); + + double trial_blk_error = eval_selectors_dual_plane(channel_index, num_pixels, trial_weights0, trial_weights1, &block_pixels_half[0][0], num_weight_levels, &decoded_half[0][0], coptions, usable_selector_bitmask); + if (trial_blk_error < cur_block_error) + { + cur_block_error = trial_blk_error; + memcpy(pEndpoints, best_trial_endpoints, NUM_MODE11_ENDPOINTS); + memcpy(pWeights0, trial_weights0, num_pixels); + memcpy(pWeights1, trial_weights1, num_pixels); + submode_used = best_submode + 1; + improved_flag = true; + } + } + } + + return improved_flag; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +bool pack_mode7( + const vec3F& high_color_q16, const float s_q16, + uint32_t ise_endpoint_range, uint8_t* pEndpoints, + uint32_t ise_weight_range, // only used for determining biasing during packing + const astc_hdr_codec_base_options& coptions, + int32_t first_submode, int32_t last_submode, bool ignore_clamping, uint32_t& submode_used) +{ + assert(first_submode <= last_submode); + assert((first_submode >= 0) && (first_submode <= (int)MAX_MODE7_SUBMODE_INDEX)); + assert(last_submode <= (int)MAX_MODE7_SUBMODE_INDEX); + + uint8_t unquant_trial_endpoints[NUM_MODE7_ENDPOINTS]; + + memset(pEndpoints, 0, NUM_MODE7_ENDPOINTS); + + double best_trial_dist = BIG_FLOAT_VAL; + int best_trial_submode = 0; + + for (int submode = first_submode; submode <= last_submode; submode++) + { + const int MAX_CLAMP_MAG_ACCEPT_THRESH = 16; + + int max_clamp_mag = 0; + const bool did_clamp = pack_astc_mode7_submode(submode, unquant_trial_endpoints, high_color_q16, s_q16, max_clamp_mag, ise_weight_range, !ignore_clamping, MAX_CLAMP_MAG_ACCEPT_THRESH); + + if (submode < 5) + { + if (!ignore_clamping) + { + if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH)) + continue; + } + } + + uint8_t trial_endpoints[NUM_MODE7_ENDPOINTS]; + + // This will distort the endpoints if the ISE endpoint range isn't 256 levels (20). + // It could massively distort the endpoints, but still result in a valid encoding. + basist::astc_6x6_hdr::requantize_ise_endpoints(7, astc_helpers::BISE_256_LEVELS, unquant_trial_endpoints, ise_endpoint_range, trial_endpoints); + + int e[2][3]; + int decoded_s = 0; + if (!decode_mode7_to_qlog12(trial_endpoints, e, &decoded_s, ise_endpoint_range)) + continue; + + // e1 is always the high color + vec3F e1( + (float)(e[1][0] << 4), + (float)(e[1][1] << 4), + (float)(e[1][2] << 4) + ); + + decoded_s <<= 4; + + double dist = e1.squared_distance_d(high_color_q16) + squared((double)decoded_s - s_q16) * 3; + + if (dist < best_trial_dist) + { + best_trial_dist = dist; + best_trial_submode = submode; + memcpy(pEndpoints, trial_endpoints, NUM_MODE7_ENDPOINTS); + } + + if (coptions.m_take_first_non_clamping_mode7_submode) + { + if (!did_clamp) + break; + } + + } // submode + + if ((coptions.m_ultra_quant) && + (ise_endpoint_range < astc_helpers::BISE_256_LEVELS) && + (best_trial_dist != BIG_FLOAT_VAL)) + { + uint8_t orig_best_trial_endpoints[NUM_MODE7_ENDPOINTS]; + memcpy(orig_best_trial_endpoints, pEndpoints, NUM_MODE7_ENDPOINTS); + + vec3F low_color_q16(high_color_q16 - vec3F(s_q16)); + low_color_q16.clamp(0.0f, 65535.0f); + + for (uint32_t c = 0; c < NUM_MODE7_ENDPOINTS; c++) + { + for (int dt = 0; dt <= 1; dt++) + { + const int d = dt ? 1 : -1; + + uint8_t varied_endpoints[NUM_MODE7_ENDPOINTS]; + memcpy(varied_endpoints, orig_best_trial_endpoints, NUM_MODE7_ENDPOINTS); + + int ise = varied_endpoints[c]; + + int rank = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_rank[ise]; + rank = clamp(rank + d, 0, astc_helpers::get_ise_levels(ise_endpoint_range) - 1); + + ise = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_rank_to_ISE[rank]; + + varied_endpoints[c] = (uint8_t)ise; + + int e[2][3]; + int decoded_s = 0; + if (!decode_mode7_to_qlog12(varied_endpoints, e, &decoded_s, ise_endpoint_range)) + continue; + + // e1 is always the high color + vec3F e1( + (float)(e[1][0] << 4), + (float)(e[1][1] << 4), + (float)(e[1][2] << 4) + ); + + decoded_s <<= 4; + + double dist = e1.squared_distance_d(high_color_q16) + squared((double)decoded_s - s_q16) * 3; + + if (dist < best_trial_dist) + { + best_trial_dist = dist; + memcpy(pEndpoints, varied_endpoints, NUM_MODE7_ENDPOINTS); + } + + } // d + } // c + } + + submode_used = best_trial_submode; + + return (best_trial_dist != BIG_FLOAT_VAL); +} + +//-------------------------------------------------------------------------------------------------------------------------- + +bool try_mode7( + uint32_t num_pixels, + uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used, + const vec3F& high_color_q16, const float s_q16, + const half_float block_pixels_half[][3], + uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions, + uint32_t ise_endpoint_range, + int32_t first_submode, int32_t last_submode) +{ + assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS)); + + assert(first_submode <= last_submode); + assert((first_submode >= 0) && (first_submode <= (int)MAX_MODE7_SUBMODE_INDEX)); + assert(last_submode <= (int)MAX_MODE7_SUBMODE_INDEX); + assert(num_weight_levels == astc_helpers::get_ise_levels(ise_weight_range)); + + uint8_t unquant_trial_endpoints[NUM_MODE7_ENDPOINTS]; + + uint8_t best_trial_endpoints[NUM_MODE7_ENDPOINTS]; + clear_obj(best_trial_endpoints); + double best_trial_dist = BIG_FLOAT_VAL; + int best_trial_submode = 0; + + for (int submode = first_submode; submode <= last_submode; submode++) + { + const int MAX_CLAMP_MAG_ACCEPT_THRESH = 16; + + int max_clamp_mag = 0; + const bool did_clamp = pack_astc_mode7_submode(submode, unquant_trial_endpoints, high_color_q16, s_q16, max_clamp_mag, ise_weight_range, true, MAX_CLAMP_MAG_ACCEPT_THRESH); + + if (submode < 5) + { + if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH)) + continue; + } + + uint8_t trial_endpoints[NUM_MODE7_ENDPOINTS]; + + // This will distort the endpoints if the ISE endpoint range isn't 256 levels (20). + // It could massively distort the endpoints, but still result in a valid encoding. + basist::astc_6x6_hdr::requantize_ise_endpoints(7, astc_helpers::BISE_256_LEVELS, unquant_trial_endpoints, ise_endpoint_range, trial_endpoints); + + int e[2][3]; + int decoded_s = 0; + if (!decode_mode7_to_qlog12(trial_endpoints, e, &decoded_s, ise_endpoint_range)) + continue; + + // e1 is always the high color + vec3F e1( + (float)(e[1][0] << 4), + (float)(e[1][1] << 4), + (float)(e[1][2] << 4) + ); + + decoded_s <<= 4; + + double dist = e1.squared_distance_d(high_color_q16) + squared((double)decoded_s - s_q16) * 3; + + if (dist < best_trial_dist) + { + best_trial_dist = dist; + best_trial_submode = submode; + memcpy(best_trial_endpoints, trial_endpoints, sizeof(best_trial_endpoints)); + } + + if (coptions.m_take_first_non_clamping_mode7_submode) + { + if (!did_clamp) + break; + } + + } // submode + + if ((coptions.m_ultra_quant) && + (ise_endpoint_range < astc_helpers::BISE_256_LEVELS) && + (best_trial_dist != BIG_FLOAT_VAL)) + { + uint8_t orig_best_trial_endpoints[NUM_MODE7_ENDPOINTS]; + memcpy(orig_best_trial_endpoints, best_trial_endpoints, NUM_MODE7_ENDPOINTS); + + vec3F low_color_q16(high_color_q16 - vec3F(s_q16)); + low_color_q16.clamp(0.0f, 65535.0f); + + for (uint32_t c = 0; c < NUM_MODE7_ENDPOINTS; c++) + { + for (int dt = 0; dt <= 1; dt++) + { + const int d = dt ? 1 : -1; + + uint8_t varied_endpoints[NUM_MODE7_ENDPOINTS]; + memcpy(varied_endpoints, orig_best_trial_endpoints, NUM_MODE7_ENDPOINTS); + + int ise = varied_endpoints[c]; + + int rank = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_rank[ise]; + rank = clamp(rank + d, 0, astc_helpers::get_ise_levels(ise_endpoint_range) - 1); + + ise = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_rank_to_ISE[rank]; + + varied_endpoints[c] = (uint8_t)ise; + + int e[2][3]; + int decoded_s = 0; + if (!decode_mode7_to_qlog12(varied_endpoints, e, &decoded_s, ise_endpoint_range)) + continue; + + // e1 is always the high color + vec3F e1( + (float)(e[1][0] << 4), + (float)(e[1][1] << 4), + (float)(e[1][2] << 4) + ); + + decoded_s <<= 4; + + double dist = e1.squared_distance_d(high_color_q16) + squared((double)decoded_s - s_q16) * 3; + + if (dist < best_trial_dist) + { + best_trial_dist = dist; + memcpy(best_trial_endpoints, varied_endpoints, NUM_MODE7_ENDPOINTS); + } + + } // d + } // c + } + + bool improved_flag = false; + + if (best_trial_dist != BIG_FLOAT_VAL) + { + half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3]; + uint8_t trial_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + + if (get_astc_hdr_mode_7_block_colors(best_trial_endpoints, &decoded_half[0][0], nullptr, num_weight_levels, ise_weight_range, ise_endpoint_range)) + { + double trial_blk_error = eval_selectors(num_pixels, trial_weights, ise_weight_range, &block_pixels_half[0][0], num_weight_levels, &decoded_half[0][0], coptions); + if (trial_blk_error < cur_block_error) + { + cur_block_error = trial_blk_error; + memcpy(pEndpoints, best_trial_endpoints, NUM_MODE7_ENDPOINTS); + memcpy(pWeights, trial_weights, num_pixels); + submode_used = best_trial_submode; + improved_flag = true; + } + } + } + + return improved_flag; +} + +//-------------------------------------------------------------------------------------------------------------------------- +const float LOW_EMPHASIS_WEIGHT = 1.0f, MIDDLE_EMPHASIS_WEIGHT = 1.25f, HIGH_EMPHASIS_WEIGHT = 1.0f; +const float LOW_EMPHASIS_WEIGHT_HEAVY = 1.0f, MIDDLE_EMPHASIS_WEIGHT_HEAVY = 4.0f, HIGH_EMPHASIS_WEIGHT_HEAVY = 1.0f; + +double encode_astc_hdr_block_mode_11( + uint32_t num_pixels, + const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[], + uint32_t ise_weight_range, + uint32_t& best_submode, + double cur_block_error, + uint8_t* blk_endpoints, uint8_t* blk_weights, + const astc_hdr_codec_base_options& coptions, + bool direct_only, + uint32_t ise_endpoint_range, + bool uber_mode, + bool constrain_ise_weight_selectors, + int32_t first_submode, int32_t last_submode, bool ignore_clamping, opt_mode_t opt_mode, + const encode_astc_block_stats* pBlock_stats) +{ + assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS)); + + assert((first_submode >= FIRST_MODE11_SUBMODE_INDEX) && (first_submode <= last_submode)); + assert(last_submode <= MAX_MODE11_SUBMODE_INDEX); + + best_submode = 0; + + const uint32_t num_weight_levels = astc_helpers::get_ise_levels(ise_weight_range); + assert(num_weight_levels <= MAX_SUPPORTED_WEIGHT_LEVELS); + + vec3F block_mean_color_q16, block_axis_q16; + if (!pBlock_stats) + { + block_mean_color_q16 = calc_mean(num_pixels, pBlock_pixels_q16); + block_axis_q16 = calc_rgb_pca(num_pixels, pBlock_pixels_q16, block_mean_color_q16); + } + else + { + assert(num_pixels == pBlock_stats->m_num_pixels); + block_mean_color_q16 = pBlock_stats->m_mean_q16; + block_axis_q16 = pBlock_stats->m_axis_q16; + } + + aabb3F color_box_q16(cInitExpand); + + float l = BIG_FLOAT_VAL, h = -BIG_FLOAT_VAL; + vec3F low_color_q16, high_color_q16; + + for (uint32_t i = 0; i < num_pixels; i++) + { + color_box_q16.expand(pBlock_pixels_q16[i]); + + vec3F k(vec3F(pBlock_pixels_q16[i]) - block_mean_color_q16); + float kd = k.dot(block_axis_q16); + + if (kd < l) + { + l = kd; + low_color_q16 = pBlock_pixels_q16[i]; + } + + if (kd > h) + { + h = kd; + high_color_q16 = pBlock_pixels_q16[i]; + } + } + + vec3F old_low_color_q16(low_color_q16), old_high_color_q16(high_color_q16); + + for (uint32_t i = 0; i < 3; i++) + { + low_color_q16[i] = lerp(old_low_color_q16[i], old_high_color_q16[i], 1.0f / 64.0f); + high_color_q16[i] = lerp(old_low_color_q16[i], old_high_color_q16[i], 63.0f / 64.0f); + } + + uint8_t trial_blk_endpoints[NUM_MODE11_ENDPOINTS]; + uint8_t trial_blk_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + uint32_t trial_best_submode = 0; + + clear_obj(trial_blk_endpoints); + clear_obj(trial_blk_weights); + + double trial_blk_error = BIG_FLOAT_VAL; + + bool did_improve = try_mode11(num_pixels, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_best_submode, + low_color_q16, high_color_q16, + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors, + first_submode, last_submode, ignore_clamping); + + // If we couldn't find ANY usable solution due to endpoint quantization, just return. There's nothing we can do. + if (!did_improve) + return cur_block_error; + + // Did the solution improve? + if (trial_blk_error < cur_block_error) + { + cur_block_error = trial_blk_error; + memcpy(blk_endpoints, trial_blk_endpoints, NUM_MODE11_ENDPOINTS); + memcpy(blk_weights, trial_blk_weights, num_pixels); + best_submode = trial_best_submode; + } + + if (opt_mode == cNoOpt) + return cur_block_error; + + // least squares on the most promising trial weight indices found + const uint32_t NUM_LS_PASSES = 3; + + float emphasis_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + + if (opt_mode == cWeightedAverage) + { + const uint32_t NUM_OPT_PASSES = 3; + for (uint32_t pass = 0; pass < NUM_OPT_PASSES; pass++) + { + vec3F low_p(0.0f); + float total_low = 0.0f; + + vec3F high_p(0.0f); + float total_high = 0.0f; + + for (uint32_t i = 0; i < num_pixels; i++) + { + vec3F p(pBlock_pixels_q16[i]); + float lerp = g_ise_weight_lerps[ise_weight_range][trial_blk_weights[i] + 1] * (1.0f / 64.0f); + + low_p += p * (1.0f - lerp); + total_low += (1.0f - lerp); + + high_p += p * lerp; + total_high += lerp; + } + + if (total_low != 0.0f) + low_p *= (1.0f / total_low); + + if (total_high != 0.0f) + high_p *= (1.0f / total_high); + + vec3F low, high; + + bool was_improved = try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + low_p, high_p, + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors, + first_submode, last_submode, ignore_clamping); + + if (!was_improved) + break; + + memcpy(trial_blk_weights, blk_weights, num_pixels); + } + } + else if (opt_mode == cOrdinaryLeastSquares) + { + for (uint32_t pass = 0; pass < NUM_LS_PASSES; pass++) + { + vec3F l_q16, h_q16; + + if (!compute_least_squares_endpoints_rgb(num_pixels, trial_blk_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16)) + break; + + bool was_improved = try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + l_q16, h_q16, + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors, + first_submode, last_submode, ignore_clamping); + + if (!was_improved) + break; + + // It's improved, so let's take the new weight indices. + memcpy(trial_blk_weights, blk_weights, num_pixels); + + } // pass + } + else + { + if (h == l) + { + for (uint32_t i = 0; i < num_pixels; i++) + emphasis_weights[i] = 1.0f; + } + else + { + float mid = (0.0f - l) / (h - l); + mid = clamp(mid, .01f, .99f); + + float lw = LOW_EMPHASIS_WEIGHT, mw = MIDDLE_EMPHASIS_WEIGHT, hw = HIGH_EMPHASIS_WEIGHT; + if (opt_mode == cWeightedLeastSquaresHeavy) + lw = LOW_EMPHASIS_WEIGHT_HEAVY, mw = MIDDLE_EMPHASIS_WEIGHT_HEAVY, hw = HIGH_EMPHASIS_WEIGHT_HEAVY; + + for (uint32_t i = 0; i < num_pixels; i++) + { + vec3F k(vec3F(pBlock_pixels_q16[i]) - block_mean_color_q16); + float kd = k.dot(block_axis_q16); + + assert((kd >= l) && (kd <= h)); + + float v = (kd - l) / (h - l); + + if (v < mid) + v = lerp(lw, mw, v / mid); + else + v = lerp(mw, hw, (v - mid) * (1.0f - mid)); + + emphasis_weights[i] = v; + } + +#if 0 + if (num_pixels == 6 * 6) + { + const float EDGE_WEIGHT = .1f; + for (uint32_t i = 0; i < 6; i++) + { + emphasis_weights[i] += EDGE_WEIGHT; + emphasis_weights[i + 5 * 6] += EDGE_WEIGHT; + emphasis_weights[i * 6] += EDGE_WEIGHT; + emphasis_weights[5 + i * 6] += EDGE_WEIGHT; + } + } +#endif + } + + for (uint32_t pass = 0; pass < NUM_LS_PASSES; pass++) + { + vec3F l_q16, h_q16; + + if (!compute_weighted_least_squares_endpoints_rgb( + num_pixels, + trial_blk_weights, &g_astc_ls_weights_ise[ise_weight_range][0], nullptr, + emphasis_weights, + &l_q16, &h_q16, + pBlock_pixels_q16, + color_box_q16)) + break; + + bool was_improved = try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + l_q16, h_q16, + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors, + first_submode, last_submode, ignore_clamping); + + if (!was_improved) + break; + + // It's improved, so let's take the new weight indices. + memcpy(trial_blk_weights, blk_weights, num_pixels); + + } // pass + } + + if ( (uber_mode) && (ise_weight_range >= astc_helpers::BISE_3_LEVELS) && + ((opt_mode == cOrdinaryLeastSquares) || (opt_mode == cWeightedLeastSquares) || (opt_mode == cWeightedLeastSquaresHeavy)) ) + { + // Try varying the current best weight indices. This can be expanded/improved, but at potentially great cost. + + uint8_t temp_astc_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + memcpy(temp_astc_weights, trial_blk_weights, num_pixels); + + uint32_t min_lin_sel = 256, max_lin_sel = 0; + for (uint32_t i = 0; i < num_pixels; i++) + { + const uint32_t astc_sel = temp_astc_weights[i]; + + const uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel]; + assert(lin_sel < num_weight_levels); + + min_lin_sel = minimumu(min_lin_sel, lin_sel); + max_lin_sel = maximumu(max_lin_sel, lin_sel); + } + + bool was_improved = false; + (void)was_improved; + + { + bool weights_changed = false; + uint8_t trial_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + for (uint32_t i = 0; i < num_pixels; i++) + { + uint32_t astc_sel = temp_astc_weights[i]; + uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel]; + + if ((lin_sel == min_lin_sel) && (lin_sel < (num_weight_levels - 1))) + { + lin_sel++; + weights_changed = true; + } + + trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel]; + } + + if (weights_changed) + { + vec3F l_q16, h_q16; + + bool succeeded; + if (opt_mode == cOrdinaryLeastSquares) + succeeded = compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16); + else + succeeded = compute_weighted_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], nullptr, emphasis_weights, &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16); + + if (succeeded) + { + if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + l_q16, h_q16, + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors, + first_submode, last_submode, ignore_clamping)) + { + was_improved = true; + } + } + } + } + + { + bool weights_changed = false; + uint8_t trial_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + + for (uint32_t i = 0; i < num_pixels; i++) + { + uint32_t astc_sel = temp_astc_weights[i]; + uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel]; + + if ((lin_sel == max_lin_sel) && (lin_sel > 0)) + { + lin_sel--; + weights_changed = true; + } + + trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel]; + } + + if (weights_changed) + { + vec3F l_q16, h_q16; + + bool succeeded; + if (opt_mode == cOrdinaryLeastSquares) + succeeded = compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16); + else + succeeded = compute_weighted_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], nullptr, emphasis_weights, &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16); + + if (succeeded) + { + if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + l_q16, h_q16, + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors, + first_submode, last_submode, ignore_clamping)) + { + was_improved = true; + } + } + } + } + + { + bool weights_changed = false; + uint8_t trial_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + for (uint32_t i = 0; i < num_pixels; i++) + { + uint32_t astc_sel = temp_astc_weights[i]; + uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel]; + + if ((lin_sel == max_lin_sel) && (lin_sel > 0)) + { + lin_sel--; + weights_changed = true; + } + else if ((lin_sel == min_lin_sel) && (lin_sel < (num_weight_levels - 1))) + { + lin_sel++; + weights_changed = true; + } + + trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel]; + } + + if (weights_changed) + { + vec3F l_q16, h_q16; + bool succeeded; + if (opt_mode == cOrdinaryLeastSquares) + succeeded = compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16); + else + succeeded = compute_weighted_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], nullptr, emphasis_weights, &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16); + + if (succeeded) + { + if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + l_q16, h_q16, + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors, + first_submode, last_submode, ignore_clamping)) + { + was_improved = true; + } + } + } + } + + } // uber_mode + + return cur_block_error; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +double encode_astc_hdr_block_downsampled_mode_11( + uint32_t block_x, uint32_t block_y, uint32_t grid_x, uint32_t grid_y, + uint32_t ise_weight_range, uint32_t ise_endpoint_range, + uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[], + double cur_block_error, + int32_t first_submode, int32_t last_submode, bool ignore_clamping, opt_mode_t opt_mode, + uint8_t* pBlk_endpoints, uint8_t* pBlk_weights, uint32_t& best_submode, + const astc_hdr_codec_base_options& coptions, + const encode_astc_block_stats* pBlock_stats) +{ + assert((block_x >= 4) && (block_y >= 4) && (block_x <= MAX_ASTC_HDR_BLOCK_W) && (block_y <= MAX_ASTC_HDR_BLOCK_H)); + assert((grid_x >= 2) && (grid_y >= 2) && (grid_x <= block_x) && (grid_y <= block_y)); + + assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS)); + + assert((first_submode >= FIRST_MODE11_SUBMODE_INDEX) && (first_submode <= last_submode)); + assert(last_submode <= MAX_MODE11_SUBMODE_INDEX); + + best_submode = 0; + + assert(astc_helpers::get_ise_levels(ise_weight_range) <= MAX_SUPPORTED_WEIGHT_LEVELS); + + const uint32_t num_weights = grid_x * grid_y; + + vec3F block_mean_color_q16, block_axis_q16; + if (!pBlock_stats) + { + block_mean_color_q16 = calc_mean(num_pixels, pBlock_pixels_q16); + block_axis_q16 = calc_rgb_pca(num_pixels, pBlock_pixels_q16, block_mean_color_q16); + } + else + { + assert(num_pixels == pBlock_stats->m_num_pixels); + block_mean_color_q16 = pBlock_stats->m_mean_q16; + block_axis_q16 = pBlock_stats->m_axis_q16; + } + + aabb3F color_box_q16(cInitExpand); + + float l = BIG_FLOAT_VAL, h = -BIG_FLOAT_VAL; + vec3F low_color_q16, high_color_q16; + + for (uint32_t i = 0; i < num_pixels; i++) + { + color_box_q16.expand(pBlock_pixels_q16[i]); + + vec3F k(vec3F(pBlock_pixels_q16[i]) - block_mean_color_q16); + float kd = k.dot(block_axis_q16); + + if (kd < l) + { + l = kd; + low_color_q16 = pBlock_pixels_q16[i]; + } + + if (kd > h) + { + h = kd; + high_color_q16 = pBlock_pixels_q16[i]; + } + } + + vec3F old_low_color_q16(low_color_q16), old_high_color_q16(high_color_q16); + + for (uint32_t i = 0; i < 3; i++) + { + low_color_q16[i] = lerp(old_low_color_q16[i], old_high_color_q16[i], 1.0f / 64.0f); + high_color_q16[i] = lerp(old_low_color_q16[i], old_high_color_q16[i], 63.0f / 64.0f); + } + + const uint32_t NUM_PASSES = 3; + for (uint32_t pass = 0; pass < NUM_PASSES; pass++) + { + uint8_t trial_blk_endpoints[NUM_MODE11_ENDPOINTS]; + uint8_t trial_blk_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; // at block resolution, not grid res + uint32_t trial_best_submode = 0; + + clear_obj(trial_blk_endpoints); + clear_obj(trial_blk_weights); + + double trial_blk_error = BIG_FLOAT_VAL; + + bool could_pack = try_mode11(num_pixels, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_best_submode, + low_color_q16, high_color_q16, + pBlock_pixels_half, 32, astc_helpers::BISE_32_LEVELS, coptions, false, ise_endpoint_range, false, + first_submode, last_submode, ignore_clamping); + + if (!could_pack) + break; + + uint8_t trial_downsampled_ise_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + + downsample_ise_weights( + astc_helpers::BISE_32_LEVELS, ise_weight_range, + block_x, block_y, grid_x, grid_y, + trial_blk_weights, trial_downsampled_ise_weights); + + uint8_t trial_downsampled_raw_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + dequantize_astc_weights(num_weights, trial_downsampled_ise_weights, ise_weight_range, trial_downsampled_raw_weights); + + uint8_t trial_upsampled_raw_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; // raw weights, NOT ISE + astc_helpers::upsample_weight_grid(block_x, block_y, grid_x, grid_y, trial_downsampled_raw_weights, trial_upsampled_raw_weights); + + //------ + + int trial_e[2][3]; + if (!decode_mode11_to_qlog12(trial_blk_endpoints, trial_e, ise_endpoint_range)) + return cur_block_error; + + double trial_error = compute_block_error_from_raw_weights(num_pixels, pBlock_pixels_half, trial_upsampled_raw_weights, trial_e, coptions); + + if (trial_error < cur_block_error) + { + cur_block_error = trial_error; + memcpy(pBlk_endpoints, trial_blk_endpoints, NUM_MODE11_ENDPOINTS); + memcpy(pBlk_weights, trial_downsampled_ise_weights, num_weights); + best_submode = trial_best_submode; + } + else if (pass) + break; + + if ((opt_mode == cWeightedLeastSquares) || (opt_mode == cWeightedLeastSquaresHeavy)) + { + float emphasis_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + if (h == l) + { + for (uint32_t i = 0; i < num_pixels; i++) + emphasis_weights[i] = 1.0f; + } + else + { + float mid = (0.0f - l) / (h - l); + mid = clamp(mid, .01f, .99f); + + float lw = LOW_EMPHASIS_WEIGHT, mw = MIDDLE_EMPHASIS_WEIGHT, hw = HIGH_EMPHASIS_WEIGHT; + if (opt_mode == cWeightedLeastSquaresHeavy) + lw = LOW_EMPHASIS_WEIGHT_HEAVY, mw = MIDDLE_EMPHASIS_WEIGHT_HEAVY, hw = HIGH_EMPHASIS_WEIGHT_HEAVY; + + for (uint32_t i = 0; i < num_pixels; i++) + { + vec3F k(vec3F(pBlock_pixels_q16[i]) - block_mean_color_q16); + float kd = k.dot(block_axis_q16); + + assert((kd >= l) && (kd <= h)); + + float v = (kd - l) / (h - l); + + if (v < mid) + v = lerp(lw, mw, v / mid); + else + v = lerp(mw, hw, (v - mid) * (1.0f - mid)); + + emphasis_weights[i] = v; + } + } + + float trial_upsampled_raw_weightsf[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + for (uint32_t i = 0; i < num_pixels; i++) + trial_upsampled_raw_weightsf[i] = (float)trial_upsampled_raw_weights[i] * (1.0f / 64.0f); + + if (!compute_weighted_least_squares_endpoints_rgb(num_pixels, nullptr, nullptr, trial_upsampled_raw_weightsf, emphasis_weights, &low_color_q16, &high_color_q16, pBlock_pixels_q16, color_box_q16)) + return false; + } + else + { + if (!compute_least_squares_endpoints_rgb_raw_weights(num_pixels, trial_upsampled_raw_weights, &low_color_q16, &high_color_q16, pBlock_pixels_q16, color_box_q16)) + break; + } + + bool pack_succeeded = pack_mode11(low_color_q16, high_color_q16, ise_endpoint_range, trial_blk_endpoints, coptions, false, first_submode, last_submode, false, trial_best_submode); + if (!pack_succeeded) + break; + + if (!decode_mode11_to_qlog12(trial_blk_endpoints, trial_e, ise_endpoint_range)) + break; + + trial_error = compute_block_error_from_raw_weights(num_pixels, pBlock_pixels_half, trial_upsampled_raw_weights, trial_e, coptions); + + if (trial_error < cur_block_error) + { + cur_block_error = trial_error; + memcpy(pBlk_endpoints, trial_blk_endpoints, NUM_MODE11_ENDPOINTS); + memcpy(pBlk_weights, trial_downsampled_ise_weights, num_weights); + best_submode = trial_best_submode; + } + else + { + break; + } + + } // pass + + return cur_block_error; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +double encode_astc_hdr_block_mode_11_dual_plane( + uint32_t num_pixels, + const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[], + uint32_t channel_index, // 0-2 + uint32_t ise_weight_range, + uint32_t& best_submode, + double cur_block_error, + uint8_t* blk_endpoints, uint8_t* blk_weights0, uint8_t* blk_weights1, + const astc_hdr_codec_base_options& coptions, + bool direct_only, + uint32_t ise_endpoint_range, + bool uber_mode, + bool constrain_ise_weight_selectors, + int32_t first_submode, int32_t last_submode, bool ignore_clamping) +{ + (void)uber_mode; + + assert(channel_index <= 2); + assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS)); + + assert((first_submode >= FIRST_MODE11_SUBMODE_INDEX) && (first_submode <= last_submode)); + assert(last_submode <= MAX_MODE11_SUBMODE_INDEX); + + assert(num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS); + + best_submode = 0; + + const uint32_t num_weight_levels = astc_helpers::get_ise_levels(ise_weight_range); + assert(num_weight_levels <= MAX_SUPPORTED_WEIGHT_LEVELS); + + vec4F temp_block_pixels_q16[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + for (uint32_t i = 0; i < num_pixels; i++) + { + temp_block_pixels_q16[i] = pBlock_pixels_q16[i]; + temp_block_pixels_q16[i][channel_index] = 0.0f; + } + + vec3F block_mean_color_q16(calc_mean(num_pixels, temp_block_pixels_q16)); + vec3F block_axis_q16(calc_rgb_pca(num_pixels, temp_block_pixels_q16, block_mean_color_q16)); + + float l = BIG_FLOAT_VAL, h = -BIG_FLOAT_VAL; + vec3F low_color_q16, high_color_q16; + + aabb3F color_box_q16(cInitExpand); + + for (uint32_t i = 0; i < num_pixels; i++) + { + color_box_q16.expand(pBlock_pixels_q16[i]); + + vec3F k(vec3F(temp_block_pixels_q16[i]) - block_mean_color_q16); + float kd = k.dot(block_axis_q16); + + if (kd < l) + { + l = kd; + low_color_q16 = pBlock_pixels_q16[i]; + } + + if (kd > h) + { + h = kd; + high_color_q16 = pBlock_pixels_q16[i]; + } + } + + low_color_q16[channel_index] = 0.0f; + high_color_q16[channel_index] = 0.0f; + + float a = low_color_q16.dot(vec3F(1.0f)), b = high_color_q16.dot(vec3F(1.0f)); + if (a <= b) + { + low_color_q16[channel_index] = color_box_q16.get_low()[channel_index]; + high_color_q16[channel_index] = color_box_q16.get_high()[channel_index]; + } + else + { + high_color_q16[channel_index] = color_box_q16.get_low()[channel_index]; + low_color_q16[channel_index] = color_box_q16.get_high()[channel_index]; + } + + vec3F old_low_color_q16(low_color_q16), old_high_color_q16(high_color_q16); + for (uint32_t i = 0; i < 3; i++) + { + low_color_q16[i] = lerp(old_low_color_q16[i], old_high_color_q16[i], 1.0f / 64.0f); + high_color_q16[i] = lerp(old_low_color_q16[i], old_high_color_q16[i], 63.0f / 64.0f); + } + + uint8_t trial_blk_endpoints[NUM_MODE11_ENDPOINTS]; + uint8_t trial_blk_weights0[MAX_ASTC_HDR_ENC_BLOCK_PIXELS], trial_blk_weights1[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + uint32_t trial_best_submode = 0; + + clear_obj(trial_blk_endpoints); + clear_obj(trial_blk_weights0); + clear_obj(trial_blk_weights1); + + double trial_blk_error = BIG_FLOAT_VAL; + + bool did_improve = try_mode11_dual_plane(channel_index, num_pixels, trial_blk_endpoints, trial_blk_weights0, trial_blk_weights1, trial_blk_error, trial_best_submode, + low_color_q16, high_color_q16, + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors, + first_submode, last_submode, ignore_clamping); + + // If we couldn't find ANY usable solution due to endpoint quantization, just return. There's nothing we can do. + if (!did_improve) + return cur_block_error; + + // Did the solution improve? + if (trial_blk_error < cur_block_error) + { + cur_block_error = trial_blk_error; + memcpy(blk_endpoints, trial_blk_endpoints, NUM_MODE11_ENDPOINTS); + memcpy(blk_weights0, trial_blk_weights0, num_pixels); + memcpy(blk_weights1, trial_blk_weights1, num_pixels); + best_submode = trial_best_submode; + } + + const uint32_t chan0 = (channel_index + 1) % 3, chan1 = (channel_index + 2) % 3; + + vec2F plane0_q16[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + aabb2F plane0_bounds; + plane0_bounds[0].set(color_box_q16.get_low()[chan0], color_box_q16.get_low()[chan1]); + plane0_bounds[1].set(color_box_q16.get_high()[chan0], color_box_q16.get_high()[chan1]); + + vec1F plane1_q16[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + aabb1F plane1_bounds; + plane1_bounds[0].set(color_box_q16.get_low()[channel_index]); + plane1_bounds[1].set(color_box_q16.get_high()[channel_index]); + + for (uint32_t i = 0; i < num_pixels; i++) + { + plane0_q16[i][0] = pBlock_pixels_q16[i][chan0]; + plane0_q16[i][1] = pBlock_pixels_q16[i][chan1]; + + plane1_q16[i][0] = pBlock_pixels_q16[i][channel_index]; + } + + const uint32_t NUM_LS_PASSES = 3; + + for (uint32_t pass = 0; pass < NUM_LS_PASSES; pass++) + { + vec2F l0_q16, h0_q16; + if (!compute_least_squares_endpoints_2D(num_pixels, trial_blk_weights0, &g_astc_ls_weights_ise[ise_weight_range][0], &l0_q16, &h0_q16, plane0_q16, plane0_bounds)) + break; + + vec1F l1_q16, h1_q16; + if (!compute_least_squares_endpoints_1D(num_pixels, trial_blk_weights1, &g_astc_ls_weights_ise[ise_weight_range][0], &l1_q16, &h1_q16, plane1_q16, plane1_bounds)) + break; + + vec3F l_q16, h_q16; + + l_q16[channel_index] = l1_q16[0]; + h_q16[channel_index] = h1_q16[0]; + + l_q16[chan0] = l0_q16[0]; + h_q16[chan0] = h0_q16[0]; + + l_q16[chan1] = l0_q16[1]; + h_q16[chan1] = h0_q16[1]; + + bool was_improved = try_mode11_dual_plane(channel_index, num_pixels, blk_endpoints, blk_weights0, blk_weights1, cur_block_error, best_submode, + l_q16, h_q16, + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight_selectors, + first_submode, last_submode, ignore_clamping); + + if (!was_improved) + break; + + // It's improved, so let's take the new weight indices. + memcpy(trial_blk_weights0, blk_weights0, num_pixels); + memcpy(trial_blk_weights1, blk_weights1, num_pixels); + + } // pass + + return cur_block_error; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +double encode_astc_hdr_block_mode_7( + uint32_t num_pixels, + const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[], + uint32_t ise_weight_range, + uint32_t& best_submode, + double cur_block_error, + uint8_t* blk_endpoints, //[4] + uint8_t* blk_weights, // [num_pixels] + const astc_hdr_codec_base_options& coptions, + uint32_t ise_endpoint_range, + int first_submode, int last_submode, + const encode_astc_block_stats* pBlock_stats) +{ + assert((num_pixels >= 1) && (num_pixels <= MAX_ASTC_HDR_ENC_BLOCK_PIXELS)); + assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); + assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + + const uint32_t num_weight_levels = astc_helpers::get_ise_levels(ise_weight_range); + assert(num_weight_levels <= MAX_SUPPORTED_WEIGHT_LEVELS); + + best_submode = 0; + + vec3F block_mean_color_q16; + if (!pBlock_stats) + block_mean_color_q16 = calc_mean(num_pixels, pBlock_pixels_q16); + else + { + assert(num_pixels == pBlock_stats->m_num_pixels); + block_mean_color_q16 = pBlock_stats->m_mean_q16; + } + + vec3F block_axis_q16(0.577350259f); + + aabb3F color_box_q16(cInitExpand); + + float l = BIG_FLOAT_VAL, h = -BIG_FLOAT_VAL; + for (uint32_t i = 0; i < num_pixels; i++) + { + color_box_q16.expand(pBlock_pixels_q16[i]); + + vec3F k(vec3F(pBlock_pixels_q16[i]) - block_mean_color_q16); + float kd = k.dot(block_axis_q16); + + l = basisu::minimum(l, kd); + h = basisu::maximum(h, kd); + } + + vec3F low_color_q16(interp_color(block_mean_color_q16, block_axis_q16, l, color_box_q16, color_box_q16)); + vec3F high_color_q16(interp_color(block_mean_color_q16, block_axis_q16, h, color_box_q16, color_box_q16)); + + low_color_q16.clamp(0.0f, MAX_QLOG16_VAL); + high_color_q16.clamp(0.0f, MAX_QLOG16_VAL); + + vec3F diff(high_color_q16 - low_color_q16); + + // The mul here (* block_axis_q16[0]) is because the "S" or scale value is subtracted from the high color with a scale of 1.0, + // i.e. it's equivalent to a vector of (1,1,1) multiplied by scale before the sub. We want to actually move along the grayscale axis, or (0.577350259, 0.577350259, 0.577350259). + float s_q16 = diff.dot(block_axis_q16) * block_axis_q16[0]; + + uint8_t trial_blk_endpoints[NUM_MODE7_ENDPOINTS]; + uint8_t trial_blk_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + uint32_t trial_best_submode = 0; + + clear_obj(trial_blk_endpoints); + clear_obj(trial_blk_weights); + + double trial_blk_error = BIG_FLOAT_VAL; + + bool did_improve = try_mode7(num_pixels, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_best_submode, + high_color_q16, ceilf(s_q16), + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range, first_submode, last_submode); + + // If we couldn't find ANY usable solution due to endpoint quantization, just return. There's nothing we can do. + if (!did_improve) + { + return cur_block_error; + } + + // Did the solution improve? + if (trial_blk_error < cur_block_error) + { + cur_block_error = trial_blk_error; + memcpy(blk_endpoints, trial_blk_endpoints, NUM_MODE7_ENDPOINTS); + memcpy(blk_weights, trial_blk_weights, num_pixels); + best_submode = trial_best_submode; + } + +#if 1 + { + //const float TL = 8830.0f;// (float)half_to_qlog16(float_to_half(0.00061f)); + //const float TH = 41600.0f;// (float)half_to_qlog16(float_to_half(40.0f)); + //float zl = minimum(color_box_q16[0][0], color_box_q16[0][1], color_box_q16[0][2]); + //float zh = minimum(color_box_q16[1][0], color_box_q16[1][1], color_box_q16[1][2]); + + //if ((zl <= TL) && (zh >= TH)) + { + // Try a simpler technique for artifact reduction + l = BIG_FLOAT_VAL; + h = -BIG_FLOAT_VAL; + + vec3F alt_low_color_q16(0.0f), alt_high_color_q16(0.0f); + for (uint32_t i = 0; i < num_pixels; i++) + { + color_box_q16.expand(pBlock_pixels_q16[i]); + + vec3F k(vec3F(pBlock_pixels_q16[i]) - block_mean_color_q16); + float kd = k.dot(block_axis_q16); + + if (kd < l) + { + alt_low_color_q16 = pBlock_pixels_q16[i]; + l = kd; + } + + if (kd > h) + { + alt_high_color_q16 = pBlock_pixels_q16[i]; + h = kd; + } + } + + vec3F old_alt_low_color_q16(alt_low_color_q16); + + for (uint32_t i = 0; i < 3; i++) + alt_low_color_q16[i] = lerp(old_alt_low_color_q16[i], alt_high_color_q16[i], 1.0f / 64.0f); + + vec3F alt_diff(alt_high_color_q16 - alt_low_color_q16); + + // The mul here (* block_axis_q16[0]) is because the "S" or scale value is subtracted from the high color with a scale of 1.0, + // i.e. it's equivalent to a vector of (1,1,1) multiplied by scale before the sub. We want to actually move along the grayscale axis, or (0.577350259, 0.577350259, 0.577350259). + float alt_s_q16 = alt_diff.dot(block_axis_q16) * block_axis_q16[0]; + + try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + alt_high_color_q16, ceilf(alt_s_q16), + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range, first_submode, last_submode); + } + } +#endif + + const float one_over_num_pixels = 1.0f / (float)num_pixels; + + const uint32_t NUM_TRIALS = 2; + for (uint32_t trial = 0; trial < NUM_TRIALS; trial++) + { + // Given a set of selectors and S, try to compute a better high color + vec3F new_high_color_q16(block_mean_color_q16); + + int e[2][3]; + int cur_s = 0; + if (!decode_mode7_to_qlog12(trial_blk_endpoints, e, &cur_s, ise_endpoint_range)) + break; + + cur_s <<= 4; + + for (uint32_t i = 0; i < num_pixels; i++) + { + uint32_t astc_sel = trial_blk_weights[i]; + float lerp = g_ise_weight_lerps[ise_weight_range][astc_sel + 1] * (1.0f / 64.0f); + + float k = (float)cur_s * (1.0f - lerp) * one_over_num_pixels; + new_high_color_q16[0] += k; + new_high_color_q16[1] += k; + new_high_color_q16[2] += k; + } + + bool improved = try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + new_high_color_q16, (float)cur_s, + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range, first_submode, last_submode); + + if (improved) + { + memcpy(trial_blk_endpoints, blk_endpoints, NUM_MODE7_ENDPOINTS); + memcpy(trial_blk_weights, blk_weights, num_pixels); + } + + // Given a set of selectors and a high color, try to compute a better S. + float t = 0.0f; + + for (uint32_t i = 0; i < num_pixels; i++) + { + uint32_t astc_sel = trial_blk_weights[i]; + float lerp = g_ise_weight_lerps[ise_weight_range][astc_sel + 1] * (1.0f / 64.0f); + + t += (1.0f) - lerp; + } + + t *= one_over_num_pixels; + + //int e[2][3]; + if (!decode_mode7_to_qlog12(trial_blk_endpoints, e, nullptr, ise_endpoint_range)) + break; + + vec3F cur_h_q16((float)(e[1][0] << 4), (float)(e[1][1] << 4), (float)(e[1][2] << 4)); + + if (fabs(t) > .0000125f) + { + float s_r = (cur_h_q16[0] - block_mean_color_q16[0]) / t; + float s_g = (cur_h_q16[1] - block_mean_color_q16[1]) / t; + float s_b = (cur_h_q16[2] - block_mean_color_q16[2]) / t; + + // TODO: gather statistics on these + if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + cur_h_q16, ceilf(s_r), + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range, first_submode, last_submode)) + { + improved = true; + } + + if (coptions.m_mode7_full_s_optimization) + { + if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + cur_h_q16, ceilf(s_g), + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range, first_submode, last_submode)) + { + improved = true; + } + + if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + cur_h_q16, ceilf(s_b), + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range, first_submode, last_submode)) + { + improved = true; + } + + if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + cur_h_q16, ceilf((s_r + s_g + s_b) / 3.0f), + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range, first_submode, last_submode)) + { + improved = true; + } + + // Added this - quite strong. + if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, + cur_h_q16, minimum(maximum(s_r, s_g, s_b) * 1.1f, 65535.0f), + pBlock_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range, first_submode, last_submode)) + { + improved = true; + } + } // if (coptions.m_mode7_full_s_optimization) + + } // if (fabs(t) > .0000125f) + + if (!improved) + break; + + memcpy(trial_blk_endpoints, blk_endpoints, NUM_MODE7_ENDPOINTS); + memcpy(trial_blk_weights, blk_weights, num_pixels); + + } // trial + + return cur_block_error; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +void dequantize_astc_weights(uint32_t n, const uint8_t* pSrc_ise_vals, uint32_t from_ise_range, uint8_t* pDst_raw_weights) +{ + const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(from_ise_range).m_ISE_to_val; + + for (uint32_t i = 0; i < n; i++) + pDst_raw_weights[i] = dequant_tab[pSrc_ise_vals[i]]; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +// For each output (2x2) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_2x2[4][36] = { +{0.165438f, 0.132609f, 0.092681f, 0.028953f, 0.000000f, 0.000000f, 0.133716f, 0.111240f, 0.065133f, 0.022236f, 0.000000f, 0.000000f, 0.092623f, 0.063898f, 0.039120f, 0.000000f, 0.000000f, 0.000000f, 0.028168f, 0.024184f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.027262f, 0.091051f, 0.132446f, 0.164791f, 0.000000f, 0.000000f, 0.026038f, 0.066511f, 0.111644f, 0.133197f, 0.000000f, 0.000000f, 0.000000f, 0.040053f, 0.064757f, 0.091196f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.024265f, 0.026789f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.028282f, 0.024804f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.092871f, 0.066580f, 0.042024f, 0.000000f, 0.000000f, 0.000000f, 0.132115f, 0.107586f, 0.061943f, 0.025551f, 0.000000f, 0.000000f, 0.166111f, 0.132946f, 0.089043f, 0.030145f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.024535f, 0.028835f, 0.000000f, 0.000000f, 0.000000f, 0.044465f, 0.063652f, 0.093251f, 0.000000f, 0.000000f, 0.025961f, 0.063339f, 0.107329f, 0.132240f, 0.000000f, 0.000000f, 0.029844f, 0.089249f, 0.132200f, 0.165099f}, +}; + +// For each output (3x2) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_3x2[6][36] = { +{0.257933f, 0.144768f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.213754f, 0.109376f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.140969f, 0.064128f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.041270f, 0.027803f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.046066f, 0.153691f, 0.153395f, 0.042845f, 0.000000f, 0.000000f, 0.038497f, 0.131674f, 0.126804f, 0.041513f, 0.000000f, 0.000000f, 0.028434f, 0.081152f, 0.075499f, 0.025372f, 0.000000f, 0.000000f, 0.000000f, 0.030067f, 0.024989f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.147088f, 0.258980f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.105549f, 0.211746f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.066714f, 0.144015f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.027755f, 0.038152f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.044268f, 0.030990f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.141642f, 0.069930f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.207393f, 0.105354f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.255911f, 0.144511f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.026658f, 0.032535f, 0.000000f, 0.000000f, 0.000000f, 0.024618f, 0.079487f, 0.080415f, 0.026311f, 0.000000f, 0.000000f, 0.038382f, 0.133569f, 0.133162f, 0.033451f, 0.000000f, 0.000000f, 0.043697f, 0.152483f, 0.154345f, 0.040885f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.026401f, 0.040228f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.066688f, 0.142350f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.108504f, 0.210286f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.149666f, 0.255876f}, +}; + +// For each output (4x2) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_4x2[8][36] = { +{0.318857f, 0.081413f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.262816f, 0.064811f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.175211f, 0.046152f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.050740f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.163830f, 0.223661f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.128904f, 0.194332f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.080369f, 0.121162f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.041941f, 0.045801f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.230801f, 0.166220f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.193495f, 0.136548f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.113816f, 0.085890f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.043771f, 0.029459f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.087528f, 0.318213f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.059739f, 0.262039f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.046515f, 0.175973f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.049993f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.054078f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.173243f, 0.055145f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.254561f, 0.059695f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.319463f, 0.083816f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.038171f, 0.037447f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.076263f, 0.117360f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.134218f, 0.202503f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.163759f, 0.230278f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.044607f, 0.035170f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.114466f, 0.088407f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.201026f, 0.127983f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.224148f, 0.164194f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.052817f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.043531f, 0.174390f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.060164f, 0.262636f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.089340f, 0.317122f}, +}; + +// For each output (5x2) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_5x2[10][36] = { +{0.393855f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.327491f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.216089f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.062565f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.303101f, 0.078223f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.261199f, 0.068761f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.160056f, 0.054634f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.074026f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.202529f, 0.207447f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.151013f, 0.157673f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.100074f, 0.095239f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.043623f, 0.042402f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.083336f, 0.309647f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.061432f, 0.269582f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.046328f, 0.166035f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.063640f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.397684f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.326178f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.217856f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.058282f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.065541f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.215996f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.321124f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.397338f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.069030f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.159434f, 0.051902f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.266327f, 0.065732f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.305627f, 0.081948f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.038550f, 0.046259f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.092606f, 0.100038f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.162523f, 0.163345f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.199767f, 0.196912f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.066709f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.050841f, 0.169003f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.061591f, 0.265094f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.081426f, 0.305335f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.063517f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.210896f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.316133f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.027674f, 0.381781f}, +}; + +// For each output (6x2) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_6x2[12][36] = { +{0.395563f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.328397f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.214936f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.061104f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.395041f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.323513f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.208086f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.073360f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.393200f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.317339f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.218679f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.070782f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.399071f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.321356f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.214689f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.064883f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.399159f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.326009f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.212426f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.062406f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.398973f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.326510f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.217446f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.057071f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.065386f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.215039f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.321113f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.398462f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.072234f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.211515f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.319185f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.397066f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.053184f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.213286f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.332634f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.400895f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.063501f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.207210f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.334096f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.395193f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.074315f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.216723f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.320827f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.388135f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.063571f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.215814f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.325843f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.394772f}, +}; + +// For each output (2x3) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_2x3[6][36] = { +{0.253933f, 0.211745f, 0.142964f, 0.043509f, 0.000000f, 0.000000f, 0.146094f, 0.108119f, 0.068727f, 0.024908f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.043336f, 0.140540f, 0.208745f, 0.253069f, 0.000000f, 0.000000f, 0.031333f, 0.069242f, 0.108596f, 0.145138f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.044780f, 0.036916f, 0.026808f, 0.000000f, 0.000000f, 0.000000f, 0.151455f, 0.129189f, 0.076266f, 0.030885f, 0.000000f, 0.000000f, 0.151915f, 0.131628f, 0.081598f, 0.031903f, 0.000000f, 0.000000f, 0.043838f, 0.032645f, 0.030173f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.028998f, 0.038454f, 0.046460f, 0.000000f, 0.000000f, 0.033717f, 0.076274f, 0.130140f, 0.153377f, 0.000000f, 0.000000f, 0.025762f, 0.077843f, 0.130195f, 0.150217f, 0.000000f, 0.000000f, 0.000000f, 0.029422f, 0.034493f, 0.044648f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.145243f, 0.107655f, 0.062280f, 0.033041f, 0.000000f, 0.000000f, 0.257369f, 0.210260f, 0.139667f, 0.044485f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.037604f, 0.064104f, 0.105759f, 0.144848f, 0.000000f, 0.000000f, 0.042699f, 0.141511f, 0.207704f, 0.255772f}, +}; + +// For each output (3x3) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_3x3[9][36] = { +{0.412913f, 0.237773f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.237370f, 0.111944f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.066531f, 0.251421f, 0.245639f, 0.065785f, 0.000000f, 0.000000f, 0.047059f, 0.143642f, 0.128760f, 0.051164f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.234587f, 0.419421f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.110765f, 0.235227f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.067391f, 0.044131f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.248992f, 0.133218f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.247568f, 0.139987f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.072238f, 0.046475f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.040674f, 0.048555f, 0.000000f, 0.000000f, 0.000000f, 0.049640f, 0.158199f, 0.158521f, 0.046044f, 0.000000f, 0.000000f, 0.043591f, 0.153956f, 0.155258f, 0.049378f, 0.000000f, 0.000000f, 0.000000f, 0.046674f, 0.049509f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.049528f, 0.063611f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.137662f, 0.252612f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.134924f, 0.246668f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.042655f, 0.072341f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.237403f, 0.114850f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.418506f, 0.229241f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.049009f, 0.142093f, 0.136891f, 0.036294f, 0.000000f, 0.000000f, 0.074433f, 0.244437f, 0.251631f, 0.065212f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.121166f, 0.231108f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.236230f, 0.411495f}, +}; + +// For each output (4x3) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_4x3[12][36] = { +{0.508292f, 0.132529f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.285382f, 0.073798f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.266624f, 0.378457f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.144380f, 0.210539f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.380292f, 0.270590f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.200825f, 0.148293f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.130560f, 0.507542f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.071578f, 0.290320f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.094051f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.322294f, 0.082665f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.316365f, 0.092271f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.092353f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.046081f, 0.061377f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.158151f, 0.235006f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.152896f, 0.232594f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.052844f, 0.061053f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.061619f, 0.046867f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.227763f, 0.158202f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.222620f, 0.155545f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.073398f, 0.053986f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.082287f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.084098f, 0.330283f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.085224f, 0.323658f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.094451f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.286413f, 0.077046f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.512915f, 0.123625f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.140389f, 0.213324f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.267125f, 0.379163f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.208464f, 0.139969f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.382876f, 0.268691f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.080416f, 0.285653f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.131803f, 0.502128f}, +}; + +// For each output (5x3) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_5x3[15][36] = { +{0.618662f, 0.032137f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.349200f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.497060f, 0.129255f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.281642f, 0.092043f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.333166f, 0.338337f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.164333f, 0.164165f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.129409f, 0.504176f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.085525f, 0.280890f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.636943f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.363057f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.113467f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.394204f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.386741f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.105588f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.086925f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.317750f, 0.095763f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.321008f, 0.086368f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.092185f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.057696f, 0.061462f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.184995f, 0.197656f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.186342f, 0.186715f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.059712f, 0.065422f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.091939f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.079906f, 0.328876f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.085955f, 0.320229f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.093096f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.099585f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.398489f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.388782f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.113144f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.360655f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.639345f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.285578f, 0.088663f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.495946f, 0.129812f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.177513f, 0.166195f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.329950f, 0.326342f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.082692f, 0.279744f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.134353f, 0.503211f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.361178f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.638822f}, +}; + +// For each output (6x3) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_6x3[18][36] = { +{0.640623f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.359377f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.638697f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.361303f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.640672f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.359328f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.637721f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.362279f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.647342f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.352658f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.638418f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.361582f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.111041f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.395972f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.387932f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.105054f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.101949f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.395728f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.401263f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.101060f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.098132f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.388180f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.402030f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.111659f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.096173f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.393865f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.386312f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.123650f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.104357f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.398062f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.393265f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.104316f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.097666f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.400772f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.390396f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.111166f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.359466f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.640534f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.360569f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.639431f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.355750f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.644250f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.353865f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.646135f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.357727f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.642273f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.359539f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.640461f}, +}; + +// For each output (2x4) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_2x4[8][36] = { +{0.312206f, 0.261492f, 0.177496f, 0.055798f, 0.000000f, 0.000000f, 0.081944f, 0.062361f, 0.048703f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.054679f, 0.172805f, 0.260561f, 0.314742f, 0.000000f, 0.000000f, 0.000000f, 0.049040f, 0.065652f, 0.082520f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.164115f, 0.129589f, 0.083879f, 0.029309f, 0.000000f, 0.000000f, 0.231202f, 0.198851f, 0.118719f, 0.044334f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.035855f, 0.083276f, 0.127764f, 0.166965f, 0.000000f, 0.000000f, 0.045347f, 0.116503f, 0.193645f, 0.230645f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.223790f, 0.194804f, 0.115855f, 0.047371f, 0.000000f, 0.000000f, 0.164616f, 0.125798f, 0.087268f, 0.040497f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.044738f, 0.118365f, 0.198854f, 0.230745f, 0.000000f, 0.000000f, 0.029646f, 0.078141f, 0.131405f, 0.168106f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.080206f, 0.060505f, 0.041197f, 0.000000f, 0.000000f, 0.000000f, 0.320486f, 0.265233f, 0.174992f, 0.057380f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.051057f, 0.058139f, 0.082120f, 0.000000f, 0.000000f, 0.056168f, 0.174118f, 0.260525f, 0.317873f}, +}; + +// For each output (3x4) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_3x4[12][36] = { +{0.503381f, 0.288537f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.130806f, 0.077275f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.088808f, 0.319226f, 0.312498f, 0.086797f, 0.000000f, 0.000000f, 0.000000f, 0.092065f, 0.079421f, 0.021185f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.286250f, 0.514036f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.072999f, 0.126714f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.261935f, 0.133191f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.376226f, 0.207118f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.021529f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.059585f, 0.153016f, 0.152552f, 0.043373f, 0.000000f, 0.000000f, 0.063990f, 0.231504f, 0.235283f, 0.060696f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.146403f, 0.262394f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.208547f, 0.382656f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.374676f, 0.209306f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.270440f, 0.145577f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.059636f, 0.233975f, 0.235944f, 0.069029f, 0.000000f, 0.000000f, 0.048950f, 0.150198f, 0.154340f, 0.047929f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.200921f, 0.380881f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.146928f, 0.271271f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.128883f, 0.075468f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.509859f, 0.285791f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.095842f, 0.086878f, 0.000000f, 0.000000f, 0.000000f, 0.092942f, 0.314169f, 0.319263f, 0.090906f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.079652f, 0.124852f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.289868f, 0.505628f}, +}; + +// For each output (4x4) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_4x4[16][36] = { +{0.665277f, 0.167914f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.166809f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.325854f, 0.449938f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.094690f, 0.129518f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.455174f, 0.326025f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.109174f, 0.109627f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.166733f, 0.664155f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.169112f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.320619f, 0.090788f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.462066f, 0.126527f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.165890f, 0.235855f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.233931f, 0.364324f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.239319f, 0.151533f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.363629f, 0.245519f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.106763f, 0.311932f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.119451f, 0.461853f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.451893f, 0.124086f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.326160f, 0.097861f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.239712f, 0.365585f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.164178f, 0.230525f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.360274f, 0.237862f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.246139f, 0.155726f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.121863f, 0.457051f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.097828f, 0.323258f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.163634f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.667648f, 0.168718f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.094870f, 0.132660f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.316878f, 0.455591f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.116917f, 0.098433f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.458816f, 0.325834f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.168403f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.172019f, 0.659578f}, +}; + +// For each output (5x4) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_5x4[20][36] = { +{0.773702f, 0.033711f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.192588f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.633422f, 0.166577f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.170080f, 0.029921f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.388335f, 0.403694f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.100996f, 0.106975f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.161122f, 0.655288f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.183590f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.801705f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.198295f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.400989f, 0.025097f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.573915f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.309345f, 0.085396f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.478694f, 0.126565f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.194664f, 0.187267f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.292735f, 0.308960f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.016375f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.098049f, 0.295983f, 0.000000f, 0.000000f, 0.017892f, 0.000000f, 0.111938f, 0.476138f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.043545f, 0.386448f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.570007f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.566407f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.402307f, 0.031286f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.463145f, 0.120696f, 0.000000f, 0.019497f, 0.000000f, 0.000000f, 0.311721f, 0.084942f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.296730f, 0.300781f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.204639f, 0.197849f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.122117f, 0.469302f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.102545f, 0.306036f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.562064f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.041534f, 0.396403f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.190134f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.773971f, 0.035896f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.169927f, 0.035812f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.630284f, 0.163977f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.112667f, 0.106813f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.393502f, 0.387018f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.177024f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.170482f, 0.652494f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.192274f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033039f, 0.774687f}, +}; + +// For each output (6x4) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_6x4[24][36] = { +{0.804254f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.195746f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.804177f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.195823f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.799585f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.200415f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.803604f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.196396f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.807256f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.192744f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.805135f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.194865f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.410532f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.589468f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.408690f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.591310f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.416225f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.583775f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.414279f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.585721f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.406723f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.593277f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.402510f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.597490f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.584784f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.415216f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.590427f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.409573f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.590073f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.409927f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.580348f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.419652f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.588321f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.411679f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.587022f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.412978f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.193281f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.806719f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.189163f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.810837f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.195108f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.804892f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.188290f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.811710f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.192914f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.807086f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.195292f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.804708f}, +}; + +// For each output (2x5) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_2x5[10][36] = { +{0.387593f, 0.325123f, 0.221104f, 0.066180f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.065940f, 0.214659f, 0.326737f, 0.392664f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.309603f, 0.265953f, 0.168780f, 0.060600f, 0.000000f, 0.000000f, 0.084707f, 0.063017f, 0.047341f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.062836f, 0.170767f, 0.261053f, 0.307978f, 0.000000f, 0.000000f, 0.000000f, 0.049286f, 0.064361f, 0.083719f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.195787f, 0.153943f, 0.095706f, 0.042417f, 0.000000f, 0.000000f, 0.190695f, 0.154435f, 0.097288f, 0.040258f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.029471f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.017536f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.039307f, 0.094677f, 0.158696f, 0.199136f, 0.000000f, 0.000000f, 0.040959f, 0.093353f, 0.155294f, 0.201042f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.079432f, 0.065739f, 0.044876f, 0.000000f, 0.000000f, 0.000000f, 0.309205f, 0.264700f, 0.167247f, 0.068801f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.052112f, 0.064829f, 0.081363f, 0.000000f, 0.000000f, 0.064024f, 0.161136f, 0.263743f, 0.312793f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.393277f, 0.324792f, 0.213188f, 0.068743f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.066964f, 0.215440f, 0.323005f, 0.394591f}, +}; + +// For each output (3x5) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_3x5[15][36] = { +{0.620557f, 0.350797f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.028646f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.110170f, 0.397489f, 0.386326f, 0.106015f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.357348f, 0.642652f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.503934f, 0.275289f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.128280f, 0.092497f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.102294f, 0.316223f, 0.313576f, 0.092518f, 0.000000f, 0.000000f, 0.000000f, 0.081158f, 0.094231f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.279079f, 0.502163f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.086083f, 0.132675f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.325483f, 0.157739f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.322567f, 0.172225f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.021986f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.063342f, 0.192228f, 0.186950f, 0.057021f, 0.000000f, 0.000000f, 0.054779f, 0.186114f, 0.185666f, 0.073901f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.172195f, 0.331802f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.148212f, 0.322038f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.025751f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.123726f, 0.081188f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.507339f, 0.287746f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.093924f, 0.094021f, 0.000000f, 0.000000f, 0.000000f, 0.097070f, 0.315697f, 0.314560f, 0.084728f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.082560f, 0.129771f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.277014f, 0.486817f, 0.023837f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.644191f, 0.355809f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.107771f, 0.387615f, 0.393454f, 0.111159f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.360886f, 0.639114f}, +}; + +// For each output (4x5) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_4x5[20][36] = { +{0.778254f, 0.190730f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.031016f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.401147f, 0.570243f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.028610f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.563768f, 0.394241f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.041992f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.196238f, 0.767548f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.036214f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.637514f, 0.166734f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.167634f, 0.028118f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.322778f, 0.473312f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.085399f, 0.118511f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.471429f, 0.308185f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.118025f, 0.102361f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.176592f, 0.643933f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.179475f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.391609f, 0.100882f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.390531f, 0.116978f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.017259f, 0.000000f, 0.201618f, 0.301555f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.197600f, 0.281968f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.016735f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.293309f, 0.192842f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.268674f, 0.208109f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.020330f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.118514f, 0.380746f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.097621f, 0.381305f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.021814f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.157977f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.657533f, 0.184490f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.097522f, 0.128585f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.309864f, 0.464029f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.128900f, 0.090864f, 0.000000f, 0.025393f, 0.000000f, 0.000000f, 0.464029f, 0.290814f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.024593f, 0.172268f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.173412f, 0.629727f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.029582f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.778816f, 0.191602f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.036297f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.394454f, 0.569249f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.039685f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.561207f, 0.399108f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.034683f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.193744f, 0.771574f}, +}; + +// For each output (5x5) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_5x5[25][36] = { +{1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.794727f, 0.205273f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.465125f, 0.484079f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.028881f, 0.000000f, 0.000000f, 0.021914f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.192446f, 0.772941f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.034613f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033123f, 0.930510f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.036367f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.800234f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.199766f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.629079f, 0.165939f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.166390f, 0.019675f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.018918f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.378734f, 0.373861f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.111597f, 0.135808f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.177492f, 0.641195f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.181313f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.028722f, 0.761781f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.209497f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.475763f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.471882f, 0.029551f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.022804f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.382714f, 0.116167f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.383377f, 0.117742f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.254151f, 0.249987f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.241972f, 0.253891f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.017950f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.122722f, 0.376847f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.095099f, 0.369986f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.017396f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.029442f, 0.472507f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.471751f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.026300f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.190299f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.776924f, 0.032778f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.171498f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.666385f, 0.162117f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.125713f, 0.117624f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.387084f, 0.369579f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.028493f, 0.169318f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.173770f, 0.628419f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.198951f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.035634f, 0.765415f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.963102f, 0.036898f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.030322f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.771054f, 0.198624f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.021816f, 0.020944f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.481761f, 0.475479f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.032816f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.198418f, 0.768766f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033338f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966662f}, +}; + +// For each output (6x5) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_6x5[30][36] = { +{0.966284f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033716f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.966287f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033713f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.966287f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033713f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.966290f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033710f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966125f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033875f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966273f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033727f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.800857f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.199143f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.773463f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.201165f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.025372f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.805735f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.194265f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.788791f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.211209f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.785975f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.214025f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.787286f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.212714f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.490845f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.487242f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.021913f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.490663f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.486878f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.022459f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.505452f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.494548f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.495383f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.482180f, 0.000000f, 0.022437f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.022727f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.496545f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.480728f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.486261f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.486387f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.027352f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.196272f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.803728f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.210059f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.789941f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.212947f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.787053f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.215261f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.784739f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.209116f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.790884f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.205881f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.794119f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033710f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966290f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033711f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966289f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033713f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966287f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033719f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966281f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033712f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966288f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033712f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966288f}, +}; + +// For each output (2x6) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_2x6[12][36] = { +{0.388815f, 0.325435f, 0.220189f, 0.065562f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.064515f, 0.214042f, 0.327700f, 0.393742f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.398821f, 0.326200f, 0.217851f, 0.057128f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.062546f, 0.216408f, 0.322269f, 0.398777f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.396575f, 0.330631f, 0.212857f, 0.059936f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.070253f, 0.215326f, 0.317576f, 0.396845f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.398130f, 0.324745f, 0.213572f, 0.063553f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.062009f, 0.216253f, 0.324683f, 0.397055f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.397646f, 0.321346f, 0.212334f, 0.068675f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.067073f, 0.210768f, 0.318165f, 0.403993f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.395756f, 0.325048f, 0.211862f, 0.067334f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.065475f, 0.214113f, 0.324009f, 0.396403f}, +}; + +// For each output (3x6) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_3x6[18][36] = { +{0.640136f, 0.359864f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.108112f, 0.399968f, 0.388087f, 0.103833f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.356122f, 0.643878f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.646308f, 0.353692f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.122937f, 0.390166f, 0.380558f, 0.106339f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.355015f, 0.644985f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.642874f, 0.357126f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.111570f, 0.398638f, 0.387639f, 0.102153f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.359134f, 0.640866f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.640159f, 0.359841f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.098908f, 0.393303f, 0.400421f, 0.107369f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.357119f, 0.642881f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.640541f, 0.359459f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.116318f, 0.397635f, 0.395084f, 0.090964f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.361948f, 0.638052f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.645448f, 0.354552f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.106981f, 0.389214f, 0.395056f, 0.108749f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.359592f, 0.640408f}, +}; + +// For each output (4x6) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_4x6[24][36] = { +{0.806928f, 0.193072f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.412216f, 0.587784f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.590075f, 0.409925f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.200682f, 0.799318f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.809822f, 0.190178f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.423474f, 0.576526f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.580816f, 0.419184f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.190240f, 0.809760f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.800320f, 0.199680f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.408625f, 0.591375f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.583392f, 0.416608f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.200372f, 0.799628f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.798914f, 0.201086f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.411243f, 0.588757f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.586520f, 0.413480f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.203588f, 0.796412f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.802040f, 0.197960f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.411175f, 0.588825f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.599873f, 0.400127f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.193060f, 0.806940f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.806073f, 0.193927f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.408705f, 0.591295f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.585711f, 0.414289f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.197672f, 0.802328f}, +}; + +// For each output (5x6) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_5x6[30][36] = { +{0.966289f, 0.033711f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.794848f, 0.205152f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.473272f, 0.496525f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.030202f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.196955f, 0.803045f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033711f, 0.966289f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966284f, 0.033716f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.795787f, 0.204213f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.500928f, 0.499072f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.198603f, 0.801397f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033716f, 0.966284f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966283f, 0.033717f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.788424f, 0.211576f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.029276f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.484227f, 0.486497f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.201499f, 0.798501f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033724f, 0.966276f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966283f, 0.033717f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.791336f, 0.208664f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.490188f, 0.509812f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.204835f, 0.795165f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033703f, 0.966297f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.966276f, 0.033724f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.799276f, 0.200724f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.022501f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.494443f, 0.483055f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.205967f, 0.794033f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033726f, 0.966274f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.965971f, 0.034029f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.798640f, 0.201360f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.502577f, 0.497423f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.203927f, 0.796073f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.033706f, 0.966294f}, +}; + +// For each output (6x6) sample, the weight of each input (6x6) sample. +static const float g_weight_downsample_6x6_to_6x6[36][36] = { +{1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f, 0.000000f}, +{0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 0.000000f, 1.000000f}, +}; + +//-------------------------------------------------------------------------------------------------------------------------- + +const struct downsample_matrix_6x6 +{ + uint32_t m_grid_width, m_grid_height; + const float* m_p; +} g_downsample_matrices_6x6[] = { + { 2, 2, (const float*)g_weight_downsample_6x6_to_2x2 }, + { 3, 2, (const float*)g_weight_downsample_6x6_to_3x2 }, + { 4, 2, (const float*)g_weight_downsample_6x6_to_4x2 }, + { 5, 2, (const float*)g_weight_downsample_6x6_to_5x2 }, + { 6, 2, (const float*)g_weight_downsample_6x6_to_6x2 }, + { 2, 3, (const float*)g_weight_downsample_6x6_to_2x3 }, + { 3, 3, (const float*)g_weight_downsample_6x6_to_3x3 }, + { 4, 3, (const float*)g_weight_downsample_6x6_to_4x3 }, + { 5, 3, (const float*)g_weight_downsample_6x6_to_5x3 }, + { 6, 3, (const float*)g_weight_downsample_6x6_to_6x3 }, + { 2, 4, (const float*)g_weight_downsample_6x6_to_2x4 }, + { 3, 4, (const float*)g_weight_downsample_6x6_to_3x4 }, + { 4, 4, (const float*)g_weight_downsample_6x6_to_4x4 }, + { 5, 4, (const float*)g_weight_downsample_6x6_to_5x4 }, + { 6, 4, (const float*)g_weight_downsample_6x6_to_6x4 }, + { 2, 5, (const float*)g_weight_downsample_6x6_to_2x5 }, + { 3, 5, (const float*)g_weight_downsample_6x6_to_3x5 }, + { 4, 5, (const float*)g_weight_downsample_6x6_to_4x5 }, + { 5, 5, (const float*)g_weight_downsample_6x6_to_5x5 }, + { 6, 5, (const float*)g_weight_downsample_6x6_to_6x5 }, + { 2, 6, (const float*)g_weight_downsample_6x6_to_2x6 }, + { 3, 6, (const float*)g_weight_downsample_6x6_to_3x6 }, + { 4, 6, (const float*)g_weight_downsample_6x6_to_4x6 }, + { 5, 6, (const float*)g_weight_downsample_6x6_to_5x6 }, + { 6, 6, (const float*)g_weight_downsample_6x6_to_6x6 } +}; +//const uint32_t NUM_DOWNSAMPLE_MATRICES_6x6 = sizeof(g_downsample_matrices_6x6) / sizeof(g_downsample_matrices_6x6[0]); + +//-------------------------------------------------------------------------------------------------------------------------- + +const float* get_6x6_downsample_matrix(uint32_t grid_width, uint32_t grid_height) +{ + // TODO: Use hash or map lookup. + for (const auto& m : g_downsample_matrices_6x6) + if ((m.m_grid_width == grid_width) && (m.m_grid_height == grid_height)) + return m.m_p; + + assert(0); + return nullptr; +} + +void downsample_weight_grid( + const float* pMatrix_weights, + uint32_t bx, uint32_t by, // source/from dimension (block size) + uint32_t wx, uint32_t wy, // dest/to dimension (grid size) + const uint8_t* pSrc_weights, // these are dequantized weights, NOT ISE symbols, [by][bx] + uint8_t* pDst_weights) // [wy][wx] +{ + const uint32_t total_block_samples = bx * by; + + for (uint32_t y = 0; y < wy; y++) + { + for (uint32_t x = 0; x < wx; x++) + { + float total = 0.5f; + + for (uint32_t i = 0; i < total_block_samples; i++) + if (pMatrix_weights[i]) + total += pMatrix_weights[i] * (float)pSrc_weights[i]; + + pDst_weights[x + y * wx] = (uint8_t)clamp((int)total, 0, 64); + + pMatrix_weights += total_block_samples; + } + } +} + +//-------------------------------------------------------------------------------------------------------------------------- + +void downsample_ise_weights( + uint32_t dequant_weight_ise_range, uint32_t quant_weight_ise_range, + uint32_t block_w, uint32_t block_h, + uint32_t grid_w, uint32_t grid_h, + const uint8_t* pSrc_weights, uint8_t* pDst_weights) +{ + assert((block_w <= MAX_ASTC_HDR_BLOCK_W) && (block_h <= MAX_ASTC_HDR_BLOCK_H)); + assert((grid_w >= 2) && (grid_w <= MAX_ASTC_HDR_BLOCK_W)); + assert((grid_h >= 2) && (grid_h <= MAX_ASTC_HDR_BLOCK_H)); + + assert(dequant_weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE); + assert(dequant_weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE); + + assert(quant_weight_ise_range >= astc_helpers::FIRST_VALID_WEIGHT_ISE_RANGE); + assert(quant_weight_ise_range <= astc_helpers::LAST_VALID_WEIGHT_ISE_RANGE); + + if ((block_w == grid_w) && (block_h == grid_h)) + { + if (dequant_weight_ise_range != quant_weight_ise_range) + { + basist::astc_6x6_hdr::requantize_astc_weights(block_w * block_h, pSrc_weights, dequant_weight_ise_range, pDst_weights, quant_weight_ise_range); + } + else + { + if (pDst_weights != pSrc_weights) + memcpy(pDst_weights, pSrc_weights, block_w * block_h); + } + + return; + } + + uint8_t desired_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + + const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(dequant_weight_ise_range).m_ISE_to_val; + + for (uint32_t by = 0; by < block_h; by++) + for (uint32_t bx = 0; bx < block_w; bx++) + desired_weights[bx + by * block_w] = dequant_tab[pSrc_weights[bx + by * block_w]]; + + uint8_t downsampled_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + + const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h); + assert(pDownsample_matrix); + + downsample_weight_grid( + pDownsample_matrix, + block_w, block_h, // source/from dimension (block size) + grid_w, grid_h, // dest/to dimension (grid size) + desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx] + downsampled_weights); // [wy][wx] + + const auto& weight_quant_tab = astc_helpers::g_dequant_tables.get_weight_tab(quant_weight_ise_range).m_val_to_ise; + + for (uint32_t gy = 0; gy < grid_h; gy++) + for (uint32_t gx = 0; gx < grid_w; gx++) + pDst_weights[gx + gy * grid_w] = weight_quant_tab[downsampled_weights[gx + gy * grid_w]]; +} + +void downsample_ise_weights_dual_plane( + uint32_t dequant_weight_ise_range, uint32_t quant_weight_ise_range, + uint32_t block_w, uint32_t block_h, + uint32_t grid_w, uint32_t grid_h, + const uint8_t* pSrc_weights0, const uint8_t* pSrc_weights1, + uint8_t* pDst_weights) +{ + uint8_t downsampled_weights0[MAX_ASTC_HDR_BLOCK_W * MAX_ASTC_HDR_BLOCK_H], downsampled_weights1[MAX_ASTC_HDR_BLOCK_W * MAX_ASTC_HDR_BLOCK_H]; + + downsample_ise_weights( + dequant_weight_ise_range, quant_weight_ise_range, + block_w, block_h, + grid_w, grid_h, + pSrc_weights0, downsampled_weights0); + + downsample_ise_weights( + dequant_weight_ise_range, quant_weight_ise_range, + block_w, block_h, + grid_w, grid_h, + pSrc_weights1, downsampled_weights1); + + const uint32_t num_grid_samples = grid_w * grid_h; + for (uint32_t i = 0; i < num_grid_samples; i++) + { + pDst_weights[i * 2 + 0] = downsampled_weights0[i]; + pDst_weights[i * 2 + 1] = downsampled_weights1[i]; + } +} + +static bool refine_endpoints_mode11( + uint32_t endpoint_ise_range, + uint8_t* pEndpoint_vals, // the endpoints to optimize + uint32_t block_w, uint32_t block_h, // block dimensions + uint32_t grid_w, uint32_t grid_h, const uint8_t* pWeights, uint32_t weight_ise_range, // weight grid + uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[], + const uint8_t* pPixel_block_ofs, // maps this subset's pixels to block offsets + astc_hdr_codec_base_options& coptions, + bool direct_only, int first_submode, int last_submode, + opt_mode_t opt_mode) +{ + if (opt_mode == cNoOpt) + return false; + + const uint32_t num_block_pixels = block_w * block_h; + + uint8_t def_pixel_block_ofs[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + if (!pPixel_block_ofs) + { + for (uint32_t i = 0; i < num_block_pixels; i++) + def_pixel_block_ofs[i] = (uint8_t)i; + + pPixel_block_ofs = def_pixel_block_ofs; + } + + const uint32_t num_weights = grid_w * grid_h; + + uint8_t dequantized_raw_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + for (uint32_t i = 0; i < num_weights; i++) + dequantized_raw_weights[i] = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_val[pWeights[i]]; + + uint8_t upsampled_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; // raw weights, NOT ISE + astc_helpers::upsample_weight_grid(block_w, block_h, grid_w, grid_h, dequantized_raw_weights, upsampled_weights); + + aabb3F color_box_q16(cInitExpand); + + uint8_t trial_blk_raw_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; // raw weights, NOT ISE + float trial_blk_raw_weightsf[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + for (uint32_t i = 0; i < num_pixels; i++) + { + color_box_q16.expand(pBlock_pixels_q16[i]); + + assert(pPixel_block_ofs[i] < num_block_pixels); + + trial_blk_raw_weights[i] = upsampled_weights[pPixel_block_ofs[i]]; + trial_blk_raw_weightsf[i] = (float)trial_blk_raw_weights[i] * (1.0f / 64.0f); + } + + vec3F l_q16, h_q16; + if (opt_mode == cOrdinaryLeastSquares) + { + if (!compute_least_squares_endpoints_rgb_raw_weights(num_pixels, trial_blk_raw_weights, &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16)) + return false; + } + else if ((opt_mode == cWeightedLeastSquares) || (opt_mode == cWeightedLeastSquaresHeavy)) + { + vec3F block_mean_color_q16(calc_mean(num_pixels, pBlock_pixels_q16)); + vec3F block_axis_q16(calc_rgb_pca(num_pixels, pBlock_pixels_q16, block_mean_color_q16)); + float l = BIG_FLOAT_VAL, h = -BIG_FLOAT_VAL; + for (uint32_t i = 0; i < num_pixels; i++) + { + vec3F k(vec3F(pBlock_pixels_q16[i]) - block_mean_color_q16); + float kd = k.dot(block_axis_q16); + if (kd < l) + l = kd; + if (kd > h) + h = kd; + } + float emphasis_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + if (h == l) + { + for (uint32_t i = 0; i < num_pixels; i++) + emphasis_weights[i] = 1.0f; + } + else + { + float mid = (0.0f - l) / (h - l); + mid = clamp(mid, .01f, .99f); + + float lw = LOW_EMPHASIS_WEIGHT, mw = MIDDLE_EMPHASIS_WEIGHT, hw = HIGH_EMPHASIS_WEIGHT; + if (opt_mode == cWeightedLeastSquaresHeavy) + lw = LOW_EMPHASIS_WEIGHT_HEAVY, mw = MIDDLE_EMPHASIS_WEIGHT_HEAVY, hw = HIGH_EMPHASIS_WEIGHT_HEAVY; + + for (uint32_t i = 0; i < num_pixels; i++) + { + vec3F k(vec3F(pBlock_pixels_q16[i]) - block_mean_color_q16); + float kd = k.dot(block_axis_q16); + + assert((kd >= l) && (kd <= h)); + + float v = (kd - l) / (h - l); + + if (v < mid) + v = lerp(lw, mw, v / mid); + else + v = lerp(mw, hw, (v - mid) * (1.0f - mid)); + + emphasis_weights[i] = v; + } + } + + if (!compute_weighted_least_squares_endpoints_rgb(num_pixels, nullptr, nullptr, trial_blk_raw_weightsf, emphasis_weights, &l_q16, &h_q16, pBlock_pixels_q16, color_box_q16)) + return false; + } + else + { + assert(opt_mode == cWeightedAverage); + + l_q16.set(0.0f); + float total_low = 0.0f; + + h_q16.set(0.0f); + float total_high = 0.0f; + + for (uint32_t i = 0; i < num_pixels; i++) + { + vec3F p(pBlock_pixels_q16[i]); + float lerp = (float)trial_blk_raw_weights[i] * (1.0f / 64.0f); + + l_q16 += p * (1.0f - lerp); + total_low += (1.0f - lerp); + + h_q16 += p * lerp; + total_high += lerp; + } + + if (total_low != 0.0f) + l_q16 *= (1.0f / total_low); + else + return false; + + if (total_high != 0.0f) + h_q16 *= (1.0f / total_high); + else + return false; + } + + uint8_t trial_endpoints[NUM_MODE11_ENDPOINTS]; + + uint32_t submode_used; + + bool pack_succeeded = pack_mode11(l_q16, h_q16, endpoint_ise_range, trial_endpoints, coptions, direct_only, first_submode, last_submode, false, submode_used); + if (!pack_succeeded) + return false; + + int cur_e[2][3]; + if (!decode_mode11_to_qlog12(pEndpoint_vals, cur_e, endpoint_ise_range)) + return false; + + int trial_e[2][3]; + if (!decode_mode11_to_qlog12(trial_endpoints, trial_e, endpoint_ise_range)) + return false; + + for (uint32_t i = 0; i < 3; i++) + { + cur_e[0][i] <<= 4; + cur_e[1][i] <<= 4; + + trial_e[0][i] <<= 4; + trial_e[1][i] <<= 4; + } + + const float R_WEIGHT = coptions.m_r_err_scale, G_WEIGHT = coptions.m_g_err_scale; + + double cur_error = 0, trial_error = 0; + + for (uint32_t p = 0; p < num_pixels; p++) + { + const half_float* pDesired_half = &pBlock_pixels_half[p][0]; + + const double desired_half_r_q = q(pDesired_half[0], coptions.m_q_log_bias), desired_half_g_q = q(pDesired_half[1], coptions.m_q_log_bias), desired_half_b_q = q(pDesired_half[2], coptions.m_q_log_bias); + + const uint32_t c = trial_blk_raw_weights[p]; + assert(c <= 64); + + { + half_float rf, gf, bf; + + { + uint32_t r0 = cur_e[0][0], r1 = cur_e[1][0]; + int ri = (r0 * (64 - c) + r1 * c + 32) / 64; + rf = astc_helpers::qlog16_to_half(ri); + } + + { + uint32_t g0 = cur_e[0][1], g1 = cur_e[1][1]; + int gi = (g0 * (64 - c) + g1 * c + 32) / 64; + gf = astc_helpers::qlog16_to_half(gi); + } + + { + uint32_t b0 = cur_e[0][2], b1 = cur_e[1][2]; + int bi = (b0 * (64 - c) + b1 * c + 32) / 64; + bf = astc_helpers::qlog16_to_half(bi); + } + + const double decoded_half_q0 = q(rf, coptions.m_q_log_bias), decoded_half_q1 = q(gf, coptions.m_q_log_bias), decoded_half_q2 = q(bf, coptions.m_q_log_bias); + + const double rd = decoded_half_q0 - desired_half_r_q, gd = decoded_half_q1 - desired_half_g_q, bd = decoded_half_q2 - desired_half_b_q; + + cur_error += R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd; + } + + { + half_float rf, gf, bf; + + { + uint32_t r0 = trial_e[0][0], r1 = trial_e[1][0]; + int ri = (r0 * (64 - c) + r1 * c + 32) / 64; + rf = astc_helpers::qlog16_to_half(ri); + } + + { + uint32_t g0 = trial_e[0][1], g1 = trial_e[1][1]; + int gi = (g0 * (64 - c) + g1 * c + 32) / 64; + gf = astc_helpers::qlog16_to_half(gi); + } + + { + uint32_t b0 = trial_e[0][2], b1 = trial_e[1][2]; + int bi = (b0 * (64 - c) + b1 * c + 32) / 64; + bf = astc_helpers::qlog16_to_half(bi); + } + + const double decoded_half_q0 = q(rf, coptions.m_q_log_bias), decoded_half_q1 = q(gf, coptions.m_q_log_bias), decoded_half_q2 = q(bf, coptions.m_q_log_bias); + + const double rd = decoded_half_q0 - desired_half_r_q, gd = decoded_half_q1 - desired_half_g_q, bd = decoded_half_q2 - desired_half_b_q; + + trial_error += R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd; + } + + } // p + + if (trial_error < cur_error) + { + memcpy(pEndpoint_vals, trial_endpoints, NUM_MODE11_ENDPOINTS); + return true; + } + + return false; +} + +static bool refine_endpoints_mode7( + uint32_t endpoint_ise_range, + uint8_t* pEndpoint_vals, // the endpoints to optimize + uint32_t block_w, uint32_t block_h, // block dimensions + uint32_t grid_w, uint32_t grid_h, const uint8_t* pWeights, uint32_t weight_ise_range, // weight grid + uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[], + const uint8_t* pPixel_block_ofs, // maps this subset's pixels to block offsets + astc_hdr_codec_base_options& coptions, + int first_submode, int last_submode) +{ + const uint32_t num_block_pixels = block_w * block_h; + + uint8_t def_pixel_block_ofs[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + if (!pPixel_block_ofs) + { + for (uint32_t i = 0; i < num_block_pixels; i++) + def_pixel_block_ofs[i] = (uint8_t)i; + + pPixel_block_ofs = def_pixel_block_ofs; + } + + const uint32_t num_weights = grid_w * grid_h; + + uint8_t dequantized_raw_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; + for (uint32_t i = 0; i < num_weights; i++) + dequantized_raw_weights[i] = astc_helpers::g_dequant_tables.get_weight_tab(weight_ise_range).m_ISE_to_val[pWeights[i]]; + + uint8_t upsampled_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; // raw weights, NOT ISE + astc_helpers::upsample_weight_grid(block_w, block_h, grid_w, grid_h, dequantized_raw_weights, upsampled_weights); + + uint8_t trial_blk_raw_weights[MAX_ASTC_HDR_ENC_BLOCK_PIXELS]; // raw weights, NOT ISE + for (uint32_t i = 0; i < num_pixels; i++) + { + assert(pPixel_block_ofs[i] < num_block_pixels); + + trial_blk_raw_weights[i] = upsampled_weights[pPixel_block_ofs[i]]; + } + + //-- + + int cur_e[2][3]; + int cur_s = 0; + if (!decode_mode7_to_qlog12(pEndpoint_vals, cur_e, &cur_s, endpoint_ise_range)) + return false; + + cur_s <<= 4; + + vec3F block_mean_color_q16(calc_mean(num_pixels, pBlock_pixels_q16)); + + vec3F new_high_color_q16(block_mean_color_q16); + + const float one_over_num_pixels = 1.0f / (float)num_pixels; + + for (uint32_t i = 0; i < num_pixels; i++) + { + float lerp = trial_blk_raw_weights[i] * (1.0f / 64.0f); + + float k = (float)cur_s * (1.0f - lerp) * one_over_num_pixels; + new_high_color_q16[0] += k; + new_high_color_q16[1] += k; + new_high_color_q16[2] += k; + } + + // Given a set of selectors and a high color, try to compute a better S. + float t = 0.0f; + + for (uint32_t i = 0; i < num_pixels; i++) + { + float lerp = trial_blk_raw_weights[i] * (1.0f / 64.0f); + + t += (1.0f) - lerp; + } + + t *= one_over_num_pixels; + + if (fabs(t) < .0000125f) + return false; + + uint8_t trial_endpoints[NUM_MODE7_ENDPOINTS]; + + uint32_t submode_used; + if (!pack_mode7(new_high_color_q16, (float)cur_s, endpoint_ise_range, trial_endpoints, weight_ise_range, coptions, first_submode, last_submode, false, submode_used)) + return false; + + int trial_e[2][3]; + if (!decode_mode7_to_qlog12(trial_endpoints, trial_e, nullptr, endpoint_ise_range)) + return false; + + vec3F cur_h_q16((float)(trial_e[1][0] << 4), (float)(trial_e[1][1] << 4), (float)(trial_e[1][2] << 4)); + + float s_r = (cur_h_q16[0] - block_mean_color_q16[0]) / t; + //float s_g = (cur_h_q16[1] - block_mean_color_q16[1]) / t; + //float s_b = (cur_h_q16[2] - block_mean_color_q16[2]) / t; + float new_s_q16 = ceilf(s_r); + + if (!pack_mode7(new_high_color_q16, new_s_q16, endpoint_ise_range, trial_endpoints, weight_ise_range, coptions, first_submode, last_submode, false, submode_used)) + return false; + + if (!decode_mode7_to_qlog12(trial_endpoints, trial_e, nullptr, endpoint_ise_range)) + return false; + + // -- + + for (uint32_t i = 0; i < 3; i++) + { + cur_e[0][i] <<= 4; + cur_e[1][i] <<= 4; + + trial_e[0][i] <<= 4; + trial_e[1][i] <<= 4; + } + + const float R_WEIGHT = coptions.m_r_err_scale, G_WEIGHT = coptions.m_g_err_scale; + + double cur_error = 0, trial_error = 0; + + for (uint32_t p = 0; p < num_pixels; p++) + { + const half_float* pDesired_half = &pBlock_pixels_half[p][0]; + + const double desired_half_r_q = q(pDesired_half[0], coptions.m_q_log_bias), desired_half_g_q = q(pDesired_half[1], coptions.m_q_log_bias), desired_half_b_q = q(pDesired_half[2], coptions.m_q_log_bias); + + const uint32_t c = trial_blk_raw_weights[p]; + assert(c <= 64); + + { + half_float rf, gf, bf; + + { + uint32_t r0 = cur_e[0][0], r1 = cur_e[1][0]; + int ri = (r0 * (64 - c) + r1 * c + 32) / 64; + rf = astc_helpers::qlog16_to_half(ri); + } + + { + uint32_t g0 = cur_e[0][1], g1 = cur_e[1][1]; + int gi = (g0 * (64 - c) + g1 * c + 32) / 64; + gf = astc_helpers::qlog16_to_half(gi); + } + + { + uint32_t b0 = cur_e[0][2], b1 = cur_e[1][2]; + int bi = (b0 * (64 - c) + b1 * c + 32) / 64; + bf = astc_helpers::qlog16_to_half(bi); + } + + const double decoded_half_q0 = q(rf, coptions.m_q_log_bias), decoded_half_q1 = q(gf, coptions.m_q_log_bias), decoded_half_q2 = q(bf, coptions.m_q_log_bias); + + const double rd = decoded_half_q0 - desired_half_r_q, gd = decoded_half_q1 - desired_half_g_q, bd = decoded_half_q2 - desired_half_b_q; + + cur_error += R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd; + } + + { + half_float rf, gf, bf; + + { + uint32_t r0 = trial_e[0][0], r1 = trial_e[1][0]; + int ri = (r0 * (64 - c) + r1 * c + 32) / 64; + rf = astc_helpers::qlog16_to_half(ri); + } + + { + uint32_t g0 = trial_e[0][1], g1 = trial_e[1][1]; + int gi = (g0 * (64 - c) + g1 * c + 32) / 64; + gf = astc_helpers::qlog16_to_half(gi); + } + + { + uint32_t b0 = trial_e[0][2], b1 = trial_e[1][2]; + int bi = (b0 * (64 - c) + b1 * c + 32) / 64; + bf = astc_helpers::qlog16_to_half(bi); + } + + const double decoded_half_q0 = q(rf, coptions.m_q_log_bias), decoded_half_q1 = q(gf, coptions.m_q_log_bias), decoded_half_q2 = q(bf, coptions.m_q_log_bias); + + const double rd = decoded_half_q0 - desired_half_r_q, gd = decoded_half_q1 - desired_half_g_q, bd = decoded_half_q2 - desired_half_b_q; + + trial_error += R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd; + } + + } // p + + if (trial_error < cur_error) + { + memcpy(pEndpoint_vals, trial_endpoints, NUM_MODE7_ENDPOINTS); + return true; + } + + return false; +} + +bool refine_endpoints( + uint32_t cem, + uint32_t endpoint_ise_range, + uint8_t* pEndpoint_vals, // the endpoints to optimize + uint32_t block_w, uint32_t block_h, // block dimensions + uint32_t grid_w, uint32_t grid_h, const uint8_t* pWeights, uint32_t weight_ise_range, // weight grid + uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[], + const uint8_t* pPixel_block_ofs, // maps this subset's pixels to block offsets + astc_hdr_codec_base_options& coptions, opt_mode_t opt_mode) +{ + if (cem == 7) + { + return refine_endpoints_mode7( + endpoint_ise_range, + pEndpoint_vals, + block_w, block_h, + grid_w, grid_h, pWeights, weight_ise_range, + num_pixels, pBlock_pixels_half, pBlock_pixels_q16, + pPixel_block_ofs, + coptions, + FIRST_MODE7_SUBMODE_INDEX, MAX_MODE7_SUBMODE_INDEX); + } + else if (cem == 11) + { + return refine_endpoints_mode11( + endpoint_ise_range, + pEndpoint_vals, + block_w, block_h, + grid_w, grid_h, pWeights, weight_ise_range, + num_pixels, pBlock_pixels_half, pBlock_pixels_q16, + pPixel_block_ofs, + coptions, + false, FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, opt_mode); + } + + return false; +} + +} // namespace basisu + diff --git a/thirdparty/basis_universal/encoder/basisu_astc_hdr_common.h b/thirdparty/basis_universal/encoder/basisu_astc_hdr_common.h new file mode 100644 index 000000000000..55be403fa31f --- /dev/null +++ b/thirdparty/basis_universal/encoder/basisu_astc_hdr_common.h @@ -0,0 +1,423 @@ +// File: basisu_astc_hdr_common.h +#pragma once +#include "basisu_enc.h" +#include "basisu_gpu_texture.h" +#include "../transcoder/basisu_astc_helpers.h" +#include "../transcoder/basisu_astc_hdr_core.h" + +namespace basisu +{ + const uint32_t MAX_ASTC_HDR_BLOCK_W = 6, MAX_ASTC_HDR_BLOCK_H = 6; + const uint32_t MAX_ASTC_HDR_ENC_BLOCK_PIXELS = 6 * 6; + + const uint32_t MODE11_TOTAL_SUBMODES = 8; // plus an extra hidden submode, directly encoded, for direct, so really 9 (see tables 99/100 of the ASTC spec) + const uint32_t MODE7_TOTAL_SUBMODES = 6; + + // [ise_range][0] = # levels + // [ise_range][1...] = lerp value [0,64] + // in ASTC order + // Supported ISE weight ranges: 0 to 11, 12 total + const uint32_t MIN_SUPPORTED_ISE_WEIGHT_INDEX = astc_helpers::BISE_2_LEVELS; // ISE 0=2 levels + const uint32_t MAX_SUPPORTED_ISE_WEIGHT_INDEX = astc_helpers::BISE_32_LEVELS; // ISE 11=16 levels + const uint32_t MIN_SUPPORTED_WEIGHT_LEVELS = 2; + const uint32_t MAX_SUPPORTED_WEIGHT_LEVELS = 32; + + extern const uint8_t g_ise_weight_lerps[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][33]; + + const float Q_LOG_BIAS_4x4 = .125f; // the original UASTC HDR 4x4 log bias + const float Q_LOG_BIAS_6x6 = 1.0f; // the log bias both encoders use now + + const float LDR_TO_HDR_NITS = 100.0f; + + struct astc_hdr_codec_base_options + { + float m_r_err_scale, m_g_err_scale; + float m_q_log_bias; + + bool m_ultra_quant; + + // If true, the ASTC HDR compressor is allowed to more aggressively vary weight indices for slightly higher compression in non-fastest mode. This will hurt BC6H quality, however. + bool m_allow_uber_mode; + + bool m_mode7_full_s_optimization; + + bool m_take_first_non_clamping_mode11_submode; + bool m_take_first_non_clamping_mode7_submode; + + bool m_disable_weight_plane_optimization; + + astc_hdr_codec_base_options() { init(); } + + void init(); + }; + + inline int get_bit( + int src_val, int src_bit) + { + assert(src_bit >= 0 && src_bit <= 31); + int bit = (src_val >> src_bit) & 1; + return bit; + } + + inline void pack_bit( + int& dst, int dst_bit, + int src_val, int src_bit = 0) + { + assert(dst_bit >= 0 && dst_bit <= 31); + int bit = get_bit(src_val, src_bit); + dst |= (bit << dst_bit); + } + + inline uint32_t get_max_qlog(uint32_t bits) + { + switch (bits) + { + case 7: return basist::MAX_QLOG7; + case 8: return basist::MAX_QLOG8; + case 9: return basist::MAX_QLOG9; + case 10: return basist::MAX_QLOG10; + case 11: return basist::MAX_QLOG11; + case 12: return basist::MAX_QLOG12; + case 16: return basist::MAX_QLOG16; + default: assert(0); break; + } + return 0; + } + +#if 0 + inline float get_max_qlog_val(uint32_t bits) + { + switch (bits) + { + case 7: return MAX_QLOG7_VAL; + case 8: return MAX_QLOG8_VAL; + case 9: return MAX_QLOG9_VAL; + case 10: return MAX_QLOG10_VAL; + case 11: return MAX_QLOG11_VAL; + case 12: return MAX_QLOG12_VAL; + case 16: return MAX_QLOG16_VAL; + default: assert(0); break; + } + return 0; + } +#endif + +#if 0 + // Input is the low 11 bits of the qlog + // Returns the 10-bit mantissa of the half float value + int qlog11_to_half_float_mantissa(int M) + { + assert(M <= 0x7FF); + int Mt; + if (M < 512) + Mt = 3 * M; + else if (M >= 1536) + Mt = 5 * M - 2048; + else + Mt = 4 * M - 512; + return (Mt >> 3); + } +#endif + + // Input is the 10-bit mantissa of the half float value + // Output is the 11-bit qlog value + // Inverse of qlog11_to_half_float_mantissa() + inline int half_float_mantissa_to_qlog11(int hf) + { + int q0 = (hf * 8 + 2) / 3; + int q1 = (hf * 8 + 2048 + 4) / 5; + + if (q0 < 512) + return q0; + else if (q1 >= 1536) + return q1; + + int q2 = (hf * 8 + 512 + 2) / 4; + return q2; + } + + inline int half_to_qlog16(int hf) + { + assert(!basist::half_is_signed((basist::half_float)hf) && !basist::is_half_inf_or_nan((basist::half_float)hf)); + + // extract 5 bits exponent, which is carried through to qlog16 unchanged + const int exp = (hf >> 10) & 0x1F; + + // extract and invert the 10 bit mantissa to nearest qlog11 (should be lossless) + const int mantissa = half_float_mantissa_to_qlog11(hf & 0x3FF); + assert(mantissa <= 0x7FF); + + // Now combine to qlog16, which is what ASTC HDR interpolates using the [0-64] weights. + uint32_t qlog16 = (exp << 11) | mantissa; + + // should be a lossless operation + assert(astc_helpers::qlog16_to_half(qlog16) == hf); + + return qlog16; + } + + void interpolate_qlog12_colors( + const int e[2][3], + basist::half_float* pDecoded_half, + vec3F* pDecoded_float, + uint32_t n, uint32_t ise_weight_range); + + bool get_astc_hdr_mode_11_block_colors( + const uint8_t* pEndpoints, + basist::half_float* pDecoded_half, + vec3F* pDecoded_float, + uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range); + + bool get_astc_hdr_mode_7_block_colors( + const uint8_t* pEndpoints, + basist::half_float* pDecoded_half, + vec3F* pDecoded_float, + uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range); + + // Fast high precision piecewise linear approximation of log2(bias+x). + // Half may be zero, positive or denormal. No NaN/Inf/negative. + BASISU_FORCE_INLINE double q(basist::half_float x, float log_bias) + { + union { float f; int32_t i; uint32_t u; } fi; + + fi.f = fast_half_to_float_pos_not_inf_or_nan(x); + + assert(fi.f >= 0.0f); + + fi.f += log_bias; + + return (double)fi.u; // approx log2f(fi.f), need to return double for the precision + } + + BASISU_FORCE_INLINE uint32_t q2(basist::half_float x, float log_bias) + { + union { float f; int32_t i; uint32_t u; } fi; + + fi.f = fast_half_to_float_pos_not_inf_or_nan(x); + + assert(fi.f >= 0.0f); + + fi.f += log_bias; + + return fi.u; + } + + double eval_selectors( + uint32_t num_pixels, + uint8_t* pWeights, + uint32_t ise_weight_range, + const basist::half_float* pBlock_pixels_half, + uint32_t num_weight_levels, + const basist::half_float* pDecoded_half, + const astc_hdr_codec_base_options& coptions, + uint32_t usable_selector_bitmask = UINT32_MAX); + + double eval_selectors_dual_plane( + uint32_t channel_index, + uint32_t num_pixels, + uint8_t* pWeights0, uint8_t* pWeights1, + const basist::half_float* pBlock_pixels_half, + uint32_t num_weight_levels, + const basist::half_float* pDecoded_half, + const astc_hdr_codec_base_options& coptions, + uint32_t usable_selector_bitmask = UINT32_MAX); + + double compute_block_error(uint32_t num_pixels, const basist::half_float* pOrig_block, const basist::half_float* pPacked_block, const astc_hdr_codec_base_options& coptions); + + const uint32_t FIRST_MODE7_SUBMODE_INDEX = 0; + const uint32_t MAX_MODE7_SUBMODE_INDEX = 5; + + bool pack_mode7( + const vec3F& high_color_q16, const float s_q16, + uint32_t ise_endpoint_range, uint8_t* pEndpoints, + uint32_t ise_weight_range, // only used for determining biasing during CEM 7 packing + const astc_hdr_codec_base_options& coptions, + int32_t first_submode, int32_t last_submode, bool ignore_clamping, uint32_t& submode_used); + + bool try_mode7( + uint32_t num_pixels, + uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used, + const vec3F& high_color_q16, const float s_q16, + const basist::half_float block_pixels_half[][3], + uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions, + uint32_t ise_endpoint_range, + int32_t first_submode = 0, int32_t last_submode = MAX_MODE7_SUBMODE_INDEX); + + bool pack_mode11( + const vec3F& low_color_q16, const vec3F& high_color_q16, + uint32_t ise_endpoint_range, uint8_t* pEndpoints, + const astc_hdr_codec_base_options& coptions, + bool direct_only, int32_t first_submode, int32_t last_submode, bool ignore_clamping, uint32_t& submode_used); + + bool try_mode11(uint32_t num_pixels, + uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used, + const vec3F& low_color_q16, const vec3F& high_color_q16, + const basist::half_float block_pixels_half[][3], + uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions, bool direct_only, uint32_t ise_endpoint_range, + bool constrain_ise_weight_selectors, + int32_t first_submode, int32_t last_submode, bool ignore_clamping); + + bool try_mode11_dual_plane(uint32_t channel_index, uint32_t num_pixels, + uint8_t* pEndpoints, uint8_t* pWeights0, uint8_t* pWeights1, double& cur_block_error, uint32_t& submode_used, + const vec3F& low_color_q16, const vec3F& high_color_q16, + const basist::half_float block_pixels_half[][3], + uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_base_options& coptions, bool direct_only, uint32_t ise_endpoint_range, + bool constrain_ise_weight_selectors, + int32_t first_submode, int32_t last_submode, bool ignore_clamping); + + const int FIRST_MODE11_SUBMODE_INDEX = -1; + const int MAX_MODE11_SUBMODE_INDEX = 7; + + enum opt_mode_t + { + cNoOpt, + cOrdinaryLeastSquares, + cWeightedLeastSquares, + cWeightedLeastSquaresHeavy, + cWeightedAverage + }; + + struct encode_astc_block_stats + { + uint32_t m_num_pixels; + vec3F m_mean_q16; + vec3F m_axis_q16; + + void init(uint32_t num_pixels, const vec4F pBlock_pixels_q16[]); + }; + + double encode_astc_hdr_block_mode_11( + uint32_t num_pixels, + const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[], + uint32_t ise_weight_range, + uint32_t& best_submode, + double cur_block_error, + uint8_t* blk_endpoints, uint8_t* blk_weights, + const astc_hdr_codec_base_options& coptions, + bool direct_only, + uint32_t ise_endpoint_range, + bool uber_mode, + bool constrain_ise_weight_selectors, + int32_t first_submode, int32_t last_submode, bool ignore_clamping, + opt_mode_t opt_mode, + const encode_astc_block_stats *pBlock_stats = nullptr); + + double encode_astc_hdr_block_downsampled_mode_11( + uint32_t block_x, uint32_t block_y, uint32_t grid_x, uint32_t grid_y, + uint32_t ise_weight_range, uint32_t ise_endpoint_range, + uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[], + double cur_block_error, + int32_t first_submode, int32_t last_submode, bool ignore_clamping, opt_mode_t opt_mode, + uint8_t* pBlk_endpoints, uint8_t* pBlk_weights, uint32_t& best_submode, + const astc_hdr_codec_base_options& coptions, + const encode_astc_block_stats* pBlock_stats = nullptr); + + double encode_astc_hdr_block_mode_11_dual_plane( + uint32_t num_pixels, + const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[], + uint32_t channel_index, // 0-2 + uint32_t ise_weight_range, + uint32_t& best_submode, + double cur_block_error, + uint8_t* blk_endpoints, uint8_t* blk_weights0, uint8_t* blk_weights1, + const astc_hdr_codec_base_options& coptions, + bool direct_only, + uint32_t ise_endpoint_range, + bool uber_mode, + bool constrain_ise_weight_selectors, + int32_t first_submode, int32_t last_submode, + bool ignore_clamping); + + double encode_astc_hdr_block_mode_7( + uint32_t num_pixels, + const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[], + uint32_t ise_weight_range, + uint32_t& best_submode, + double cur_block_error, + uint8_t* blk_endpoints, //[4] + uint8_t* blk_weights, // [num_pixels] + const astc_hdr_codec_base_options& coptions, + uint32_t ise_endpoint_range, + int first_submode = 0, int last_submode = MAX_MODE7_SUBMODE_INDEX, + const encode_astc_block_stats *pBlock_stats = nullptr); + + //-------------------------------------------------------------------------------------------------------------------------- + + struct mode11_log_desc + { + int32_t m_submode; + int32_t m_maj_comp; + + // Or R0, G0, B0 if maj_comp==3 (direct) + int32_t m_a; // positive + int32_t m_c; // positive + int32_t m_b0; // positive + + // Or R1, G1, B1 if maj_comp==3 (direct) + int32_t m_b1; // positive + int32_t m_d0; // if not direct, is signed + int32_t m_d1; // if not direct, is signed + + // limits if not direct + int32_t m_a_bits, m_c_bits, m_b_bits, m_d_bits; + int32_t m_max_a_val, m_max_c_val, m_max_b_val, m_min_d_val, m_max_d_val; + + void clear() { clear_obj(*this); } + + bool is_direct() const { return m_maj_comp == 3; } + }; + + //-------------------------------------------------------------------------------------------------------------------------- + bool pack_astc_mode7_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& rgb_q16, float s_q16, int& max_clamp_mag, uint32_t ise_weight_range, bool early_out_if_clamped, int max_clamp_mag_accept_thresh); + + bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, int val_q[2][3], int& max_clamp_mag, bool early_out_if_clamped = false, int max_clamp_mag_accept_thresh = 0); + bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& low_q16, const vec3F& high_q16, int& max_clamp_mag, bool early_out_if_clamped = false, int max_clamp_mag_accept_thresh = 0); + void pack_astc_mode11_direct(uint8_t* pEndpoints, vec3F l_q16, vec3F h_q16); + + bool pack_mode11(mode11_log_desc& desc, uint8_t* pEndpoints); + void unpack_mode11(const uint8_t* pEndpoints, mode11_log_desc& desc); + + void decode_cem_11_config(const uint8_t* pEndpoints, int& submode_index, int& maj_index); + void decode_cem_7_config(const uint8_t* pEndpoints, int& submode_index, int& maj_index); + + void dequantize_astc_weights(uint32_t n, const uint8_t* pSrc_ise_vals, uint32_t from_ise_range, uint8_t* pDst_raw_weights); + + const float* get_6x6_downsample_matrix(uint32_t grid_width, uint32_t grid_height); + + void downsample_weight_grid( + const float* pMatrix_weights, + uint32_t bx, uint32_t by, // source/from dimension (block size) + uint32_t wx, uint32_t wy, // dest/to dimension (grid size) + const uint8_t* pSrc_weights, // these are dequantized weights, NOT ISE symbols, [by][bx] + uint8_t* pDst_weights); // [wy][wx] + + void downsample_ise_weights( + uint32_t weight_ise_range, uint32_t quant_weight_ise_range, + uint32_t block_w, uint32_t block_h, + uint32_t grid_w, uint32_t grid_h, + const uint8_t* pSrc_weights, uint8_t* pDst_weights); + + void downsample_ise_weights_dual_plane( + uint32_t dequant_weight_ise_range, uint32_t quant_weight_ise_range, + uint32_t block_w, uint32_t block_h, + uint32_t grid_w, uint32_t grid_h, + const uint8_t* pSrc_weights0, const uint8_t* pSrc_weights1, + uint8_t* pDst_weights); + + bool refine_endpoints( + uint32_t cem, + uint32_t endpoint_ise_range, + uint8_t* pEndpoint_vals, // the endpoints to optimize + uint32_t block_w, uint32_t block_h, // block dimensions + uint32_t grid_w, uint32_t grid_h, const uint8_t* pWeights, uint32_t weight_ise_range, // weight grid + uint32_t num_pixels, const basist::half_float pBlock_pixels_half[][3], const vec4F pBlock_pixels_q16[], + const uint8_t* pPixel_block_ofs, // maps this subset's pixels to block offsets + astc_hdr_codec_base_options& coptions, opt_mode_t opt_mode); + + extern bool g_astc_hdr_enc_initialized; + + // This MUST be called before encoding any blocks. + void astc_hdr_enc_init(); + +} // namespace basisu + diff --git a/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.cpp b/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.cpp deleted file mode 100644 index d698a7ff872b..000000000000 --- a/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.cpp +++ /dev/null @@ -1,3310 +0,0 @@ -// basisu_astc_hdr_enc.cpp -#include "basisu_astc_hdr_enc.h" -#include "../transcoder/basisu_transcoder.h" - -using namespace basist; - -namespace basisu -{ - -const float DEF_R_ERROR_SCALE = 2.0f; -const float DEF_G_ERROR_SCALE = 3.0f; - -static inline uint32_t get_max_qlog(uint32_t bits) -{ - switch (bits) - { - case 7: return MAX_QLOG7; - case 8: return MAX_QLOG8; - case 9: return MAX_QLOG9; - case 10: return MAX_QLOG10; - case 11: return MAX_QLOG11; - case 12: return MAX_QLOG12; - case 16: return MAX_QLOG16; - default: assert(0); break; - } - return 0; -} - -#if 0 -static inline float get_max_qlog_val(uint32_t bits) -{ - switch (bits) - { - case 7: return MAX_QLOG7_VAL; - case 8: return MAX_QLOG8_VAL; - case 9: return MAX_QLOG9_VAL; - case 10: return MAX_QLOG10_VAL; - case 11: return MAX_QLOG11_VAL; - case 12: return MAX_QLOG12_VAL; - case 16: return MAX_QLOG16_VAL; - default: assert(0); break; - } - return 0; -} -#endif - -static inline int get_bit( - int src_val, int src_bit) -{ - assert(src_bit >= 0 && src_bit <= 31); - int bit = (src_val >> src_bit) & 1; - return bit; -} - -static inline void pack_bit( - int& dst, int dst_bit, - int src_val, int src_bit = 0) -{ - assert(dst_bit >= 0 && dst_bit <= 31); - int bit = get_bit(src_val, src_bit); - dst |= (bit << dst_bit); -} - -//-------------------------------------------------------------------------------------------------------------------------- - -astc_hdr_codec_options::astc_hdr_codec_options() -{ - init(); -} - -void astc_hdr_codec_options::init() -{ - m_bc6h_err_weight = .85f; - m_r_err_scale = DEF_R_ERROR_SCALE; - m_g_err_scale = DEF_G_ERROR_SCALE; - - // Disabling by default to avoid transcoding outliers (try kodim26). The quality lost is very low. TODO: Could include the uber result in the output. - m_allow_uber_mode = false; - - // Must set best quality level first to set defaults. - set_quality_best(); - - set_quality_level(cDefaultLevel); -} - -void astc_hdr_codec_options::set_quality_best() -{ - m_mode11_direct_only = false; - - // highest achievable quality - m_use_solid = true; - - m_use_mode11 = true; - m_mode11_uber_mode = true; - m_first_mode11_weight_ise_range = MODE11_FIRST_ISE_RANGE; - m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE; - m_first_mode11_submode = -1; - m_last_mode11_submode = 7; - - m_use_mode7_part1 = true; - m_first_mode7_part1_weight_ise_range = MODE7_PART1_FIRST_ISE_RANGE; - m_last_mode7_part1_weight_ise_range = MODE7_PART1_LAST_ISE_RANGE; - - m_use_mode7_part2 = true; - m_mode7_part2_part_masks = UINT32_MAX; - m_first_mode7_part2_weight_ise_range = MODE7_PART2_FIRST_ISE_RANGE; - m_last_mode7_part2_weight_ise_range = MODE7_PART2_LAST_ISE_RANGE; - - m_use_mode11_part2 = true; - m_mode11_part2_part_masks = UINT32_MAX; - m_first_mode11_part2_weight_ise_range = MODE11_PART2_FIRST_ISE_RANGE; - m_last_mode11_part2_weight_ise_range = MODE11_PART2_LAST_ISE_RANGE; - - m_refine_weights = true; - - m_use_estimated_partitions = false; - m_max_estimated_partitions = 0; -} - -void astc_hdr_codec_options::set_quality_normal() -{ - m_use_solid = true; - - // We'll allow uber mode in normal if the user allows it. - m_use_mode11 = true; - m_mode11_uber_mode = true; - m_first_mode11_weight_ise_range = 6; - m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE; - - m_use_mode7_part1 = true; - m_first_mode7_part1_weight_ise_range = MODE7_PART1_LAST_ISE_RANGE; - m_last_mode7_part1_weight_ise_range = MODE7_PART1_LAST_ISE_RANGE; - - m_use_mode7_part2 = true; - m_mode7_part2_part_masks = UINT32_MAX; - m_first_mode7_part2_weight_ise_range = MODE7_PART2_LAST_ISE_RANGE; - m_last_mode7_part2_weight_ise_range = MODE7_PART2_LAST_ISE_RANGE; - - m_use_mode11_part2 = true; - m_mode11_part2_part_masks = UINT32_MAX; - m_first_mode11_part2_weight_ise_range = MODE11_PART2_LAST_ISE_RANGE; - m_last_mode11_part2_weight_ise_range = MODE11_PART2_LAST_ISE_RANGE; - - m_refine_weights = true; -} - -void astc_hdr_codec_options::set_quality_fastest() -{ - m_use_solid = true; - - m_use_mode11 = true; - m_mode11_uber_mode = false; - m_first_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE; - m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE; - - m_use_mode7_part1 = false; - m_use_mode7_part2 = false; - m_use_mode11_part2 = false; - - m_refine_weights = false; -} - -//-------------------------------------------------------------------------------------------------------------------------- - -void astc_hdr_codec_options::set_quality_level(int level) -{ - level = clamp(level, cMinLevel, cMaxLevel); - - m_level = level; - - switch (level) - { - case 0: - { - set_quality_fastest(); - break; - } - case 1: - { - set_quality_normal(); - - m_first_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE - 1; - m_last_mode11_weight_ise_range = MODE11_LAST_ISE_RANGE; - - m_use_mode7_part1 = false; - m_use_mode7_part2 = false; - - m_use_estimated_partitions = true; - m_max_estimated_partitions = 1; - - m_mode11_part2_part_masks = 1 | 2; - m_mode7_part2_part_masks = 1 | 2; - break; - } - case 2: - { - set_quality_normal(); - - m_use_estimated_partitions = true; - m_max_estimated_partitions = 2; - - m_mode11_part2_part_masks = 1 | 2; - m_mode7_part2_part_masks = 1 | 2; - - break; - } - case 3: - { - set_quality_best(); - - m_use_estimated_partitions = true; - m_max_estimated_partitions = 2; - - m_mode11_part2_part_masks = 1 | 2 | 4 | 8; - m_mode7_part2_part_masks = 1 | 2 | 4 | 8; - - break; - } - case 4: - { - set_quality_best(); - - break; - } - } -} - -//-------------------------------------------------------------------------------------------------------------------------- - -#if 0 -static inline half_float qlog12_to_half_slow(uint32_t qlog12) -{ - return qlog_to_half_slow(qlog12, 12); -} -#endif - -// max usable qlog8 value is 247, 248=inf, >=249 is nan -// max usable qlog7 value is 123, 124=inf, >=125 is nan - -// To go from a smaller qlog to an larger one, shift left by X bits. - -//const uint32_t TOTAL_USABLE_QLOG8 = 248; // 0-247 are usable, 0=0, 247=60416.0, 246=55296.0 - -// for qlog7's shift left by 1 -//half_float g_qlog8_to_half[256]; -//float g_qlog8_to_float[256]; - -//half_float g_qlog12_to_half[4096]; -//float g_qlog12_to_float[4096]; - -static half_float g_qlog16_to_half[65536]; - -inline half_float qlog_to_half(uint32_t val, uint32_t bits) -{ - assert((bits >= 5) && (bits <= 16)); - assert(val < (1U << bits)); - return g_qlog16_to_half[val << (16 - bits)]; -} - -// nearest values given a positive half float value (only) -static uint16_t g_half_to_qlog7[32768], g_half_to_qlog8[32768], g_half_to_qlog9[32768], g_half_to_qlog10[32768], g_half_to_qlog11[32768], g_half_to_qlog12[32768]; - -const uint32_t HALF_TO_QLOG_TABS_BASE = 7; -static uint16_t* g_pHalf_to_qlog_tabs[8] = -{ - g_half_to_qlog7, - g_half_to_qlog8, - - g_half_to_qlog9, - g_half_to_qlog10, - - g_half_to_qlog11, - g_half_to_qlog12 -}; - -static inline uint32_t half_to_qlog7_12(half_float h, uint32_t bits) -{ - assert((bits >= HALF_TO_QLOG_TABS_BASE) && (bits <= 12)); - assert(h < 32768); - - return g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_BASE][h]; -} - -#if 0 -// Input is the low 11 bits of the qlog -// Returns the 10-bit mantissa of the half float value -static int qlog11_to_half_float_mantissa(int M) -{ - assert(M <= 0x7FF); - int Mt; - if (M < 512) - Mt = 3 * M; - else if (M >= 1536) - Mt = 5 * M - 2048; - else - Mt = 4 * M - 512; - return (Mt >> 3); -} -#endif - -// Input is the 10-bit mantissa of the half float value -// Output is the 11-bit qlog value -// Inverse of qlog11_to_half_float_mantissa() -static inline int half_float_mantissa_to_qlog11(int hf) -{ - int q0 = (hf * 8 + 2) / 3; - int q1 = (hf * 8 + 2048 + 4) / 5; - - if (q0 < 512) - return q0; - else if (q1 >= 1536) - return q1; - - int q2 = (hf * 8 + 512 + 2) / 4; - return q2; -} - -static inline int half_to_qlog16(int hf) -{ - // extract 5 bits exponent, which is carried through to qlog16 unchanged - const int exp = (hf >> 10) & 0x1F; - - // extract and invert the 10 bit mantissa to nearest qlog11 (should be lossless) - const int mantissa = half_float_mantissa_to_qlog11(hf & 0x3FF); - assert(mantissa <= 0x7FF); - - // Now combine to qlog16, which is what ASTC HDR interpolates using the [0-64] weights. - uint32_t qlog16 = (exp << 11) | mantissa; - - // should be a lossless operation - assert(qlog16_to_half_slow(qlog16) == hf); - - return qlog16; -} - -static inline uint32_t quant_qlog16(uint32_t q16, uint32_t desired_bits) -{ - assert((desired_bits >= 7) && (desired_bits <= 12)); - assert(q16 <= 65535); - - const uint32_t shift = 16 - desired_bits; - uint32_t e = (q16 + (1U << (shift - 1U)) - 1U) >> shift; - - uint32_t max_val = (1U << desired_bits) - 1U; - e = minimum(e, max_val); - - return e; -} - -static void compute_half_to_qlog_table(uint32_t bits, uint16_t* pTable, const basisu::vector &qlog16_to_float) -{ - assert(bits >= 5 && bits <= 12); - const uint32_t max_val = (1 << bits) - 1; - - // For all positive half-floats - for (uint32_t h = 0; h < 32768; h++) - { - // Skip invalid values - if (is_half_inf_or_nan((half_float)h)) - continue; - const float desired_val = half_to_float((half_float)h); - - float best_err = 1e+30f; - uint32_t best_qlog = 0; - - // For all possible qlog's - for (uint32_t i = 0; i <= max_val; i++) - { - // Skip invalid values - float v = qlog16_to_float[i << (16 - bits)]; - if (std::isnan(v)) - continue; - - // Compute error - float err = fabs(v - desired_val); - - // Find best - if (err < best_err) - { - best_err = err; - best_qlog = i; - } - } - - pTable[h] = (uint16_t)best_qlog; - } - -#if 0 - uint32_t t = 0; - - const uint32_t nb = 12; - int nb_shift = 16 - nb; - - for (uint32_t q16 = 0; q16 < 65536; q16++) - { - half_float h = qlog16_to_half_slow(q16); - if (is_half_inf_or_nan(h)) - continue; - - int q7 = half_to_qlog7_12(h, nb); - - uint32_t best_err = UINT32_MAX, best_l = 0; - for (int l = 0; l < (1 << nb); l++) - { - int dec_q16 = l << nb_shift; - int err = iabs(dec_q16 - q16); - if (err < best_err) - { - best_err = err; - best_l = l; - } - } - - //int e = (q16 + 253) >> 9; // 345 - - int e = (q16 + (1 << (nb_shift - 1)) - 1) >> nb_shift; // 285 - if (best_l != e) - //if (q7 != best_l) - { - printf("q16=%u, h=%u, q7=%u, e=%u, best_l=%u\n", q16, h, q7, e, best_l); - t++; - } - } - - printf("Mismatches: %u\n", t); - exit(0); -#endif -} - -static void init_qlog_tables() -{ - basisu::vector qlog16_to_float(65536); - - // for all possible qlog16, compute the corresponding half float - for (uint32_t i = 0; i <= 65535; i++) - { - half_float h = qlog16_to_half_slow(i); - g_qlog16_to_half[i] = h; - - qlog16_to_float[i] = half_to_float(h); - } - - // for all possible half floats, find the nearest qlog5-12 float - for (uint32_t bits = HALF_TO_QLOG_TABS_BASE; bits <= 12; bits++) - { - compute_half_to_qlog_table(bits, g_pHalf_to_qlog_tabs[bits - HALF_TO_QLOG_TABS_BASE], qlog16_to_float); - } -} - -// [ise_range][0] = # levels -// [ise_range][1...] = lerp value [0,64] -// in ASTC order -// Supported ISE weight ranges: 0 to 10, 11 total -const uint32_t MIN_SUPPORTED_ISE_WEIGHT_INDEX = 1; // ISE 1=3 levels -const uint32_t MAX_SUPPORTED_ISE_WEIGHT_INDEX = 10; // ISE 10=24 levels - -static const uint8_t g_ise_weight_lerps[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][32] = -{ - { 0 }, // ise range=0 is invalid for 4x4 block sizes (<24 weight bits in the block) - { 3, 0, 32, 64 }, // 1 - { 4, 0, 21, 43, 64 }, // 2 - { 5, 0, 16, 32, 48, 64 }, // 3 - { 6, 0, 64, 12, 52, 25, 39 }, // 4 - { 8, 0, 9, 18, 27, 37, 46, 55, 64 }, // 5 - { 10, 0, 64, 7, 57, 14, 50, 21, 43, 28, 36 }, // 6 - { 12, 0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36 }, // 7 - { 16, 0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64 }, // 8 - { 20, 0, 64, 16, 48, 3, 61, 19, 45, 6, 58, 23, 41, 9, 55, 26, 38, 13, 51, 29, 35 }, // 9 - { 24, 0, 64, 8, 56, 16, 48, 24, 40, 2, 62, 11, 53, 19, 45, 27, 37, 5, 59, 13, 51, 22, 42, 30, 34 } // 10 -}; - -//{ 12, 0, 64, 17, 47, 5, 59, 23, 41, 11, 53, 28, 36 }, // 7 -//static const uint8_t g_weight_order_7[12] = { 0, 4, 8, 2, 6, 10, 11, 7, 3, 9, 5, 1 }; - -static vec3F calc_mean(uint32_t num_pixels, const vec4F* pPixels) -{ - vec3F mean(0.0f); - - for (uint32_t i = 0; i < num_pixels; i++) - { - const vec4F& p = pPixels[i]; - - mean[0] += p[0]; - mean[1] += p[1]; - mean[2] += p[2]; - } - - return mean / static_cast(num_pixels); -} - -static vec3F calc_rgb_pca(uint32_t num_pixels, const vec4F* pPixels, const vec3F& mean_color) -{ - float cov[6] = { 0, 0, 0, 0, 0, 0 }; - - for (uint32_t i = 0; i < num_pixels; i++) - { - const vec4F& v = pPixels[i]; - - float r = v[0] - mean_color[0]; - float g = v[1] - mean_color[1]; - float b = v[2] - mean_color[2]; - - cov[0] += r * r; - cov[1] += r * g; - cov[2] += r * b; - cov[3] += g * g; - cov[4] += g * b; - cov[5] += b * b; - } - - float xr = .9f, xg = 1.0f, xb = .7f; - for (uint32_t iter = 0; iter < 3; iter++) - { - float r = xr * cov[0] + xg * cov[1] + xb * cov[2]; - float g = xr * cov[1] + xg * cov[3] + xb * cov[4]; - float b = xr * cov[2] + xg * cov[4] + xb * cov[5]; - - float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b)); - - if (m > 1e-10f) - { - m = 1.0f / m; - - r *= m; - g *= m; - b *= m; - } - - xr = r; - xg = g; - xb = b; - } - - float len = xr * xr + xg * xg + xb * xb; - - vec3F axis; - if (len < 1e-10f) - axis.set(0.0f); - else - { - len = 1.0f / sqrtf(len); - - xr *= len; - xg *= len; - xb *= len; - - axis.set(xr, xg, xb, 0); - } - - if (axis.dot(axis) < .5f) - { - axis.set(1.0f, 1.0f, 1.0f, 0.0f); - axis.normalize_in_place(); - } - - return axis; -} - -static vec3F interp_color(const vec3F& mean, const vec3F& dir, float df, const aabb3F& colorspace_box, const aabb3F& input_box, bool* pInside = nullptr) -{ -#if 0 - assert(mean[0] >= input_box[0][0]); - assert(mean[1] >= input_box[0][1]); - assert(mean[2] >= input_box[0][2]); - assert(mean[0] <= input_box[1][0]); - assert(mean[1] <= input_box[1][1]); - assert(mean[2] <= input_box[1][2]); -#endif - - if (pInside) - *pInside = false; - - vec3F k(mean + dir * df); - if (colorspace_box.contains(k)) - { - if (pInside) - *pInside = true; - - return k; - } - - // starts inside - vec3F s(mean); - - // ends outside - vec3F e(mean + dir * df); - - // a ray guaranteed to go from the outside to inside - ray3F r(e, (s - e).normalize_in_place()); - vec3F c; - float t = 0.0f; - - intersection::result res = intersection::ray_aabb(c, t, r, input_box); - if (res != intersection::cSuccess) - c = k; - - return c; -} - -// all in Q16 space, 0-65535 -static bool compute_least_squares_endpoints_rgb( - uint32_t N, const uint8_t* pSelectors, const vec4F* pSelector_weights, - vec3F* pXl, vec3F* pXh, const vec4F* pColors, const aabb3F& input_box) -{ - // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf - // https://web.archive.org/web/20150319232457/http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf - // I did this in matrix form first, expanded out all the ops, then optimized it a bit. - float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; - float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; - float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; - float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f; - - for (uint32_t i = 0; i < N; i++) - { - const uint32_t sel = pSelectors[i]; - z00 += pSelector_weights[sel][0]; - z10 += pSelector_weights[sel][1]; - z11 += pSelector_weights[sel][2]; - - float w = pSelector_weights[sel][3]; - q00_r += w * pColors[i][0]; - t_r += pColors[i][0]; - - q00_g += w * pColors[i][1]; - t_g += pColors[i][1]; - - q00_b += w * pColors[i][2]; - t_b += pColors[i][2]; - } - - q10_r = t_r - q00_r; - q10_g = t_g - q00_g; - q10_b = t_b - q00_b; - - z01 = z10; - - float det = z00 * z11 - z01 * z10; - if (det == 0.0f) - return false; - - det = 1.0f / det; - - float iz00, iz01, iz10, iz11; - iz00 = z11 * det; - iz01 = -z01 * det; - iz10 = -z10 * det; - iz11 = z00 * det; - - (*pXl)[0] = (float)(iz00 * q00_r + iz01 * q10_r); - (*pXh)[0] = (float)(iz10 * q00_r + iz11 * q10_r); - - (*pXl)[1] = (float)(iz00 * q00_g + iz01 * q10_g); - (*pXh)[1] = (float)(iz10 * q00_g + iz11 * q10_g); - - (*pXl)[2] = (float)(iz00 * q00_b + iz01 * q10_b); - (*pXh)[2] = (float)(iz10 * q00_b + iz11 * q10_b); - - for (uint32_t c = 0; c < 3; c++) - { - float l = (*pXl)[c], h = (*pXh)[c]; - - if (input_box.get_dim(c) < .0000125f) - { - l = input_box[0][c]; - h = input_box[1][c]; - } - - (*pXl)[c] = l; - (*pXh)[c] = h; - } - - vec3F mean((*pXl + *pXh) * .5f); - vec3F dir(*pXh - *pXl); - - float ln = dir.length(); - if (ln) - { - dir /= ln; - - float ld = (*pXl - mean).dot(dir); - float hd = (*pXh - mean).dot(dir); - - aabb3F colorspace_box(vec3F(0.0f), vec3F(MAX_QLOG16_VAL)); - - bool was_inside1 = false; - - vec3F l = interp_color(mean, dir, ld, colorspace_box, input_box, &was_inside1); - if (!was_inside1) - *pXl = l; - - bool was_inside2 = false; - vec3F h = interp_color(mean, dir, hd, colorspace_box, input_box, &was_inside2); - if (!was_inside2) - *pXh = h; - } - - pXl->clamp(0.0f, MAX_QLOG16_VAL); - pXh->clamp(0.0f, MAX_QLOG16_VAL); - - return true; -} - -static vec4F g_astc_ls_weights_ise[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][24]; - -static uint8_t g_map_astc_to_linear_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][24]; // [ise_range][astc_index] -> linear index -static uint8_t g_map_linear_to_astc_order[MAX_SUPPORTED_ISE_WEIGHT_INDEX + 1][24]; // [ise_range][linear_index] -> astc_index - -static void encode_astc_hdr_init() -{ - // Precomputed weight constants used during least fit determination. For each entry: w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w - for (uint32_t range = MIN_SUPPORTED_ISE_WEIGHT_INDEX; range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX; range++) - { - const uint32_t num_levels = g_ise_weight_lerps[range][0]; - assert((num_levels >= 3) && (num_levels <= 24)); - - for (uint32_t i = 0; i < num_levels; i++) - { - float w = g_ise_weight_lerps[range][1 + i] * (1.0f / 64.0f); - - g_astc_ls_weights_ise[range][i].set(w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w); - } - } - - for (uint32_t ise_range = MIN_SUPPORTED_ISE_WEIGHT_INDEX; ise_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX; ise_range++) - { - const uint32_t num_levels = g_ise_weight_lerps[ise_range][0]; - assert((num_levels >= 3) && (num_levels <= 24)); - - uint32_t s[32]; - for (uint32_t i = 0; i < num_levels; i++) - s[i] = (g_ise_weight_lerps[ise_range][1 + i] << 8) + i; - - std::sort(s, s + num_levels); - - for (uint32_t i = 0; i < num_levels; i++) - g_map_linear_to_astc_order[ise_range][i] = (uint8_t)(s[i] & 0xFF); - - for (uint32_t i = 0; i < num_levels; i++) - g_map_astc_to_linear_order[ise_range][g_map_linear_to_astc_order[ise_range][i]] = (uint8_t)i; - } -} - -void interpolate_qlog12_colors( - const int e[2][3], - half_float* pDecoded_half, - vec3F* pDecoded_float, - uint32_t n, uint32_t ise_weight_range) -{ - assert((ise_weight_range >= MIN_SUPPORTED_ISE_WEIGHT_INDEX) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); - - for (uint32_t i = 0; i < 2; i++) - { - for (uint32_t j = 0; j < 3; j++) - { - assert(in_range(e[i][j], 0, 0xFFF)); - } - } - - for (uint32_t i = 0; i < n; i++) - { - const int c = g_ise_weight_lerps[ise_weight_range][1 + i]; - assert(c == (int)astc_helpers::dequant_bise_weight(i, ise_weight_range)); - - half_float rf, gf, bf; - - { - uint32_t r0 = e[0][0] << 4; - uint32_t r1 = e[1][0] << 4; - int ri = (r0 * (64 - c) + r1 * c + 32) / 64; - rf = qlog16_to_half_slow(ri); - } - - { - uint32_t g0 = e[0][1] << 4; - uint32_t g1 = e[1][1] << 4; - int gi = (g0 * (64 - c) + g1 * c + 32) / 64; - gf = qlog16_to_half_slow(gi); - } - - { - uint32_t b0 = e[0][2] << 4; - uint32_t b1 = e[1][2] << 4; - int bi = (b0 * (64 - c) + b1 * c + 32) / 64; - bf = qlog16_to_half_slow(bi); - } - - if (pDecoded_half) - { - pDecoded_half[i * 3 + 0] = rf; - pDecoded_half[i * 3 + 1] = gf; - pDecoded_half[i * 3 + 2] = bf; - } - - if (pDecoded_float) - { - pDecoded_float[i][0] = half_to_float(rf); - pDecoded_float[i][1] = half_to_float(gf); - pDecoded_float[i][2] = half_to_float(bf); - } - } -} - -// decoded in ASTC order, not linear order -// return false if the ISE endpoint quantization leads to non-valid endpoints being decoded -bool get_astc_hdr_mode_11_block_colors( - const uint8_t* pEndpoints, - half_float* pDecoded_half, - vec3F* pDecoded_float, - uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range) -{ - assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); - - int e[2][3]; - if (!decode_mode11_to_qlog12(pEndpoints, e, ise_endpoint_range)) - return false; - - interpolate_qlog12_colors(e, pDecoded_half, pDecoded_float, n, ise_weight_range); - - return true; -} - -// decoded in ASTC order, not linear order -// return false if the ISE endpoint quantization leads to non-valid endpoints being decoded -bool get_astc_hdr_mode_7_block_colors( - const uint8_t* pEndpoints, - half_float* pDecoded_half, - vec3F* pDecoded_float, - uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range) -{ - assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); - - int e[2][3]; - if (!decode_mode7_to_qlog12(pEndpoints, e, nullptr, ise_endpoint_range)) - return false; - - interpolate_qlog12_colors(e, pDecoded_half, pDecoded_float, n, ise_weight_range); - - return true; -} - -// Fast high precision piecewise linear approximation of log2(bias+x). -// Half may be zero, positive or denormal. No NaN/Inf/negative. -static inline double q(half_float x) -{ - union { float f; int32_t i; uint32_t u; } fi; - - fi.f = fast_half_to_float_pos_not_inf_or_nan(x); - - assert(fi.f >= 0.0f); - - fi.f += .125f; - - return (double)fi.u; // approx log2f(fi.f), need to return double for the precision -} - -double eval_selectors( - uint32_t num_pixels, - uint8_t* pWeights, - const half_float* pBlock_pixels_half, - uint32_t num_weight_levels, - const half_float* pDecoded_half, - const astc_hdr_codec_options& coptions, - uint32_t usable_selector_bitmask) -{ - assert((num_pixels >= 1) && (num_pixels <= 16)); - assert(usable_selector_bitmask); - - const float R_WEIGHT = coptions.m_r_err_scale; - const float G_WEIGHT = coptions.m_g_err_scale; - - double total_error = 0; - -#ifdef _DEBUG - for (uint32_t i = 0; i < num_weight_levels; i++) - { - assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 0])); - assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 1])); - assert(!is_half_inf_or_nan(pDecoded_half[i * 3 + 2])); - } -#endif - - for (uint32_t p = 0; p < num_pixels; p++) - { - const half_float* pDesired_half = &pBlock_pixels_half[p * 3]; - - double lowest_e = 1e+30f; - - // this is an approximation of MSLE - for (uint32_t i = 0; i < num_weight_levels; i++) - { - if (((1 << i) & usable_selector_bitmask) == 0) - continue; - - // compute piecewise linear approximation of log2(a+eps)-log2(b+eps), for each component, then MSLE - double rd = q(pDecoded_half[i * 3 + 0]) - q(pDesired_half[0]); - double gd = q(pDecoded_half[i * 3 + 1]) - q(pDesired_half[1]); - double bd = q(pDecoded_half[i * 3 + 2]) - q(pDesired_half[2]); - - double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd; - - if (e < lowest_e) - { - lowest_e = e; - pWeights[p] = (uint8_t)i; - } - } - - total_error += lowest_e; - - } // p - - return total_error; -} - -//-------------------------------------------------------------------------------------------------------------------------- - -double compute_block_error(const half_float* pOrig_block, const half_float* pPacked_block, const astc_hdr_codec_options& coptions) -{ - const float R_WEIGHT = coptions.m_r_err_scale; - const float G_WEIGHT = coptions.m_g_err_scale; - - double total_error = 0; - - for (uint32_t p = 0; p < 16; p++) - { - double rd = q(pOrig_block[p * 3 + 0]) - q(pPacked_block[p * 3 + 0]); - double gd = q(pOrig_block[p * 3 + 1]) - q(pPacked_block[p * 3 + 1]); - double bd = q(pOrig_block[p * 3 + 2]) - q(pPacked_block[p * 3 + 2]); - - double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd; - - total_error += e; - } - - return total_error; -} - -//-------------------------------------------------------------------------------------------------------------------------- - -static inline int compute_clamped_val(int v, int l, int h, bool& did_clamp, int& max_clamp_mag) -{ - assert(l < h); - - if (v < l) - { - max_clamp_mag = basisu::maximum(max_clamp_mag, l - v); - - v = l; - did_clamp = true; - } - else if (v > h) - { - max_clamp_mag = basisu::maximum(max_clamp_mag, v - h); - - v = h; - did_clamp = true; - } - - return v; -} - -static bool pack_astc_mode11_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& low_q16, const vec3F& high_q16, int& max_clamp_mag) -{ - assert(submode <= 7); - - const uint8_t s_b_bits[8] = { 7, 8, 6, 7, 8, 6, 7, 6 }; - const uint8_t s_c_bits[8] = { 6, 6, 7, 7, 6, 7, 7, 7 }; - const uint8_t s_d_bits[8] = { 7, 6, 7, 6, 5, 6, 5, 6 }; - - const uint32_t a_bits = 9 + (submode >> 1); - const uint32_t b_bits = s_b_bits[submode]; - const uint32_t c_bits = s_c_bits[submode]; - const uint32_t d_bits = s_d_bits[submode]; - - const int max_a_val = (1 << a_bits) - 1; - const int max_b_val = (1 << b_bits) - 1; - const int max_c_val = (1 << c_bits) - 1; - - // The maximum usable value before it turns to NaN/Inf - const int max_a_qlog = get_max_qlog(a_bits); - - const int min_d_val = -(1 << (d_bits - 1)); - const int max_d_val = -min_d_val - 1; - assert((max_d_val - min_d_val + 1) == (1 << d_bits)); - - int val_q[2][3]; - - for (uint32_t c = 0; c < 3; c++) - { -#if 1 - // this is better - const half_float l = qlog16_to_half_slow((uint32_t)std::round(low_q16[c])); - val_q[0][c] = half_to_qlog7_12(l, a_bits); - - const half_float h = qlog16_to_half_slow((uint32_t)std::round(high_q16[c])); - val_q[1][c] = half_to_qlog7_12(h, a_bits); -#else - val_q[0][c] = quant_qlog16((uint32_t)std::round(low_q16[c]), a_bits); - val_q[1][c] = quant_qlog16((uint32_t)std::round(high_q16[c]), a_bits); -#endif - -#if 1 - if (val_q[0][c] == val_q[1][c]) - { -#if 0 - if (l <= h) -#else - if (low_q16[c] < high_q16[c]) -#endif - { - if (val_q[0][c]) - val_q[0][c]--; - - if (val_q[1][c] != max_a_val) - val_q[1][c]++; - } - else - { - if (val_q[0][c] != max_a_val) - val_q[0][c]++; - - if (val_q[1][c]) - val_q[1][c]--; - } - } -#endif - - val_q[0][c] = minimum(val_q[0][c], max_a_qlog); - val_q[1][c] = minimum(val_q[1][c], max_a_qlog); - } - - int highest_q = -1, highest_val = 0, highest_comp = 0; - - for (uint32_t v = 0; v < 2; v++) - { - for (uint32_t c = 0; c < 3; c++) - { - assert(val_q[v][c] >= 0 && val_q[v][c] <= max_a_val); - - if (val_q[v][c] > highest_q) - { - highest_q = val_q[v][c]; - highest_val = v; - highest_comp = c; - } - } - } - - const bool had_tie = (val_q[highest_val ^ 1][highest_comp] == highest_q); - - if (highest_val != 1) - { - for (uint32_t c = 0; c < 3; c++) - { - std::swap(val_q[0][c], val_q[1][c]); - } - } - - if (highest_comp) - { - std::swap(val_q[0][0], val_q[0][highest_comp]); - std::swap(val_q[1][0], val_q[1][highest_comp]); - } - - int orig_q[2][3]; - memcpy(orig_q, val_q, sizeof(val_q)); - - // val[1][0] is now guaranteed to be highest - int best_va = 0, best_vb0 = 0, best_vb1 = 0, best_vc = 0, best_vd0 = 0, best_vd1 = 0; - int best_max_clamp_mag = 0; - bool best_did_clamp = false; - int best_q[2][3] = { { 0, 0, 0}, { 0, 0, 0 } }; - BASISU_NOTE_UNUSED(best_q); - uint32_t best_dist = UINT_MAX; - - for (uint32_t pass = 0; pass < 2; pass++) - { - int trial_va = val_q[1][0]; - - assert(trial_va <= max_a_val); - assert(trial_va >= val_q[1][1]); - assert(trial_va >= val_q[1][2]); - - assert(trial_va >= val_q[0][0]); - assert(trial_va >= val_q[0][1]); - assert(trial_va >= val_q[0][2]); - - bool did_clamp = false; - int trial_max_clamp_mag = 0; - - int trial_vb0 = compute_clamped_val(trial_va - val_q[1][1], 0, max_b_val, did_clamp, trial_max_clamp_mag); - int trial_vb1 = compute_clamped_val(trial_va - val_q[1][2], 0, max_b_val, did_clamp, trial_max_clamp_mag); - int trial_vc = compute_clamped_val(trial_va - val_q[0][0], 0, max_c_val, did_clamp, trial_max_clamp_mag); - int trial_vd0 = compute_clamped_val((trial_va - trial_vb0 - trial_vc) - val_q[0][1], min_d_val, max_d_val, did_clamp, trial_max_clamp_mag); - int trial_vd1 = compute_clamped_val((trial_va - trial_vb1 - trial_vc) - val_q[0][2], min_d_val, max_d_val, did_clamp, trial_max_clamp_mag); - - if (!did_clamp) - { - // Make sure decoder gets the expected values - assert(trial_va == val_q[1][0]); - assert(trial_va - trial_vb0 == val_q[1][1]); - assert(trial_va - trial_vb1 == val_q[1][2]); - - assert((trial_va - trial_vc) == val_q[0][0]); - assert((trial_va - trial_vb0 - trial_vc - trial_vd0) == val_q[0][1]); - assert((trial_va - trial_vb1 - trial_vc - trial_vd1) == val_q[0][2]); - } - - const int r_e0 = clamp(trial_va, 0, max_a_val); - const int r_e1 = clamp(trial_va - trial_vb0, 0, max_a_val); - const int r_e2 = clamp(trial_va - trial_vb1, 0, max_a_val); - - const int r_f0 = clamp(trial_va - trial_vc, 0, max_a_val); - const int r_f1 = clamp(trial_va - trial_vb0 - trial_vc - trial_vd0, 0, max_a_val); - const int r_f2 = clamp(trial_va - trial_vb1 - trial_vc - trial_vd1, 0, max_a_val); - - assert(r_e0 <= max_a_qlog); - assert(r_e1 <= max_a_qlog); - assert(r_e2 <= max_a_qlog); - - assert(r_f0 <= max_a_qlog); - assert(r_f1 <= max_a_qlog); - assert(r_f2 <= max_a_qlog); - - if ((!did_clamp) || (!had_tie)) - { - best_va = trial_va; - best_vb0 = trial_vb0; - best_vb1 = trial_vb1; - best_vc = trial_vc; - best_vd0 = trial_vd0; - best_vd1 = trial_vd1; - best_max_clamp_mag = trial_max_clamp_mag; - best_did_clamp = did_clamp; - - best_q[1][0] = r_e0; - best_q[1][1] = r_e1; - best_q[1][2] = r_e2; - best_q[0][0] = r_f0; - best_q[0][1] = r_f1; - best_q[0][2] = r_f2; - break; - } - - // we had a tie and it did clamp, try swapping L/H for a potential slight gain - - const uint32_t r_dist1 = basisu::square(r_e0 - val_q[1][0]) + basisu::square(r_e1 - val_q[1][1]) + basisu::square(r_e2 - val_q[1][2]); - const uint32_t r_dist0 = basisu::square(r_f0 - val_q[0][0]) + basisu::square(r_f1 - val_q[0][1]) + basisu::square(r_f2 - val_q[0][2]); - - const uint32_t total_dist = r_dist1 + r_dist0; - - if (total_dist < best_dist) - { - best_dist = total_dist; - - best_va = trial_va; - best_vb0 = trial_vb0; - best_vb1 = trial_vb1; - best_vc = trial_vc; - best_vd0 = trial_vd0; - best_vd1 = trial_vd1; - best_did_clamp = did_clamp; - - best_q[1][0] = r_e0; - best_q[1][1] = r_e1; - best_q[1][2] = r_e2; - best_q[0][0] = r_f0; - best_q[0][1] = r_f1; - best_q[0][2] = r_f2; - } - - for (uint32_t c = 0; c < 3; c++) - std::swap(val_q[0][c], val_q[1][c]); - } - - // pack bits now - int v0 = 0, v1 = 0, v2 = 0, v3 = 0, v4 = 0, v5 = 0; - - int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0; - switch (submode) - { - case 0: - x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vd0, 6); x3 = get_bit(best_vd1, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); - break; - case 1: - x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vb0, 7); x3 = get_bit(best_vb1, 7); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); - break; - case 2: - x0 = get_bit(best_va, 9); x1 = get_bit(best_vc, 6); x2 = get_bit(best_vd0, 6); x3 = get_bit(best_vd1, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); - break; - case 3: - x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_va, 9); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); - break; - case 4: - x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_vb0, 7); x3 = get_bit(best_vb1, 7); x4 = get_bit(best_va, 9); x5 = get_bit(best_va, 10); - break; - case 5: - x0 = get_bit(best_va, 9); x1 = get_bit(best_va, 10); x2 = get_bit(best_vc, 7); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); - break; - case 6: - x0 = get_bit(best_vb0, 6); x1 = get_bit(best_vb1, 6); x2 = get_bit(best_va, 11); x3 = get_bit(best_vc, 6); x4 = get_bit(best_va, 9); x5 = get_bit(best_va, 10); - break; - case 7: - x0 = get_bit(best_va, 9); x1 = get_bit(best_va, 10); x2 = get_bit(best_va, 11); x3 = get_bit(best_vc, 6); x4 = get_bit(best_vd0, 5); x5 = get_bit(best_vd1, 5); - break; - default: - break; - } - - // write mode - pack_bit(v1, 7, submode, 0); - pack_bit(v2, 7, submode, 1); - pack_bit(v3, 7, submode, 2); - - // highest component - pack_bit(v4, 7, highest_comp, 0); - pack_bit(v5, 7, highest_comp, 1); - - // write bit 8 of va - pack_bit(v1, 6, best_va, 8); - - // extra bits - pack_bit(v2, 6, x0); - pack_bit(v3, 6, x1); - pack_bit(v4, 6, x2); - pack_bit(v5, 6, x3); - pack_bit(v4, 5, x4); - pack_bit(v5, 5, x5); - - v0 = best_va & 0xFF; - v1 |= (best_vc & 63); - v2 |= (best_vb0 & 63); - v3 |= (best_vb1 & 63); - v4 |= (best_vd0 & 31); - v5 |= (best_vd1 & 31); - - assert(in_range(v0, 0, 255) && in_range(v1, 0, 255) && in_range(v2, 0, 255) && in_range(v3, 0, 255) && in_range(v4, 0, 255) && in_range(v5, 0, 255)); - - pEndpoints[0] = (uint8_t)v0; - pEndpoints[1] = (uint8_t)v1; - pEndpoints[2] = (uint8_t)v2; - pEndpoints[3] = (uint8_t)v3; - pEndpoints[4] = (uint8_t)v4; - pEndpoints[5] = (uint8_t)v5; - -#ifdef _DEBUG - // Test for valid pack by unpacking - { - if (highest_comp) - { - std::swap(best_q[0][0], best_q[0][highest_comp]); - std::swap(best_q[1][0], best_q[1][highest_comp]); - - std::swap(orig_q[0][0], orig_q[0][highest_comp]); - std::swap(orig_q[1][0], orig_q[1][highest_comp]); - } - - int test_e[2][3]; - decode_mode11_to_qlog12(pEndpoints, test_e, astc_helpers::BISE_256_LEVELS); - for (uint32_t i = 0; i < 2; i++) - { - for (uint32_t j = 0; j < 3; j++) - { - assert(best_q[i][j] == test_e[i][j] >> (12 - a_bits)); - - if (!best_did_clamp) - { - assert((orig_q[i][j] == test_e[i][j] >> (12 - a_bits)) || - (orig_q[1 - i][j] == test_e[i][j] >> (12 - a_bits))); - } - } - } - } -#endif - - max_clamp_mag = best_max_clamp_mag; - - return best_did_clamp; -} - -//-------------------------------------------------------------------------------------------------------------------------- - -static void pack_astc_mode11_direct(uint8_t* pEndpoints, const vec3F& l_q16, const vec3F& h_q16) -{ - for (uint32_t i = 0; i < 3; i++) - { - // TODO: This goes from QLOG16->HALF->QLOG8/7 - half_float l_half = qlog16_to_half_slow(clamp((int)std::round(l_q16[i]), 0, 65535)); - half_float h_half = qlog16_to_half_slow(clamp((int)std::round(h_q16[i]), 0, 65535)); - - int l_q, h_q; - - if (i == 2) - { - l_q = g_half_to_qlog7[bounds_check((uint32_t)l_half, 0U, 32768U)]; - h_q = g_half_to_qlog7[bounds_check((uint32_t)h_half, 0U, 32768U)]; - - l_q = minimum(l_q, MAX_QLOG7); - h_q = minimum(h_q, MAX_QLOG7); - } - else - { - l_q = g_half_to_qlog8[bounds_check((uint32_t)l_half, 0U, 32768U)]; - h_q = g_half_to_qlog8[bounds_check((uint32_t)h_half, 0U, 32768U)]; - - l_q = minimum(l_q, MAX_QLOG8); - h_q = minimum(h_q, MAX_QLOG8); - } - -#if 1 - if (l_q == h_q) - { - const int m = (i == 2) ? MAX_QLOG7 : MAX_QLOG8; - - if (l_q16[i] <= h_q16[i]) - { - if (l_q) - l_q--; - - if (h_q != m) - h_q++; - } - else - { - if (h_q) - h_q--; - - if (l_q != m) - l_q++; - } - } -#endif - - if (i == 2) - { - assert(l_q <= (int)MAX_QLOG7 && h_q <= (int)MAX_QLOG7); - l_q |= 128; - h_q |= 128; - } - else - { - assert(l_q <= (int)MAX_QLOG8 && h_q <= (int)MAX_QLOG8); - } - - pEndpoints[2 * i + 0] = (uint8_t)l_q; - pEndpoints[2 * i + 1] = (uint8_t)h_q; - } -} - -//-------------------------------------------------------------------------------------------------------------------------- - -static bool pack_astc_mode7_submode(uint32_t submode, uint8_t* pEndpoints, const vec3F& rgb_q16, float s_q16, int& max_clamp_mag, uint32_t ise_weight_range) -{ - assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); - - assert(submode <= 5); - max_clamp_mag = 0; - - static const uint8_t s_r_bits[6] = { 11, 11, 10, 9, 8, 7 }; - static const uint8_t s_g_b_bits[6] = { 5, 6, 5, 6, 7, 7 }; - static const uint8_t s_s_bits[6] = { 7, 5, 8, 7, 6, 7 }; - - // The precision of the components - const uint32_t prec_bits = s_r_bits[submode]; - - int qlog[4], pack_bits[4]; - - for (uint32_t i = 0; i < 4; i++) - { - const float f = (i == 3) ? s_q16 : rgb_q16[i]; - - // The # of bits the component is packed into - if (i == 0) - pack_bits[i] = s_r_bits[submode]; - else if (i == 3) - pack_bits[i] = s_s_bits[submode]; - else - pack_bits[i] = s_g_b_bits[submode]; - -#if 0 - // this is slightly worse - // TODO: going from qlog16 to half loses some precision. Then going from half to qlog 7-12 will have extra error. - half_float h = qlog_to_half(clamp((int)std::round(f), 0, MAX_QLOG16), 16); - qlog[i] = half_to_qlog7_12((half_float)bounds_check((uint32_t)h, 0U, 32768U), prec_bits); -#else - qlog[i] = quant_qlog16(clamp((int)std::round(f), 0, MAX_QLOG16), prec_bits); - - // Only bias if there are enough texel weights, 4=6 weights - if (ise_weight_range >= 4) - { - // Explictly bias the high color, and the scale up, to better exploit the weights. - // The quantized range also then encompases the complete input range. - const uint32_t max_val = (1 << prec_bits) - 1; - const uint32_t K = 3; - if (i == 3) - { - qlog[i] = minimum(qlog[i] + K * 2, max_val); - } - else - { - qlog[i] = minimum(qlog[i] + K, max_val); - } - } -#endif - - if (i != 3) - qlog[i] = minimum(qlog[i], get_max_qlog(prec_bits)); - - // If S=0, we lose freedom for the texel weights to add any value. - if ((i == 3) && (qlog[i] == 0)) - qlog[i] = 1; - } - - uint32_t maj_index = 0; - - bool did_clamp = false; - - if (submode != 5) - { - int largest_qlog = 0; - for (uint32_t i = 0; i < 3; i++) - { - if (qlog[i] > largest_qlog) - { - largest_qlog = qlog[i]; - maj_index = i; - } - } - - if (maj_index) - { - std::swap(qlog[0], qlog[maj_index]); - } - - assert(qlog[0] >= qlog[1]); - assert(qlog[0] >= qlog[2]); - - qlog[1] = qlog[0] - qlog[1]; - qlog[2] = qlog[0] - qlog[2]; - - for (uint32_t i = 1; i < 4; i++) - { - const int max_val = (1 << pack_bits[i]) - 1; - - if (qlog[i] > max_val) - { - max_clamp_mag = maximum(max_clamp_mag, qlog[i] - max_val); - qlog[i] = max_val; - did_clamp = true; - } - } - } - - for (uint32_t i = 0; i < 4; i++) - { - const int max_val = (1 << pack_bits[i]) - 1; (void)max_val; - - assert(qlog[i] <= max_val); - } - - int mode = 0; - - int r = qlog[0] & 63; // 6-bits - int g = qlog[1] & 31; // 5-bits - int b = qlog[2] & 31; // 5-bits - int s = qlog[3] & 31; // 5-bits - - int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x4 = 0, x5 = 0, x6 = 0; - - switch (submode) - { - case 0: - { - mode = (maj_index << 2) | 0; - assert((mode & 0xC) != 0xC); - - x0 = get_bit(qlog[0], 9); // R9 - x1 = get_bit(qlog[0], 8); // R8 - x2 = get_bit(qlog[0], 7); // R7 - x3 = get_bit(qlog[0], 10); // R10 - x4 = get_bit(qlog[0], 6); // R6 - x5 = get_bit(qlog[3], 6); // S6 - x6 = get_bit(qlog[3], 5); // S5 - break; - } - case 1: - { - mode = (maj_index << 2) | 1; - assert((mode & 0xC) != 0xC); - - x0 = get_bit(qlog[0], 8); // R8 - x1 = get_bit(qlog[1], 5); // G5 - x2 = get_bit(qlog[0], 7); // R7 - x3 = get_bit(qlog[2], 5); // B5 - x4 = get_bit(qlog[0], 6); // R6 - x5 = get_bit(qlog[0], 10); // R10 - x6 = get_bit(qlog[0], 9); // R9 - break; - } - case 2: - { - mode = (maj_index << 2) | 2; - assert((mode & 0xC) != 0xC); - - x0 = get_bit(qlog[0], 9); // R9 - x1 = get_bit(qlog[0], 8); // R8 - x2 = get_bit(qlog[0], 7); // R7 - x3 = get_bit(qlog[0], 6); // R6 - x4 = get_bit(qlog[3], 7); // S7 - x5 = get_bit(qlog[3], 6); // S6 - x6 = get_bit(qlog[3], 5); // S5 - break; - } - case 3: - { - mode = (maj_index << 2) | 3; - assert((mode & 0xC) != 0xC); - - x0 = get_bit(qlog[0], 8); // R8 - x1 = get_bit(qlog[1], 5); // G5 - x2 = get_bit(qlog[0], 7); // R7 - x3 = get_bit(qlog[2], 5); // B5 - x4 = get_bit(qlog[0], 6); // R6 - x5 = get_bit(qlog[3], 6); // S6 - x6 = get_bit(qlog[3], 5); // S5 - break; - } - case 4: - { - mode = maj_index | 0xC; // 0b1100 - assert((mode & 0xC) == 0xC); - assert(mode != 0xF); - - x0 = get_bit(qlog[1], 6); // G6 - x1 = get_bit(qlog[1], 5); // G5 - x2 = get_bit(qlog[2], 6); // B6 - x3 = get_bit(qlog[2], 5); // B5 - x4 = get_bit(qlog[0], 6); // R6 - x5 = get_bit(qlog[0], 7); // R7 - x6 = get_bit(qlog[3], 5); // S5 - break; - } - case 5: - { - mode = 0xF; - - x0 = get_bit(qlog[1], 6); // G6 - x1 = get_bit(qlog[1], 5); // G5 - x2 = get_bit(qlog[2], 6); // B6 - x3 = get_bit(qlog[2], 5); // B5 - x4 = get_bit(qlog[0], 6); // R6 - x5 = get_bit(qlog[3], 6); // S6 - x6 = get_bit(qlog[3], 5); // S5 - break; - } - default: - { - assert(0); - break; - } - } - - pEndpoints[0] = (uint8_t)((get_bit(mode, 1) << 7) | (get_bit(mode, 0) << 6) | r); - pEndpoints[1] = (uint8_t)((get_bit(mode, 2) << 7) | (x0 << 6) | (x1 << 5) | g); - pEndpoints[2] = (uint8_t)((get_bit(mode, 3) << 7) | (x2 << 6) | (x3 << 5) | b); - pEndpoints[3] = (uint8_t)((x4 << 7) | (x5 << 6) | (x6 << 5) | s); - -#ifdef _DEBUG - // Test for valid pack by unpacking - { - const int inv_shift = 12 - prec_bits; - - int unpacked_e[2][3]; - if (submode != 5) - { - unpacked_e[1][0] = left_shift32(qlog[0], inv_shift); - unpacked_e[1][1] = clamp(left_shift32((qlog[0] - qlog[1]), inv_shift), 0, 0xFFF); - unpacked_e[1][2] = clamp(left_shift32((qlog[0] - qlog[2]), inv_shift), 0, 0xFFF); - - unpacked_e[0][0] = clamp(left_shift32((qlog[0] - qlog[3]), inv_shift), 0, 0xFFF); - unpacked_e[0][1] = clamp(left_shift32(((qlog[0] - qlog[1]) - qlog[3]), inv_shift), 0, 0xFFF); - unpacked_e[0][2] = clamp(left_shift32(((qlog[0] - qlog[2]) - qlog[3]), inv_shift), 0, 0xFFF); - } - else - { - unpacked_e[1][0] = left_shift32(qlog[0], inv_shift); - unpacked_e[1][1] = left_shift32(qlog[1], inv_shift); - unpacked_e[1][2] = left_shift32(qlog[2], inv_shift); - - unpacked_e[0][0] = clamp(left_shift32((qlog[0] - qlog[3]), inv_shift), 0, 0xFFF); - unpacked_e[0][1] = clamp(left_shift32((qlog[1] - qlog[3]), inv_shift), 0, 0xFFF); - unpacked_e[0][2] = clamp(left_shift32((qlog[2] - qlog[3]), inv_shift), 0, 0xFFF); - } - - if (maj_index) - { - std::swap(unpacked_e[0][0], unpacked_e[0][maj_index]); - std::swap(unpacked_e[1][0], unpacked_e[1][maj_index]); - } - - int e[2][3]; - decode_mode7_to_qlog12_ise20(pEndpoints, e, nullptr); - - for (uint32_t i = 0; i < 3; i++) - { - assert(unpacked_e[0][i] == e[0][i]); - assert(unpacked_e[1][i] == e[1][i]); - } - } -#endif - - return did_clamp; -} - -//-------------------------------------------------------------------------------------------------------------------------- - -static void quantize_ise_endpoints(uint32_t ise_endpoint_range, const uint8_t* pSrc_endpoints, uint8_t *pDst_endpoints, uint32_t n) -{ - assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); - - if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS) - { - memcpy(pDst_endpoints, pSrc_endpoints, n); - } - else - { - for (uint32_t i = 0; i < n; i++) - { - uint32_t v = pSrc_endpoints[i]; - assert(v <= 255); - - pDst_endpoints[i] = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_val_to_ise[v]; - } - } -} - -//-------------------------------------------------------------------------------------------------------------------------- - -// Note this could fail to find any valid solution if use_endpoint_range!=20. -// Returns true if improved. -static bool try_mode11(uint32_t num_pixels, - uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used, - vec3F& low_color_q16, const vec3F& high_color_q16, - half_float block_pixels_half[16][3], - uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_options& coptions, bool direct_only, uint32_t ise_endpoint_range, - bool constrain_ise_weight8_selectors, - int32_t first_submode, int32_t last_submode) // -1, 7 -{ - assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); - assert((num_weight_levels >= 3) && (num_weight_levels <= 32)); - assert((num_pixels >= 1) && (num_pixels <= 16)); - - bool improved_flag = false; - - half_float decoded_half[32][3]; - vec3F decoded_float[32]; - uint8_t orig_trial_endpoints[NUM_MODE11_ENDPOINTS], trial_endpoints[NUM_MODE11_ENDPOINTS], trial_weights[16]; - - if (direct_only) - { - first_submode = -1; - last_submode = -1; - } - - assert(first_submode <= last_submode); - assert((first_submode >= -1) && (first_submode <= 7)); - assert((last_submode >= -1) && (last_submode <= 7)); - - // TODO: First determine if a submode doesn't clamp first. If one is found, encode to that and we're done. - for (int submode = last_submode; submode >= first_submode; submode--) - { - bool did_clamp = false; - int max_clamp_mag = 0; - if (submode == -1) - { - // If it had to clamp with one of the submodes, try direct which can't clamp, but has low precision. - pack_astc_mode11_direct(orig_trial_endpoints, low_color_q16, high_color_q16); - } - else - { - did_clamp = pack_astc_mode11_submode(submode, orig_trial_endpoints, low_color_q16, high_color_q16, max_clamp_mag); - - // If it had to clamp and the clamp was too high, it'll distort the endpoint colors too much, which could lead to noticeable artifacts. - const int MAX_CLAMP_MAG_ACCEPT_THRESH = 4; - if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH)) - continue; - } - - // This will distort the endpoints if the ISE endpoint range isn't 256 levels (20). - // It could massively distort the endpoints, but still result in a valid encoding. - quantize_ise_endpoints(ise_endpoint_range, orig_trial_endpoints, trial_endpoints, NUM_MODE11_ENDPOINTS); - - if (!get_astc_hdr_mode_11_block_colors(trial_endpoints, &decoded_half[0][0], decoded_float, num_weight_levels, ise_weight_range, ise_endpoint_range)) - continue; - - uint32_t usable_selector_bitmask = UINT32_MAX; - if ((constrain_ise_weight8_selectors) && (ise_weight_range == astc_helpers::BISE_16_LEVELS)) - usable_selector_bitmask = (1 << 0) | (1 << 1) | (1 << 4) | (1 << 5) | (1 << 10) | (1 << 11) | (1 << 14) | (1 << 15); - - double trial_blk_error = eval_selectors(num_pixels, trial_weights, &block_pixels_half[0][0], num_weight_levels, &decoded_half[0][0], coptions, usable_selector_bitmask); - if (trial_blk_error < cur_block_error) - { - cur_block_error = trial_blk_error; - memcpy(pEndpoints, trial_endpoints, NUM_MODE11_ENDPOINTS); - memcpy(pWeights, trial_weights, num_pixels); - submode_used = submode + 1; - improved_flag = true; - } - - // If it didn't clamp it was a lossless encode at this precision, so we can stop early as there's probably no use trying lower precision submodes. - // (Although it may be, because a lower precision pack could try nearby voxel coords.) - // However, at lower levels quantization may cause the decoded endpoints to be very distorted, so we need to evaluate up to direct. - if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS) - { - if (!did_clamp) - break; - } - } - - return improved_flag; -} - -//-------------------------------------------------------------------------------------------------------------------------- - -static bool try_mode7( - uint32_t num_pixels, - uint8_t* pEndpoints, uint8_t* pWeights, double& cur_block_error, uint32_t& submode_used, - vec3F& high_color_q16, const float s_q16, - half_float block_pixels_half[16][3], - uint32_t num_weight_levels, uint32_t ise_weight_range, const astc_hdr_codec_options& coptions, - uint32_t ise_endpoint_range) -{ - assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); - assert((num_pixels >= 1) && (num_pixels <= 16)); - - bool improved_flag = false; - - half_float decoded_half[24][3]; - vec3F decoded_float[24]; - - uint8_t orig_trial_endpoints[NUM_MODE7_ENDPOINTS], trial_endpoints[NUM_MODE7_ENDPOINTS], trial_weights[16]; - - // TODO: First determine if a submode doesn't clamp first. If one is found, encode to that and we're done. - for (int submode = 0; submode <= 5; submode++) - { - int max_clamp_mag = 0; - const bool did_clamp = pack_astc_mode7_submode(submode, orig_trial_endpoints, high_color_q16, s_q16, max_clamp_mag, ise_weight_range); - - if (submode < 5) - { - const int MAX_CLAMP_MAG_ACCEPT_THRESH = 4; - if ((did_clamp) && (max_clamp_mag > MAX_CLAMP_MAG_ACCEPT_THRESH)) - continue; - } - - // This will distort the endpoints if the ISE endpoint range isn't 256 levels (20). - // It could massively distort the endpoints, but still result in a valid encoding. - quantize_ise_endpoints(ise_endpoint_range, orig_trial_endpoints, trial_endpoints, NUM_MODE7_ENDPOINTS); - - if (!get_astc_hdr_mode_7_block_colors(trial_endpoints, &decoded_half[0][0], decoded_float, num_weight_levels, ise_weight_range, ise_endpoint_range)) - continue; - - double trial_blk_error = eval_selectors(num_pixels, trial_weights, &block_pixels_half[0][0], num_weight_levels, &decoded_half[0][0], coptions); - if (trial_blk_error < cur_block_error) - { - cur_block_error = trial_blk_error; - memcpy(pEndpoints, trial_endpoints, NUM_MODE7_ENDPOINTS); - memcpy(pWeights, trial_weights, num_pixels); - submode_used = submode; - improved_flag = true; - } - - if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS) - { - if (!did_clamp) - break; - } - } - - return improved_flag; -} - -//-------------------------------------------------------------------------------------------------------------------------- - -static double encode_astc_hdr_block_mode_11( - uint32_t num_pixels, - const vec4F* pBlock_pixels, - uint32_t ise_weight_range, - uint32_t& best_submode, - double cur_block_error, - uint8_t* blk_endpoints, uint8_t* blk_weights, - const astc_hdr_codec_options& coptions, - bool direct_only, - uint32_t ise_endpoint_range, - bool uber_mode, - bool constrain_ise_weight8_selectors, - int32_t first_submode, int32_t last_submode) -{ - assert((ise_weight_range >= 1) && (ise_weight_range <= MAX_SUPPORTED_ISE_WEIGHT_INDEX)); - assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); - assert((num_pixels >= 1) && (num_pixels <= 16)); - - best_submode = 0; - - half_float block_pixels_half[16][3]; - vec4F block_pixels_q16[16]; - - // TODO: This is done redundantly. - for (uint32_t i = 0; i < num_pixels; i++) - { - block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][0]); - block_pixels_q16[i][0] = (float)half_to_qlog16(block_pixels_half[i][0]); - - block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][1]); - block_pixels_q16[i][1] = (float)half_to_qlog16(block_pixels_half[i][1]); - - block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][2]); - block_pixels_q16[i][2] = (float)half_to_qlog16(block_pixels_half[i][2]); - - block_pixels_q16[i][3] = 0.0f; - } - - const uint32_t num_weight_levels = astc_helpers::get_ise_levels(ise_weight_range); - - // TODO: should match MAX_SUPPORTED_ISE_WEIGHT_INDEX - const uint32_t MAX_WEIGHT_LEVELS = 32; - (void)MAX_WEIGHT_LEVELS; - assert(num_weight_levels <= MAX_WEIGHT_LEVELS); - - vec3F block_mean_color_q16(calc_mean(num_pixels, block_pixels_q16)); - vec3F block_axis_q16(calc_rgb_pca(num_pixels, block_pixels_q16, block_mean_color_q16)); - - aabb3F color_box_q16(cInitExpand); - - float l = 1e+30f, h = -1e+30f; - vec3F low_color_q16, high_color_q16; - - for (uint32_t i = 0; i < num_pixels; i++) - { - color_box_q16.expand(block_pixels_q16[i]); - - vec3F k(vec3F(block_pixels_q16[i]) - block_mean_color_q16); - float kd = k.dot(block_axis_q16); - - if (kd < l) - { - l = kd; - low_color_q16 = block_pixels_q16[i]; - } - - if (kd > h) - { - h = kd; - high_color_q16 = block_pixels_q16[i]; - } - } - - vec3F old_low_color_q16(low_color_q16), old_high_color_q16(high_color_q16); - for (uint32_t i = 0; i < 3; i++) - { - low_color_q16[i] = lerp(old_low_color_q16[i], old_high_color_q16[i], 1.0f / 64.0f); - high_color_q16[i] = lerp(old_low_color_q16[i], old_high_color_q16[i], 63.0f / 64.0f); - } - - uint8_t trial_blk_endpoints[NUM_MODE11_ENDPOINTS]; - uint8_t trial_blk_weights[16]; - uint32_t trial_best_submode = 0; - - clear_obj(trial_blk_endpoints); - clear_obj(trial_blk_weights); - - double trial_blk_error = 1e+30f; - - bool did_improve = try_mode11(num_pixels, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_best_submode, - low_color_q16, high_color_q16, - block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors, - first_submode, last_submode); - - // If we couldn't find ANY usable solution due to endpoint quantization, just return. There's nothing we can do. - if (!did_improve) - return cur_block_error; - - // Did the solution improve? - if (trial_blk_error < cur_block_error) - { - cur_block_error = trial_blk_error; - memcpy(blk_endpoints, trial_blk_endpoints, NUM_MODE11_ENDPOINTS); - memcpy(blk_weights, trial_blk_weights, num_pixels); - best_submode = trial_best_submode; - } - -#define USE_LEAST_SQUARES (1) -#if USE_LEAST_SQUARES - // least squares on the most promising trial weight indices found - const uint32_t NUM_LS_PASSES = 3; - - for (uint32_t pass = 0; pass < NUM_LS_PASSES; pass++) - { - vec3F l_q16, h_q16; - if (!compute_least_squares_endpoints_rgb(num_pixels, trial_blk_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16)) - break; - - bool was_improved = try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, - l_q16, h_q16, - block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors, - first_submode, last_submode); - - if (!was_improved) - break; - - // It's improved, so let's take the new weight indices. - memcpy(trial_blk_weights, blk_weights, num_pixels); - - } // pass -#endif - - if (uber_mode) - { - // Try varying the current best weight indices. This can be expanded/improved, but at potentially great cost. - - uint8_t temp_astc_weights[16]; - memcpy(temp_astc_weights, trial_blk_weights, num_pixels); - - uint32_t min_lin_sel = 256, max_lin_sel = 0; - for (uint32_t i = 0; i < num_pixels; i++) - { - const uint32_t astc_sel = temp_astc_weights[i]; - - const uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel]; - assert(lin_sel < num_weight_levels); - - min_lin_sel = minimumu(min_lin_sel, lin_sel); - max_lin_sel = maximumu(max_lin_sel, lin_sel); - } - - bool was_improved = false; - (void)was_improved; - - { - bool weights_changed = false; - uint8_t trial_weights[16]; - for (uint32_t i = 0; i < num_pixels; i++) - { - uint32_t astc_sel = temp_astc_weights[i]; - uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel]; - - if ((lin_sel == min_lin_sel) && (lin_sel < (num_weight_levels - 1))) - { - lin_sel++; - weights_changed = true; - } - - trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel]; - } - - if (weights_changed) - { - vec3F l_q16, h_q16; - if (compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16)) - { - if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, - l_q16, h_q16, - block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors, - first_submode, last_submode)) - { - was_improved = true; - } - } - } - } - - { - bool weights_changed = false; - uint8_t trial_weights[16]; - for (uint32_t i = 0; i < num_pixels; i++) - { - uint32_t astc_sel = temp_astc_weights[i]; - uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel]; - - if ((lin_sel == max_lin_sel) && (lin_sel > 0)) - { - lin_sel--; - weights_changed = true; - } - - trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel]; - } - - if (weights_changed) - { - vec3F l_q16, h_q16; - if (compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16)) - { - if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, - l_q16, h_q16, - block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors, - first_submode, last_submode)) - { - was_improved = true; - } - } - } - } - - { - bool weights_changed = false; - uint8_t trial_weights[16]; - for (uint32_t i = 0; i < num_pixels; i++) - { - uint32_t astc_sel = temp_astc_weights[i]; - uint32_t lin_sel = g_map_astc_to_linear_order[ise_weight_range][astc_sel]; - - if ((lin_sel == max_lin_sel) && (lin_sel > 0)) - { - lin_sel--; - weights_changed = true; - } - else if ((lin_sel == min_lin_sel) && (lin_sel < (num_weight_levels - 1))) - { - lin_sel++; - weights_changed = true; - } - - trial_weights[i] = g_map_linear_to_astc_order[ise_weight_range][lin_sel]; - } - - if (weights_changed) - { - vec3F l_q16, h_q16; - if (compute_least_squares_endpoints_rgb(num_pixels, trial_weights, &g_astc_ls_weights_ise[ise_weight_range][0], &l_q16, &h_q16, block_pixels_q16, color_box_q16)) - { - if (try_mode11(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, - l_q16, h_q16, - block_pixels_half, num_weight_levels, ise_weight_range, coptions, direct_only, ise_endpoint_range, constrain_ise_weight8_selectors, - first_submode, last_submode)) - { - was_improved = true; - } - } - } - } - } // uber_mode - - return cur_block_error; -} - -//-------------------------------------------------------------------------------------------------------------------------- - -static double encode_astc_hdr_block_mode_7( - uint32_t num_pixels, const vec4F* pBlock_pixels, - uint32_t ise_weight_range, - uint32_t& best_submode, - double cur_block_error, - uint8_t* blk_endpoints, //[4] - uint8_t* blk_weights, // [num_pixels] - const astc_hdr_codec_options& coptions, - uint32_t ise_endpoint_range) -{ - assert((num_pixels >= 1) && (num_pixels <= 16)); - assert((ise_weight_range >= 1) && (ise_weight_range <= 10)); - assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); - const uint32_t num_weight_levels = astc_helpers::get_ise_levels(ise_weight_range); - - const uint32_t MAX_WEIGHT_LEVELS = 24; - assert(num_weight_levels <= MAX_WEIGHT_LEVELS); - BASISU_NOTE_UNUSED(MAX_WEIGHT_LEVELS); - - best_submode = 0; - - half_float block_pixels_half[16][3]; - - vec4F block_pixels_q16[16]; - for (uint32_t i = 0; i < num_pixels; i++) - { - block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][0]); - block_pixels_q16[i][0] = (float)half_to_qlog16(block_pixels_half[i][0]); - - block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][1]); - block_pixels_q16[i][1] = (float)half_to_qlog16(block_pixels_half[i][1]); - - block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][2]); - block_pixels_q16[i][2] = (float)half_to_qlog16(block_pixels_half[i][2]); - - block_pixels_q16[i][3] = 0.0f; - } - - vec3F block_mean_color_q16(calc_mean(num_pixels, block_pixels_q16)); - - vec3F block_axis_q16(0.577350259f); - - aabb3F color_box_q16(cInitExpand); - - float l = 1e+30f, h = -1e+30f; - for (uint32_t i = 0; i < num_pixels; i++) - { - color_box_q16.expand(block_pixels_q16[i]); - - vec3F k(vec3F(block_pixels_q16[i]) - block_mean_color_q16); - float kd = k.dot(block_axis_q16); - - l = basisu::minimum(l, kd); - h = basisu::maximum(h, kd); - } - - vec3F low_color_q16(interp_color(block_mean_color_q16, block_axis_q16, l, color_box_q16, color_box_q16)); - vec3F high_color_q16(interp_color(block_mean_color_q16, block_axis_q16, h, color_box_q16, color_box_q16)); - - low_color_q16.clamp(0.0f, MAX_QLOG16_VAL); - high_color_q16.clamp(0.0f, MAX_QLOG16_VAL); - - vec3F diff(high_color_q16 - low_color_q16); - float s_q16 = diff.dot(block_axis_q16) * block_axis_q16[0]; - - uint8_t trial_blk_endpoints[NUM_MODE7_ENDPOINTS]; - uint8_t trial_blk_weights[16]; - uint32_t trial_best_submode = 0; - - clear_obj(trial_blk_endpoints); - clear_obj(trial_blk_weights); - - double trial_blk_error = 1e+30f; - - bool did_improve = try_mode7(num_pixels, trial_blk_endpoints, trial_blk_weights, trial_blk_error, trial_best_submode, - high_color_q16, ceilf(s_q16), - block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range); - - // If we couldn't find ANY usable solution due to endpoint quantization, just return. There's nothing we can do. - if (!did_improve) - { - return cur_block_error; - } - - // Did the solution improve? - if (trial_blk_error < cur_block_error) - { - cur_block_error = trial_blk_error; - memcpy(blk_endpoints, trial_blk_endpoints, NUM_MODE7_ENDPOINTS); - memcpy(blk_weights, trial_blk_weights, num_pixels); - best_submode = trial_best_submode; - } - - const float one_over_num_pixels = 1.0f / (float)num_pixels; - - const uint32_t NUM_TRIALS = 2; - for (uint32_t trial = 0; trial < NUM_TRIALS; trial++) - { - // Given a set of selectors and S, try to compute a better high color - vec3F new_high_color_q16(block_mean_color_q16); - - int e[2][3]; - int cur_s = 0; - if (!decode_mode7_to_qlog12(trial_blk_endpoints, e, &cur_s, ise_endpoint_range)) - break; - - cur_s <<= 4; - - for (uint32_t i = 0; i < num_pixels; i++) - { - uint32_t astc_sel = trial_blk_weights[i]; - float lerp = g_ise_weight_lerps[ise_weight_range][astc_sel + 1] * (1.0f / 64.0f); - - float k = (float)cur_s * (1.0f - lerp) * one_over_num_pixels; - new_high_color_q16[0] += k; - new_high_color_q16[1] += k; - new_high_color_q16[2] += k; - } - - bool improved = try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, - new_high_color_q16, (float)cur_s, - block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range); - - if (improved) - { - memcpy(trial_blk_endpoints, blk_endpoints, NUM_MODE7_ENDPOINTS); - memcpy(trial_blk_weights, blk_weights, num_pixels); - } - - // Given a set of selectors and a high color, try to compute a better S. - float t = 0.0f; - - for (uint32_t i = 0; i < num_pixels; i++) - { - uint32_t astc_sel = trial_blk_weights[i]; - float lerp = g_ise_weight_lerps[ise_weight_range][astc_sel + 1] * (1.0f / 64.0f); - - t += (1.0f) - lerp; - } - - t *= one_over_num_pixels; - - //int e[2][3]; - if (!decode_mode7_to_qlog12(trial_blk_endpoints, e, nullptr, ise_endpoint_range)) - break; - - vec3F cur_h_q16((float)(e[1][0] << 4), (float)(e[1][1] << 4), (float)(e[1][2] << 4)); - - if (fabs(t) > .0000125f) - { - float s_r = (cur_h_q16[0] - block_mean_color_q16[0]) / t; - float s_g = (cur_h_q16[1] - block_mean_color_q16[1]) / t; - float s_b = (cur_h_q16[2] - block_mean_color_q16[2]) / t; - - // TODO: gather statistics on these - if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, - cur_h_q16, ceilf(s_r), - block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range)) - { - improved = true; - } - - if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, - cur_h_q16, ceilf(s_g), - block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range)) - { - improved = true; - } - - if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, - cur_h_q16, ceilf(s_b), - block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range)) - { - improved = true; - } - - if (try_mode7(num_pixels, blk_endpoints, blk_weights, cur_block_error, best_submode, - cur_h_q16, ceilf((s_r + s_g + s_b) / 3.0f), - block_pixels_half, num_weight_levels, ise_weight_range, coptions, ise_endpoint_range)) - { - improved = true; - } - } - - if (!improved) - break; - - memcpy(trial_blk_endpoints, blk_endpoints, NUM_MODE7_ENDPOINTS); - memcpy(trial_blk_weights, blk_weights, num_pixels); - - } // trial - - return cur_block_error; -} - -//-------------------------------------------------------------------------------------------------------------------------- - -static bool pack_solid(const vec4F* pBlock_linear_colors, basisu::vector& all_results, const astc_hdr_codec_options& coptions) -{ - float r = 0.0f, g = 0.0f, b = 0.0f; - - const float LOG_BIAS = .125f; - - bool solid_block = true; - for (uint32_t i = 0; i < 16; i++) - { - if ((pBlock_linear_colors[0][0] != pBlock_linear_colors[i][0]) || - (pBlock_linear_colors[0][1] != pBlock_linear_colors[i][1]) || - (pBlock_linear_colors[0][2] != pBlock_linear_colors[i][2])) - { - solid_block = false; - } - - r += log2f(pBlock_linear_colors[i][0] + LOG_BIAS); - g += log2f(pBlock_linear_colors[i][1] + LOG_BIAS); - b += log2f(pBlock_linear_colors[i][2] + LOG_BIAS); - } - - if (solid_block) - { - r = pBlock_linear_colors[0][0]; - g = pBlock_linear_colors[0][1]; - b = pBlock_linear_colors[0][2]; - } - else - { - r = maximum(0.0f, powf(2.0f, r * (1.0f / 16.0f)) - LOG_BIAS); - g = maximum(0.0f, powf(2.0f, g * (1.0f / 16.0f)) - LOG_BIAS); - b = maximum(0.0f, powf(2.0f, b * (1.0f / 16.0f)) - LOG_BIAS); - - // for safety - r = minimum(r, MAX_HALF_FLOAT); - g = minimum(g, MAX_HALF_FLOAT); - b = minimum(b, MAX_HALF_FLOAT); - } - - half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b), ah = float_to_half_non_neg_no_nan_inf(1.0f); - - astc_hdr_pack_results results; - results.clear(); - - uint8_t* packed_blk = (uint8_t*)&results.m_solid_blk; - results.m_is_solid = true; - - packed_blk[0] = 0b11111100; - packed_blk[1] = 255; - packed_blk[2] = 255; - packed_blk[3] = 255; - packed_blk[4] = 255; - packed_blk[5] = 255; - packed_blk[6] = 255; - packed_blk[7] = 255; - - packed_blk[8] = (uint8_t)rh; - packed_blk[9] = (uint8_t)(rh >> 8); - packed_blk[10] = (uint8_t)gh; - packed_blk[11] = (uint8_t)(gh >> 8); - packed_blk[12] = (uint8_t)bh; - packed_blk[13] = (uint8_t)(bh >> 8); - packed_blk[14] = (uint8_t)ah; - packed_blk[15] = (uint8_t)(ah >> 8); - - results.m_best_block_error = 0; - - if (!solid_block) - { - const float R_WEIGHT = coptions.m_r_err_scale; - const float G_WEIGHT = coptions.m_g_err_scale; - - // This MUST match how errors are computed in eval_selectors(). - for (uint32_t i = 0; i < 16; i++) - { - half_float dr = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][0]), dg = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][1]), db = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][2]); - double rd = q(rh) - q(dr); - double gd = q(gh) - q(dg); - double bd = q(bh) - q(db); - - double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd; - - results.m_best_block_error += e; - } - } - - const half_float hc[3] = { rh, gh, bh }; - - bc6h_enc_block_solid_color(&results.m_bc6h_block, hc); - - all_results.push_back(results); - - return solid_block; -} - -//-------------------------------------------------------------------------------------------------------------------------- - -static void pack_mode11( - const vec4F* pBlock_linear_colors, - basisu::vector& all_results, - const astc_hdr_codec_options& coptions, - uint32_t first_weight_ise_range, uint32_t last_weight_ise_range, bool constrain_ise_weight8_selectors) -{ - uint8_t trial_endpoints[NUM_MODE11_ENDPOINTS], trial_weights[16]; - uint32_t trial_submode11 = 0; - - clear_obj(trial_endpoints); - clear_obj(trial_weights); - - for (uint32_t weight_ise_range = first_weight_ise_range; weight_ise_range <= last_weight_ise_range; weight_ise_range++) - { - const bool direct_only = coptions.m_mode11_direct_only; - - uint32_t endpoint_ise_range = astc_helpers::BISE_256_LEVELS; - if (weight_ise_range == astc_helpers::BISE_16_LEVELS) - endpoint_ise_range = astc_helpers::BISE_192_LEVELS; - else - { - assert(weight_ise_range < astc_helpers::BISE_16_LEVELS); - } - - double trial_error = encode_astc_hdr_block_mode_11(16, pBlock_linear_colors, weight_ise_range, trial_submode11, 1e+30f, trial_endpoints, trial_weights, coptions, direct_only, - endpoint_ise_range, coptions.m_mode11_uber_mode && (weight_ise_range >= astc_helpers::BISE_4_LEVELS) && coptions.m_allow_uber_mode, constrain_ise_weight8_selectors, coptions.m_first_mode11_submode, coptions.m_last_mode11_submode); - - if (trial_error < 1e+30f) - { - astc_hdr_pack_results results; - results.clear(); - - results.m_best_block_error = trial_error; - - results.m_best_submodes[0] = trial_submode11; - results.m_constrained_weights = constrain_ise_weight8_selectors; - - results.m_best_blk.m_num_partitions = 1; - results.m_best_blk.m_color_endpoint_modes[0] = 11; - results.m_best_blk.m_weight_ise_range = weight_ise_range; - results.m_best_blk.m_endpoint_ise_range = endpoint_ise_range; - - memcpy(results.m_best_blk.m_endpoints, trial_endpoints, NUM_MODE11_ENDPOINTS); - memcpy(results.m_best_blk.m_weights, trial_weights, 16); - -#ifdef _DEBUG - { - half_float block_pixels_half[16][3]; - - vec4F block_pixels_q16[16]; - for (uint32_t i = 0; i < 16; i++) - { - block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][0]); - block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][1]); - block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][2]); - } - - half_float unpacked_astc_blk_rgba[4][4][4]; - bool res = astc_helpers::decode_block(results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16); - assert(res); - - half_float unpacked_astc_blk_rgb[4][4][3]; - for (uint32_t y = 0; y < 4; y++) - for (uint32_t x = 0; x < 4; x++) - for (uint32_t c = 0; c < 3; c++) - unpacked_astc_blk_rgb[y][x][c] = unpacked_astc_blk_rgba[y][x][c]; - - double cmp_err = compute_block_error(&block_pixels_half[0][0], &unpacked_astc_blk_rgb[0][0][0], coptions); - assert(results.m_best_block_error == cmp_err); - } -#endif - - // transcode to BC6H - assert(results.m_best_blk.m_color_endpoint_modes[0] == 11); - - // Get qlog12 endpoints - int e[2][3]; - bool success = decode_mode11_to_qlog12(results.m_best_blk.m_endpoints, e, results.m_best_blk.m_endpoint_ise_range); - assert(success); - BASISU_NOTE_UNUSED(success); - - // Transform endpoints to half float - half_float h_e[3][2] = - { - { qlog_to_half(e[0][0], 12), qlog_to_half(e[1][0], 12) }, - { qlog_to_half(e[0][1], 12), qlog_to_half(e[1][1], 12) }, - { qlog_to_half(e[0][2], 12), qlog_to_half(e[1][2], 12) } - }; - - // Transcode to bc6h - success = transcode_bc6h_1subset(h_e, results.m_best_blk, results.m_bc6h_block); - assert(success); - - all_results.push_back(results); - } - } -} - -//-------------------------------------------------------------------------------------------------------------------------- - -static void pack_mode7_single_part(const vec4F* pBlock_linear_colors, basisu::vector& all_results, const astc_hdr_codec_options& coptions) -{ - uint8_t trial_endpoints[NUM_MODE7_ENDPOINTS], trial_weights[16]; - uint32_t trial_submode7 = 0; - - clear_obj(trial_endpoints); - clear_obj(trial_weights); - - for (uint32_t weight_ise_range = coptions.m_first_mode7_part1_weight_ise_range; weight_ise_range <= coptions.m_last_mode7_part1_weight_ise_range; weight_ise_range++) - { - const uint32_t ise_endpoint_range = astc_helpers::BISE_256_LEVELS; - - double trial_error = encode_astc_hdr_block_mode_7(16, pBlock_linear_colors, weight_ise_range, trial_submode7, 1e+30f, trial_endpoints, trial_weights, coptions, ise_endpoint_range); - - if (trial_error < 1e+30f) - { - astc_hdr_pack_results results; - results.clear(); - - results.m_best_block_error = trial_error; - - results.m_best_submodes[0] = trial_submode7; - - results.m_best_blk.m_num_partitions = 1; - results.m_best_blk.m_color_endpoint_modes[0] = 7; - results.m_best_blk.m_weight_ise_range = weight_ise_range; - results.m_best_blk.m_endpoint_ise_range = ise_endpoint_range; - - memcpy(results.m_best_blk.m_endpoints, trial_endpoints, NUM_MODE7_ENDPOINTS); - memcpy(results.m_best_blk.m_weights, trial_weights, 16); - - // transcode to BC6H - assert(results.m_best_blk.m_color_endpoint_modes[0] == 7); - - // Get qlog12 endpoints - int e[2][3]; - if (!decode_mode7_to_qlog12(results.m_best_blk.m_endpoints, e, nullptr, results.m_best_blk.m_endpoint_ise_range)) - continue; - - // Transform endpoints to half float - half_float h_e[3][2] = - { - { qlog_to_half(e[0][0], 12), qlog_to_half(e[1][0], 12) }, - { qlog_to_half(e[0][1], 12), qlog_to_half(e[1][1], 12) }, - { qlog_to_half(e[0][2], 12), qlog_to_half(e[1][2], 12) } - }; - - // Transcode to bc6h - bool status = transcode_bc6h_1subset(h_e, results.m_best_blk, results.m_bc6h_block); - assert(status); - (void)status; - - all_results.push_back(results); - } - } -} - -//-------------------------------------------------------------------------------------------------------------------------- - -static bool estimate_partition2(const vec4F* pBlock_pixels, int* pBest_parts, uint32_t num_best_parts) -{ - assert(num_best_parts <= basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); - - vec3F training_vecs[16], mean(0.0f); - - for (uint32_t i = 0; i < 16; i++) - { - vec3F& v = training_vecs[i]; - - v[0] = (float)float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][0]); - v[1] = (float)float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][1]); - v[2] = (float)float_to_half_non_neg_no_nan_inf(pBlock_pixels[i][2]); - - mean += v; - } - mean *= (1.0f / 16.0f); - - vec3F cluster_centroids[2] = { mean - vec3F(.1f), mean + vec3F(.1f) }; - - uint32_t cluster_pixels[2][16]; - uint32_t num_cluster_pixels[2]; - vec3F new_cluster_means[2]; - - for (uint32_t s = 0; s < 4; s++) - { - num_cluster_pixels[0] = 0; - num_cluster_pixels[1] = 0; - - new_cluster_means[0].clear(); - new_cluster_means[1].clear(); - - for (uint32_t i = 0; i < 16; i++) - { - float d0 = training_vecs[i].squared_distance(cluster_centroids[0]); - float d1 = training_vecs[i].squared_distance(cluster_centroids[1]); - - if (d0 < d1) - { - cluster_pixels[0][num_cluster_pixels[0]] = i; - new_cluster_means[0] += training_vecs[i]; - num_cluster_pixels[0]++; - } - else - { - cluster_pixels[1][num_cluster_pixels[1]] = i; - new_cluster_means[1] += training_vecs[i]; - num_cluster_pixels[1]++; - } - } - - if (!num_cluster_pixels[0] || !num_cluster_pixels[1]) - return false; - - cluster_centroids[0] = new_cluster_means[0] / (float)num_cluster_pixels[0]; - cluster_centroids[1] = new_cluster_means[1] / (float)num_cluster_pixels[1]; - } - - int desired_parts[4][4]; // [y][x] - for (uint32_t p = 0; p < 2; p++) - { - for (uint32_t i = 0; i < num_cluster_pixels[p]; i++) - { - const uint32_t pix_index = cluster_pixels[p][i]; - - desired_parts[pix_index >> 2][pix_index & 3] = p; - } - } - - uint32_t part_similarity[basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2]; - - for (uint32_t part_index = 0; part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; part_index++) - { - const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7; - - int total_sim_non_inv = 0; - int total_sim_inv = 0; - - for (uint32_t y = 0; y < 4; y++) - { - for (uint32_t x = 0; x < 4; x++) - { - int part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4]; - - if (part == desired_parts[y][x]) - total_sim_non_inv++; - - if ((part ^ 1) == desired_parts[y][x]) - total_sim_inv++; - } - } - - int total_sim = maximum(total_sim_non_inv, total_sim_inv); - - part_similarity[part_index] = (total_sim << 8) | part_index; - - } // part_index; - - std::sort(part_similarity, part_similarity + basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); - - for (uint32_t i = 0; i < num_best_parts; i++) - pBest_parts[i] = part_similarity[(basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2 - 1) - i] & 0xFF; - - return true; -} - -//-------------------------------------------------------------------------------------------------------------------------- - -static void pack_mode7_2part(const vec4F* pBlock_linear_colors, basisu::vector& all_results, const astc_hdr_codec_options& coptions, - int num_estimated_partitions, const int *pEstimated_partitions, - uint32_t first_weight_ise_range, uint32_t last_weight_ise_range) -{ - assert(coptions.m_mode7_part2_part_masks); - - astc_helpers::log_astc_block trial_blk; - clear_obj(trial_blk); - trial_blk.m_grid_width = 4; - trial_blk.m_grid_height = 4; - - trial_blk.m_num_partitions = 2; - trial_blk.m_color_endpoint_modes[0] = 7; - trial_blk.m_color_endpoint_modes[1] = 7; - - uint32_t first_part_index = 0, last_part_index = basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; - - if (num_estimated_partitions) - { - first_part_index = 0; - last_part_index = num_estimated_partitions; - } - - for (uint32_t part_index_iter = first_part_index; part_index_iter < last_part_index; ++part_index_iter) - { - uint32_t part_index; - if (num_estimated_partitions) - { - part_index = pEstimated_partitions[part_index_iter]; - assert(part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); - } - else - { - part_index = part_index_iter; - if (((1U << part_index) & coptions.m_mode7_part2_part_masks) == 0) - continue; - } - - const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc; - const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7; - const bool invert_flag = basist::g_astc_bc7_common_partitions2[part_index].m_invert; - - vec4F part_pixels[2][16]; - uint32_t pixel_part_index[4][4]; // [y][x] - uint32_t num_part_pixels[2] = { 0, 0 }; - - // Extract each subset's texels for this partition pattern - for (uint32_t y = 0; y < 4; y++) - { - for (uint32_t x = 0; x < 4; x++) - { - uint32_t part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4]; - if (invert_flag) - part = 1 - part; - - pixel_part_index[y][x] = part; - part_pixels[part][num_part_pixels[part]] = pBlock_linear_colors[x + y * 4]; - - num_part_pixels[part]++; - } - } - - trial_blk.m_partition_id = astc_pattern; - - for (uint32_t weight_ise_range = first_weight_ise_range; weight_ise_range <= last_weight_ise_range; weight_ise_range++) - { - assert(weight_ise_range <= astc_helpers::BISE_8_LEVELS); - - uint32_t ise_endpoint_range = astc_helpers::BISE_256_LEVELS; - if (weight_ise_range == astc_helpers::BISE_5_LEVELS) - ise_endpoint_range = astc_helpers::BISE_192_LEVELS; - else if (weight_ise_range == astc_helpers::BISE_6_LEVELS) - ise_endpoint_range = astc_helpers::BISE_128_LEVELS; - else if (weight_ise_range == astc_helpers::BISE_8_LEVELS) - ise_endpoint_range = astc_helpers::BISE_80_LEVELS; - - uint8_t trial_endpoints[2][NUM_MODE7_ENDPOINTS], trial_weights[2][16]; - uint32_t trial_submode7[2]; - - clear_obj(trial_endpoints); - clear_obj(trial_weights); - clear_obj(trial_submode7); - - double total_trial_err = 0; - for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++) - { - total_trial_err += encode_astc_hdr_block_mode_7( - num_part_pixels[pack_part_index], &part_pixels[pack_part_index][0], - weight_ise_range, trial_submode7[pack_part_index], 1e+30f, - &trial_endpoints[pack_part_index][0], &trial_weights[pack_part_index][0], coptions, ise_endpoint_range); - - } // pack_part_index - - if (total_trial_err < 1e+30f) - { - trial_blk.m_weight_ise_range = weight_ise_range; - trial_blk.m_endpoint_ise_range = ise_endpoint_range; - - for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++) - memcpy(&trial_blk.m_endpoints[pack_part_index * NUM_MODE7_ENDPOINTS], &trial_endpoints[pack_part_index][0], NUM_MODE7_ENDPOINTS); - - uint32_t src_pixel_index[2] = { 0, 0 }; - for (uint32_t y = 0; y < 4; y++) - { - for (uint32_t x = 0; x < 4; x++) - { - uint32_t p = pixel_part_index[y][x]; - trial_blk.m_weights[x + y * 4] = trial_weights[p][src_pixel_index[p]++]; - } - } - - astc_hdr_pack_results results; - results.clear(); - - results.m_best_block_error = total_trial_err; - results.m_best_submodes[0] = trial_submode7[0]; - results.m_best_submodes[1] = trial_submode7[1]; - results.m_best_pat_index = part_index; - - results.m_best_blk = trial_blk; - - bool status = transcode_bc6h_2subsets(part_index, results.m_best_blk, results.m_bc6h_block); - assert(status); - BASISU_NOTE_UNUSED(status); - - all_results.push_back(results); - } - - } // weight_ise_range - - } // part_index -} - -//-------------------------------------------------------------------------------------------------------------------------- - -static void pack_mode11_2part(const vec4F* pBlock_linear_colors, basisu::vector& all_results, const astc_hdr_codec_options& coptions, - int num_estimated_partitions, const int* pEstimated_partitions) -{ - assert(coptions.m_mode11_part2_part_masks); - - astc_helpers::log_astc_block trial_blk; - clear_obj(trial_blk); - trial_blk.m_grid_width = 4; - trial_blk.m_grid_height = 4; - - trial_blk.m_num_partitions = 2; - trial_blk.m_color_endpoint_modes[0] = 11; - trial_blk.m_color_endpoint_modes[1] = 11; - - uint32_t first_part_index = 0, last_part_index = basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; - - if (num_estimated_partitions) - { - first_part_index = 0; - last_part_index = num_estimated_partitions; - } - - for (uint32_t part_index_iter = first_part_index; part_index_iter < last_part_index; ++part_index_iter) - { - uint32_t part_index; - if (num_estimated_partitions) - { - part_index = pEstimated_partitions[part_index_iter]; - assert(part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); - } - else - { - part_index = part_index_iter; - if (((1U << part_index) & coptions.m_mode11_part2_part_masks) == 0) - continue; - } - - const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc; - const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7; - const bool invert_flag = basist::g_astc_bc7_common_partitions2[part_index].m_invert; - - vec4F part_pixels[2][16]; - uint32_t pixel_part_index[4][4]; // [y][x] - uint32_t num_part_pixels[2] = { 0, 0 }; - - // Extract each subset's texels for this partition pattern - for (uint32_t y = 0; y < 4; y++) - { - for (uint32_t x = 0; x < 4; x++) - { - uint32_t part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4]; - if (invert_flag) - part = 1 - part; - - pixel_part_index[y][x] = part; - part_pixels[part][num_part_pixels[part]] = pBlock_linear_colors[x + y * 4]; - - num_part_pixels[part]++; - } - } - - trial_blk.m_partition_id = astc_pattern; - - for (uint32_t weight_ise_range = coptions.m_first_mode11_part2_weight_ise_range; weight_ise_range <= coptions.m_last_mode11_part2_weight_ise_range; weight_ise_range++) - { - bool direct_only = false; - uint32_t ise_endpoint_range = astc_helpers::BISE_64_LEVELS; - if (weight_ise_range == astc_helpers::BISE_4_LEVELS) - ise_endpoint_range = astc_helpers::BISE_40_LEVELS; - - uint8_t trial_endpoints[2][NUM_MODE11_ENDPOINTS], trial_weights[2][16]; - uint32_t trial_submode11[2]; - - clear_obj(trial_endpoints); - clear_obj(trial_weights); - clear_obj(trial_submode11); - - double total_trial_err = 0; - for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++) - { - total_trial_err += encode_astc_hdr_block_mode_11( - num_part_pixels[pack_part_index], &part_pixels[pack_part_index][0], - weight_ise_range, trial_submode11[pack_part_index], 1e+30f, - &trial_endpoints[pack_part_index][0], &trial_weights[pack_part_index][0], coptions, - direct_only, ise_endpoint_range, coptions.m_mode11_uber_mode && (weight_ise_range >= astc_helpers::BISE_4_LEVELS) && coptions.m_allow_uber_mode, false, - coptions.m_first_mode11_submode, coptions.m_last_mode11_submode); - - } // pack_part_index - - if (total_trial_err < 1e+30f) - { - trial_blk.m_weight_ise_range = weight_ise_range; - trial_blk.m_endpoint_ise_range = ise_endpoint_range; - - for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++) - memcpy(&trial_blk.m_endpoints[pack_part_index * NUM_MODE11_ENDPOINTS], &trial_endpoints[pack_part_index][0], NUM_MODE11_ENDPOINTS); - - uint32_t src_pixel_index[2] = { 0, 0 }; - for (uint32_t y = 0; y < 4; y++) - { - for (uint32_t x = 0; x < 4; x++) - { - uint32_t p = pixel_part_index[y][x]; - trial_blk.m_weights[x + y * 4] = trial_weights[p][src_pixel_index[p]++]; - } - } - - astc_hdr_pack_results results; - results.clear(); - - results.m_best_block_error = total_trial_err; - results.m_best_submodes[0] = trial_submode11[0]; - results.m_best_submodes[1] = trial_submode11[1]; - results.m_best_pat_index = part_index; - - results.m_best_blk = trial_blk; - - bool status = transcode_bc6h_2subsets(part_index, results.m_best_blk, results.m_bc6h_block); - assert(status); - BASISU_NOTE_UNUSED(status); - - all_results.push_back(results); - } - - } // weight_ise_range - - } // part_index -} - -//-------------------------------------------------------------------------------------------------------------------------- - -bool g_astc_hdr_enc_initialized; - -void astc_hdr_enc_init() -{ - if (g_astc_hdr_enc_initialized) - return; - - astc_hdr_core_init(); - - astc_helpers::init_tables(true); - - init_qlog_tables(); - - encode_astc_hdr_init(); - - g_astc_hdr_enc_initialized = true; -} - -bool astc_hdr_enc_block( - const float* pRGBPixels, - const astc_hdr_codec_options& coptions, - basisu::vector& all_results) -{ - assert(g_astc_hdr_enc_initialized); - if (!g_astc_hdr_enc_initialized) - { - // astc_hdr_enc_init() MUST be called first. - assert(0); - return false; - } - - all_results.resize(0); - - vec4F block_linear_colors[16]; - - // Sanity check the input block. - for (uint32_t i = 0; i < 16; i++) - { - for (uint32_t j = 0; j < 3; j++) - { - float v = pRGBPixels[i * 3 + j]; - - if (std::isinf(v) || std::isnan(v)) - { - // Input pixels cannot be NaN or +-Inf. - assert(0); - return false; - } - - if (v < 0.0f) - { - // Input pixels cannot be signed. - assert(0); - return false; - } - - if (v > MAX_HALF_FLOAT) - { - // Too large for half float. - assert(0); - return false; - } - - block_linear_colors[i][j] = v; - } - - block_linear_colors[i][3] = 1.0f; - } - - assert(coptions.m_use_solid || coptions.m_use_mode11 || coptions.m_use_mode7_part2 || coptions.m_use_mode7_part1 || coptions.m_use_mode11_part2); - - bool is_solid = false; - if (coptions.m_use_solid) - is_solid = pack_solid(block_linear_colors, all_results, coptions); - - if (!is_solid) - { - if (coptions.m_use_mode11) - { - const size_t cur_num_results = all_results.size(); - - pack_mode11(block_linear_colors, all_results, coptions, coptions.m_first_mode11_weight_ise_range, coptions.m_last_mode11_weight_ise_range, false); - - if (coptions.m_last_mode11_weight_ise_range == astc_helpers::BISE_16_LEVELS) - { - pack_mode11(block_linear_colors, all_results, coptions, astc_helpers::BISE_16_LEVELS, astc_helpers::BISE_16_LEVELS, true); - } - - // If we couldn't get any mode 11 results at all, and we were restricted to just trying weight ISE range 8 (which required endpoint quantization) then - // fall back to weight ISE range 7 (which doesn't need any endpoint quantization). - // This is to guarantee we always get at least 1 non-solid result. - if (all_results.size() == cur_num_results) - { - if (coptions.m_first_mode11_weight_ise_range == astc_helpers::BISE_16_LEVELS) - { - pack_mode11(block_linear_colors, all_results, coptions, astc_helpers::BISE_12_LEVELS, astc_helpers::BISE_12_LEVELS, false); - } - } - } - - if (coptions.m_use_mode7_part1) - { - // Mode 7 1-subset never requires endpoint quantization, so it cannot fail to find at least one usable solution. - pack_mode7_single_part(block_linear_colors, all_results, coptions); - } - - bool have_est = false; - int best_parts[basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2]; - - if ((coptions.m_use_mode7_part2) || (coptions.m_use_mode11_part2)) - { - if (coptions.m_use_estimated_partitions) - have_est = estimate_partition2(block_linear_colors, best_parts, coptions.m_max_estimated_partitions); - } - - if (coptions.m_use_mode7_part2) - { - const size_t cur_num_results = all_results.size(); - - pack_mode7_2part(block_linear_colors, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts, - coptions.m_first_mode7_part2_weight_ise_range, coptions.m_last_mode7_part2_weight_ise_range); - - // If we couldn't find any packable 2-subset mode 7 results at weight levels >= 5 levels (which always requires endpoint quant), then try falling back to - // 5 levels which doesn't require endpoint quantization. - if (all_results.size() == cur_num_results) - { - if (coptions.m_first_mode7_part2_weight_ise_range >= astc_helpers::BISE_5_LEVELS) - { - pack_mode7_2part(block_linear_colors, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts, - astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_4_LEVELS); - } - } - } - - if (coptions.m_use_mode11_part2) - { - // This always requires endpoint quant, so it could fail to find any usable solutions. - pack_mode11_2part(block_linear_colors, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts); - } - } - - if (coptions.m_refine_weights) - { - // TODO: Move this above, do it once only. - basist::half_float rgb_pixels_half[16 * 3]; - for (uint32_t i = 0; i < 16; i++) - { - rgb_pixels_half[i * 3 + 0] = float_to_half_non_neg_no_nan_inf(pRGBPixels[i * 3 + 0]); - rgb_pixels_half[i * 3 + 1] = float_to_half_non_neg_no_nan_inf(pRGBPixels[i * 3 + 1]); - rgb_pixels_half[i * 3 + 2] = float_to_half_non_neg_no_nan_inf(pRGBPixels[i * 3 + 2]); - } - - for (uint32_t i = 0; i < all_results.size(); i++) - { - bool status = astc_hdr_refine_weights(rgb_pixels_half, all_results[i], coptions, coptions.m_bc6h_err_weight, &all_results[i].m_improved_via_refinement_flag); - assert(status); - BASISU_NOTE_UNUSED(status); - } - } - - return true; -} - -bool astc_hdr_pack_results_to_block(astc_blk& dst_blk, const astc_hdr_pack_results& results) -{ - assert(g_astc_hdr_enc_initialized); - if (!g_astc_hdr_enc_initialized) - return false; - - if (results.m_is_solid) - { - memcpy(&dst_blk, &results.m_solid_blk, sizeof(results.m_solid_blk)); - } - else - { - bool status = astc_helpers::pack_astc_block((astc_helpers::astc_block&)dst_blk, results.m_best_blk); - if (!status) - { - assert(0); - return false; - } - } - - return true; -} - -// Refines a block's chosen weight indices, balancing BC6H and ASTC HDR error. -bool astc_hdr_refine_weights(const half_float *pSource_block, astc_hdr_pack_results& cur_results, const astc_hdr_codec_options& coptions, float bc6h_weight, bool *pImproved_flag) -{ - if (pImproved_flag) - *pImproved_flag = false; - - if (cur_results.m_is_solid) - return true; - - const uint32_t total_weights = astc_helpers::get_ise_levels(cur_results.m_best_blk.m_weight_ise_range); - - assert((total_weights >= 3) && (total_weights <= 16)); - - double best_err[4][4]; - uint8_t best_weight[4][4]; - for (uint32_t y = 0; y < 4; y++) - { - for (uint32_t x = 0; x < 4; x++) - { - best_err[y][x] = 1e+30f; - best_weight[y][x] = 0; - } - } - - astc_hdr_pack_results temp_results; - - const float c_weights[3] = { coptions.m_r_err_scale, coptions.m_g_err_scale, 1.0f }; - - for (uint32_t weight_index = 0; weight_index < total_weights; weight_index++) - { - temp_results = cur_results; - for (uint32_t i = 0; i < 16; i++) - temp_results.m_best_blk.m_weights[i] = (uint8_t)weight_index; - - half_float unpacked_astc_blk_rgba[4][4][4]; - bool res = astc_helpers::decode_block(temp_results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16); - assert(res); - - basist::bc6h_block trial_bc6h_blk; - res = basist::astc_hdr_transcode_to_bc6h(temp_results.m_best_blk, trial_bc6h_blk); - assert(res); - - half_float unpacked_bc6h_blk[4][4][3]; - res = unpack_bc6h(&trial_bc6h_blk, unpacked_bc6h_blk, false); - assert(res); - BASISU_NOTE_UNUSED(res); - - for (uint32_t y = 0; y < 4; y++) - { - for (uint32_t x = 0; x < 4; x++) - { - double total_err = 0.0f; - - for (uint32_t c = 0; c < 3; c++) - { - const half_float orig_c = pSource_block[(x + y * 4) * 3 + c]; - const double orig_c_q = q(orig_c); - - const half_float astc_c = unpacked_astc_blk_rgba[y][x][c]; - const double astc_c_q = q(astc_c); - const double astc_e = square(astc_c_q - orig_c_q) * c_weights[c]; - - const half_float bc6h_c = unpacked_bc6h_blk[y][x][c]; - const double bc6h_c_q = q(bc6h_c); - const double bc6h_e = square(bc6h_c_q - orig_c_q) * c_weights[c]; - - const double overall_err = astc_e * (1.0f - bc6h_weight) + bc6h_e * bc6h_weight; - - total_err += overall_err; - - } // c - - if (total_err < best_err[y][x]) - { - best_err[y][x] = total_err; - best_weight[y][x] = (uint8_t)weight_index; - } - - } // x - } // y - - } // weight_index - - bool any_changed = false; - for (uint32_t i = 0; i < 16; i++) - { - if (cur_results.m_best_blk.m_weights[i] != best_weight[i >> 2][i & 3]) - { - any_changed = true; - break; - } - } - - if (any_changed) - { - memcpy(cur_results.m_best_blk.m_weights, best_weight, 16); - - { - bool res = basist::astc_hdr_transcode_to_bc6h(cur_results.m_best_blk, cur_results.m_bc6h_block); - assert(res); - BASISU_NOTE_UNUSED(res); - - half_float unpacked_astc_blk_rgba[4][4][4]; - res = astc_helpers::decode_block(cur_results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16); - assert(res); - - half_float unpacked_astc_blk_rgb[4][4][3]; - for (uint32_t y = 0; y < 4; y++) - for (uint32_t x = 0; x < 4; x++) - for (uint32_t c = 0; c < 3; c++) - unpacked_astc_blk_rgb[y][x][c] = unpacked_astc_blk_rgba[y][x][c]; - - cur_results.m_best_block_error = compute_block_error(pSource_block, &unpacked_astc_blk_rgb[0][0][0], coptions); - } - - if (pImproved_flag) - *pImproved_flag = true; - } - - return true; -} - -void astc_hdr_block_stats::update(const astc_hdr_pack_results& log_blk) -{ - std::lock_guard lck(m_mutex); - - m_total_blocks++; - - if (log_blk.m_improved_via_refinement_flag) - m_total_refined++; - - if (log_blk.m_is_solid) - { - m_total_solid++; - } - else - { - int best_weight_range = log_blk.m_best_blk.m_weight_ise_range; - - if (log_blk.m_best_blk.m_color_endpoint_modes[0] == 7) - { - m_mode7_submode_hist[bounds_check(log_blk.m_best_submodes[0], 0U, 6U)]++; - - if (log_blk.m_best_blk.m_num_partitions == 2) - { - m_total_mode7_2part++; - - m_mode7_submode_hist[bounds_check(log_blk.m_best_submodes[1], 0U, 6U)]++; - m_total_2part++; - - m_weight_range_hist_7_2part[bounds_check(best_weight_range, 0, 11)]++; - - m_part_hist[bounds_check(log_blk.m_best_pat_index, 0U, 32U)]++; - } - else - { - m_total_mode7_1part++; - - m_weight_range_hist_7[bounds_check(best_weight_range, 0, 11)]++; - } - } - else - { - m_mode11_submode_hist[bounds_check(log_blk.m_best_submodes[0], 0U, 9U)]++; - if (log_blk.m_constrained_weights) - m_total_mode11_1part_constrained_weights++; - - if (log_blk.m_best_blk.m_num_partitions == 2) - { - m_total_mode11_2part++; - - m_mode11_submode_hist[bounds_check(log_blk.m_best_submodes[1], 0U, 9U)]++; - m_total_2part++; - - m_weight_range_hist_11_2part[bounds_check(best_weight_range, 0, 11)]++; - - m_part_hist[bounds_check(log_blk.m_best_pat_index, 0U, 32U)]++; - } - else - { - m_total_mode11_1part++; - - m_weight_range_hist_11[bounds_check(best_weight_range, 0, 11)]++; - } - } - } -} - -void astc_hdr_block_stats::print() -{ - std::lock_guard lck(m_mutex); - - assert(m_total_blocks); - if (!m_total_blocks) - return; - - printf("\nLow-level ASTC Encoder Statistics:\n"); - printf("Total blocks: %u\n", m_total_blocks); - printf("Total solid: %u %3.2f%%\n", m_total_solid, (m_total_solid * 100.0f) / m_total_blocks); - printf("Total refined: %u %3.2f%%\n", m_total_refined, (m_total_refined * 100.0f) / m_total_blocks); - - printf("Total mode 11, 1 partition: %u %3.2f%%\n", m_total_mode11_1part, (m_total_mode11_1part * 100.0f) / m_total_blocks); - printf("Total mode 11, 1 partition, constrained weights: %u %3.2f%%\n", m_total_mode11_1part_constrained_weights, (m_total_mode11_1part_constrained_weights * 100.0f) / m_total_blocks); - printf("Total mode 11, 2 partition: %u %3.2f%%\n", m_total_mode11_2part, (m_total_mode11_2part * 100.0f) / m_total_blocks); - - printf("Total mode 7, 1 partition: %u %3.2f%%\n", m_total_mode7_1part, (m_total_mode7_1part * 100.0f) / m_total_blocks); - printf("Total mode 7, 2 partition: %u %3.2f%%\n", m_total_mode7_2part, (m_total_mode7_2part * 100.0f) / m_total_blocks); - - printf("Total 2 partitions: %u %3.2f%%\n", m_total_2part, (m_total_2part * 100.0f) / m_total_blocks); - printf("\n"); - - printf("ISE texel weight range histogram mode 11:\n"); - for (uint32_t i = 1; i <= MODE11_LAST_ISE_RANGE; i++) - printf("%u %u\n", i, m_weight_range_hist_11[i]); - printf("\n"); - - printf("ISE texel weight range histogram mode 11, 2 partition:\n"); - for (uint32_t i = 1; i <= MODE11_PART2_LAST_ISE_RANGE; i++) - printf("%u %u\n", i, m_weight_range_hist_11_2part[i]); - printf("\n"); - - printf("ISE texel weight range histogram mode 7:\n"); - for (uint32_t i = 1; i <= MODE7_PART1_LAST_ISE_RANGE; i++) - printf("%u %u\n", i, m_weight_range_hist_7[i]); - printf("\n"); - - printf("ISE texel weight range histogram mode 7, 2 partition:\n"); - for (uint32_t i = 1; i <= MODE7_PART2_LAST_ISE_RANGE; i++) - printf("%u %u\n", i, m_weight_range_hist_7_2part[i]); - printf("\n"); - - printf("Mode 11 submode histogram:\n"); - for (uint32_t i = 0; i <= MODE11_TOTAL_SUBMODES; i++) // +1 because of the extra direct encoding - printf("%u %u\n", i, m_mode11_submode_hist[i]); - printf("\n"); - - printf("Mode 7 submode histogram:\n"); - for (uint32_t i = 0; i < MODE7_TOTAL_SUBMODES; i++) - printf("%u %u\n", i, m_mode7_submode_hist[i]); - printf("\n"); - - printf("Partition pattern table usage histogram:\n"); - for (uint32_t i = 0; i < basist::TOTAL_ASTC_BC7_COMMON_PARTITIONS2; i++) - printf("%u:%u ", i, m_part_hist[i]); - printf("\n\n"); -} - -} // namespace basisu - diff --git a/thirdparty/basis_universal/encoder/basisu_comp.cpp b/thirdparty/basis_universal/encoder/basisu_comp.cpp index 81813257cd24..e9aa20f313bc 100644 --- a/thirdparty/basis_universal/encoder/basisu_comp.cpp +++ b/thirdparty/basis_universal/encoder/basisu_comp.cpp @@ -47,15 +47,18 @@ using namespace buminiz; #define BASISU_USE_STB_IMAGE_RESIZE_FOR_MIPMAP_GEN 0 #define DEBUG_CROP_TEXTURE_TO_64x64 (0) #define DEBUG_RESIZE_TEXTURE (0) -#define DEBUG_EXTRACT_SINGLE_BLOCK (0) namespace basisu { basis_compressor::basis_compressor() : m_pOpenCL_context(nullptr), + m_fmt_mode(basist::basis_tex_format::cETC1S), m_basis_file_size(0), m_basis_bits_per_texel(0.0f), m_total_blocks(0), + m_hdr_image_scale(1.0f), + m_ldr_to_hdr_upconversion_nit_multiplier(1.0f), + m_upconverted_any_ldr_images(false), m_any_source_image_has_alpha(false), m_opencl_failed(false) { @@ -107,7 +110,7 @@ namespace basisu { if (m_params.m_source_alpha_filenames.size()) { - debug_printf("Warning: Alpha channel image filenames are not supported in UASTC HDR mode.\n"); + debug_printf("Warning: Alpha channel image filenames are not yet supported in UASTC HDR/ASTC HDR modes.\n"); m_params.m_source_alpha_filenames.clear(); } } @@ -210,21 +213,25 @@ namespace basisu if ((m_params.m_compute_stats) && (!m_params.m_validate_output_data)) m_params.m_validate_output_data = true; + m_hdr_image_scale = 1.0f; + m_ldr_to_hdr_upconversion_nit_multiplier = 1.0f; + m_upconverted_any_ldr_images = false; + check_for_hdr_inputs(); if (m_params.m_debug) { debug_printf("basis_compressor::init:\n"); -#define PRINT_BOOL_VALUE(v) debug_printf("%s: %u %u\n", BASISU_STRINGIZE2(v), static_cast(m_params.v), m_params.v.was_changed()); -#define PRINT_INT_VALUE(v) debug_printf("%s: %i %u\n", BASISU_STRINGIZE2(v), static_cast(m_params.v), m_params.v.was_changed()); -#define PRINT_UINT_VALUE(v) debug_printf("%s: %u %u\n", BASISU_STRINGIZE2(v), static_cast(m_params.v), m_params.v.was_changed()); -#define PRINT_FLOAT_VALUE(v) debug_printf("%s: %f %u\n", BASISU_STRINGIZE2(v), static_cast(m_params.v), m_params.v.was_changed()); +#define PRINT_BOOL_VALUE(v) fmt_debug_printf("{}: {} {}\n", BASISU_STRINGIZE2(v), static_cast(m_params.v), m_params.v.was_changed()); +#define PRINT_INT_VALUE(v) fmt_debug_printf("{}: {} {}\n", BASISU_STRINGIZE2(v), static_cast(m_params.v), m_params.v.was_changed()); +#define PRINT_UINT_VALUE(v) fmt_debug_printf("{}: {} {}\n", BASISU_STRINGIZE2(v), static_cast(m_params.v), m_params.v.was_changed()); +#define PRINT_FLOAT_VALUE(v) fmt_debug_printf("{}: {} {}\n", BASISU_STRINGIZE2(v), static_cast(m_params.v), m_params.v.was_changed()); - debug_printf("Source LDR images: %u, HDR images: %u, filenames: %u, alpha filenames: %i, LDR mipmap images: %u, HDR mipmap images: %u\n", - m_params.m_source_images.size(), m_params.m_source_images_hdr.size(), - m_params.m_source_filenames.size(), m_params.m_source_alpha_filenames.size(), - m_params.m_source_mipmap_images.size(), m_params.m_source_mipmap_images_hdr.size()); + fmt_debug_printf("Source LDR images: {}, HDR images: {}, filenames: {}, alpha filenames: {}, LDR mipmap images: {}, HDR mipmap images: {}\n", + (uint64_t)m_params.m_source_images.size(), (uint64_t)m_params.m_source_images_hdr.size(), + (uint64_t)m_params.m_source_filenames.size(), (uint64_t)m_params.m_source_alpha_filenames.size(), + (uint64_t)m_params.m_source_mipmap_images.size(), (uint64_t)m_params.m_source_mipmap_images_hdr.size()); if (m_params.m_source_mipmap_images.size()) { @@ -243,6 +250,29 @@ namespace basisu } PRINT_BOOL_VALUE(m_hdr); + + switch (m_params.m_hdr_mode) + { + case hdr_modes::cUASTC_HDR_4X4: + { + fmt_debug_printf("m_hdr_mode: cUASTC_HDR_4X4\n"); + break; + } + case hdr_modes::cASTC_HDR_6X6: + { + fmt_debug_printf("m_hdr_mode: cASTC_HDR_6X6\n"); + break; + } + case hdr_modes::cASTC_HDR_6X6_INTERMEDIATE: + { + fmt_debug_printf("m_hdr_mode: cASTC_HDR_6X6_INTERMEDIATE\n"); + break; + } + default: + assert(false); + return false; + } + PRINT_BOOL_VALUE(m_uastc); PRINT_BOOL_VALUE(m_use_opencl); PRINT_BOOL_VALUE(m_y_flip); @@ -280,25 +310,25 @@ namespace basisu PRINT_INT_VALUE(m_mip_smallest_dimension); debug_printf("m_mip_filter: %s\n", m_params.m_mip_filter.c_str()); - debug_printf("m_max_endpoint_clusters: %u\n", m_params.m_max_endpoint_clusters); - debug_printf("m_max_selector_clusters: %u\n", m_params.m_max_selector_clusters); - debug_printf("m_quality_level: %i\n", m_params.m_quality_level); - debug_printf("UASTC HDR quality level: %u\n", m_params.m_uastc_hdr_options.m_level); + debug_printf("m_max_endpoint_clusters: %u\n", m_params.m_etc1s_max_endpoint_clusters); + debug_printf("m_max_selector_clusters: %u\n", m_params.m_etc1s_max_selector_clusters); + debug_printf("m_etc1s_quality_level: %i\n", m_params.m_etc1s_quality_level); + debug_printf("UASTC HDR 4x4 quality level: %u\n", m_params.m_uastc_hdr_4x4_options.m_level); debug_printf("m_tex_type: %u\n", m_params.m_tex_type); debug_printf("m_userdata0: 0x%X, m_userdata1: 0x%X\n", m_params.m_userdata0, m_params.m_userdata1); debug_printf("m_us_per_frame: %i (%f fps)\n", m_params.m_us_per_frame, m_params.m_us_per_frame ? 1.0f / (m_params.m_us_per_frame / 1000000.0f) : 0); - debug_printf("m_pack_uastc_flags: 0x%X\n", m_params.m_pack_uastc_flags); + debug_printf("m_pack_uastc_ldr_4x4_flags: 0x%X\n", m_params.m_pack_uastc_ldr_4x4_flags); - PRINT_BOOL_VALUE(m_rdo_uastc); - PRINT_FLOAT_VALUE(m_rdo_uastc_quality_scalar); - PRINT_INT_VALUE(m_rdo_uastc_dict_size); - PRINT_FLOAT_VALUE(m_rdo_uastc_max_allowed_rms_increase_ratio); - PRINT_FLOAT_VALUE(m_rdo_uastc_skip_block_rms_thresh); - PRINT_FLOAT_VALUE(m_rdo_uastc_max_smooth_block_error_scale); - PRINT_FLOAT_VALUE(m_rdo_uastc_smooth_block_max_std_dev); - PRINT_BOOL_VALUE(m_rdo_uastc_favor_simpler_modes_in_rdo_mode) - PRINT_BOOL_VALUE(m_rdo_uastc_multithreading); + PRINT_BOOL_VALUE(m_rdo_uastc_ldr_4x4); + PRINT_FLOAT_VALUE(m_rdo_uastc_ldr_4x4_quality_scalar); + PRINT_INT_VALUE(m_rdo_uastc_ldr_4x4_dict_size); + PRINT_FLOAT_VALUE(m_rdo_uastc_ldr_4x4_max_allowed_rms_increase_ratio); + PRINT_FLOAT_VALUE(m_rdo_uastc_ldr_4x4_skip_block_rms_thresh); + PRINT_FLOAT_VALUE(m_rdo_uastc_ldr_4x4_max_smooth_block_error_scale); + PRINT_FLOAT_VALUE(m_rdo_uastc_ldr_4x4_smooth_block_max_std_dev); + PRINT_BOOL_VALUE(m_rdo_uastc_ldr_4x4_favor_simpler_modes_in_rdo_mode) + PRINT_BOOL_VALUE(m_rdo_uastc_ldr_4x4_multithreading); PRINT_INT_VALUE(m_resample_width); PRINT_INT_VALUE(m_resample_height); @@ -323,8 +353,10 @@ namespace basisu } PRINT_BOOL_VALUE(m_validate_output_data); - PRINT_BOOL_VALUE(m_hdr_ldr_srgb_to_linear_conversion); - debug_printf("Allow UASTC HDR uber mode: %u\n", m_params.m_uastc_hdr_options.m_allow_uber_mode); + PRINT_BOOL_VALUE(m_ldr_hdr_upconversion_srgb_to_linear); + PRINT_FLOAT_VALUE(m_ldr_hdr_upconversion_nit_multiplier); + debug_printf("Allow UASTC HDR 4x4 uber mode: %u\n", m_params.m_uastc_hdr_4x4_options.m_allow_uber_mode); + debug_printf("UASTC HDR 4x4 ultra quant: %u\n", m_params.m_uastc_hdr_4x4_options.m_ultra_quant); PRINT_BOOL_VALUE(m_hdr_favor_astc); #undef PRINT_BOOL_VALUE @@ -345,6 +377,62 @@ namespace basisu return true; } + + void basis_compressor::pick_format_mode() + { + // Unfortunately due to the legacy of this code and backwards compat this is more complex than I would like. + m_fmt_mode = basist::basis_tex_format::cETC1S; + + if (m_params.m_hdr) + { + assert(m_params.m_uastc); + + switch (m_params.m_hdr_mode) + { + case hdr_modes::cUASTC_HDR_4X4: + m_fmt_mode = basist::basis_tex_format::cUASTC_HDR_4x4; + break; + case hdr_modes::cASTC_HDR_6X6: + m_fmt_mode = basist::basis_tex_format::cASTC_HDR_6x6; + break; + case hdr_modes::cASTC_HDR_6X6_INTERMEDIATE: + m_fmt_mode = basist::basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE; + break; + default: + assert(0); + break; + } + } + else if (m_params.m_uastc) + { + m_fmt_mode = basist::basis_tex_format::cUASTC4x4; + } + + if (m_params.m_debug) + { + switch (m_fmt_mode) + { + case basist::basis_tex_format::cETC1S: + fmt_debug_printf("Format Mode: cETC1S\n"); + break; + case basist::basis_tex_format::cUASTC4x4: + fmt_debug_printf("Format Mode: cUASTC4x4\n"); + break; + case basist::basis_tex_format::cUASTC_HDR_4x4: + fmt_debug_printf("Format Mode: cUASTC_HDR_4x4\n"); + break; + case basist::basis_tex_format::cASTC_HDR_6x6: + fmt_debug_printf("Format Mode: cASTC_HDR_6x6\n"); + break; + case basist::basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE: + fmt_debug_printf("Format Mode: cASTC_HDR_6x6_INTERMEDIATE\n"); + break; + default: + assert(0); + break; + } + } + } basis_compressor::error_code basis_compressor::process() { @@ -353,6 +441,9 @@ namespace basisu if (!read_dds_source_images()) return cECFailedReadingSourceImages; + // Note: After here m_params.m_hdr, m_params.m_uastc and m_fmt_mode cannot be changed. + pick_format_mode(); + if (!read_source_images()) return cECFailedReadingSourceImages; @@ -373,26 +464,49 @@ namespace basisu if (m_params.m_hdr) { - // UASTC HDR - printf("Mode: UASTC HDR Level %u\n", m_params.m_uastc_hdr_options.m_level); + if (m_params.m_hdr_mode == hdr_modes::cUASTC_HDR_4X4) + { + // UASTC 4x4 HDR + if (m_params.m_status_output) + printf("Mode: UASTC 4x4 HDR Level %u\n", m_params.m_uastc_hdr_4x4_options.m_level); - error_code ec = encode_slices_to_uastc_hdr(); - if (ec != cECSuccess) - return ec; + error_code ec = encode_slices_to_uastc_4x4_hdr(); + if (ec != cECSuccess) + return ec; + } + else + { + assert((m_params.m_hdr_mode == hdr_modes::cASTC_HDR_6X6) || (m_params.m_hdr_mode == hdr_modes::cASTC_HDR_6X6_INTERMEDIATE)); + + // ASTC 6x6 HDR + if (m_params.m_status_output) + { + fmt_printf("Mode: ASTC 6x6 HDR {}, Base Level: {}, Highest Level: {}, Lambda: {}, REC 2020: {}\n", + (m_params.m_hdr_mode == hdr_modes::cASTC_HDR_6X6_INTERMEDIATE) ? "Intermediate" : "", + m_params.m_astc_hdr_6x6_options.m_master_comp_level, m_params.m_astc_hdr_6x6_options.m_highest_comp_level, + m_params.m_astc_hdr_6x6_options.m_lambda, m_params.m_astc_hdr_6x6_options.m_rec2020_bt2100_color_gamut); + } + + error_code ec = encode_slices_to_astc_6x6_hdr(); + if (ec != cECSuccess) + return ec; + } } else if (m_params.m_uastc) { - // UASTC - printf("Mode: UASTC LDR Level %u\n", m_params.m_pack_uastc_flags & cPackUASTCLevelMask); + // UASTC 4x4 LDR + if (m_params.m_status_output) + printf("Mode: UASTC LDR 4x4 Level %u\n", m_params.m_pack_uastc_ldr_4x4_flags & cPackUASTCLevelMask); - error_code ec = encode_slices_to_uastc(); + error_code ec = encode_slices_to_uastc_4x4_ldr(); if (ec != cECSuccess) return ec; } else { // ETC1S - printf("Mode: ETC1S Quality %i, Level %i\n", m_params.m_quality_level, (int)m_params.m_compression_level); + if (m_params.m_status_output) + printf("Mode: ETC1S Quality %i, Level %i\n", m_params.m_etc1s_quality_level, (int)m_params.m_compression_level); if (!process_frontend()) return cECFailedFrontEnd; @@ -419,9 +533,101 @@ namespace basisu return cECSuccess; } - basis_compressor::error_code basis_compressor::encode_slices_to_uastc_hdr() + basis_compressor::error_code basis_compressor::encode_slices_to_astc_6x6_hdr() + { + debug_printf("basis_compressor::encode_slices_to_astc_6x6_hdr\n"); + + interval_timer tm; + tm.start(); + + m_uastc_slice_textures.resize(m_slice_descs.size()); + for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++) + m_uastc_slice_textures[slice_index].init(texture_format::cASTC_HDR_6x6, m_slice_descs[slice_index].m_orig_width, m_slice_descs[slice_index].m_orig_height); + + if (m_params.m_hdr_mode == hdr_modes::cASTC_HDR_6X6) + m_uastc_backend_output.m_tex_format = basist::basis_tex_format::cASTC_HDR_6x6; + else if (m_params.m_hdr_mode == hdr_modes::cASTC_HDR_6X6_INTERMEDIATE) + m_uastc_backend_output.m_tex_format = basist::basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE; + else + { + assert(0); + return cECFailedEncodeUASTC; + } + + m_uastc_backend_output.m_etc1s = false; + m_uastc_backend_output.m_srgb = false; + m_uastc_backend_output.m_slice_desc = m_slice_descs; + m_uastc_backend_output.m_slice_image_data.resize(m_slice_descs.size()); + m_uastc_backend_output.m_slice_image_crcs.resize(m_slice_descs.size()); + + astc_6x6_hdr::astc_hdr_6x6_global_config global_cfg(m_params.m_astc_hdr_6x6_options); + + global_cfg.m_image_stats = m_params.m_compute_stats; + global_cfg.m_debug_images = m_params.m_debug_images; + global_cfg.m_output_images = m_params.m_debug_images; + global_cfg.m_debug_output = m_params.m_debug; + global_cfg.m_status_output = m_params.m_status_output || m_params.m_debug; + + for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++) + { + gpu_image& dst_tex = m_uastc_slice_textures[slice_index]; + uint8_vec &dst_buf = m_uastc_backend_output.m_slice_image_data[slice_index]; + + basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index]; + (void)slice_desc; + + const imagef& source_image = m_slice_images_hdr[slice_index]; + assert(source_image.get_width() && source_image.get_height()); + + uint8_vec intermediate_tex_data, astc_tex_data; + + global_cfg.m_debug_image_prefix = m_params.m_astc_hdr_6x6_options.m_debug_image_prefix; + global_cfg.m_debug_image_prefix += fmt_string("slice_{}_", slice_index); + + global_cfg.m_output_image_prefix = m_params.m_astc_hdr_6x6_options.m_output_image_prefix; + global_cfg.m_output_image_prefix += fmt_string("slice_{}_", slice_index); + + if (m_params.m_debug) + fmt_debug_printf("----------------------------------------------------------------------------\n"); + + astc_6x6_hdr::result_metrics metrics; + bool status = astc_6x6_hdr::compress_photo(source_image, global_cfg, m_params.m_pJob_pool, intermediate_tex_data, astc_tex_data, metrics); + if (!status) + return cECFailedEncodeUASTC; + + if (m_params.m_debug) + fmt_debug_printf("----------------------------------------------------------------------------\n"); + + // Currently it always gives us both intermediate and RDO + assert(intermediate_tex_data.size()); + assert(astc_tex_data.size()); + assert((astc_tex_data.size() & 15) == 0); + assert(dst_tex.get_size_in_bytes() == astc_tex_data.size_in_bytes()); + + memcpy(dst_tex.get_ptr(), astc_tex_data.data(), astc_tex_data.size_in_bytes()); + + if (m_params.m_hdr_mode == hdr_modes::cASTC_HDR_6X6) + { + dst_buf.resize(dst_tex.get_size_in_bytes()); + memcpy(&dst_buf[0], dst_tex.get_ptr(), dst_tex.get_size_in_bytes()); + } + else + { + assert(m_params.m_hdr_mode == hdr_modes::cASTC_HDR_6X6_INTERMEDIATE); + + dst_buf.resize(intermediate_tex_data.size_in_bytes()); + memcpy(&dst_buf[0], intermediate_tex_data.get_ptr(), intermediate_tex_data.size_in_bytes()); + } + + m_uastc_backend_output.m_slice_image_crcs[slice_index] = basist::crc16(dst_buf.get_ptr(), dst_buf.size_in_bytes(), 0); + } + + return cECSuccess; + } + + basis_compressor::error_code basis_compressor::encode_slices_to_uastc_4x4_hdr() { - debug_printf("basis_compressor::encode_slices_to_uastc_hdr\n"); + debug_printf("basis_compressor::encode_slices_to_uastc_4x4_hdr\n"); interval_timer tm; tm.start(); @@ -432,24 +638,25 @@ namespace basisu m_uastc_backend_output.m_tex_format = basist::basis_tex_format::cUASTC_HDR_4x4; m_uastc_backend_output.m_etc1s = false; + m_uastc_backend_output.m_srgb = false; m_uastc_backend_output.m_slice_desc = m_slice_descs; m_uastc_backend_output.m_slice_image_data.resize(m_slice_descs.size()); m_uastc_backend_output.m_slice_image_crcs.resize(m_slice_descs.size()); if (!m_params.m_perceptual) { - m_params.m_uastc_hdr_options.m_r_err_scale = 1.0f; - m_params.m_uastc_hdr_options.m_g_err_scale = 1.0f; + m_params.m_uastc_hdr_4x4_options.m_r_err_scale = 1.0f; + m_params.m_uastc_hdr_4x4_options.m_g_err_scale = 1.0f; } - const float DEFAULT_BC6H_ERROR_WEIGHT = .85f; + const float DEFAULT_BC6H_ERROR_WEIGHT = .65f;// .85f; const float LOWEST_BC6H_ERROR_WEIGHT = .1f; - m_params.m_uastc_hdr_options.m_bc6h_err_weight = m_params.m_hdr_favor_astc ? LOWEST_BC6H_ERROR_WEIGHT : DEFAULT_BC6H_ERROR_WEIGHT; + m_params.m_uastc_hdr_4x4_options.m_bc6h_err_weight = m_params.m_hdr_favor_astc ? LOWEST_BC6H_ERROR_WEIGHT : DEFAULT_BC6H_ERROR_WEIGHT; std::atomic any_failures; - any_failures = false; + any_failures.store(false); - astc_hdr_block_stats enc_stats; + astc_hdr_4x4_block_stats enc_stats; struct uastc_blk_desc { @@ -514,23 +721,20 @@ namespace basisu const imagef& source_image = m_slice_images_hdr[slice_index]; std::atomic total_blocks_processed; - total_blocks_processed = 0; + total_blocks_processed.store(0); const uint32_t N = 256; for (uint32_t block_index_iter = 0; block_index_iter < total_blocks; block_index_iter += N) { const uint32_t first_index = block_index_iter; const uint32_t last_index = minimum(total_blocks, block_index_iter + N); - - // FIXME: This sucks, but we're having a stack size related problem with std::function with emscripten. -#ifndef __EMSCRIPTEN__ + m_params.m_pJob_pool->add_job([this, first_index, last_index, num_blocks_x, num_blocks_y, total_blocks, &source_image, &tex, &total_blocks_processed, &any_failures, &enc_stats, &unique_block_descs, &unique_block_desc_mutex] { -#endif BASISU_NOTE_UNUSED(num_blocks_y); - basisu::vector all_results; + basisu::vector all_results; all_results.reserve(256); for (uint32_t block_index = first_index; block_index < last_index; block_index++) @@ -538,6 +742,9 @@ namespace basisu const uint32_t block_x = block_index % num_blocks_x; const uint32_t block_y = block_index / num_blocks_x; + //if ((block_x == 176) && (block_y == 128)) + // printf("!"); + vec4F block_pixels[16]; source_image.extract_block_clamped(&block_pixels[0], block_x * 4, block_y * 4, 4, 4); @@ -558,17 +765,17 @@ namespace basisu rgb_pixels_half[i * 3 + 2] = float_to_half_non_neg_no_nan_inf(block_pixels[i][2]); } - bool status = astc_hdr_enc_block(&rgb_pixels[0], m_params.m_uastc_hdr_options, all_results); + bool status = astc_hdr_4x4_enc_block(&rgb_pixels[0], rgb_pixels_half, m_params.m_uastc_hdr_4x4_options, all_results); if (!status) { - any_failures = true; + any_failures.store(true); continue; } double best_err = 1e+30f; int best_result_index = -1; - const double bc6h_err_weight = m_params.m_uastc_hdr_options.m_bc6h_err_weight; + const double bc6h_err_weight = m_params.m_uastc_hdr_4x4_options.m_bc6h_err_weight; const double astc_err_weight = (1.0f - bc6h_err_weight); for (uint32_t i = 0; i < all_results.size(); i++) @@ -576,7 +783,7 @@ namespace basisu basist::half_float unpacked_bc6h_block[4 * 4 * 3]; unpack_bc6h(&all_results[i].m_bc6h_block, unpacked_bc6h_block, false); - all_results[i].m_bc6h_block_error = compute_block_error(rgb_pixels_half, unpacked_bc6h_block, m_params.m_uastc_hdr_options); + all_results[i].m_bc6h_block_error = compute_block_error(16, rgb_pixels_half, unpacked_bc6h_block, m_params.m_uastc_hdr_4x4_options); double overall_err = (all_results[i].m_bc6h_block_error * bc6h_err_weight) + (all_results[i].m_best_block_error * astc_err_weight); @@ -587,22 +794,22 @@ namespace basisu } } - const astc_hdr_pack_results& best_results = all_results[best_result_index]; + const astc_hdr_4x4_pack_results& best_results = all_results[best_result_index]; - astc_hdr_pack_results_to_block(dest_block, best_results); + astc_hdr_4x4_pack_results_to_block(dest_block, best_results); // Verify that this block is valid UASTC HDR and we can successfully transcode it to BC6H. // (Well, except in fastest mode.) - if (m_params.m_uastc_hdr_options.m_level > 0) + if (m_params.m_uastc_hdr_4x4_options.m_level > 0) { basist::bc6h_block transcoded_bc6h_blk; bool transcode_results = astc_hdr_transcode_to_bc6h(dest_block, transcoded_bc6h_blk); assert(transcode_results); if ((!transcode_results) && (!any_failures)) { - error_printf("basis_compressor::encode_slices_to_uastc_hdr: UASTC HDR block transcode check failed!\n"); + error_printf("basis_compressor::encode_slices_to_uastc_4x4_hdr: UASTC HDR block transcode check failed!\n"); - any_failures = true; + any_failures.store(true); continue; } } @@ -641,19 +848,15 @@ namespace basisu uint32_t val = total_blocks_processed; if (((val & 1023) == 1023) && m_params.m_status_output) { - debug_printf("basis_compressor::encode_slices_to_uastc_hdr: %3.1f%% done\n", static_cast(val) * 100.0f / total_blocks); + debug_printf("basis_compressor::encode_slices_to_uastc_4x4_hdr: %3.1f%% done\n", static_cast(val) * 100.0f / total_blocks); } } -#ifndef __EMSCRIPTEN__ }); -#endif } // block_index_iter -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->wait_for_all(); -#endif if (any_failures) return cECFailedEncodeUASTC; @@ -665,7 +868,7 @@ namespace basisu } // slice_index - debug_printf("basis_compressor::encode_slices_to_uastc_hdr: Total time: %3.3f secs\n", tm.get_elapsed_secs()); + debug_printf("basis_compressor::encode_slices_to_uastc_4x4_hdr: Total time: %3.3f secs\n", tm.get_elapsed_secs()); if (m_params.m_debug) { @@ -722,9 +925,9 @@ namespace basisu return cECSuccess; } - basis_compressor::error_code basis_compressor::encode_slices_to_uastc() + basis_compressor::error_code basis_compressor::encode_slices_to_uastc_4x4_ldr() { - debug_printf("basis_compressor::encode_slices_to_uastc\n"); + debug_printf("basis_compressor::encode_slices_to_uastc_4x4_ldr\n"); m_uastc_slice_textures.resize(m_slice_descs.size()); for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++) @@ -748,23 +951,20 @@ namespace basisu const image& source_image = m_slice_images[slice_index]; std::atomic total_blocks_processed; - total_blocks_processed = 0; + total_blocks_processed.store(0); const uint32_t N = 256; for (uint32_t block_index_iter = 0; block_index_iter < total_blocks; block_index_iter += N) { const uint32_t first_index = block_index_iter; const uint32_t last_index = minimum(total_blocks, block_index_iter + N); - - // FIXME: This sucks, but we're having a stack size related problem with std::function with emscripten. -#ifndef __EMSCRIPTEN__ + m_params.m_pJob_pool->add_job([this, first_index, last_index, num_blocks_x, num_blocks_y, total_blocks, &source_image, &tex, &total_blocks_processed] { -#endif BASISU_NOTE_UNUSED(num_blocks_y); - uint32_t uastc_flags = m_params.m_pack_uastc_flags; - if ((m_params.m_rdo_uastc) && (m_params.m_rdo_uastc_favor_simpler_modes_in_rdo_mode)) + uint32_t uastc_flags = m_params.m_pack_uastc_ldr_4x4_flags; + if ((m_params.m_rdo_uastc_ldr_4x4) && (m_params.m_rdo_uastc_ldr_4x4_favor_simpler_modes_in_rdo_mode)) uastc_flags |= cPackUASTCFavorSimplerModes; for (uint32_t block_index = first_index; block_index < last_index; block_index++) @@ -785,34 +985,30 @@ namespace basisu uint32_t val = total_blocks_processed; if (((val & 16383) == 16383) && m_params.m_status_output) { - debug_printf("basis_compressor::encode_slices_to_uastc: %3.1f%% done\n", static_cast(val) * 100.0f / total_blocks); + debug_printf("basis_compressor::encode_slices_to_uastc_4x4_ldr: %3.1f%% done\n", static_cast(val) * 100.0f / total_blocks); } } -#ifndef __EMSCRIPTEN__ }); -#endif } // block_index_iter -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->wait_for_all(); -#endif - if (m_params.m_rdo_uastc) + if (m_params.m_rdo_uastc_ldr_4x4) { uastc_rdo_params rdo_params; - rdo_params.m_lambda = m_params.m_rdo_uastc_quality_scalar; - rdo_params.m_max_allowed_rms_increase_ratio = m_params.m_rdo_uastc_max_allowed_rms_increase_ratio; - rdo_params.m_skip_block_rms_thresh = m_params.m_rdo_uastc_skip_block_rms_thresh; - rdo_params.m_lz_dict_size = m_params.m_rdo_uastc_dict_size; - rdo_params.m_smooth_block_max_error_scale = m_params.m_rdo_uastc_max_smooth_block_error_scale; - rdo_params.m_max_smooth_block_std_dev = m_params.m_rdo_uastc_smooth_block_max_std_dev; + rdo_params.m_lambda = m_params.m_rdo_uastc_ldr_4x4_quality_scalar; + rdo_params.m_max_allowed_rms_increase_ratio = m_params.m_rdo_uastc_ldr_4x4_max_allowed_rms_increase_ratio; + rdo_params.m_skip_block_rms_thresh = m_params.m_rdo_uastc_ldr_4x4_skip_block_rms_thresh; + rdo_params.m_lz_dict_size = m_params.m_rdo_uastc_ldr_4x4_dict_size; + rdo_params.m_smooth_block_max_error_scale = m_params.m_rdo_uastc_ldr_4x4_max_smooth_block_error_scale; + rdo_params.m_max_smooth_block_std_dev = m_params.m_rdo_uastc_ldr_4x4_smooth_block_max_std_dev; bool status = uastc_rdo(tex.get_total_blocks(), (basist::uastc_block*)tex.get_ptr(), - (const color_rgba *)m_source_blocks[slice_desc.m_first_block_index].m_pixels, rdo_params, m_params.m_pack_uastc_flags, m_params.m_rdo_uastc_multithreading ? m_params.m_pJob_pool : nullptr, - (m_params.m_rdo_uastc_multithreading && m_params.m_pJob_pool) ? basisu::minimum(4, (uint32_t)m_params.m_pJob_pool->get_total_threads()) : 0); + (const color_rgba *)m_source_blocks[slice_desc.m_first_block_index].m_pixels, rdo_params, m_params.m_pack_uastc_ldr_4x4_flags, m_params.m_rdo_uastc_ldr_4x4_multithreading ? m_params.m_pJob_pool : nullptr, + (m_params.m_rdo_uastc_ldr_4x4_multithreading && m_params.m_pJob_pool) ? basisu::minimum(4, (uint32_t)m_params.m_pJob_pool->get_total_threads()) : 0); if (!status) { return cECFailedUASTCRDOPostProcess; @@ -1013,6 +1209,8 @@ namespace basisu if (!src_img.clean_astc_hdr_pixels(basist::ASTC_HDR_MAX_VAL)) printf("Warning: clean_astc_hdr_pixels() had to modify the input image to encode to ASTC HDR - see previous warning(s).\n"); + m_hdr_image_scale = (float)hdr_image_scale; + float lowest_nonzero_val = 1e+30f; float lowest_val = 1e+30f; float highest_val = -1e+30f; @@ -1097,7 +1295,7 @@ namespace basisu if (m_params.m_status_output) { - printf("Read DDS file \"%s\", %s, %ux%u, %u mipmap levels\n", + printf("Read DDS file \"%s\", %s, %ux%u, %zu mipmap levels\n", m_params.m_source_filenames[i].c_str(), ldr_mips.size() ? "LDR" : "HDR", ldr_mips.size() ? ldr_mips[0].get_width() : hdr_mips[0].get_width(), @@ -1118,7 +1316,7 @@ namespace basisu if (ldr_mips.size() > 1) { - ldr_mips.erase(0U); + ldr_mips.erase_index(0U); m_params.m_source_mipmap_images.back().swap(ldr_mips); @@ -1138,7 +1336,7 @@ namespace basisu if (hdr_mips.size() > 1) { - hdr_mips.erase(0U); + hdr_mips.erase_index(0U); m_params.m_source_mipmap_images_hdr.back().swap(hdr_mips); @@ -1215,13 +1413,24 @@ namespace basisu // Load the source image if (m_params.m_hdr) { - if (!load_image_hdr(pSource_filename, file_image_hdr, m_params.m_hdr_ldr_srgb_to_linear_conversion)) + float upconversion_nit_multiplier = m_params.m_ldr_hdr_upconversion_nit_multiplier; + if (upconversion_nit_multiplier == 0.0f) + { + // Note: We used to use a normalized nit multiplier of 1.0 for UASTC HDR 4x4. We're now writing upconverted output files in absolute luminance (100 nits). + upconversion_nit_multiplier = LDR_TO_HDR_NITS; + } + + m_ldr_to_hdr_upconversion_nit_multiplier = upconversion_nit_multiplier; + if (!is_image_filename_hdr(pSource_filename)) + m_upconverted_any_ldr_images = true; + + if (!load_image_hdr(pSource_filename, file_image_hdr, m_params.m_ldr_hdr_upconversion_srgb_to_linear, upconversion_nit_multiplier, m_params.m_ldr_hdr_upconversion_black_bias)) { error_printf("Failed reading source image: %s\n", pSource_filename); return false; } - // For now, just slam alpha to 1.0f. UASTC HDR doesn't support alpha yet. + // TODO: For now, just slam alpha to 1.0f. None of our HDR encoders support alpha yet. for (uint32_t y = 0; y < file_image_hdr.get_height(); y++) for (uint32_t x = 0; x < file_image_hdr.get_width(); x++) file_image_hdr(x, y)[3] = 1.0f; @@ -1262,7 +1471,8 @@ namespace basisu return false; } - printf("Read source alpha image \"%s\", %ux%u\n", pSource_alpha_image, alpha_data.get_width(), alpha_data.get_height()); + if (m_params.m_status_output) + printf("Read source alpha image \"%s\", %ux%u\n", pSource_alpha_image, alpha_data.get_width(), alpha_data.get_height()); alpha_data.crop(width, height); @@ -1358,24 +1568,6 @@ namespace basisu file_image.flip_y(); } -#if DEBUG_EXTRACT_SINGLE_BLOCK - const uint32_t block_x = 0; - const uint32_t block_y = 0; - - if (m_params.m_hdr) - { - imagef block_image(4, 4); - block_image_hdr.blit(block_x * 4, block_y * 4, 4, 4, 0, 0, file_image_hdr, 0); - file_image_hdr = block_image; - } - else - { - image block_image(4, 4); - block_image.blit(block_x * 4, block_y * 4, 4, 4, 0, 0, file_image, 0); - file_image = block_image; - } -#endif - #if DEBUG_CROP_TEXTURE_TO_64x64 if (m_params.m_hdr) file_image_hdr.resize(64, 64); @@ -1674,11 +1866,19 @@ namespace basisu } } - // Enlarge the source image to 4x4 block boundaries, duplicating edge pixels if necessary to avoid introducing extra colors into blocks. + // Enlarge the source image to block boundaries, duplicating edge pixels if necessary to avoid introducing extra colors into blocks. if (m_params.m_hdr) - pSlice_image_hdr->crop_dup_borders(pSlice_image_hdr->get_block_width(4) * 4, pSlice_image_hdr->get_block_height(4) * 4); + { + // Don't pad in 6x6 mode, the lower level compressor handles it. + if (m_params.m_hdr_mode == hdr_modes::cUASTC_HDR_4X4) + { + pSlice_image_hdr->crop_dup_borders(pSlice_image_hdr->get_block_width(get_block_width()) * get_block_width(), pSlice_image_hdr->get_block_height(get_block_height()) * get_block_height()); + } + } else - pSlice_image->crop_dup_borders(pSlice_image->get_block_width(4) * 4, pSlice_image->get_block_height(4) * 4); + { + pSlice_image->crop_dup_borders(pSlice_image->get_block_width(get_block_width()) * get_block_width(), pSlice_image->get_block_height(get_block_height()) * get_block_height()); + } if (m_params.m_debug_images) { @@ -1688,7 +1888,7 @@ namespace basisu save_png(string_format("basis_debug_source_image_%u_slice_%u.png", source_file_index, slice_index).c_str(), *pSlice_image); } - const uint32_t dest_image_index = (m_params.m_hdr ? m_slice_images_hdr.size() : m_slice_images.size()); + const size_t dest_image_index = (m_params.m_hdr ? m_slice_images_hdr.size() : m_slice_images.size()); enlarge_vector(m_stats, 1); @@ -1721,16 +1921,16 @@ namespace basisu slice_desc.m_width = pSlice_image_hdr->get_width(); slice_desc.m_height = pSlice_image_hdr->get_height(); - slice_desc.m_num_blocks_x = pSlice_image_hdr->get_block_width(4); - slice_desc.m_num_blocks_y = pSlice_image_hdr->get_block_height(4); + slice_desc.m_num_blocks_x = pSlice_image_hdr->get_block_width(get_block_width()); + slice_desc.m_num_blocks_y = pSlice_image_hdr->get_block_height(get_block_height()); } else { slice_desc.m_width = pSlice_image->get_width(); slice_desc.m_height = pSlice_image->get_height(); - slice_desc.m_num_blocks_x = pSlice_image->get_block_width(4); - slice_desc.m_num_blocks_y = pSlice_image->get_block_height(4); + slice_desc.m_num_blocks_x = pSlice_image->get_block_width(get_block_width()); + slice_desc.m_num_blocks_y = pSlice_image->get_block_height(get_block_height()); } slice_desc.m_num_macroblocks_x = (slice_desc.m_num_blocks_x + 1) >> 1; @@ -1926,6 +2126,10 @@ namespace basisu { debug_printf("basis_compressor::extract_source_blocks\n"); + // No need to extract blocks in 6x6 mode, but the 4x4 compressors want 4x4 blocks. + if ((m_fmt_mode == basist::basis_tex_format::cASTC_HDR_6x6) || (m_fmt_mode == basist::basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE)) + return true; + if (m_params.m_hdr) m_source_blocks_hdr.resize(m_total_blocks); else @@ -2018,8 +2222,8 @@ namespace basisu const double total_texels = m_total_blocks * 16.0f; - int endpoint_clusters = m_params.m_max_endpoint_clusters; - int selector_clusters = m_params.m_max_selector_clusters; + int endpoint_clusters = m_params.m_etc1s_max_endpoint_clusters; + int selector_clusters = m_params.m_etc1s_max_selector_clusters; if (endpoint_clusters > basisu_frontend::cMaxEndpointClusters) { @@ -2032,9 +2236,9 @@ namespace basisu return false; } - if (m_params.m_quality_level != -1) + if (m_params.m_etc1s_quality_level != -1) { - const float quality = saturate(m_params.m_quality_level / 255.0f); + const float quality = saturate(m_params.m_etc1s_quality_level / 255.0f); const float bits_per_endpoint_cluster = 14.0f; const float max_desired_endpoint_cluster_bits_per_texel = 1.0f; // .15f @@ -2090,7 +2294,7 @@ namespace basisu debug_printf("Max endpoints: %u, max selectors: %u\n", endpoint_clusters, selector_clusters); - if (m_params.m_quality_level >= 223) + if (m_params.m_etc1s_quality_level >= 223) { if (!m_params.m_selector_rdo_thresh.was_changed()) { @@ -2101,7 +2305,7 @@ namespace basisu m_params.m_selector_rdo_thresh *= .25f; } } - else if (m_params.m_quality_level >= 192) + else if (m_params.m_etc1s_quality_level >= 192) { if (!m_params.m_endpoint_rdo_thresh.was_changed()) m_params.m_endpoint_rdo_thresh *= .5f; @@ -2109,7 +2313,7 @@ namespace basisu if (!m_params.m_selector_rdo_thresh.was_changed()) m_params.m_selector_rdo_thresh *= .5f; } - else if (m_params.m_quality_level >= 160) + else if (m_params.m_etc1s_quality_level >= 160) { if (!m_params.m_endpoint_rdo_thresh.was_changed()) m_params.m_endpoint_rdo_thresh *= .75f; @@ -2117,7 +2321,7 @@ namespace basisu if (!m_params.m_selector_rdo_thresh.was_changed()) m_params.m_selector_rdo_thresh *= .75f; } - else if (m_params.m_quality_level >= 129) + else if (m_params.m_etc1s_quality_level >= 129) { float l = (quality - 129 / 255.0f) / ((160 - 129) / 255.0f); @@ -2239,13 +2443,13 @@ namespace basisu backend_params.m_debug_images = m_params.m_debug_images; backend_params.m_etc1s = true; backend_params.m_compression_level = m_params.m_compression_level; - + if (!m_params.m_no_endpoint_rdo) backend_params.m_endpoint_rdo_quality_thresh = m_params.m_endpoint_rdo_thresh; if (!m_params.m_no_selector_rdo) backend_params.m_selector_rdo_quality_thresh = m_params.m_selector_rdo_thresh; - + backend_params.m_used_global_codebooks = m_frontend.get_params().m_pGlobal_codebooks != nullptr; backend_params.m_validate = m_params.m_validate_output_data; @@ -2274,34 +2478,36 @@ namespace basisu error_printf("basis_compressor::create_basis_file_and_transcode: basisu_backend:init() failed!\n"); return false; } - - const uint8_vec &comp_data = m_basis_file.get_compressed_data(); - m_output_basis_file = comp_data; + const uint8_vec& comp_data = m_basis_file.get_compressed_data(); - uint32_t total_orig_pixels = 0, total_texels = 0, total_orig_texels = 0; - (void)total_texels; + m_output_basis_file = comp_data; + uint32_t total_orig_pixels = 0; + for (uint32_t i = 0; i < m_slice_descs.size(); i++) { const basisu_backend_slice_desc& slice_desc = m_slice_descs[i]; total_orig_pixels += slice_desc.m_orig_width * slice_desc.m_orig_height; - total_texels += slice_desc.m_width * slice_desc.m_height; } m_basis_file_size = (uint32_t)comp_data.size(); - m_basis_bits_per_texel = total_orig_texels ? (comp_data.size() * 8.0f) / total_orig_texels : 0; + m_basis_bits_per_texel = total_orig_pixels ? (comp_data.size() * 8.0f) / total_orig_pixels : 0; debug_printf("Total .basis output file size: %u, %3.3f bits/texel\n", comp_data.size(), comp_data.size() * 8.0f / total_orig_pixels); + // HDR 6x6 TODO + // HACK HACK + const bool is_hdr_6x6 = m_params.m_hdr && (m_params.m_hdr_mode != hdr_modes::cUASTC_HDR_4X4); + if (m_params.m_validate_output_data) { interval_timer tm; tm.start(); basist::basisu_transcoder_init(); - + debug_printf("basist::basisu_transcoder_init: Took %f ms\n", tm.get_elapsed_ms()); // Verify the compressed data by transcoding it to ASTC (or ETC1)/BC7 and validating the CRC's. @@ -2330,6 +2536,7 @@ namespace basisu } tm.start(); + if (m_params.m_pGlobal_codebooks) { decoder.set_global_codebooks(m_params.m_pGlobal_codebooks); @@ -2347,24 +2554,31 @@ namespace basisu double total_time_etc1s_or_astc = 0; - for (uint32_t i = 0; i < m_slice_descs.size(); i++) + for (uint32_t slice_iter = 0; slice_iter < m_slice_descs.size(); slice_iter++) { + // Select either BC6H, UASTC LDR 4x4, or ETC1 basisu::texture_format tex_format = m_params.m_hdr ? texture_format::cBC6HUnsigned : (m_params.m_uastc ? texture_format::cUASTC4x4 : texture_format::cETC1); - basist::block_format format = m_params.m_hdr ? basist::block_format::cBC6H : (m_params.m_uastc ? basist::block_format::cUASTC_4x4 : basist::block_format::cETC1); + basist::block_format blk_format = m_params.m_hdr ? basist::block_format::cBC6H : (m_params.m_uastc ? basist::block_format::cUASTC_4x4 : basist::block_format::cETC1); gpu_image decoded_texture; decoded_texture.init( - tex_format, - m_slice_descs[i].m_width, m_slice_descs[i].m_height); + tex_format, + m_slice_descs[slice_iter].m_width, m_slice_descs[slice_iter].m_height); tm.start(); - + + const uint32_t block_size_x = basisu::get_block_width(tex_format); + const uint32_t block_size_y = basisu::get_block_height(tex_format); + const uint32_t num_dst_blocks_x = (m_slice_descs[slice_iter].m_orig_width + block_size_x - 1) / block_size_x; + const uint32_t num_dst_blocks_y = (m_slice_descs[slice_iter].m_orig_height + block_size_y - 1) / block_size_y; + const uint32_t total_dst_blocks = num_dst_blocks_x * num_dst_blocks_y; + uint32_t bytes_per_block = m_params.m_uastc ? 16 : 8; - if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i, - reinterpret_cast(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, format, bytes_per_block)) + if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), slice_iter, + reinterpret_cast(decoded_texture.get_ptr()), total_dst_blocks, blk_format, bytes_per_block)) { - error_printf("Transcoding failed on slice %u!\n", i); + error_printf("Transcoding failed on slice %u!\n", slice_iter); return false; } @@ -2373,15 +2587,15 @@ namespace basisu if (encoded_output.m_tex_format == basist::basis_tex_format::cETC1S) { uint32_t image_crc16 = basist::crc16(decoded_texture.get_ptr(), decoded_texture.get_size_in_bytes(), 0); - if (image_crc16 != encoded_output.m_slice_image_crcs[i]) + if (image_crc16 != encoded_output.m_slice_image_crcs[slice_iter]) { - error_printf("Decoded image data CRC check failed on slice %u!\n", i); + error_printf("Decoded image data CRC check failed on slice %u!\n", slice_iter); return false; } - debug_printf("Decoded image data CRC check succeeded on slice %i\n", i); + debug_printf("Decoded image data CRC check succeeded on slice %i\n", slice_iter); } - m_decoded_output_textures[i] = decoded_texture; + m_decoded_output_textures[slice_iter] = decoded_texture; } double total_alt_transcode_time = 0; @@ -2389,23 +2603,44 @@ namespace basisu if (m_params.m_hdr) { - assert(basist::basis_is_format_supported(basist::transcoder_texture_format::cTFASTC_HDR_4x4_RGBA, basist::basis_tex_format::cUASTC_HDR_4x4)); - - for (uint32_t i = 0; i < m_slice_descs.size(); i++) + if (is_hdr_6x6) { - gpu_image decoded_texture; - decoded_texture.init(texture_format::cASTC_HDR_4x4, m_slice_descs[i].m_width, m_slice_descs[i].m_height); + assert(basist::basis_is_format_supported(basist::transcoder_texture_format::cTFASTC_HDR_6x6_RGBA, basist::basis_tex_format::cASTC_HDR_6x6)); + assert(basist::basis_is_format_supported(basist::transcoder_texture_format::cTFASTC_HDR_6x6_RGBA, basist::basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE)); - tm.start(); + for (uint32_t i = 0; i < m_slice_descs.size(); i++) + { + gpu_image decoded_texture; + decoded_texture.init(texture_format::cASTC_HDR_6x6, m_slice_descs[i].m_width, m_slice_descs[i].m_height); - if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i, - reinterpret_cast(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, basist::block_format::cASTC_HDR_4x4, 16)) + if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i, + reinterpret_cast(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, basist::block_format::cASTC_HDR_6x6, 16)) + { + error_printf("Transcoding failed to ASTC HDR on slice %u!\n", i); + return false; + } + + m_decoded_output_textures_astc_hdr[i] = decoded_texture; + } + } + else + { + assert(basist::basis_is_format_supported(basist::transcoder_texture_format::cTFASTC_HDR_4x4_RGBA, basist::basis_tex_format::cUASTC_HDR_4x4)); + + for (uint32_t i = 0; i < m_slice_descs.size(); i++) { - error_printf("Transcoding failed to ASTC HDR on slice %u!\n", i); - return false; + gpu_image decoded_texture; + decoded_texture.init(texture_format::cASTC_HDR_4x4, m_slice_descs[i].m_width, m_slice_descs[i].m_height); + + if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i, + reinterpret_cast(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, basist::block_format::cASTC_HDR_4x4, 16)) + { + error_printf("Transcoding failed to ASTC HDR on slice %u!\n", i); + return false; + } + + m_decoded_output_textures_astc_hdr[i] = decoded_texture; } - - m_decoded_output_textures_astc_hdr[i] = decoded_texture; } } else @@ -2417,14 +2652,14 @@ namespace basisu { gpu_image decoded_texture; decoded_texture.init(texture_format::cBC7, m_slice_descs[i].m_width, m_slice_descs[i].m_height); - + if (!decoder.transcode_slice(&comp_data[0], (uint32_t)comp_data.size(), i, reinterpret_cast(decoded_texture.get_ptr()), m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y, basist::block_format::cBC7, 16)) { error_printf("Transcoding failed to BC7 on slice %u!\n", i); return false; } - + m_decoded_output_textures_bc7[i] = decoded_texture; } } @@ -2440,7 +2675,7 @@ namespace basisu bool status = m_decoded_output_textures[i].unpack_hdr(m_decoded_output_textures_bc6h_hdr_unpacked[i]); assert(status); BASISU_NOTE_UNUSED(status); - + // ASTC HDR status = m_decoded_output_textures_astc_hdr[i].unpack_hdr(m_decoded_output_textures_astc_hdr_unpacked[i]); assert(status); @@ -2459,21 +2694,24 @@ namespace basisu } } - debug_printf("Transcoded to %s in %3.3fms, %f texels/sec\n", + debug_printf("Transcoded to %s in %3.3fms, %f texels/sec\n", m_params.m_hdr ? "BC6H" : (m_params.m_uastc ? "ASTC" : "ETC1"), total_time_etc1s_or_astc * 1000.0f, total_orig_pixels / total_time_etc1s_or_astc); if (total_alt_transcode_time != 0) debug_printf("Alternate transcode in %3.3fms, %f texels/sec\n", total_alt_transcode_time * 1000.0f, total_orig_pixels / total_alt_transcode_time); - for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++) + if (!is_hdr_6x6) { - const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index]; + for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++) + { + const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index]; - const uint32_t total_blocks = slice_desc.m_num_blocks_x * slice_desc.m_num_blocks_y; - BASISU_NOTE_UNUSED(total_blocks); + const uint32_t total_blocks = slice_desc.m_num_blocks_x * slice_desc.m_num_blocks_y; + BASISU_NOTE_UNUSED(total_blocks); - assert(m_decoded_output_textures[slice_index].get_total_blocks() == total_blocks); + assert(m_decoded_output_textures[slice_index].get_total_blocks() == total_blocks); + } } } // if (m_params.m_validate_output_data) @@ -2489,15 +2727,17 @@ namespace basisu image srgb_img(width, height); + const float inv_upconversion_scale = (m_ldr_to_hdr_upconversion_nit_multiplier > 0.0f) ? (1.0f / m_ldr_to_hdr_upconversion_nit_multiplier) : 1.0f; + for (uint32_t y = 0; y < height; y++) { for (uint32_t x = 0; x < width; x++) { vec4F p(hdr_img(x, y)); - p[0] = clamp(p[0], 0.0f, 1.0f); - p[1] = clamp(p[1], 0.0f, 1.0f); - p[2] = clamp(p[2], 0.0f, 1.0f); + p[0] = clamp(p[0] * inv_upconversion_scale, 0.0f, 1.0f); + p[1] = clamp(p[1] * inv_upconversion_scale, 0.0f, 1.0f); + p[2] = clamp(p[2] * inv_upconversion_scale, 0.0f, 1.0f); int rc = (int)std::round(linear_to_srgb(p[0]) * 255.0f); int gc = (int)std::round(linear_to_srgb(p[1]) * 255.0f); @@ -2560,7 +2800,7 @@ namespace basisu return false; } - //if (m_params.m_status_output) + if (m_params.m_status_output) { printf("Wrote output .basis/.ktx2 file \"%s\"\n", output_filename.c_str()); } @@ -2583,14 +2823,13 @@ namespace basisu uint32_t total_texels = 0; for (uint32_t i = 0; i < m_slice_descs.size(); i++) - total_texels += (m_slice_descs[i].m_num_blocks_x * m_slice_descs[i].m_num_blocks_y) * 16; + total_texels += (m_slice_descs[i].m_orig_width * m_slice_descs[i].m_orig_height); - m_basis_bits_per_texel = comp_size * 8.0f / total_texels; + m_basis_bits_per_texel = ((float)comp_size * 8.0f) / total_texels; - debug_printf("Output file size: %u, LZ compressed file size: %u, %3.2f bits/texel\n", - (uint32_t)comp_data.size(), - (uint32_t)comp_size, - m_basis_bits_per_texel); + fmt_debug_printf("Output file size: {}, {3.2} bits/texel, LZ compressed file size: {}, {3.2} bits/texel\n", + (uint64_t)comp_data.size(), ((float)comp_data.size() * 8.0f) / total_texels, + (uint64_t)comp_size, m_basis_bits_per_texel); } m_stats.resize(m_slice_descs.size()); @@ -2662,6 +2901,26 @@ namespace basisu { printf("BC6H RGB: "); im.print_hp(); + //printf("\n"); + } + + im.calc(m_slice_images_hdr[slice_index], m_decoded_output_textures_astc_hdr_unpacked[slice_index], 0, 3, true, true); + s.m_basis_rgb_avg_log2_psnr = (float)im.m_psnr; + + if (m_params.m_print_stats) + { + printf("\nASTC Log2 RGB: "); + im.print_hp(); + } + + im.calc(m_slice_images_hdr[slice_index], m_decoded_output_textures_bc6h_hdr_unpacked[slice_index], 0, 3, true, true); + s.m_basis_rgb_avg_bc6h_log2_psnr = (float)im.m_psnr; + + if (m_params.m_print_stats) + { + printf("BC6H Log2 RGB: "); + im.print_hp(); + printf("\n"); } } @@ -2693,7 +2952,15 @@ namespace basisu astc_tex.override_dimensions(slice_desc.m_orig_width, slice_desc.m_orig_height); std::string filename1(out_basename + "_astc.astc"); - write_astc_file(filename1.c_str(), astc_tex.get_ptr(), 4, 4, slice_desc.m_orig_width, slice_desc.m_orig_height); + + uint32_t block_width = 4, block_height = 4; + if ((m_params.m_hdr_mode == hdr_modes::cASTC_HDR_6X6) || (m_params.m_hdr_mode == hdr_modes::cASTC_HDR_6X6_INTERMEDIATE)) + { + block_width = 6; + block_height = 6; + } + + write_astc_file(filename1.c_str(), astc_tex.get_ptr(), block_width, block_height, slice_desc.m_orig_width, slice_desc.m_orig_height); printf("Wrote .ASTC file %s\n", filename1.c_str()); std::string filename2(out_basename + "_astc.ktx"); @@ -2961,19 +3228,24 @@ namespace basisu return true; } + // colorModel=KTX2_KDF_DF_MODEL_ETC1S (0xA3) + // LDR ETC1S texture data in a custom format, with global codebooks static uint8_t g_ktx2_etc1s_nonalpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA3,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x3F,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF }; static uint8_t g_ktx2_etc1s_alpha_dfd[60] = { 0x3C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x38,0x0,0xA3,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x3F,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF,0x40,0x0,0x3F,0xF,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF }; - static uint8_t g_ktx2_uastc_nonalpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA6,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x7F,0x4,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF }; - static uint8_t g_ktx2_uastc_alpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA6,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x7F,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF }; - - // HDR TODO - what is the best Khronos DFD to use for UASTC HDR? - static uint8_t g_ktx2_uastc_hdr_nonalpha_dfd[44] = + // colorModel=KTX2_KDF_DF_MODEL_UASTC_LDR_4X4 (0xA6) + // LDR UASTC 4x4 texture data in a custom block format + static uint8_t g_ktx2_uastc_ldr_4x4_nonalpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA6,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x7F,0x4,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF }; + static uint8_t g_ktx2_uastc_ldr_4x4_alpha_dfd[44] = { 0x2C,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x2,0x0,0x28,0x0,0xA6,0x1,0x2,0x0,0x3,0x3,0x0,0x0,0x10,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x7F,0x3,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0xFF,0xFF,0xFF,0xFF }; + + // colorModel=KTX2_KDF_DF_MODEL_UASTC_HDR_4X4 (0xA7) + // Standard ASTC HDR 4x4 texture data but constrained for easy transcoding to BC6H, either highest quality or RDO optimized. + static uint8_t g_ktx2_uastc_hdr_4x4_nonalpha_dfd[44] = { 0x2C,0x0,0x0,0x0, // 0 totalSize 0x0,0x0,0x0,0x0, // 1 descriptorType/vendorId 0x2,0x0,0x28,0x0, // 2 descriptorBlockSize/versionNumber - 0xA7,0x1,0x1,0x0, // 3 flags, transferFunction, colorPrimaries, colorModel + 0xA7,0x1,0x1,0x0, // 3 flags, transferFunction, colorPrimaries, colorModel (KTX2_KDF_DF_MODEL_UASTC_HDR_4X4) 0x3,0x3,0x0,0x0, // 4 texelBlockDimension0-texelBlockDimension3 0x10,0x0,0x0,0x0, // 5 bytesPlane0-bytesPlane3 0x0,0x0,0x0,0x0, // 6 bytesPlane4-bytesPlane7 @@ -2982,8 +3254,42 @@ namespace basisu 0x0,0x0,0x0,0x0, // 9 sampleLower (0.0) 0x00, 0x00, 0x80, 0x3F // 10 sampleHigher (1.0) }; - - void basis_compressor::get_dfd(uint8_vec &dfd, const basist::ktx2_header &header) + + // colorModel=KTX2_KDF_DF_MODEL_ASTC (0xA2) + // Standard ASTC HDR 6x6 texture data, either highest quality or RDO optimized. + static uint8_t g_ktx2_astc_hdr_6x6_nonalpha_dfd[44] = + { + 0x2C,0x0,0x0,0x0, // 0 totalSize + 0x0,0x0,0x0,0x0, // 1 descriptorType/vendorId + 0x2,0x0,0x28,0x0, // 2 descriptorBlockSize/versionNumber + 0xA2,0x1,0x1,0x0, // 3 flags, transferFunction, colorPrimaries, colorModel (0xA2/162, standard ASTC, KTX2_KDF_DF_MODEL_ASTC) + 0x5,0x5,0x0,0x0, // 4 texelBlockDimension0-texelBlockDimension3 + 0x10,0x0,0x0,0x0, // 5 bytesPlane0-bytesPlane3 + 0x0,0x0,0x0,0x0, // 6 bytesPlane4-bytesPlane7 + 0x0,0x0,0x7F,0x80, // 7 bitLength/bitOffset/channelType and Qualifer flags (KHR_DF_SAMPLE_DATATYPE_FLOAT etc.) + 0x0,0x0,0x0,0x0, // 8 samplePosition0-samplePosition3 + 0x0,0x0,0x0,0x0, // 9 sampleLower (0.0) + 0x00, 0x00, 0x80, 0x3F // 10 sampleHigher (1.0) + }; + + // colorModel=KTX2_KDF_DF_MODEL_ASTC_HDR_6X6_INTERMEDIATE (0xA8) + // Our custom intermediate format that when decoded directly outputs ASTC HDR 6x6 + static uint8_t g_ktx2_astc_hdr_6x6_intermediate_nonalpha_dfd[44] = + { + 0x2C,0x0,0x0,0x0, // 0 totalSize + 0x0,0x0,0x0,0x0, // 1 descriptorType/vendorId + 0x2,0x0,0x28,0x0, // 2 descriptorBlockSize/versionNumber + 0xA8,0x1,0x1,0x0, // 3 flags, transferFunction, colorPrimaries, colorModel (KTX2_KDF_DF_MODEL_ASTC_HDR_6X6_INTERMEDIATE) + 0x5,0x5,0x0,0x0, // 4 texelBlockDimension0-texelBlockDimension3 + 0x10,0x0,0x0,0x0, // 5 bytesPlane0-bytesPlane3 + 0x0,0x0,0x0,0x0, // 6 bytesPlane4-bytesPlane7 + 0x0,0x0,0x7F,0x80, // 7 bitLength/bitOffset/channelType and Qualifer flags (KHR_DF_SAMPLE_DATATYPE_FLOAT etc.) + 0x0,0x0,0x0,0x0, // 8 samplePosition0-samplePosition3 + 0x0,0x0,0x0,0x0, // 9 sampleLower (0.0) + 0x00, 0x00, 0x80, 0x3F // 10 sampleHigher (1.0) + }; + + bool basis_compressor::get_dfd(uint8_vec &dfd, const basist::ktx2_header &header) { const uint8_t* pDFD; uint32_t dfd_len; @@ -2992,22 +3298,50 @@ namespace basisu { if (m_params.m_hdr) { - pDFD = g_ktx2_uastc_hdr_nonalpha_dfd; - dfd_len = sizeof(g_ktx2_uastc_hdr_nonalpha_dfd); + switch (m_params.m_hdr_mode) + { + case hdr_modes::cUASTC_HDR_4X4: + { + pDFD = g_ktx2_uastc_hdr_4x4_nonalpha_dfd; + dfd_len = sizeof(g_ktx2_uastc_hdr_4x4_nonalpha_dfd); + break; + } + case hdr_modes::cASTC_HDR_6X6: + { + pDFD = g_ktx2_astc_hdr_6x6_nonalpha_dfd; + dfd_len = sizeof(g_ktx2_astc_hdr_6x6_nonalpha_dfd); + break; + } + case hdr_modes::cASTC_HDR_6X6_INTERMEDIATE: + { + pDFD = g_ktx2_astc_hdr_6x6_intermediate_nonalpha_dfd; + dfd_len = sizeof(g_ktx2_astc_hdr_6x6_intermediate_nonalpha_dfd); + break; + } + default: + { + assert(0); + return false; + } + } } + // Must be LDR UASTC 4x4 else if (m_any_source_image_has_alpha) { - pDFD = g_ktx2_uastc_alpha_dfd; - dfd_len = sizeof(g_ktx2_uastc_alpha_dfd); + pDFD = g_ktx2_uastc_ldr_4x4_alpha_dfd; + dfd_len = sizeof(g_ktx2_uastc_ldr_4x4_alpha_dfd); } else { - pDFD = g_ktx2_uastc_nonalpha_dfd; - dfd_len = sizeof(g_ktx2_uastc_nonalpha_dfd); + pDFD = g_ktx2_uastc_ldr_4x4_nonalpha_dfd; + dfd_len = sizeof(g_ktx2_uastc_ldr_4x4_nonalpha_dfd); } } else { + // Must be ETC1S. + assert(!m_params.m_hdr); + if (m_any_source_image_has_alpha) { pDFD = g_ktx2_etc1s_alpha_dfd; @@ -3026,7 +3360,15 @@ namespace basisu memcpy(dfd.data(), pDFD, dfd_len); uint32_t dfd_bits = basisu::read_le_dword(dfd.data() + 3 * sizeof(uint32_t)); + + // Color primaries + if ((m_params.m_hdr) && (m_params.m_astc_hdr_6x6_options.m_rec2020_bt2100_color_gamut)) + { + dfd_bits &= ~(0xFF << 8); + dfd_bits |= (basist::KTX2_DF_PRIMARIES_BT2020 << 8); + } + // Transfer function dfd_bits &= ~(0xFF << 16); if (m_params.m_hdr) @@ -3068,14 +3410,55 @@ namespace basisu } basisu::write_le_dword(dfd.data() + 7 * sizeof(uint32_t), dfd_chan0); + + return true; } bool basis_compressor::create_ktx2_file() { - if (m_params.m_uastc) + //bool needs_global_data = false; + bool can_use_zstd = false; + + switch (m_fmt_mode) + { + case basist::basis_tex_format::cETC1S: + { + //needs_global_data = true; + break; + } + case basist::basis_tex_format::cUASTC4x4: + { + can_use_zstd = true; + break; + } + case basist::basis_tex_format::cUASTC_HDR_4x4: + { + can_use_zstd = true; + break; + } + case basist::basis_tex_format::cASTC_HDR_6x6: + { + can_use_zstd = true; + break; + } + case basist::basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE: + { + //needs_global_data = true; + break; + } + default: + assert(0); + fmt_debug_printf("HERE 1\n"); + return false; + } + + if (can_use_zstd) { if ((m_params.m_ktx2_uastc_supercompression != basist::KTX2_SS_NONE) && (m_params.m_ktx2_uastc_supercompression != basist::KTX2_SS_ZSTANDARD)) + { + fmt_debug_printf("HERE 2\n"); return false; + } } const basisu_backend_output& backend_output = m_backend.get_output(); @@ -3083,7 +3466,7 @@ namespace basisu // Determine the width/height, number of array layers, mipmap levels, and the number of faces (1 for 2D, 6 for cubemap). // This does not support 1D or 3D. uint32_t base_width = 0, base_height = 0, total_layers = 0, total_levels = 0, total_faces = 1; - + for (uint32_t i = 0; i < m_slice_descs.size(); i++) { if ((m_slice_descs[i].m_mip_index == 0) && (!base_width)) @@ -3101,7 +3484,7 @@ namespace basisu if (m_params.m_tex_type == basist::cBASISTexTypeCubemapArray) { assert((total_layers % 6) == 0); - + total_layers /= 6; assert(total_layers >= 1); @@ -3115,17 +3498,33 @@ namespace basisu header.m_pixel_width = base_width; header.m_pixel_height = base_height; header.m_face_count = total_faces; - + if (m_params.m_hdr) - header.m_vk_format = basist::KTX2_FORMAT_UASTC_4x4_SFLOAT_BLOCK; + { + if (m_params.m_hdr_mode == hdr_modes::cUASTC_HDR_4X4) + header.m_vk_format = basist::KTX2_FORMAT_ASTC_4x4_SFLOAT_BLOCK; + else if (m_params.m_hdr_mode == hdr_modes::cASTC_HDR_6X6) + header.m_vk_format = basist::KTX2_FORMAT_ASTC_6x6_SFLOAT_BLOCK; + else + { + assert(m_params.m_hdr_mode == hdr_modes::cASTC_HDR_6X6_INTERMEDIATE); + + header.m_vk_format = basist::KTX2_VK_FORMAT_UNDEFINED; + } + } else + { + // Either ETC1S or UASTC LDR 4x4. + assert((m_fmt_mode == basist::basis_tex_format::cETC1S) || (m_fmt_mode == basist::basis_tex_format::cUASTC4x4)); + header.m_vk_format = basist::KTX2_VK_FORMAT_UNDEFINED; + } header.m_type_size = 1; header.m_level_count = total_levels; header.m_layer_count = (total_layers > 1) ? total_layers : 0; - if (m_params.m_uastc) + if (can_use_zstd) { switch (m_params.m_ktx2_uastc_supercompression) { @@ -3143,13 +3542,16 @@ namespace basisu #endif break; } - default: assert(0); return false; + default: + assert(0); + fmt_debug_printf("HERE 3\n"); + return false; } } basisu::vector level_data_bytes(total_levels); basisu::vector compressed_level_data_bytes(total_levels); - uint_vec slice_level_offsets(m_slice_descs.size()); + size_t_vec slice_level_offsets(m_slice_descs.size()); // This will append the texture data in the correct order (for each level: layer, then face). for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++) @@ -3158,14 +3560,18 @@ namespace basisu slice_level_offsets[slice_index] = level_data_bytes[slice_desc.m_mip_index].size(); - if (m_params.m_uastc) - append_vector(level_data_bytes[slice_desc.m_mip_index], m_uastc_backend_output.m_slice_image_data[slice_index]); - else + if (m_fmt_mode == basist::basis_tex_format::cETC1S) + { append_vector(level_data_bytes[slice_desc.m_mip_index], backend_output.m_slice_image_data[slice_index]); + } + else + { + append_vector(level_data_bytes[slice_desc.m_mip_index], m_uastc_backend_output.m_slice_image_data[slice_index]); + } } - // UASTC supercompression - if ((m_params.m_uastc) && (header.m_supercompression_scheme == basist::KTX2_SS_ZSTANDARD)) + // Zstd Supercompression + if ((can_use_zstd) && (header.m_supercompression_scheme == basist::KTX2_SS_ZSTANDARD)) { #if BASISD_SUPPORT_KTX2_ZSTD for (uint32_t level_index = 0; level_index < total_levels; level_index++) @@ -3177,13 +3583,17 @@ namespace basisu m_params.m_ktx2_zstd_supercompression_level); if (ZSTD_isError(result)) + { + fmt_debug_printf("HERE 5\n"); return false; + } compressed_level_data_bytes[level_index].resize(result); } #else // Can't get here assert(0); + fmt_debug_printf("HERE 6\n"); return false; #endif } @@ -3192,11 +3602,11 @@ namespace basisu // No supercompression compressed_level_data_bytes = level_data_bytes; } - - uint8_vec etc1s_global_data; + + uint8_vec ktx2_global_data; // Create ETC1S global supercompressed data - if (!m_params.m_uastc) + if (m_fmt_mode == basist::basis_tex_format::cETC1S) { basist::ktx2_etc1s_global_data_header etc1s_global_data_header; clear_obj(etc1s_global_data_header); @@ -3241,31 +3651,58 @@ namespace basisu } } // slice_index - append_vector(etc1s_global_data, (const uint8_t*)&etc1s_global_data_header, sizeof(etc1s_global_data_header)); - append_vector(etc1s_global_data, (const uint8_t*)etc1s_image_descs.data(), etc1s_image_descs.size_in_bytes()); - append_vector(etc1s_global_data, backend_output.m_endpoint_palette); - append_vector(etc1s_global_data, backend_output.m_selector_palette); - append_vector(etc1s_global_data, backend_output.m_slice_image_tables); - + append_vector(ktx2_global_data, (const uint8_t*)&etc1s_global_data_header, sizeof(etc1s_global_data_header)); + append_vector(ktx2_global_data, (const uint8_t*)etc1s_image_descs.data(), etc1s_image_descs.size_in_bytes()); + append_vector(ktx2_global_data, backend_output.m_endpoint_palette); + append_vector(ktx2_global_data, backend_output.m_selector_palette); + append_vector(ktx2_global_data, backend_output.m_slice_image_tables); + + header.m_supercompression_scheme = basist::KTX2_SS_BASISLZ; + } + else if (m_fmt_mode == basist::basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE) + { + basisu::vector image_descs(total_levels * total_layers * total_faces); + memset(image_descs.data(), 0, image_descs.size_in_bytes()); + + for (uint32_t slice_index = 0; slice_index < m_slice_descs.size(); slice_index++) + { + const basisu_backend_slice_desc& slice_desc = m_slice_descs[slice_index]; + + const uint32_t level_index = slice_desc.m_mip_index; + uint32_t layer_index = slice_desc.m_source_file_index; + uint32_t face_index = 0; + + if (m_params.m_tex_type == basist::cBASISTexTypeCubemapArray) + { + face_index = layer_index % 6; + layer_index /= 6; + } + + const uint32_t output_image_index = level_index * (total_layers * total_faces) + layer_index * total_faces + face_index; + + image_descs[output_image_index].m_rgb_slice_byte_length = m_uastc_backend_output.m_slice_image_data[slice_index].size(); + image_descs[output_image_index].m_rgb_slice_byte_offset = slice_level_offsets[slice_index]; + + } // slice_index + + append_vector(ktx2_global_data, (const uint8_t*)image_descs.data(), image_descs.size_in_bytes()); + header.m_supercompression_scheme = basist::KTX2_SS_BASISLZ; } // Key values basist::ktx2_transcoder::key_value_vec key_values(m_params.m_ktx2_key_values); - key_values.enlarge(1); - const char* pKTXwriter = "KTXwriter"; - key_values.back().m_key.resize(strlen(pKTXwriter) + 1); - memcpy(key_values.back().m_key.data(), pKTXwriter, strlen(pKTXwriter) + 1); + basist::ktx2_add_key_value(key_values, "KTXwriter", fmt_string("Basis Universal {}", BASISU_LIB_VERSION_STRING)); - char writer_id[128]; -#ifdef _MSC_VER - sprintf_s(writer_id, sizeof(writer_id), "Basis Universal %s", BASISU_LIB_VERSION_STRING); -#else - snprintf(writer_id, sizeof(writer_id), "Basis Universal %s", BASISU_LIB_VERSION_STRING); -#endif - key_values.back().m_value.resize(strlen(writer_id) + 1); - memcpy(key_values.back().m_value.data(), writer_id, strlen(writer_id) + 1); + if (m_params.m_hdr) + { + if (m_upconverted_any_ldr_images) + basist::ktx2_add_key_value(key_values, "LDRUpconversionMultiplier", fmt_string("{}", m_ldr_to_hdr_upconversion_nit_multiplier)); + + if (m_params.m_ldr_hdr_upconversion_srgb_to_linear) + basist::ktx2_add_key_value(key_values, "LDRUpconversionSRGBToLinear", "1"); + } key_values.sort(); @@ -3278,23 +3715,36 @@ namespace basisu // DFD uint8_vec dfd; - get_dfd(dfd, header); + if (!get_dfd(dfd, header)) + { + fmt_debug_printf("HERE 7\n"); + return false; + } - const uint32_t kvd_file_offset = sizeof(header) + sizeof(basist::ktx2_level_index) * total_levels + dfd.size(); + const uint32_t kvd_file_offset = sizeof(header) + sizeof(basist::ktx2_level_index) * total_levels + (uint32_t)dfd.size(); for (uint32_t pass = 0; pass < 2; pass++) { for (uint32_t i = 0; i < key_values.size(); i++) { if (key_values[i].m_key.size() < 2) + { + fmt_debug_printf("HERE 8\n"); return false; + } if (key_values[i].m_key.back() != 0) + { + fmt_debug_printf("HERE 9\n"); return false; + } const uint64_t total_len = (uint64_t)key_values[i].m_key.size() + (uint64_t)key_values[i].m_value.size(); if (total_len >= UINT32_MAX) + { + fmt_debug_printf("HERE 10\n"); return false; + } packed_uint<4> le_len((uint32_t)total_len); append_vector(key_value_data, (const uint8_t*)&le_len, sizeof(le_len)); @@ -3316,7 +3766,7 @@ namespace basisu #endif // Hack to ensure the KVD block ends on a 16 byte boundary, because we have no other official way of aligning the data. - uint32_t kvd_end_file_offset = kvd_file_offset + key_value_data.size(); + uint32_t kvd_end_file_offset = kvd_file_offset + (uint32_t)key_value_data.size(); uint32_t bytes_needed_to_pad = (16 - (kvd_end_file_offset & 15)) & 15; if (!bytes_needed_to_pad) { @@ -3326,7 +3776,10 @@ namespace basisu assert(!pass); if (pass) + { + fmt_debug_printf("HERE 11\n"); return false; + } if (bytes_needed_to_pad < 6) bytes_needed_to_pad += 16; @@ -3365,7 +3818,7 @@ namespace basisu // DFD const uint8_t* pDFD = dfd.data(); - uint32_t dfd_len = dfd.size(); + uint32_t dfd_len = (uint32_t)dfd.size(); header.m_dfd_byte_offset = m_output_ktx2_file.size(); header.m_dfd_byte_length = dfd_len; @@ -3382,17 +3835,17 @@ namespace basisu } // Global Supercompressed Data - if (etc1s_global_data.size()) + if (ktx2_global_data.size()) { uint32_t ofs = m_output_ktx2_file.size() & 7; uint32_t padding = (8 - ofs) & 7; for (uint32_t i = 0; i < padding; i++) m_output_ktx2_file.push_back(0); - header.m_sgd_byte_length = etc1s_global_data.size(); + header.m_sgd_byte_length = ktx2_global_data.size(); header.m_sgd_byte_offset = m_output_ktx2_file.size(); - append_vector(m_output_ktx2_file, etc1s_global_data); + append_vector(m_output_ktx2_file, ktx2_global_data); } // mipPadding @@ -3416,8 +3869,12 @@ namespace basisu for (int level = total_levels - 1; level >= 0; level--) { level_index_array[level].m_byte_length = compressed_level_data_bytes[level].size(); - if (m_params.m_uastc) + + //if (m_params.m_uastc) + if (can_use_zstd) + { level_index_array[level].m_uncompressed_byte_length = level_data_bytes[level].size(); + } level_index_array[level].m_byte_offset = m_output_ktx2_file.size(); append_vector(m_output_ktx2_file, compressed_level_data_bytes[level]); @@ -3429,7 +3886,15 @@ namespace basisu // Write final level index array memcpy(m_output_ktx2_file.data() + sizeof(header), level_index_array.data(), level_index_array.size_in_bytes()); - debug_printf("Total .ktx2 output file size: %u\n", m_output_ktx2_file.size()); + uint32_t total_orig_pixels = 0; + + for (uint32_t i = 0; i < m_slice_descs.size(); i++) + { + const basisu_backend_slice_desc& slice_desc = m_slice_descs[i]; + total_orig_pixels += slice_desc.m_orig_width * slice_desc.m_orig_height; + } + + debug_printf("Total .ktx2 output file size: %u, %3.3f bits/texel\n", m_output_ktx2_file.size(), ((float)m_output_ktx2_file.size() * 8.0f) / total_orig_pixels); return true; } @@ -3455,10 +3920,10 @@ namespace basisu results_vec.resize(params_vec.size()); std::atomic result; - result = true; + result.store(true); std::atomic opencl_failed; - opencl_failed = false; + opencl_failed.store(false); for (uint32_t pindex = 0; pindex < params_vec.size(); pindex++) { @@ -3485,14 +3950,14 @@ namespace basisu bool status = c.init(params); if (c.get_opencl_failed()) - opencl_failed = true; + opencl_failed.store(true); if (status) { basis_compressor::error_code ec = c.process(); if (c.get_opencl_failed()) - opencl_failed = true; + opencl_failed.store(true); results.m_error_code = ec; @@ -3530,6 +3995,7 @@ namespace basisu } static void* basis_compress( + basist::basis_tex_format mode, const basisu::vector *pSource_images, const basisu::vector *pSource_images_hdr, uint32_t flags_and_quality, float uastc_rdo_quality, @@ -3570,6 +4036,8 @@ namespace basisu // Initialize the compressor parameter struct basis_compressor_params comp_params; + comp_params.set_format_mode(mode); + comp_params.m_pJob_pool = &jp; comp_params.m_y_flip = (flags_and_quality & cFlagYFlip) != 0; @@ -3618,25 +4086,15 @@ namespace basisu comp_params.m_mip_gen = (flags_and_quality & (cFlagGenMipsWrap | cFlagGenMipsClamp)) != 0; comp_params.m_mip_wrapping = (flags_and_quality & cFlagGenMipsWrap) != 0; - if ((pSource_images_hdr) || (flags_and_quality & cFlagHDR)) + if (mode == basist::basis_tex_format::cUASTC4x4) { - // In UASTC HDR mode, the compressor will jam this to true anyway. - // And there's no need to set UASTC LDR or ETC1S options. - comp_params.m_uastc = true; + comp_params.m_pack_uastc_ldr_4x4_flags = flags_and_quality & cPackUASTCLevelMask; + comp_params.m_rdo_uastc_ldr_4x4 = (flags_and_quality & cFlagUASTCRDO) != 0; + comp_params.m_rdo_uastc_ldr_4x4_quality_scalar = uastc_rdo_quality; } - else + else if (mode == basist::basis_tex_format::cETC1S) { - comp_params.m_uastc = (flags_and_quality & cFlagUASTC) != 0; - if (comp_params.m_uastc) - { - comp_params.m_pack_uastc_flags = flags_and_quality & cPackUASTCLevelMask; - comp_params.m_rdo_uastc = (flags_and_quality & cFlagUASTCRDO) != 0; - comp_params.m_rdo_uastc_quality_scalar = uastc_rdo_quality; - } - else - { - comp_params.m_quality_level = basisu::maximum(1, flags_and_quality & 255); - } + comp_params.m_etc1s_quality_level = basisu::maximum(1, flags_and_quality & 255); } comp_params.m_create_ktx2_file = (flags_and_quality & cFlagKTX2) != 0; @@ -3654,14 +4112,16 @@ namespace basisu comp_params.m_print_stats = (flags_and_quality & cFlagPrintStats) != 0; comp_params.m_status_output = (flags_and_quality & cFlagPrintStatus) != 0; - if ((flags_and_quality & cFlagHDR) || (pSource_images_hdr)) + if (mode == basist::basis_tex_format::cUASTC_HDR_4x4) { - comp_params.m_hdr = true; - comp_params.m_uastc_hdr_options.set_quality_level(flags_and_quality & cPackUASTCLevelMask); + comp_params.m_uastc_hdr_4x4_options.set_quality_level(flags_and_quality & cPackUASTCLevelMask); + } + else if ((mode == basist::basis_tex_format::cASTC_HDR_6x6) || (mode == basist::basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE)) + { + comp_params.m_astc_hdr_6x6_options.set_user_level(flags_and_quality & cPackUASTCLevelMask); + comp_params.m_astc_hdr_6x6_options.m_lambda = uastc_rdo_quality; + comp_params.m_astc_hdr_6x6_options.m_rec2020_bt2100_color_gamut = (flags_and_quality & cFlagREC2020) != 0; } - - if (flags_and_quality & cFlagHDRLDRImageSRGBToLinearConversion) - comp_params.m_hdr_ldr_srgb_to_linear_conversion = true; // Create the compressor, initialize it, and process the input basis_compressor comp; @@ -3707,24 +4167,27 @@ namespace basisu } void* basis_compress( + basist::basis_tex_format mode, const basisu::vector& source_images, uint32_t flags_and_quality, float uastc_rdo_quality, size_t* pSize, image_stats* pStats) { - return basis_compress(&source_images, nullptr, flags_and_quality, uastc_rdo_quality, pSize, pStats); + return basis_compress(mode, &source_images, nullptr, flags_and_quality, uastc_rdo_quality, pSize, pStats); } void* basis_compress( + basist::basis_tex_format mode, const basisu::vector& source_images_hdr, - uint32_t flags_and_quality, + uint32_t flags_and_quality, float lambda, size_t* pSize, image_stats* pStats) { - return basis_compress(nullptr, &source_images_hdr, flags_and_quality, 0.0f, pSize, pStats); + return basis_compress(mode, nullptr, &source_images_hdr, flags_and_quality, lambda, pSize, pStats); } void* basis_compress( + basist::basis_tex_format mode, const uint8_t* pImageRGBA, uint32_t width, uint32_t height, uint32_t pitch_in_pixels, uint32_t flags_and_quality, float uastc_rdo_quality, size_t* pSize, @@ -3754,7 +4217,7 @@ namespace basisu for (uint32_t y = 0; y < height; y++) memcpy(source_image[0].get_ptr() + y * width, (const color_rgba*)pImageRGBA + y * pitch_in_pixels, width * sizeof(color_rgba)); - return basis_compress(source_image, flags_and_quality, uastc_rdo_quality, pSize, pStats); + return basis_compress(mode, source_image, flags_and_quality, uastc_rdo_quality, pSize, pStats); } void basis_free_data(void* p) @@ -3807,6 +4270,7 @@ namespace basisu { tm.start(); void* pComp_data = basis_compress( + basist::basis_tex_format::cETC1S, images, flags_and_quality, 1.0f, &comp_size, @@ -3829,6 +4293,7 @@ namespace basisu { tm.start(); void* pComp_data = basis_compress( + basist::basis_tex_format::cETC1S, images, flags_and_quality | cFlagUseOpenCL, 1.0f, &comp_size, diff --git a/thirdparty/basis_universal/encoder/basisu_comp.h b/thirdparty/basis_universal/encoder/basisu_comp.h index 1cc75fc8a385..ffa1fdf2aae2 100644 --- a/thirdparty/basis_universal/encoder/basisu_comp.h +++ b/thirdparty/basis_universal/encoder/basisu_comp.h @@ -18,10 +18,11 @@ #include "basisu_basis_file.h" #include "../transcoder/basisu_transcoder.h" #include "basisu_uastc_enc.h" -#include "basisu_astc_hdr_enc.h" +#include "basisu_uastc_hdr_4x4_enc.h" +#include "basisu_astc_hdr_6x6_enc.h" -#define BASISU_LIB_VERSION 150 -#define BASISU_LIB_VERSION_STRING "1.50" +#define BASISU_LIB_VERSION 160 +#define BASISU_LIB_VERSION_STRING "1.60" #ifndef BASISD_SUPPORT_KTX2 #error BASISD_SUPPORT_KTX2 is undefined @@ -76,6 +77,8 @@ namespace basisu m_height = 0; m_basis_rgb_avg_psnr = 0.0f; + m_basis_rgb_avg_log2_psnr = 0.0f; + m_basis_rgba_avg_psnr = 0.0f; m_basis_a_avg_psnr = 0.0f; m_basis_luma_709_psnr = 0.0f; @@ -83,6 +86,7 @@ namespace basisu m_basis_luma_709_ssim = 0.0f; m_basis_rgb_avg_bc6h_psnr = 0.0f; + m_basis_rgb_avg_bc6h_log2_psnr = 0.0f; m_bc7_rgb_avg_psnr = 0.0f; m_bc7_rgba_avg_psnr = 0.0f; @@ -105,6 +109,8 @@ namespace basisu // .basis/.ktx2 compressed (LDR: ETC1S or UASTC statistics, HDR: transcoded BC6H statistics) float m_basis_rgb_avg_psnr; + float m_basis_rgb_avg_log2_psnr; + float m_basis_rgba_avg_psnr; float m_basis_a_avg_psnr; float m_basis_luma_709_psnr; @@ -113,6 +119,7 @@ namespace basisu // UASTC HDR only. float m_basis_rgb_avg_bc6h_psnr; + float m_basis_rgb_avg_bc6h_log2_psnr; // LDR: BC7 statistics float m_bc7_rgb_avg_psnr; @@ -131,6 +138,17 @@ namespace basisu bool m_opencl_failed; }; + enum class hdr_modes + { + // standard but constrained ASTC HDR 4x4 tex data that can be rapidly transcoded to BC6H + cUASTC_HDR_4X4, + // standard RDO optimized or non-RDO (highest quality) ASTC HDR 6x6 tex data that can be rapidly re-encoded to BC6H + cASTC_HDR_6X6, + // a custom intermediate format based off ASTC HDR that can be rapidly decoded straight to ASTC HDR or re-encoded to BC6H + cASTC_HDR_6X6_INTERMEDIATE, + cTotal + }; + template struct bool_param { @@ -220,21 +238,23 @@ namespace basisu m_endpoint_rdo_thresh(BASISU_DEFAULT_ENDPOINT_RDO_THRESH, 0.0f, 1e+10f), m_mip_scale(1.0f, .000125f, 4.0f), m_mip_smallest_dimension(1, 1, 16384), - m_max_endpoint_clusters(512), - m_max_selector_clusters(512), - m_quality_level(-1), - m_pack_uastc_flags(cPackUASTCLevelDefault), - m_rdo_uastc_quality_scalar(1.0f, 0.001f, 50.0f), - m_rdo_uastc_dict_size(BASISU_RDO_UASTC_DICT_SIZE_DEFAULT, BASISU_RDO_UASTC_DICT_SIZE_MIN, BASISU_RDO_UASTC_DICT_SIZE_MAX), - m_rdo_uastc_max_smooth_block_error_scale(UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE, 1.0f, 300.0f), - m_rdo_uastc_smooth_block_max_std_dev(UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV, .01f, 65536.0f), - m_rdo_uastc_max_allowed_rms_increase_ratio(UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO, .01f, 100.0f), - m_rdo_uastc_skip_block_rms_thresh(UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH, .01f, 100.0f), + m_etc1s_max_endpoint_clusters(512), + m_etc1s_max_selector_clusters(512), + m_etc1s_quality_level(-1), + m_pack_uastc_ldr_4x4_flags(cPackUASTCLevelDefault), + m_rdo_uastc_ldr_4x4_quality_scalar(1.0f, 0.001f, 50.0f), + m_rdo_uastc_ldr_4x4_dict_size(BASISU_RDO_UASTC_DICT_SIZE_DEFAULT, BASISU_RDO_UASTC_DICT_SIZE_MIN, BASISU_RDO_UASTC_DICT_SIZE_MAX), + m_rdo_uastc_ldr_4x4_max_smooth_block_error_scale(UASTC_RDO_DEFAULT_SMOOTH_BLOCK_MAX_ERROR_SCALE, 1.0f, 300.0f), + m_rdo_uastc_ldr_4x4_smooth_block_max_std_dev(UASTC_RDO_DEFAULT_MAX_SMOOTH_BLOCK_STD_DEV, .01f, 65536.0f), + m_rdo_uastc_ldr_4x4_max_allowed_rms_increase_ratio(UASTC_RDO_DEFAULT_MAX_ALLOWED_RMS_INCREASE_RATIO, .01f, 100.0f), + m_rdo_uastc_ldr_4x4_skip_block_rms_thresh(UASTC_RDO_DEFAULT_SKIP_BLOCK_RMS_THRESH, .01f, 100.0f), m_resample_width(0, 1, 16384), m_resample_height(0, 1, 16384), m_resample_factor(0.0f, .00125f, 100.0f), m_ktx2_uastc_supercompression(basist::KTX2_SS_NONE), m_ktx2_zstd_supercompression_level(6, INT_MIN, INT_MAX), + m_ldr_hdr_upconversion_nit_multiplier(0.0f, 0.0f, basist::MAX_HALF_FLOAT), + m_ldr_hdr_upconversion_black_bias(0.0f, 0.0f, 1.0f), m_pJob_pool(nullptr) { clear(); @@ -243,6 +263,9 @@ namespace basisu void clear() { m_uastc.clear(); + m_hdr.clear(); + m_hdr_mode = hdr_modes::cUASTC_HDR_4X4; + m_use_opencl.clear(); m_status_output.clear(); @@ -290,24 +313,24 @@ namespace basisu m_mip_fast.clear(); m_mip_smallest_dimension.clear(); - m_max_endpoint_clusters = 0; - m_max_selector_clusters = 0; - m_quality_level = -1; + m_etc1s_max_endpoint_clusters = 0; + m_etc1s_max_selector_clusters = 0; + m_etc1s_quality_level = -1; m_tex_type = basist::cBASISTexType2D; m_userdata0 = 0; m_userdata1 = 0; m_us_per_frame = 0; - m_pack_uastc_flags = cPackUASTCLevelDefault; - m_rdo_uastc.clear(); - m_rdo_uastc_quality_scalar.clear(); - m_rdo_uastc_max_smooth_block_error_scale.clear(); - m_rdo_uastc_smooth_block_max_std_dev.clear(); - m_rdo_uastc_max_allowed_rms_increase_ratio.clear(); - m_rdo_uastc_skip_block_rms_thresh.clear(); - m_rdo_uastc_favor_simpler_modes_in_rdo_mode.clear(); - m_rdo_uastc_multithreading.clear(); + m_pack_uastc_ldr_4x4_flags = cPackUASTCLevelDefault; + m_rdo_uastc_ldr_4x4.clear(); + m_rdo_uastc_ldr_4x4_quality_scalar.clear(); + m_rdo_uastc_ldr_4x4_max_smooth_block_error_scale.clear(); + m_rdo_uastc_ldr_4x4_smooth_block_max_std_dev.clear(); + m_rdo_uastc_ldr_4x4_max_allowed_rms_increase_ratio.clear(); + m_rdo_uastc_ldr_4x4_skip_block_rms_thresh.clear(); + m_rdo_uastc_ldr_4x4_favor_simpler_modes_in_rdo_mode.clear(); + m_rdo_uastc_ldr_4x4_multithreading.clear(); m_resample_width.clear(); m_resample_height.clear(); @@ -323,19 +346,80 @@ namespace basisu m_validate_output_data.clear(); - m_hdr_ldr_srgb_to_linear_conversion.clear(); + m_ldr_hdr_upconversion_srgb_to_linear.clear(); m_hdr_favor_astc.clear(); + m_uastc_hdr_4x4_options.init(); + m_astc_hdr_6x6_options.clear(); + + m_ldr_hdr_upconversion_nit_multiplier.clear(); + m_ldr_hdr_upconversion_black_bias.clear(); + m_pJob_pool = nullptr; } - + + // Configures the compressor's mode by setting the proper parameters (which were preserved for backwards compatibility with old code). + void set_format_mode(basist::basis_tex_format m) + { + switch (m) + { + case basist::basis_tex_format::cETC1S: + { + m_hdr = false; + m_uastc = false; + m_hdr_mode = hdr_modes::cUASTC_HDR_4X4; // doesn't matter + break; + } + case basist::basis_tex_format::cUASTC4x4: + { + m_hdr = false; + m_uastc = true; + m_hdr_mode = hdr_modes::cUASTC_HDR_4X4; // doesn't matter + break; + } + case basist::basis_tex_format::cUASTC_HDR_4x4: + { + m_hdr = true; + m_uastc = true; + m_hdr_mode = hdr_modes::cUASTC_HDR_4X4; + break; + } + case basist::basis_tex_format::cASTC_HDR_6x6: + { + m_hdr = true; + m_uastc = true; + m_hdr_mode = hdr_modes::cASTC_HDR_6X6; + break; + } + case basist::basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE: + { + m_hdr = true; + m_uastc = true; + m_hdr_mode = hdr_modes::cASTC_HDR_6X6_INTERMEDIATE; + break; + } + default: + assert(0); + break; + } + } + + // By default we generate LDR ETC1S data. + // if m_uastc is true but m_hdr is not true, we generate UASTC 4x4 LDR data (8bpp with or without RDO). + // if m_uastc is true and m_hdr is true, we generate 4x4 or 6x6 HDR data (either standard ASTC, constrained ASTC, RDO ASTC, or intermediate), controlled by m_hdr_mode. + // True to generate UASTC .basis/.KTX2 file data, otherwise ETC1S. + // Should be true for any non-ETC1S format (UASTC 4x4 LDR, UASTC 4x4 HDR, RDO ASTC 6x6 HDR, and ASTC 6x6 HDR intermediate). bool_param m_uastc; - // Set m_hdr to true to switch to UASTC HDR mode. + // Set m_hdr to true to switch to UASTC HDR mode. m_hdr_mode then controls which format is output. + // m_hdr_mode then controls which format is output (4x4, 6x6, or 6x6 intermediate). bool_param m_hdr; + // If m_hdr is true, this specifies which mode we operate in (currently UASTC 4x4 HDR or ASTC 6x6 HDR). Defaults to UASTC 4x4 HDR for backwards compatibility. + hdr_modes m_hdr_mode; + bool_param m_use_opencl; // If m_read_source_images is true, m_source_filenames (and optionally m_source_alpha_filenames) contains the filenames of PNG etc. images to read. @@ -426,30 +510,31 @@ namespace basisu bool_param m_mip_fast; param m_mip_smallest_dimension; - // Codebook size (quality) control. - // If m_quality_level != -1, it controls the quality level. It ranges from [1,255] or [BASISU_QUALITY_MIN, BASISU_QUALITY_MAX]. + // ETC1S codebook size (quality) control. + // If m_etc1s_quality_level != -1, it controls the quality level. It ranges from [1,255] or [BASISU_QUALITY_MIN, BASISU_QUALITY_MAX]. // Otherwise m_max_endpoint_clusters/m_max_selector_clusters controls the codebook sizes directly. - uint32_t m_max_endpoint_clusters; - uint32_t m_max_selector_clusters; - int m_quality_level; + uint32_t m_etc1s_max_endpoint_clusters; + uint32_t m_etc1s_max_selector_clusters; + int m_etc1s_quality_level; - // m_tex_type, m_userdata0, m_userdata1, m_framerate - These fields go directly into the Basis file header. + // m_tex_type, m_userdata0, m_userdata1, m_framerate - These fields go directly into the .basis file header. basist::basis_texture_type m_tex_type; uint32_t m_userdata0; uint32_t m_userdata1; uint32_t m_us_per_frame; + // UASTC LDR 4x4 parameters // cPackUASTCLevelDefault, etc. - uint32_t m_pack_uastc_flags; - bool_param m_rdo_uastc; - param m_rdo_uastc_quality_scalar; - param m_rdo_uastc_dict_size; - param m_rdo_uastc_max_smooth_block_error_scale; - param m_rdo_uastc_smooth_block_max_std_dev; - param m_rdo_uastc_max_allowed_rms_increase_ratio; - param m_rdo_uastc_skip_block_rms_thresh; - bool_param m_rdo_uastc_favor_simpler_modes_in_rdo_mode; - bool_param m_rdo_uastc_multithreading; + uint32_t m_pack_uastc_ldr_4x4_flags; + bool_param m_rdo_uastc_ldr_4x4; + param m_rdo_uastc_ldr_4x4_quality_scalar; + param m_rdo_uastc_ldr_4x4_dict_size; + param m_rdo_uastc_ldr_4x4_max_smooth_block_error_scale; + param m_rdo_uastc_ldr_4x4_smooth_block_max_std_dev; + param m_rdo_uastc_ldr_4x4_max_allowed_rms_increase_ratio; + param m_rdo_uastc_ldr_4x4_skip_block_rms_thresh; + bool_param m_rdo_uastc_ldr_4x4_favor_simpler_modes_in_rdo_mode; + bool_param m_rdo_uastc_ldr_4x4_multithreading; param m_resample_width; param m_resample_height; @@ -465,13 +550,26 @@ namespace basisu param m_ktx2_zstd_supercompression_level; bool_param m_ktx2_srgb_transfer_func; - astc_hdr_codec_options m_uastc_hdr_options; + uastc_hdr_4x4_codec_options m_uastc_hdr_4x4_options; + astc_6x6_hdr::astc_hdr_6x6_global_config m_astc_hdr_6x6_options; bool_param m_validate_output_data; - // If true, LDR images (such as PNG) will be converted to normalized [0,1] linear light (via a sRGB->Linear conversion) and then processed as HDR. - // Otherwise, LDR images will be processed as HDR as-is. - bool_param m_hdr_ldr_srgb_to_linear_conversion; + // LDR->HDR upconversion parameters. + // + // If true, LDR images (such as PNG) will be converted to normalized [0,1] linear light (via a sRGB->Linear conversion), or absolute luminance (nits or candelas per meter squared), and then processed as HDR. + // Otherwise, LDR images are assumed to already be in linear light (i.e. they don't use the sRGB transfer function). + bool_param m_ldr_hdr_upconversion_srgb_to_linear; + + // m_ldr_hdr_upconversion_nit_multiplier is only used when loading SDR/LDR images and compressing to an HDR output format. + // By default m_ldr_hdr_upconversion_nit_multiplier is 0. It's an override for the default. + // When loading LDR images, a default multiplier of 1.0 will be used in UASTC 4x4 HDR mode. Partially for backwards compatibility with previous library releases, and also because it doesn't really matter with this encoder what the multiplier is. + // With the 6x6 HDR encoder it does matter because it expects inputs in absolute nits, so the LDR upconversion luminance multiplier default will be 100 nits. (Most SDR monitors were/are 80-100 nits or so.) + param m_ldr_hdr_upconversion_nit_multiplier; + + // The optional sRGB space bias to use during LDR->HDR upconversion. Should be between [0,.49] or so. Only applied on black (0.0) color components. + // Defaults to no bias (0.0f). + param m_ldr_hdr_upconversion_black_bias; // If true, ASTC HDR quality is favored more than BC6H quality. Otherwise it's a rough balance. bool_param m_hdr_favor_astc; @@ -526,8 +624,10 @@ namespace basisu private: basis_compressor_params m_params; - + opencl_context_ptr m_pOpenCL_context; + + basist::basis_tex_format m_fmt_mode; basisu::vector m_slice_images; basisu::vector m_slice_images_hdr; @@ -543,6 +643,7 @@ namespace basisu basisu_frontend m_frontend; + // These are 4x4 blocks. pixel_block_vec m_source_blocks; pixel_block_hdr_vec m_source_blocks_hdr; @@ -572,6 +673,15 @@ namespace basisu basisu::vector m_uastc_slice_textures; basisu_backend_output m_uastc_backend_output; + // The amount the HDR input has to be scaled up in case it had to be rescaled to fit into half floats. + float m_hdr_image_scale; + + // The upconversion multiplier used to load LDR images in HDR mode. + float m_ldr_to_hdr_upconversion_nit_multiplier; + + // True if any loaded source images were LDR and upconverted to HDR. + bool m_upconverted_any_ldr_images; + bool m_any_source_image_has_alpha; bool m_opencl_failed; @@ -588,14 +698,48 @@ namespace basisu bool create_basis_file_and_transcode(); bool write_hdr_debug_images(const char* pBasename, const imagef& img, uint32_t width, uint32_t height); bool write_output_files_and_compute_stats(); - error_code encode_slices_to_uastc_hdr(); - error_code encode_slices_to_uastc(); + error_code encode_slices_to_astc_6x6_hdr(); + error_code encode_slices_to_uastc_4x4_hdr(); + error_code encode_slices_to_uastc_4x4_ldr(); bool generate_mipmaps(const imagef& img, basisu::vector& mips, bool has_alpha); bool generate_mipmaps(const image &img, basisu::vector &mips, bool has_alpha); bool validate_texture_type_constraints(); bool validate_ktx2_constraints(); - void get_dfd(uint8_vec& dfd, const basist::ktx2_header& hdr); + bool get_dfd(uint8_vec& dfd, const basist::ktx2_header& hdr); bool create_ktx2_file(); + void pick_format_mode(); + + uint32_t get_block_width() const + { + if (m_params.m_hdr) + { + switch (m_params.m_hdr_mode) + { + case hdr_modes::cASTC_HDR_6X6: + case hdr_modes::cASTC_HDR_6X6_INTERMEDIATE: + return 6; + default: + break; + } + } + return 4; + } + + uint32_t get_block_height() const + { + if (m_params.m_hdr) + { + switch (m_params.m_hdr_mode) + { + case hdr_modes::cASTC_HDR_6X6: + case hdr_modes::cASTC_HDR_6X6_INTERMEDIATE: + return 6; + default: + break; + } + } + return 4; + } }; // Alternative simple C-style wrapper API around the basis_compressor class. @@ -628,21 +772,19 @@ namespace basisu cFlagKTX2UASTCSuperCompression = 1 << 12, // use KTX2 Zstd supercompression on UASTC files cFlagSRGB = 1 << 13, // input texture is sRGB, use perceptual colorspace metrics, also use sRGB filtering during mipmap gen, and also sets KTX2 output transfer func to sRGB - cFlagGenMipsClamp = 1 << 14, // generate mipmaps with clamp addressing - cFlagGenMipsWrap = 1 << 15, // generate mipmaps with wrap addressing - - cFlagYFlip = 1 << 16, // flip source image on Y axis before compression + cFlagGenMipsClamp = 1 << 14, // generate mipmaps with clamp addressing + cFlagGenMipsWrap = 1 << 15, // generate mipmaps with wrap addressing - cFlagUASTC = 1 << 17, // use UASTC compression vs. ETC1S - cFlagUASTCRDO = 1 << 18, // use RDO postprocessing when generating UASTC files (must set uastc_rdo_quality to the quality scalar) + cFlagYFlip = 1 << 16, // flip source image on Y axis before compression - cFlagPrintStats = 1 << 19, // print image stats to stdout - cFlagPrintStatus = 1 << 20, // print status to stdout + cFlagUASTCRDO = 1 << 17, // use RDO postprocessing when generating UASTC files (must set uastc_rdo_quality to the quality scalar) - cFlagHDR = 1 << 21, // Force encoder into HDR mode, even if source image is LDR. - cFlagHDRLDRImageSRGBToLinearConversion = 1 << 22, // In HDR mode, convert LDR source images to linear before encoding. + cFlagPrintStats = 1 << 18, // print image stats to stdout + cFlagPrintStatus = 1 << 19, // print status to stdout - cFlagDebugImages = 1 << 23 // enable status output + cFlagDebugImages = 1 << 20, // enable status output + + cFlagREC2020 = 1 << 21 // ASTC 6x6 modes: treat input as REC 2020 vs. the default 709 }; // This function accepts an array of source images. @@ -652,6 +794,7 @@ namespace basisu // basisu_encoder_init() MUST be called first! // LDR version. To compress the LDR source image as HDR: Use the cFlagHDR flag. void* basis_compress( + basist::basis_tex_format mode, const basisu::vector &source_images, uint32_t flags_and_quality, float uastc_rdo_quality, size_t* pSize, @@ -660,14 +803,16 @@ namespace basisu // HDR-only version. // Important: The returned block MUST be manually freed using basis_free_data(). void* basis_compress( + basist::basis_tex_format mode, const basisu::vector& source_images_hdr, - uint32_t flags_and_quality, + uint32_t flags_and_quality, float lambda, size_t* pSize, image_stats* pStats = nullptr); // This function only accepts a single LDR source image. It's just a wrapper for basis_compress() above. // Important: The returned block MUST be manually freed using basis_free_data(). void* basis_compress( + basist::basis_tex_format mode, const uint8_t* pImageRGBA, uint32_t width, uint32_t height, uint32_t pitch_in_pixels, uint32_t flags_and_quality, float uastc_rdo_quality, size_t* pSize, diff --git a/thirdparty/basis_universal/encoder/basisu_enc.cpp b/thirdparty/basis_universal/encoder/basisu_enc.cpp index fff98e830148..5987685ae712 100644 --- a/thirdparty/basis_universal/encoder/basisu_enc.cpp +++ b/thirdparty/basis_universal/encoder/basisu_enc.cpp @@ -21,7 +21,9 @@ #include "jpgd.h" #include "pvpngreader.h" #include "basisu_opencl.h" -#include "basisu_astc_hdr_enc.h" +#include "basisu_uastc_hdr_4x4_enc.h" +#include "basisu_astc_hdr_6x6_enc.h" + #include #ifndef TINYEXR_USE_ZFP @@ -47,10 +49,13 @@ namespace basisu { uint64_t interval_timer::g_init_ticks, interval_timer::g_freq; double interval_timer::g_timer_freq; + #if BASISU_SUPPORT_SSE bool g_cpu_supports_sse41; #endif + fast_linear_to_srgb g_fast_linear_to_srgb; + uint8_t g_hamming_dist[256] = { 0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, @@ -175,7 +180,7 @@ namespace basisu bool g_library_initialized; std::mutex g_encoder_init_mutex; - + // Encoder library initialization (just call once at startup) bool basisu_encoder_init(bool use_opencl, bool opencl_force_serialization) { @@ -185,7 +190,7 @@ namespace basisu return true; detect_sse41(); - + basist::basisu_transcoder_init(); pack_etc1_solid_color_init(); //uastc_init(); @@ -201,6 +206,7 @@ namespace basisu astc_hdr_enc_init(); basist::bc6h_enc_init(); + astc_6x6_hdr::global_init(); g_library_initialized = true; return true; @@ -215,15 +221,40 @@ namespace basisu void error_vprintf(const char* pFmt, va_list args) { - char buf[8192]; + const uint32_t BUF_SIZE = 256; + char buf[BUF_SIZE]; -#ifdef _WIN32 - vsprintf_s(buf, sizeof(buf), pFmt, args); -#else - vsnprintf(buf, sizeof(buf), pFmt, args); -#endif + va_list args_copy; + va_copy(args_copy, args); + int total_chars = vsnprintf(buf, sizeof(buf), pFmt, args_copy); + va_end(args_copy); - fprintf(stderr, "ERROR: %s", buf); + if (total_chars < 0) + { + assert(0); + return; + } + + if (total_chars >= (int)BUF_SIZE) + { + basisu::vector var_buf(total_chars + 1); + + va_copy(args_copy, args); + int total_chars_retry = vsnprintf(var_buf.data(), var_buf.size(), pFmt, args_copy); + va_end(args_copy); + + if (total_chars_retry < 0) + { + assert(0); + return; + } + + fprintf(stderr, "ERROR: %s", var_buf.data()); + } + else + { + fprintf(stderr, "ERROR: %s", buf); + } } void error_printf(const char *pFmt, ...) @@ -234,6 +265,18 @@ namespace basisu va_end(args); } +#if defined(_WIN32) + void platform_sleep(uint32_t ms) + { + Sleep(ms); + } +#else + void platform_sleep(uint32_t ms) + { + // TODO + } +#endif + #if defined(_WIN32) inline void query_counter(timer_ticks* pTicks) { @@ -331,6 +374,8 @@ namespace basisu return ticks * g_timer_freq; } + // Note this is linear<->sRGB, NOT REC709 which uses slightly different equations/transfer functions. + // However the gamuts/white points of REC709 and sRGB are the same. float linear_to_srgb(float l) { assert(l >= 0.0f && l <= 1.0f); @@ -339,7 +384,7 @@ namespace basisu else return saturate(1.055f * powf(l, 1.0f / 2.4f) - .055f); } - + float srgb_to_linear(float s) { assert(s >= 0.0f && s <= 1.0f); @@ -418,7 +463,8 @@ namespace basisu uint32_t width = 0, height = 0, num_chans = 0; void* pImage = pv_png::load_png(pBuf, buf_size, 4, width, height, num_chans); - if (!pBuf) + + if (!pImage) { error_printf("pv_png::load_png failed while loading image \"%s\"\n", pFilename); return false; @@ -457,6 +503,26 @@ namespace basisu return true; } + bool load_jpg(const uint8_t* pBuf, size_t buf_size, image& img) + { + if (buf_size > INT_MAX) + { + assert(0); + return false; + } + + int width = 0, height = 0, actual_comps = 0; + uint8_t* pImage_data = jpgd::decompress_jpeg_image_from_memory(pBuf, (int)buf_size, &width, &height, &actual_comps, 4, jpgd::jpeg_decoder::cFlagBoxChromaFiltering); + if (!pImage_data) + return false; + + img.init(pImage_data, width, height, 4); + + free(pImage_data); + + return true; + } + bool load_image(const char* pFilename, image& img) { std::string ext(string_get_extension(std::string(pFilename))); @@ -478,7 +544,7 @@ namespace basisu return false; } - static void convert_ldr_to_hdr_image(imagef &img, const image &ldr_img, bool ldr_srgb_to_linear) + static void convert_ldr_to_hdr_image(imagef &img, const image &ldr_img, bool ldr_srgb_to_linear, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f) { img.resize(ldr_img.get_width(), ldr_img.get_height()); @@ -491,23 +557,41 @@ namespace basisu vec4F& d = img(x, y); if (ldr_srgb_to_linear) { - // TODO: Multiply by 100-200 nits? - d[0] = srgb_to_linear(c[0] * (1.0f / 255.0f)); - d[1] = srgb_to_linear(c[1] * (1.0f / 255.0f)); - d[2] = srgb_to_linear(c[2] * (1.0f / 255.0f)); + float r = (float)c[0]; + float g = (float)c[1]; + float b = (float)c[2]; + + if (ldr_black_bias > 0.0f) + { + // ASTC HDR is noticeably weaker dealing with blocks containing some pixels with components set to 0. + // Add a very slight bias less than .5 to avoid this difficulity. When the HDR image is mapped to SDR sRGB and rounded back to 8-bits, this bias will still result in zero. + // (FWIW, in reality, a physical monitor would be unlikely to have a perfectly zero black level.) + // This is purely optional and on most images it doesn't matter visually. + if (r == 0.0f) + r = ldr_black_bias; + if (g == 0.0f) + g = ldr_black_bias; + if (b == 0.0f) + b = ldr_black_bias; + } + + // Compute how much linear light would be emitted by a SDR 80-100 nit monitor. + d[0] = srgb_to_linear(r * (1.0f / 255.0f)) * linear_nit_multiplier; + d[1] = srgb_to_linear(g * (1.0f / 255.0f)) * linear_nit_multiplier; + d[2] = srgb_to_linear(b * (1.0f / 255.0f)) * linear_nit_multiplier; } else { - d[0] = c[0] * (1.0f / 255.0f); - d[1] = c[1] * (1.0f / 255.0f); - d[2] = c[2] * (1.0f / 255.0f); + d[0] = c[0] * (1.0f / 255.0f) * linear_nit_multiplier; + d[1] = c[1] * (1.0f / 255.0f) * linear_nit_multiplier; + d[2] = c[2] * (1.0f / 255.0f) * linear_nit_multiplier; } d[3] = c[3] * (1.0f / 255.0f); } } } - bool load_image_hdr(const void* pMem, size_t mem_size, imagef& img, uint32_t width, uint32_t height, hdr_image_type img_type, bool ldr_srgb_to_linear) + bool load_image_hdr(const void* pMem, size_t mem_size, imagef& img, uint32_t width, uint32_t height, hdr_image_type img_type, bool ldr_srgb_to_linear, float linear_nit_multiplier, float ldr_black_bias) { if ((!pMem) || (!mem_size)) { @@ -571,13 +655,22 @@ namespace basisu break; } + case hdr_image_type::cHITJPGImage: + { + image ldr_img; + if (!load_jpg(static_cast(pMem), mem_size, ldr_img)) + return false; + + convert_ldr_to_hdr_image(img, ldr_img, ldr_srgb_to_linear, linear_nit_multiplier, ldr_black_bias); + break; + } case hdr_image_type::cHITPNGImage: { image ldr_img; if (!load_png(static_cast(pMem), mem_size, ldr_img)) return false; - convert_ldr_to_hdr_image(img, ldr_img, ldr_srgb_to_linear); + convert_ldr_to_hdr_image(img, ldr_img, ldr_srgb_to_linear, linear_nit_multiplier, ldr_black_bias); break; } case hdr_image_type::cHITEXRImage: @@ -605,8 +698,21 @@ namespace basisu return true; } + + bool is_image_filename_hdr(const char *pFilename) + { + std::string ext(string_get_extension(std::string(pFilename))); + + if (ext.length() == 0) + return false; + + const char* pExt = ext.c_str(); + + return ((strcasecmp(pExt, "hdr") == 0) || (strcasecmp(pExt, "exr") == 0)); + } - bool load_image_hdr(const char* pFilename, imagef& img, bool ldr_srgb_to_linear) + // TODO: move parameters to struct, add a HDR clean flag to eliminate NaN's/Inf's + bool load_image_hdr(const char* pFilename, imagef& img, bool ldr_srgb_to_linear, float linear_nit_multiplier, float ldr_black_bias) { std::string ext(string_get_extension(std::string(pFilename))); @@ -637,7 +743,7 @@ namespace basisu if (!load_image(pFilename, ldr_img)) return false; - convert_ldr_to_hdr_image(img, ldr_img, ldr_srgb_to_linear); + convert_ldr_to_hdr_image(img, ldr_img, ldr_srgb_to_linear, linear_nit_multiplier, ldr_black_bias); } return true; @@ -1002,7 +1108,7 @@ namespace basisu return false; } - if ((src_w == dst_w) && (src_h == dst_h)) + if ((src_w == dst_w) && (src_h == dst_h) && (filter_scale == 1.0f)) { dst = src; return true; @@ -1652,7 +1758,7 @@ namespace basisu uint32_t a = max_index / num_syms, b = max_index % num_syms; - const uint32_t ofs = m_entries_picked.size(); + const size_t ofs = m_entries_picked.size(); m_entries_picked.push_back(a); m_entries_picked.push_back(b); @@ -2002,6 +2108,34 @@ namespace basisu m_psnr = m_rms ? (float)clamp(log10(255.0 / m_rms) * 20.0f, 0.0f, 100.0f) : 100.0f; } + void print_image_metrics(const image& a, const image& b) + { + image_metrics im; + im.calc(a, b, 0, 3); + im.print("RGB "); + + im.calc(a, b, 0, 4); + im.print("RGBA "); + + im.calc(a, b, 0, 1); + im.print("R "); + + im.calc(a, b, 1, 1); + im.print("G "); + + im.calc(a, b, 2, 1); + im.print("B "); + + im.calc(a, b, 3, 1); + im.print("A "); + + im.calc(a, b, 0, 0); + im.print("Y 709 "); + + im.calc(a, b, 0, 0, true, true); + im.print("Y 601 "); + } + void fill_buffer_with_random_bytes(void *pBuf, size_t size, uint32_t seed) { rand r(seed); @@ -2079,9 +2213,11 @@ namespace basisu } job_pool::job_pool(uint32_t num_threads) : - m_num_active_jobs(0), - m_kill_flag(false) + m_num_active_jobs(0) { + m_kill_flag.store(false); + m_num_active_workers.store(0); + assert(num_threads >= 1U); debug_printf("job_pool::job_pool: %u total threads\n", num_threads); @@ -2100,11 +2236,23 @@ namespace basisu debug_printf("job_pool::~job_pool\n"); // Notify all workers that they need to die right now. - m_kill_flag = true; + m_kill_flag.store(true); m_has_work.notify_all(); - // Wait for all workers to die. +#ifdef __EMSCRIPTEN__ + for ( ; ; ) + { + if (m_num_active_workers.load() <= 0) + break; + std::this_thread::sleep_for(std::chrono::milliseconds(50)); + } + + // At this point all worker threads should be exiting or exited. + // We could call detach(), but this seems to just call join() anyway. +#endif + + // Wait for all worker threads to exit. for (uint32_t i = 0; i < m_threads.size(); i++) m_threads[i].join(); } @@ -2157,13 +2305,26 @@ namespace basisu } // The queue is empty, now wait for all active jobs to finish up. +#ifndef __EMSCRIPTEN__ m_no_more_jobs.wait(lock, [this]{ return !m_num_active_jobs; } ); +#else + // Avoid infinite blocking + for (; ; ) + { + if (m_no_more_jobs.wait_for(lock, std::chrono::milliseconds(50), [this] { return !m_num_active_jobs; })) + { + break; + } + } +#endif } void job_pool::job_thread(uint32_t index) { BASISU_NOTE_UNUSED(index); //debug_printf("job_pool::job_thread: starting %u\n", index); + + m_num_active_workers.fetch_add(1); while (true) { @@ -2199,6 +2360,8 @@ namespace basisu m_no_more_jobs.notify_all(); } + m_num_active_workers.fetch_add(-1); + //debug_printf("job_pool::job_thread: exiting\n"); } @@ -3314,7 +3477,7 @@ namespace basisu return true; } - bool write_exr(const char* pFilename, imagef& img, uint32_t n_chans, uint32_t flags) + bool write_exr(const char* pFilename, const imagef& img, uint32_t n_chans, uint32_t flags) { assert((n_chans == 1) || (n_chans == 3) || (n_chans == 4)); @@ -3483,11 +3646,14 @@ namespace basisu // Very basic global Reinhard tone mapping, output converted to sRGB with no dithering, alpha is carried through unchanged. // Only used for debugging/development. - void tonemap_image_reinhard(image &ldr_img, const imagef &hdr_img, float exposure) + void tonemap_image_reinhard(image &ldr_img, const imagef &hdr_img, float exposure, bool add_noise, bool per_component, bool luma_scaling) { uint32_t width = hdr_img.get_width(), height = hdr_img.get_height(); ldr_img.resize(width, height); + + rand r; + r.seed(128); for (uint32_t y = 0; y < height; y++) { @@ -3495,32 +3661,84 @@ namespace basisu { vec4F c(hdr_img(x, y)); - for (uint32_t t = 0; t < 3; t++) + if (per_component) { - if (c[t] <= 0.0f) + for (uint32_t t = 0; t < 3; t++) { - c[t] = 0.0f; + if (c[t] <= 0.0f) + { + c[t] = 0.0f; + } + else + { + c[t] *= exposure; + c[t] = c[t] / (1.0f + c[t]); + } } - else + } + else + { + c[0] *= exposure; + c[1] *= exposure; + c[2] *= exposure; + + const float L = 0.2126f * c[0] + 0.7152f * c[1] + 0.0722f * c[2]; + + float Lmapped = 0.0f; + if (L > 0.0f) { - c[t] *= exposure; - c[t] = c[t] / (1.0f + c[t]); + //Lmapped = L / (1.0f + L); + //Lmapped /= L; + + Lmapped = 1.0f / (1.0f + L); + } + + c[0] = c[0] * Lmapped; + c[1] = c[1] * Lmapped; + c[2] = c[2] * Lmapped; + + if (luma_scaling) + { + // Keeps the ratio of r/g/b intact + float m = maximum(c[0], c[1], c[2]); + if (m > 1.0f) + { + c /= m; + } } } c.clamp(0.0f, 1.0f); - c[0] = linear_to_srgb(c[0]) * 255.0f; - c[1] = linear_to_srgb(c[1]) * 255.0f; - c[2] = linear_to_srgb(c[2]) * 255.0f; c[3] = c[3] * 255.0f; color_rgba& o = ldr_img(x, y); - - o[0] = (uint8_t)std::round(c[0]); - o[1] = (uint8_t)std::round(c[1]); - o[2] = (uint8_t)std::round(c[2]); - o[3] = (uint8_t)std::round(c[3]); + + if (add_noise) + { + c[0] = linear_to_srgb(c[0]) * 255.0f; + c[1] = linear_to_srgb(c[1]) * 255.0f; + c[2] = linear_to_srgb(c[2]) * 255.0f; + + const float NOISE_AMP = .5f; + c[0] += r.frand(-NOISE_AMP, NOISE_AMP); + c[1] += r.frand(-NOISE_AMP, NOISE_AMP); + c[2] += r.frand(-NOISE_AMP, NOISE_AMP); + + c.clamp(0.0f, 255.0f); + + o[0] = (uint8_t)fast_roundf_int(c[0]); + o[1] = (uint8_t)fast_roundf_int(c[1]); + o[2] = (uint8_t)fast_roundf_int(c[2]); + o[3] = (uint8_t)fast_roundf_int(c[3]); + } + else + { + o[0] = g_fast_linear_to_srgb.convert(c[0]); + o[1] = g_fast_linear_to_srgb.convert(c[1]); + o[2] = g_fast_linear_to_srgb.convert(c[2]); + o[3] = (uint8_t)fast_roundf_int(c[3]); + } } } } @@ -3681,5 +3899,69 @@ namespace basisu return true; } + + bool tonemap_image_compressive2(image& dst_img, const imagef& hdr_test_img) + { + const uint32_t width = hdr_test_img.get_width(); + const uint32_t height = hdr_test_img.get_height(); + + dst_img.resize(width, height); + dst_img.set_all(color_rgba(0, 0, 0, 255)); + + basisu::vector half_img(width * 3 * height); + + uint32_t low_h = UINT32_MAX, high_h = 0; + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + const vec4F& p = hdr_test_img(x, y); + + for (uint32_t i = 0; i < 3; i++) + { + float f = p[i]; + + if (std::isnan(f) || std::isinf(f)) + f = 0.0f; + else if (f < 0.0f) + f = 0.0f; + else if (f > basist::MAX_HALF_FLOAT) + f = basist::MAX_HALF_FLOAT; + + uint32_t h = basist::float_to_half(f); + + low_h = minimum(low_h, h); + high_h = maximum(high_h, h); + + half_img[(x + y * width) * 3 + i] = (basist::half_float)h; + + } // i + } // x + } // y + + if (low_h == high_h) + return false; + + for (uint32_t y = 0; y < height; y++) + { + for (uint32_t x = 0; x < width; x++) + { + for (uint32_t i = 0; i < 3; i++) + { + basist::half_float h = half_img[(x + y * width) * 3 + i]; + float f = (float)(h - low_h) / (float)(high_h - low_h); + + int iv = basisu::clamp((int)std::round(f * 255.0f), 0, 255); + + dst_img(x, y)[i] = (uint8_t)iv; + + } // i + } // x + } // y + + return true; + } + } // namespace basisu diff --git a/thirdparty/basis_universal/encoder/basisu_enc.h b/thirdparty/basis_universal/encoder/basisu_enc.h index 780605e7b861..5373b60c5aaf 100644 --- a/thirdparty/basis_universal/encoder/basisu_enc.h +++ b/thirdparty/basis_universal/encoder/basisu_enc.h @@ -64,8 +64,19 @@ namespace basisu void error_vprintf(const char* pFmt, va_list args); void error_printf(const char *pFmt, ...); - // Helpers + template + inline void fmt_error_printf(const char* pFmt, Args&&... args) + { + std::string res; + if (!fmt_variants(res, pFmt, fmt_variant_vec{ fmt_variant(std::forward(args))... })) + return; + error_printf("%s", res.c_str()); + } + void platform_sleep(uint32_t ms); + + // Helpers + inline uint8_t clamp255(int32_t i) { return (uint8_t)((i & 0xFFFFFF00U) ? (~(i >> 31)) : i); @@ -98,6 +109,17 @@ namespace basisu return (uint8_t)((v + (v >> 8)) >> 8); } + inline int fast_roundf_int(float x) + { + return (x >= 0.0f) ? (int)(x + 0.5f) : (int)(x - 0.5f); + } + + inline int fast_floorf_int(float x) + { + int xi = (int)x; // Truncate towards zero + return ((x < 0.0f) && (x != (float)xi)) ? (xi - 1) : xi; + } + inline uint64_t read_bits(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize) { assert(codesize <= 64); @@ -168,6 +190,15 @@ namespace basisu } bool string_begins_with(const std::string& str, const char* pPhrase); + + // Case sensitive, returns -1 if can't find + inline int string_find_first(const std::string& str, const char* pPhrase) + { + size_t res = str.find(pPhrase, 0); + if (res == std::string::npos) + return -1; + return (int)res; + } // Hashing @@ -209,12 +240,23 @@ namespace basisu template struct bit_hasher { - std::size_t operator()(const Key& k) const + inline std::size_t operator()(const Key& k) const { return hash_hsieh(reinterpret_cast(&k), sizeof(k)); } }; + struct string_hasher + { + inline std::size_t operator()(const std::string& k) const + { + size_t l = k.size(); + if (!l) + return 0; + return hash_hsieh(reinterpret_cast(k.c_str()), l); + } + }; + class running_stat { public: @@ -297,7 +339,7 @@ namespace basisu }; // Linear algebra - + template class vec { @@ -318,7 +360,7 @@ namespace basisu inline vec(const vec &other) { for (uint32_t i = 0; i < N; i++) m_v[i] = other.m_v[i]; } template inline vec(const vec &other) { set(other); } - inline T operator[](uint32_t i) const { assert(i < N); return m_v[i]; } + inline const T& operator[](uint32_t i) const { assert(i < N); return m_v[i]; } inline T &operator[](uint32_t i) { assert(i < N); return m_v[i]; } inline T getX() const { return m_v[0]; } @@ -327,6 +369,7 @@ namespace basisu inline T getW() const { static_assert(N >= 4, "N too small"); return m_v[3]; } inline bool operator==(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) if (m_v[i] != rhs.m_v[i]) return false; return true; } + inline bool operator!=(const vec& rhs) const { return !(*this == rhs); } inline bool operator<(const vec &rhs) const { for (uint32_t i = 0; i < N; i++) { if (m_v[i] < rhs.m_v[i]) return true; else if (m_v[i] != rhs.m_v[i]) return false; } return false; } inline void set_zero() { for (uint32_t i = 0; i < N; i++) m_v[i] = 0; } @@ -433,6 +476,8 @@ namespace basisu inline vec &normalize_in_place() { T len = length(); if (len != 0.0f) *this *= (1.0f / len); return *this; } + inline vec get_normalized() const { vec res(*this); res.normalize_in_place(); return res; } + inline vec &clamp(T l, T h) { for (uint32_t i = 0; i < N; i++) @@ -440,6 +485,14 @@ namespace basisu return *this; } + static vec component_mul(const vec& a, const vec& b) + { + vec res; + for (uint32_t i = 0; i < N; i++) + res[i] = a[i] * b[i]; + return res; + } + static vec component_min(const vec& a, const vec& b) { vec res; @@ -455,6 +508,14 @@ namespace basisu res[i] = maximum(a[i], b[i]); return res; } + + static vec lerp(const vec& a, const vec& b, float s) + { + vec res; + for (uint32_t i = 0; i < N; i++) + res[i] = basisu::lerp(a[i], b[i], s); + return res; + } }; typedef vec<4, double> vec4D; @@ -462,12 +523,17 @@ namespace basisu typedef vec<2, double> vec2D; typedef vec<1, double> vec1D; + typedef vec<6, float> vec6F; + typedef vec<5, float> vec5F; typedef vec<4, float> vec4F; typedef vec<3, float> vec3F; typedef vec<2, float> vec2F; typedef vec<1, float> vec1F; typedef vec<16, float> vec16F; + + template struct bitwise_copyable< vec > { enum { cFlag = true }; }; + template struct bitwise_movable< vec > { enum { cFlag = true }; }; template class matrix @@ -514,6 +580,9 @@ namespace basisu } }; + template struct bitwise_copyable< matrix > { enum { cFlag = true }; }; + template struct bitwise_movable< matrix > { enum { cFlag = true }; }; + template inline VectorType compute_pca_from_covar(matrix &cmatrix) { @@ -759,6 +828,8 @@ namespace basisu std::atomic m_kill_flag; + std::atomic m_num_active_workers; + void job_thread(uint32_t index); }; @@ -962,6 +1033,9 @@ namespace basisu inline int get_709_luma() const { return (13938U * m_comps[0] + 46869U * m_comps[1] + 4729U * m_comps[2] + 32768U) >> 16U; } inline int get_luma(bool luma_601) const { return luma_601 ? get_601_luma() : get_709_luma(); } + inline uint32_t get_bgra_uint32() const { return b | (g << 8) | (r << 16) | (a << 24); } + inline uint32_t get_rgba_uint32() const { return r | (g << 8) | (b << 16) | (a << 24); } + inline basist::color32 get_color32() const { return basist::color32(r, g, b, a); @@ -1135,23 +1209,7 @@ namespace basisu return true; } - - inline std::string string_format(const char* pFmt, ...) - { - char buf[2048]; - - va_list args; - va_start(args, pFmt); -#ifdef _WIN32 - vsprintf_s(buf, sizeof(buf), pFmt, args); -#else - vsnprintf(buf, sizeof(buf), pFmt, args); -#endif - va_end(args); - - return std::string(buf); - } - + inline std::string string_tolower(const std::string& s) { std::string result(s); @@ -1710,7 +1768,7 @@ namespace basisu // This SSE function takes pointers to void types, so do some sanity checks. assert(sizeof(TrainingVectorType) == sizeof(float) * 16); assert(sizeof(training_vec_with_weight) == sizeof(std::pair)); - update_covar_matrix_16x16_sse41(node.m_training_vecs.size(), m_training_vecs.data(), &node.m_origin, node.m_training_vecs.data(), &cmatrix); + update_covar_matrix_16x16_sse41(node.m_training_vecs.size_u32(), m_training_vecs.data(), &node.m_origin, node.m_training_vecs.data(), &cmatrix); #endif } @@ -2019,9 +2077,7 @@ namespace basisu for (uint32_t thread_iter = 0; thread_iter < max_threads; thread_iter++) { -#ifndef __EMSCRIPTEN__ pJob_pool->add_job( [thread_iter, &local_clusters, &local_parent_clusters, &success_flags, &quantizers, &initial_codebook, &q, &limit_clusterizers, &max_codebook_size, &max_threads, &max_parent_codebook_size] { -#endif Quantizer& lq = quantizers[thread_iter]; uint_vec& cluster_indices = initial_codebook[thread_iter]; @@ -2062,15 +2118,11 @@ namespace basisu } } -#ifndef __EMSCRIPTEN__ } ); -#endif } // thread_iter -#ifndef __EMSCRIPTEN__ pJob_pool->wait_for_all(); -#endif uint32_t total_clusters = 0, total_parent_clusters = 0; @@ -2353,6 +2405,48 @@ namespace basisu { } + bitwise_coder(const bitwise_coder& other) : + m_bytes(other.m_bytes), + m_bit_buffer(other.m_bit_buffer), + m_bit_buffer_size(other.m_bit_buffer_size), + m_total_bits(other.m_total_bits) + { + } + + bitwise_coder(bitwise_coder&& other) : + m_bytes(std::move(other.m_bytes)), + m_bit_buffer(other.m_bit_buffer), + m_bit_buffer_size(other.m_bit_buffer_size), + m_total_bits(other.m_total_bits) + { + } + + bitwise_coder& operator= (const bitwise_coder& rhs) + { + if (this == &rhs) + return *this; + + m_bytes = rhs.m_bytes; + m_bit_buffer = rhs.m_bit_buffer; + m_bit_buffer_size = rhs.m_bit_buffer_size; + m_total_bits = rhs.m_total_bits; + + return *this; + } + + bitwise_coder& operator= (bitwise_coder&& rhs) + { + if (this == &rhs) + return *this; + + m_bytes = std::move(rhs.m_bytes); + m_bit_buffer = rhs.m_bit_buffer; + m_bit_buffer_size = rhs.m_bit_buffer_size; + m_total_bits = rhs.m_total_bits; + + return *this; + } + inline void clear() { clear_vector(m_bytes); @@ -2370,8 +2464,12 @@ namespace basisu } inline const uint8_vec &get_bytes() const { return m_bytes; } + inline uint8_vec& get_bytes() { return m_bytes; } + + inline void reserve(uint32_t size) { m_bytes.reserve(size); } inline uint64_t get_total_bits() const { return m_total_bits; } + inline uint32_t get_total_bits_u32() const { assert(m_total_bits <= UINT32_MAX); return static_cast(m_total_bits); } inline void clear_total_bits() { m_total_bits = 0; } inline void init(uint32_t reserve_size = 1024) @@ -2495,16 +2593,27 @@ namespace basisu } uint32_t emit_huffman_table(const huffman_encoding_table &tab); + + void append(const bitwise_coder& other) + { + for (uint32_t i = 0; i < other.m_bytes.size(); i++) + put_bits(other.m_bytes[i], 8); + + if (other.m_bit_buffer_size) + put_bits(other.m_bit_buffer, other.m_bit_buffer_size); + } private: uint8_vec m_bytes; uint32_t m_bit_buffer, m_bit_buffer_size; uint64_t m_total_bits; - void append_byte(uint8_t c) + inline void append_byte(uint8_t c) { - m_bytes.resize(m_bytes.size() + 1); - m_bytes.back() = c; + //m_bytes.resize(m_bytes.size() + 1); + //m_bytes.back() = c; + + m_bytes.push_back(c); } static void end_nonzero_run(uint16_vec &syms, uint32_t &run_size, uint32_t len); @@ -2672,6 +2781,31 @@ namespace basisu *this = other; } + image(image&& other) : + m_width(other.m_width), m_height(other.m_height), m_pitch(other.m_pitch), + m_pixels(std::move(other.m_pixels)) + { + other.m_width = 0; + other.m_height = 0; + other.m_pitch = 0; + } + + image& operator= (image&& rhs) + { + if (this != &rhs) + { + m_width = rhs.m_width; + m_height = rhs.m_height; + m_pitch = rhs.m_pitch; + m_pixels = std::move(rhs.m_pixels); + + rhs.m_width = 0; + rhs.m_height = 0; + rhs.m_pitch = 0; + } + return *this; + } + image &swap(image &other) { std::swap(m_width, other.m_width); @@ -2702,6 +2836,12 @@ namespace basisu return *this; } + image& match_dimensions(const image& other) + { + resize(other.get_width(), other.get_height()); + return *this; + } + image &resize(uint32_t w, uint32_t h, uint32_t p = UINT32_MAX, const color_rgba& background = g_black_color) { return crop(w, h, p, background); @@ -2913,7 +3053,7 @@ namespace basisu const int sx = src_x + x; if (sx < 0) continue; - else if (sx >= (int)src.get_height()) + else if (sx >= (int)src.get_width()) break; set_clipped(dst_x + x, dst_y + y, src(sx, sy)); @@ -2955,6 +3095,8 @@ namespace basisu return *this; } + inline bool is_valid() const { return m_width > 0; } + inline uint32_t get_width() const { return m_width; } inline uint32_t get_height() const { return m_height; } inline uint32_t get_pitch() const { return m_pitch; } @@ -3038,8 +3180,56 @@ namespace basisu return *this; } + void swap_rb() + { + for (auto& v : m_pixels) + std::swap(v.r, v.b); + } + void debug_text(uint32_t x_ofs, uint32_t y_ofs, uint32_t x_scale, uint32_t y_scale, const color_rgba &fg, const color_rgba *pBG, bool alpha_only, const char* p, ...); + vec4F get_filtered_vec4F(float x, float y) const + { + x -= .5f; + y -= .5f; + + int ix = (int)floorf(x); + int iy = (int)floorf(y); + float wx = x - ix; + float wy = y - iy; + + color_rgba a(get_clamped(ix, iy)); + color_rgba b(get_clamped(ix + 1, iy)); + color_rgba c(get_clamped(ix, iy + 1)); + color_rgba d(get_clamped(ix + 1, iy + 1)); + + vec4F result; + + for (uint32_t i = 0; i < 4; i++) + { + const float top = lerp((float)a[i], (float)b[i], wx); + const float bot = lerp((float)c[i], (float)d[i], wx); + const float m = lerp((float)top, (float)bot, wy); + + result[i] = m; + } + + return result; + } + + // (x,y) - Continuous coordinates, where pixel centers are at (.5,.5), valid image coords are [0,width] and [0,height]. Clamp addressing. + color_rgba get_filtered(float x, float y) const + { + const vec4F fresult(get_filtered_vec4F(x, y)); + + color_rgba result; + + for (uint32_t i = 0; i < 4; i++) + result[i] = (uint8_t)clamp((int)(fresult[i] + .5f), 0, 255); + + return result; + } + private: uint32_t m_width, m_height, m_pitch; // all in pixels color_rgba_vec m_pixels; @@ -3069,6 +3259,31 @@ namespace basisu *this = other; } + imagef(imagef&& other) : + m_width(other.m_width), m_height(other.m_height), m_pitch(other.m_pitch), + m_pixels(std::move(other.m_pixels)) + { + other.m_width = 0; + other.m_height = 0; + other.m_pitch = 0; + } + + imagef& operator= (imagef&& rhs) + { + if (this != &rhs) + { + m_width = rhs.m_width; + m_height = rhs.m_height; + m_pitch = rhs.m_pitch; + m_pixels = std::move(rhs.m_pixels); + + rhs.m_width = 0; + rhs.m_height = 0; + rhs.m_pitch = 0; + } + return *this; + } + imagef &swap(imagef &other) { std::swap(m_width, other.m_width); @@ -3118,6 +3333,12 @@ namespace basisu return *this; } + imagef& match_dimensions(const imagef& other) + { + resize(other.get_width(), other.get_height()); + return *this; + } + imagef &resize(const imagef &other, uint32_t p = UINT32_MAX, const vec4F& background = vec4F(0,0,0,1)) { return resize(other.get_width(), other.get_height(), p, background); @@ -3248,7 +3469,7 @@ namespace basisu const int sx = src_x + x; if (sx < 0) continue; - else if (sx >= (int)src.get_height()) + else if (sx >= (int)src.get_width()) break; set_clipped(dst_x + x, dst_y + y, src(sx, sy)); @@ -3274,10 +3495,12 @@ namespace basisu return *this; } + inline bool is_valid() const { return m_width > 0; } + inline uint32_t get_width() const { return m_width; } inline uint32_t get_height() const { return m_height; } inline uint32_t get_pitch() const { return m_pitch; } - inline uint32_t get_total_pixels() const { return m_width * m_height; } + inline uint64_t get_total_pixels() const { return (uint64_t)m_width * m_height; } inline uint32_t get_block_width(uint32_t w) const { return (m_width + (w - 1)) / w; } inline uint32_t get_block_height(uint32_t h) const { return (m_height + (h - 1)) / h; } @@ -3315,7 +3538,7 @@ namespace basisu { if (!nan_msg) { - fprintf(stderr, "One or more pixels was NaN, setting to 0.\n"); + fprintf(stderr, "One or more input pixels was NaN, setting to 0.\n"); nan_msg = true; } } @@ -3324,7 +3547,7 @@ namespace basisu { if (!inf_msg) { - fprintf(stderr, "One or more pixels was INF, setting to 0.\n"); + fprintf(stderr, "One or more input pixels was INF, setting to 0.\n"); inf_msg = true; } } @@ -3333,7 +3556,7 @@ namespace basisu { if (!neg_zero_msg) { - fprintf(stderr, "One or more pixels was -0, setting them to 0.\n"); + fprintf(stderr, "One or more input pixels was -0, setting them to 0.\n"); neg_zero_msg = true; } } @@ -3350,7 +3573,7 @@ namespace basisu if (!neg_msg) { - fprintf(stderr, "One or more pixels was negative -- setting these pixel components to 0 because ASTC HDR doesn't support signed values.\n"); + fprintf(stderr, "One or more input pixels was negative -- setting these pixel components to 0 because ASTC HDR doesn't support signed values.\n"); neg_msg = true; } @@ -3363,7 +3586,7 @@ namespace basisu if (!clamp_msg) { - fprintf(stderr, "One or more pixels had to be clamped to %f.\n", highest_mag); + fprintf(stderr, "One or more input pixels had to be clamped to %f.\n", highest_mag); clamp_msg = true; } @@ -3385,6 +3608,45 @@ namespace basisu return *this; } + + bool has_alpha(uint32_t channel = 3) const + { + for (uint32_t y = 0; y < m_height; ++y) + for (uint32_t x = 0; x < m_width; ++x) + if ((*this)(x, y)[channel] != 1.0f) + return true; + + return false; + } + + vec4F get_filtered_vec4F(float x, float y) const + { + x -= .5f; + y -= .5f; + + int ix = (int)floorf(x); + int iy = (int)floorf(y); + float wx = x - ix; + float wy = y - iy; + + vec4F a(get_clamped(ix, iy)); + vec4F b(get_clamped(ix + 1, iy)); + vec4F c(get_clamped(ix, iy + 1)); + vec4F d(get_clamped(ix + 1, iy + 1)); + + vec4F result; + + for (uint32_t i = 0; i < 4; i++) + { + const float top = lerp((float)a[i], (float)b[i], wx); + const float bot = lerp((float)c[i], (float)d[i], wx); + const float m = lerp((float)top, (float)bot, wy); + + result[i] = m; + } + + return result; + } private: uint32_t m_width, m_height, m_pitch; // all in pixels @@ -3402,6 +3664,52 @@ namespace basisu float linear_to_srgb(float l); float srgb_to_linear(float s); + class fast_linear_to_srgb + { + public: + fast_linear_to_srgb() + { + init(); + } + + void init() + { + for (int i = 0; i < LINEAR_TO_SRGB_TABLE_SIZE; ++i) + { + float l = (float)i * (1.0f / (LINEAR_TO_SRGB_TABLE_SIZE - 1)); + m_linear_to_srgb_table[i] = (uint8_t)basisu::fast_floorf_int(255.0f * basisu::linear_to_srgb(l)); + } + + float srgb_to_linear[256]; + for (int i = 0; i < 256; i++) + srgb_to_linear[i] = basisu::srgb_to_linear((float)i / 255.0f); + + for (int i = 0; i < 256; i++) + m_srgb_to_linear_thresh[i] = (srgb_to_linear[i] + srgb_to_linear[basisu::minimum(i + 1, 255)]) * .5f; + } + + inline uint8_t convert(float l) const + { + assert((l >= 0.0f) && (l <= 1.0f)); + int j = basisu::fast_roundf_int((LINEAR_TO_SRGB_TABLE_SIZE - 1) * l); + + assert((j >= 0) && (j < LINEAR_TO_SRGB_TABLE_SIZE)); + int b = m_linear_to_srgb_table[j]; + + b += (l > m_srgb_to_linear_thresh[b]); + + return (uint8_t)b; + } + + private: + static constexpr int LINEAR_TO_SRGB_TABLE_SIZE = 2048; + uint8_t m_linear_to_srgb_table[LINEAR_TO_SRGB_TABLE_SIZE]; + + float m_srgb_to_linear_thresh[256]; + }; + + extern fast_linear_to_srgb g_fast_linear_to_srgb; + // Image metrics class image_metrics @@ -3438,6 +3746,8 @@ namespace basisu void calc(const image &a, const image &b, uint32_t first_chan = 0, uint32_t total_chans = 0, bool avg_comp_error = true, bool use_601_luma = false); }; + void print_image_metrics(const image& a, const image& b); + // Image saving/loading/resampling bool load_png(const uint8_t* pBuf, size_t buf_size, image& img, const char* pFilename = nullptr); @@ -3450,15 +3760,22 @@ namespace basisu bool load_qoi(const char* pFilename, image& img); bool load_jpg(const char *pFilename, image& img); + bool load_jpg(const uint8_t* pBuf, size_t buf_size, image& img); inline bool load_jpg(const std::string &filename, image &img) { return load_jpg(filename.c_str(), img); } // Currently loads .PNG, .TGA, or .JPG bool load_image(const char* pFilename, image& img); inline bool load_image(const std::string &filename, image &img) { return load_image(filename.c_str(), img); } + bool is_image_filename_hdr(const char* pFilename); + // Supports .HDR and most (but not all) .EXR's (see TinyEXR). - bool load_image_hdr(const char* pFilename, imagef& img, bool ldr_srgb_to_linear = true); - inline bool load_image_hdr(const std::string& filename, imagef& img, bool ldr_srgb_to_linear = true) { return load_image_hdr(filename.c_str(), img, ldr_srgb_to_linear); } + bool load_image_hdr(const char* pFilename, imagef& img, bool ldr_srgb_to_linear = true, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f); + + inline bool load_image_hdr(const std::string& filename, imagef& img, bool ldr_srgb_to_linear = true, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f) + { + return load_image_hdr(filename.c_str(), img, ldr_srgb_to_linear, linear_nit_multiplier, ldr_black_bias); + } enum class hdr_image_type { @@ -3466,10 +3783,11 @@ namespace basisu cHITRGBAFloat = 1, cHITPNGImage = 2, cHITEXRImage = 3, - cHITHDRImage = 4 + cHITHDRImage = 4, + cHITJPGImage = 5 }; - bool load_image_hdr(const void* pMem, size_t mem_size, imagef& img, uint32_t width, uint32_t height, hdr_image_type img_type, bool ldr_srgb_to_linear); + bool load_image_hdr(const void* pMem, size_t mem_size, imagef& img, uint32_t width, uint32_t height, hdr_image_type img_type, bool ldr_srgb_to_linear, float linear_nit_multiplier = 1.0f, float ldr_black_bias = 0.0f); uint8_t *read_tga(const uint8_t *pBuf, uint32_t buf_size, int &width, int &height, int &n_chans); uint8_t *read_tga(const char *pFilename, int &width, int &height, int &n_chans); @@ -3512,7 +3830,7 @@ namespace basisu }; // Supports 1 (Y), 3 (RGB), or 4 (RGBA) channel images. - bool write_exr(const char* pFilename, imagef& img, uint32_t n_chans, uint32_t flags); + bool write_exr(const char* pFilename, const imagef& img, uint32_t n_chans, uint32_t flags); enum { @@ -3572,102 +3890,6 @@ namespace basisu inline double get_interval_timer() { return interval_timer::ticks_to_secs(interval_timer::get_ticks()); } - // 2D array - - template - class vector2D - { - typedef basisu::vector TVec; - - uint32_t m_width, m_height; - TVec m_values; - - public: - vector2D() : - m_width(0), - m_height(0) - { - } - - vector2D(uint32_t w, uint32_t h) : - m_width(0), - m_height(0) - { - resize(w, h); - } - - vector2D(const vector2D &other) - { - *this = other; - } - - vector2D &operator= (const vector2D &other) - { - if (this != &other) - { - m_width = other.m_width; - m_height = other.m_height; - m_values = other.m_values; - } - return *this; - } - - inline bool operator== (const vector2D &rhs) const - { - return (m_width == rhs.m_width) && (m_height == rhs.m_height) && (m_values == rhs.m_values); - } - - inline uint32_t size_in_bytes() const { return (uint32_t)m_values.size() * sizeof(m_values[0]); } - - inline const T &operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_values[x + y * m_width]; } - inline T &operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_values[x + y * m_width]; } - - inline const T &operator[] (uint32_t i) const { return m_values[i]; } - inline T &operator[] (uint32_t i) { return m_values[i]; } - - inline const T &at_clamped(int x, int y) const { return (*this)(clamp(x, 0, m_width - 1), clamp(y, 0, m_height - 1)); } - inline T &at_clamped(int x, int y) { return (*this)(clamp(x, 0, m_width - 1), clamp(y, 0, m_height - 1)); } - - void clear() - { - m_width = 0; - m_height = 0; - m_values.clear(); - } - - void set_all(const T&val) - { - vector_set_all(m_values, val); - } - - inline const T* get_ptr() const { return &m_values[0]; } - inline T* get_ptr() { return &m_values[0]; } - - vector2D &resize(uint32_t new_width, uint32_t new_height) - { - if ((m_width == new_width) && (m_height == new_height)) - return *this; - - TVec oldVals(new_width * new_height); - oldVals.swap(m_values); - - const uint32_t w = minimum(m_width, new_width); - const uint32_t h = minimum(m_height, new_height); - - if ((w) && (h)) - { - for (uint32_t y = 0; y < h; y++) - for (uint32_t x = 0; x < w; x++) - m_values[x + y * new_width] = oldVals[x + y * m_width]; - } - - m_width = new_width; - m_height = new_height; - - return *this; - } - }; - inline FILE *fopen_safe(const char *pFilename, const char *pMode) { #ifdef _WIN32 @@ -3723,12 +3945,14 @@ namespace basisu }; typedef basisu::vector pixel_block_hdr_vec; - void tonemap_image_reinhard(image& ldr_img, const imagef& hdr_img, float exposure); + void tonemap_image_reinhard(image& ldr_img, const imagef& hdr_img, float exposure, bool add_noise = false, bool per_component = true, bool luma_scaling = false); bool tonemap_image_compressive(image& dst_img, const imagef& hdr_test_img); + bool tonemap_image_compressive2(image& dst_img, const imagef& hdr_test_img); // Intersection enum eClear { cClear = 0 }; enum eInitExpand { cInitExpand = 0 }; + enum eIdentity { cIdentity = 0 }; template class ray @@ -3845,6 +4069,7 @@ namespace basisu typedef vec_interval vec_interval3F; typedef vec_interval vec_interval4F; + typedef vec_interval1F aabb1F; typedef vec_interval2F aabb2F; typedef vec_interval3F aabb3F; @@ -4004,18 +4229,19 @@ namespace basisu return result; } + union fu32 + { + uint32_t u; + float f; + }; + // Supports positive and denormals only. No NaN or Inf. - inline float fast_half_to_float_pos_not_inf_or_nan(basist::half_float h) + BASISU_FORCE_INLINE float fast_half_to_float_pos_not_inf_or_nan(basist::half_float h) { assert(!basist::half_is_signed(h) && !basist::is_half_inf_or_nan(h)); - - union fu32 - { - uint32_t u; - float f; - }; - - static const fu32 K = { 0x77800000 }; + + // add 112 to the exponent (112+half float's exp bias of 15=float32's bias of 127) + static const fu32 K = { 0x77800000 }; fu32 o; o.u = h << 13; @@ -4023,7 +4249,62 @@ namespace basisu return o.f; } + + // Positive, negative, or denormals. No NaN or Inf. Clamped to MAX_HALF_FLOAT. + inline basist::half_float fast_float_to_half_trunc_no_nan_or_inf(float f) + { + assert(!isnan(f) && !isinf(f)); + + // Sutract 112 from the exponent, to change the bias from 127 to 15. + static const fu32 g_f_to_h{ 0x7800000 }; -} // namespace basisu + fu32 fu; + + fu.f = minimum((float)basist::MAX_HALF_FLOAT, fabsf(f)) * g_f_to_h.f; + return (basist::half_float)(((fu.u >> (23 - 10)) & 0x7FFF) | ((f < 0.0f) ? 0x8000 : 0)); + } + + inline basist::half_float fast_float_to_half_trunc_no_clamp_neg_nan_or_inf(float f) + { + assert(!isnan(f) && !isinf(f)); + assert((f >= 0.0f) && (f <= basist::MAX_HALF_FLOAT)); + + // Sutract 112 from the exponent, to change the bias from 127 to 15. + static const fu32 g_f_to_h{ 0x7800000 }; + + fu32 fu; + + fu.f = f * g_f_to_h.f; + + return (basist::half_float)((fu.u >> (23 - 10)) & 0x7FFF); + } + + inline basist::half_float fast_float_to_half_no_clamp_neg_nan_or_inf(float f) + { + assert(!isnan(f) && !isinf(f)); + assert((f >= 0.0f) && (f <= basist::MAX_HALF_FLOAT)); + + // Sutract 112 from the exponent, to change the bias from 127 to 15. + static const fu32 g_f_to_h{ 0x7800000 }; + + fu32 fu; + + fu.f = f * g_f_to_h.f; + + uint32_t h = (basist::half_float)((fu.u >> (23 - 10)) & 0x7FFF); + + // round to even or nearest + uint32_t mant = fu.u & 8191; // examine lowest 13 bits + uint32_t inc = (mant > 4096) | ((mant == 4096) & (h & 1)); + h += inc; + + if (h > basist::MAX_HALF_FLOAT_AS_INT_BITS) + h = basist::MAX_HALF_FLOAT_AS_INT_BITS; + + return (basist::half_float)h; + } + +} // namespace basisu +#include "basisu_math.h" diff --git a/thirdparty/basis_universal/encoder/basisu_frontend.cpp b/thirdparty/basis_universal/encoder/basisu_frontend.cpp index 750f706aa538..99ac2aa9dd49 100644 --- a/thirdparty/basis_universal/encoder/basisu_frontend.cpp +++ b/thirdparty/basis_universal/encoder/basisu_frontend.cpp @@ -353,9 +353,7 @@ namespace basisu const uint32_t first_index = block_index_iter; const uint32_t last_index = minimum(m_total_blocks, first_index + N); -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->add_job([this, first_index, last_index] { -#endif for (uint32_t block_index = first_index; block_index < last_index; block_index++) { @@ -388,15 +386,11 @@ namespace basisu m_block_selector_cluster_index[block_index] = best_index; } -#ifndef __EMSCRIPTEN__ }); -#endif } -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->wait_for_all(); -#endif m_encoded_blocks.resize(m_total_blocks); for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++) @@ -425,9 +419,7 @@ namespace basisu const uint32_t first_index = block_index_iter; const uint32_t last_index = minimum(m_total_blocks, first_index + N); -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->add_job([this, first_index, last_index, pass] { -#endif for (uint32_t block_index = first_index; block_index < last_index; block_index++) { @@ -475,15 +467,11 @@ namespace basisu } // block_index -#ifndef __EMSCRIPTEN__ }); -#endif } -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->wait_for_all(); -#endif m_endpoint_clusters.resize(0); m_endpoint_clusters.resize(endpoints.size()); @@ -501,9 +489,7 @@ namespace basisu const uint32_t first_index = block_index_iter; const uint32_t last_index = minimum(m_total_blocks, first_index + N); -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->add_job([this, first_index, last_index] { -#endif for (uint32_t block_index = first_index; block_index < last_index; block_index++) { @@ -535,15 +521,11 @@ namespace basisu m_block_selector_cluster_index[block_index] = best_index; } -#ifndef __EMSCRIPTEN__ }); -#endif } -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->wait_for_all(); -#endif m_encoded_blocks.resize(m_total_blocks); for (uint32_t block_index = 0; block_index < m_total_blocks; block_index++) @@ -573,7 +555,7 @@ namespace basisu debug_printf("introduce_special_selector_clusters\n"); uint32_t total_blocks_relocated = 0; - const uint32_t initial_selector_clusters = (uint32_t)m_selector_cluster_block_indices.size(); + const uint32_t initial_selector_clusters = m_selector_cluster_block_indices.size_u32(); bool_vec block_relocated_flags(m_total_blocks); @@ -595,7 +577,7 @@ namespace basisu debug_printf("Introducing sel %u\n", sel); - const uint32_t new_selector_cluster_index = (uint32_t)m_optimized_cluster_selectors.size(); + const uint32_t new_selector_cluster_index = m_optimized_cluster_selectors.size_u32(); m_optimized_cluster_selectors.push_back(blk); @@ -675,7 +657,7 @@ namespace basisu { debug_printf("optimize_selector_codebook\n"); - const uint32_t orig_total_selector_clusters = (uint32_t)m_optimized_cluster_selectors.size(); + const uint32_t orig_total_selector_clusters = m_optimized_cluster_selectors.size_u32(); bool_vec selector_cluster_was_used(m_optimized_cluster_selectors.size()); for (uint32_t i = 0; i < m_total_blocks; i++) @@ -787,9 +769,7 @@ namespace basisu const uint32_t first_index = block_index_iter; const uint32_t last_index = minimum(m_total_blocks, first_index + N); -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->add_job([this, first_index, last_index] { -#endif for (uint32_t block_index = first_index; block_index < last_index; block_index++) { @@ -830,15 +810,11 @@ namespace basisu blk.set_selector(x, y, selectors[x + y * 4]); } -#ifndef __EMSCRIPTEN__ }); -#endif } -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->wait_for_all(); -#endif } // use_cpu @@ -859,9 +835,7 @@ namespace basisu const uint32_t first_index = block_index_iter; const uint32_t last_index = minimum(m_total_blocks, first_index + N); -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->add_job( [this, first_index, last_index, &training_vecs] { -#endif for (uint32_t block_index = first_index; block_index < last_index; block_index++) { @@ -883,15 +857,11 @@ namespace basisu } // block_index; -#ifndef __EMSCRIPTEN__ } ); -#endif } // block_index_iter -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->wait_for_all(); -#endif } void basisu_frontend::generate_endpoint_clusters() @@ -970,7 +940,7 @@ namespace basisu } if (m_params.m_debug_stats) - debug_printf("Total endpoint clusters: %u, parent clusters: %u\n", (uint32_t)m_endpoint_clusters.size(), (uint32_t)m_endpoint_parent_clusters.size()); + debug_printf("Total endpoint clusters: %u, parent clusters: %u\n", m_endpoint_clusters.size_u32(), m_endpoint_parent_clusters.size_u32()); } // Iterate through each array of endpoint cluster block indices and set the m_block_endpoint_clusters_indices[][] array to indicaste which cluster index each block uses. @@ -1040,11 +1010,9 @@ namespace basisu for (uint32_t cluster_index_iter = 0; cluster_index_iter < m_endpoint_clusters.size(); cluster_index_iter += N) { const uint32_t first_index = cluster_index_iter; - const uint32_t last_index = minimum((uint32_t)m_endpoint_clusters.size(), cluster_index_iter + N); + const uint32_t last_index = minimum(m_endpoint_clusters.size_u32(), cluster_index_iter + N); -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->add_job( [this, first_index, last_index] { -#endif for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++) { @@ -1112,15 +1080,11 @@ namespace basisu } } // cluster_index -#ifndef __EMSCRIPTEN__ } ); -#endif } // cluster_index_iter -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->wait_for_all(); -#endif vector_sort(m_subblock_endpoint_quant_err_vec); } @@ -1131,19 +1095,19 @@ namespace basisu generate_block_endpoint_clusters(); - int num_new_endpoint_clusters = m_params.m_max_endpoint_clusters - (uint32_t)m_endpoint_clusters.size(); + int num_new_endpoint_clusters = m_params.m_max_endpoint_clusters - m_endpoint_clusters.size_u32(); if (num_new_endpoint_clusters <= 0) return; compute_endpoint_subblock_error_vec(); - const uint32_t num_orig_endpoint_clusters = (uint32_t)m_endpoint_clusters.size(); + const uint32_t num_orig_endpoint_clusters = m_endpoint_clusters.size_u32(); std::unordered_set training_vector_was_relocated; uint_vec cluster_sizes(num_orig_endpoint_clusters); for (uint32_t i = 0; i < num_orig_endpoint_clusters; i++) - cluster_sizes[i] = (uint32_t)m_endpoint_clusters[i].size(); + cluster_sizes[i] = m_endpoint_clusters[i].size_u32(); std::unordered_set ignore_cluster; @@ -1259,7 +1223,7 @@ namespace basisu // TODO: Get this working when step>0 if (m_params.m_pOpenCL_context && !step) { - const uint32_t total_clusters = m_endpoint_clusters.size(); + const uint32_t total_clusters = (uint32_t)m_endpoint_clusters.size(); basisu::vector pixel_clusters(total_clusters); @@ -1342,7 +1306,7 @@ namespace basisu } // cluster_indices_iter - uint32_t* pSorted = radix_sort(colors.size(), colors.data(), colors2.data(), 0, 3); + uint32_t* pSorted = radix_sort((uint32_t)colors.size(), colors.data(), colors2.data(), 0, 3); const uint64_t first_pixel_index = input_pixels.size(); @@ -1522,9 +1486,7 @@ namespace basisu const uint32_t first_index = cluster_index_iter; const uint32_t last_index = minimum((uint32_t)m_endpoint_clusters.size(), cluster_index_iter + N); -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->add_job([this, first_index, last_index, step] { -#endif for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++) { @@ -1643,15 +1605,11 @@ namespace basisu } // cluster_index -#ifndef __EMSCRIPTEN__ }); -#endif } // cluster_index_iter -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->wait_for_all(); -#endif } debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs()); @@ -1726,14 +1684,14 @@ namespace basisu { // For the OpenCL kernel, we order the parent endpoint clusters by smallest to largest for efficiency. // We also prepare an array of block info structs that point into this new parent endpoint cluster array. - const uint32_t total_parent_clusters = m_endpoint_clusters_within_each_parent_cluster.size(); + const uint32_t total_parent_clusters = (uint32_t)m_endpoint_clusters_within_each_parent_cluster.size(); basisu::vector cl_block_info_structs(m_total_blocks); // the size of each parent cluster, in total clusters uint_vec parent_cluster_sizes(total_parent_clusters); for (uint32_t i = 0; i < total_parent_clusters; i++) - parent_cluster_sizes[i] = m_endpoint_clusters_within_each_parent_cluster[i].size(); + parent_cluster_sizes[i] = (uint32_t)m_endpoint_clusters_within_each_parent_cluster[i].size(); uint_vec first_parent_cluster_ofs(total_parent_clusters); uint32_t cur_ofs = 0; @@ -1818,9 +1776,7 @@ namespace basisu const uint32_t first_index = block_index_iter; const uint32_t last_index = minimum(m_total_blocks, first_index + N); -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->add_job([this, first_index, last_index, &best_cluster_indices, &block_clusters] { -#endif for (uint32_t block_index = first_index; block_index < last_index; block_index++) { @@ -1951,15 +1907,11 @@ namespace basisu } // block_index -#ifndef __EMSCRIPTEN__ }); -#endif } // block_index_iter -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->wait_for_all(); -#endif } // use_cpu @@ -2104,9 +2056,7 @@ namespace basisu const uint32_t first_index = block_index_iter; const uint32_t last_index = minimum(m_total_blocks, first_index + N); -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->add_job([this, first_index, last_index] { -#endif for (uint32_t block_index = first_index; block_index < last_index; block_index++) { @@ -2131,15 +2081,11 @@ namespace basisu } // block_index -#ifndef __EMSCRIPTEN__ }); -#endif } // block_index_iter -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->wait_for_all(); -#endif } // use_cpu @@ -2204,9 +2150,7 @@ namespace basisu const uint32_t first_index = block_index_iter; const uint32_t last_index = minimum(m_total_blocks, first_index + N); -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->add_job( [this, first_index, last_index, &training_vecs] { -#endif for (uint32_t block_index = first_index; block_index < last_index; block_index++) { @@ -2233,15 +2177,11 @@ namespace basisu } // block_index -#ifndef __EMSCRIPTEN__ } ); -#endif } // block_index_iter -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->wait_for_all(); -#endif vec16F_clusterizer selector_clusterizer; for (uint32_t i = 0; i < m_total_blocks; i++) @@ -2335,9 +2275,7 @@ namespace basisu const uint32_t first_index = cluster_index_iter; const uint32_t last_index = minimum((uint32_t)total_selector_clusters, cluster_index_iter + N); -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->add_job([this, first_index, last_index] { -#endif for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++) { @@ -2406,15 +2344,11 @@ namespace basisu } // cluster_index -#ifndef __EMSCRIPTEN__ }); -#endif } // cluster_index_iter -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->wait_for_all(); -#endif debug_printf("Elapsed time: %3.3f secs\n", tm.get_elapsed_secs()); @@ -2506,7 +2440,7 @@ namespace basisu if ((m_params.m_pOpenCL_context) && m_use_hierarchical_selector_codebooks) { - const uint32_t num_parent_clusters = m_selector_clusters_within_each_parent_cluster.size(); + const uint32_t num_parent_clusters = m_selector_clusters_within_each_parent_cluster.size_u32(); basisu::vector selector_structs; selector_structs.reserve(m_optimized_cluster_selectors.size()); @@ -2534,7 +2468,7 @@ namespace basisu selector_cluster_indices.push_back(selector_cluster_index); } - cur_ofs += m_selector_clusters_within_each_parent_cluster[parent_index].size(); + cur_ofs += m_selector_clusters_within_each_parent_cluster[parent_index].size_u32(); } const uint32_t total_input_selectors = cur_ofs; @@ -2549,7 +2483,7 @@ namespace basisu block_structs[i].m_etc_color5_inten.a = (uint8_t)blk.get_inten_table(0); block_structs[i].m_first_selector = parent_selector_cluster_offsets[parent_selector_cluster]; - block_structs[i].m_num_selectors = m_selector_clusters_within_each_parent_cluster[parent_selector_cluster].size(); + block_structs[i].m_num_selectors = m_selector_clusters_within_each_parent_cluster[parent_selector_cluster].size_u32(); } uint_vec output_selector_cluster_indices(m_total_blocks); @@ -2615,10 +2549,8 @@ namespace basisu const uint32_t first_index = block_index_iter; const uint32_t last_index = minimum(m_total_blocks, first_index + N); - #ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->add_job( [this, first_index, last_index, &unpacked_optimized_cluster_selectors] { - #endif - + int prev_best_cluster_index = 0; for (uint32_t block_index = first_index; block_index < last_index; block_index++) @@ -2756,15 +2688,11 @@ namespace basisu } // block_index - #ifndef __EMSCRIPTEN__ } ); - #endif } // block_index_iter - #ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->wait_for_all(); - #endif for (uint32_t i = 0; i < m_selector_cluster_block_indices.size(); i++) { @@ -3081,9 +3009,7 @@ namespace basisu const uint32_t first_index = cluster_index_iter; const uint32_t last_index = minimum((uint32_t)new_endpoint_cluster_block_indices.size(), cluster_index_iter + N); -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->add_job( [this, first_index, last_index, &cluster_improved, &cluster_valid, &new_endpoint_cluster_block_indices, &pBlock_selector_indices ] { -#endif for (uint32_t cluster_index = first_index; cluster_index < last_index; cluster_index++) { @@ -3173,15 +3099,11 @@ namespace basisu } // cluster_index -#ifndef __EMSCRIPTEN__ } ); -#endif } // cluster_index_iter -#ifndef __EMSCRIPTEN__ m_params.m_pJob_pool->wait_for_all(); -#endif uint32_t total_unused_clusters = 0; uint32_t total_improved_clusters = 0; diff --git a/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp index 648cfb47aecb..028ac3f31417 100644 --- a/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp +++ b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp @@ -19,6 +19,8 @@ #include "basisu_bc7enc.h" #include "../transcoder/basisu_astc_hdr_core.h" +#define BASISU_USE_GOOGLE_ASTC_DECODER (1) + namespace basisu { //------------------------------------------------------------------------------------------------ @@ -1421,6 +1423,7 @@ namespace basisu case texture_format::cBC6HUnsigned: case texture_format::cASTC_HDR_4x4: case texture_format::cUASTC_HDR_4x4: + case texture_format::cASTC_HDR_6x6: { // Can't unpack HDR blocks in unpack_block() because it returns 32bpp pixel data. assert(0); @@ -1487,15 +1490,44 @@ namespace basisu { switch (fmt) { + case texture_format::cASTC_HDR_6x6: + { +#if BASISU_USE_GOOGLE_ASTC_DECODER + bool status = basisu_astc::astc::decompress_hdr(&pPixels[0][0], (uint8_t*)pBlock, 6, 6); + assert(status); + if (!status) + return false; +#else + // Use our decoder + basist::half_float half_block[6 * 6][4]; + + astc_helpers::log_astc_block log_blk; + if (!astc_helpers::unpack_block(pBlock, log_blk, 6, 6)) + return false; + if (!astc_helpers::decode_block(log_blk, half_block, 6, 6, astc_helpers::cDecodeModeHDR16)) + return false; + + for (uint32_t p = 0; p < (6 * 6); p++) + { + pPixels[p][0] = basist::half_to_float(half_block[p][0]); + pPixels[p][1] = basist::half_to_float(half_block[p][1]); + pPixels[p][2] = basist::half_to_float(half_block[p][2]); + pPixels[p][3] = basist::half_to_float(half_block[p][3]); + } +#endif + return true; + } case texture_format::cASTC_HDR_4x4: case texture_format::cUASTC_HDR_4x4: { -#if 1 +#if BASISU_USE_GOOGLE_ASTC_DECODER + // Use Google's decoder bool status = basisu_astc::astc::decompress_hdr(&pPixels[0][0], (uint8_t*)pBlock, 4, 4); assert(status); if (!status) return false; #else + // Use our decoder basist::half_float half_block[16][4]; astc_helpers::log_astc_block log_blk; @@ -1592,10 +1624,8 @@ namespace basisu bool gpu_image::unpack_hdr(imagef& img) const { - if ((m_fmt != texture_format::cASTC_HDR_4x4) && - (m_fmt != texture_format::cUASTC_HDR_4x4) && - (m_fmt != texture_format::cBC6HUnsigned) && - (m_fmt != texture_format::cBC6HSigned)) + if ((m_fmt != texture_format::cASTC_HDR_4x4) && (m_fmt != texture_format::cUASTC_HDR_4x4) && (m_fmt != texture_format::cASTC_HDR_6x6) && + (m_fmt != texture_format::cBC6HUnsigned) && (m_fmt != texture_format::cBC6HSigned)) { // Can't call on LDR images, at least currently. (Could unpack the LDR data and convert to float.) assert(0); @@ -1643,6 +1673,7 @@ namespace basisu KTX_RG = 0x8227, KTX_RGB = 0x1907, KTX_RGBA = 0x1908, + KTX_COMPRESSED_RGB_S3TC_DXT1_EXT = 0x83F0, KTX_COMPRESSED_RGBA_S3TC_DXT5_EXT = 0x83F3, KTX_COMPRESSED_RED_RGTC1_EXT = 0x8DBB, @@ -1655,11 +1686,42 @@ namespace basisu KTX_COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT = 0x8E8F, KTX_COMPRESSED_RGB_PVRTC_4BPPV1_IMG = 0x8C00, KTX_COMPRESSED_RGBA_PVRTC_4BPPV1_IMG = 0x8C02, + KTX_COMPRESSED_RGBA_ASTC_4x4_KHR = 0x93B0, + KTX_COMPRESSED_RGBA_ASTC_5x4_KHR = 0x93B1, + KTX_COMPRESSED_RGBA_ASTC_5x5_KHR = 0x93B2, + KTX_COMPRESSED_RGBA_ASTC_6x5_KHR = 0x93B3, + KTX_COMPRESSED_RGBA_ASTC_6x6_KHR = 0x93B4, + KTX_COMPRESSED_RGBA_ASTC_8x5_KHR = 0x93B5, + KTX_COMPRESSED_RGBA_ASTC_8x6_KHR = 0x93B6, + KTX_COMPRESSED_RGBA_ASTC_8x8_KHR = 0x93B7, + KTX_COMPRESSED_RGBA_ASTC_10x5_KHR = 0x93B8, + KTX_COMPRESSED_RGBA_ASTC_10x6_KHR = 0x93B9, + KTX_COMPRESSED_RGBA_ASTC_10x8_KHR = 0x93BA, + KTX_COMPRESSED_RGBA_ASTC_10x10_KHR = 0x93BB, + KTX_COMPRESSED_RGBA_ASTC_12x10_KHR = 0x93BC, + KTX_COMPRESSED_RGBA_ASTC_12x12_KHR = 0x93BD, + KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR = 0x93D0, + KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR = 0x93D1, + KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR = 0x93D2, + KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR = 0x93D3, + KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR = 0x93D4, + KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR = 0x93D5, + KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR = 0x93D6, + KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR = 0x93D7, + KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR = 0x93D8, + KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR = 0x93D9, + KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR = 0x93DA, + KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR = 0x93DB, + KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR = 0x93DC, + KTX_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR = 0x93DD, + KTX_COMPRESSED_RGBA_UASTC_4x4_KHR = 0x94CC, // TODO - Use proper value! + KTX_ATC_RGB_AMD = 0x8C92, KTX_ATC_RGBA_INTERPOLATED_ALPHA_AMD = 0x87EE, + KTX_COMPRESSED_RGB_FXT1_3DFX = 0x86B0, KTX_COMPRESSED_RGBA_FXT1_3DFX = 0x86B1, KTX_COMPRESSED_RGBA_PVRTC_4BPPV2_IMG = 0x9138, @@ -1836,6 +1898,13 @@ namespace basisu base_internal_fmt = KTX_RGBA; break; } + case texture_format::cASTC_HDR_6x6: + { + internal_fmt = KTX_COMPRESSED_RGBA_ASTC_6x6_KHR; + // TODO: should we write RGB? We don't support generating HDR 6x6 with alpha. + base_internal_fmt = KTX_RGBA; + break; + } // We use different enums for HDR vs. LDR ASTC, but internally they are both just ASTC. case texture_format::cASTC_LDR_4x4: case texture_format::cASTC_HDR_4x4: diff --git a/thirdparty/basis_universal/encoder/basisu_math.h b/thirdparty/basis_universal/encoder/basisu_math.h new file mode 100644 index 000000000000..3e56747bea70 --- /dev/null +++ b/thirdparty/basis_universal/encoder/basisu_math.h @@ -0,0 +1,3146 @@ +// File: basisu_math.h +#pragma once + +// TODO: Would prefer this in the basisu namespace, but to avoid collisions with the existing vec/matrix classes I'm placing this in "bu_math". +namespace bu_math +{ + // Cross-platform 1.0f/sqrtf(x) approximation. See https://en.wikipedia.org/wiki/Fast_inverse_square_root#cite_note-37. + // Would prefer using SSE1 etc. but that would require implementing multiple versions and platform divergence (needing more testing). + BASISU_FORCE_INLINE float inv_sqrt(float v) + { + union + { + float flt; + uint32_t ui; + } un; + + un.flt = v; + un.ui = 0x5F1FFFF9UL - (un.ui >> 1); + + return 0.703952253f * un.flt * (2.38924456f - v * (un.flt * un.flt)); + } + + inline float smoothstep(float edge0, float edge1, float x) + { + assert(edge1 != edge0); + + // Scale, and clamp x to 0..1 range + x = basisu::saturate((x - edge0) / (edge1 - edge0)); + + return x * x * (3.0f - 2.0f * x); + } + + template + class vec : public basisu::rel_ops > + { + public: + typedef T scalar_type; + enum + { + num_elements = N + }; + + inline vec() + { + } + + inline vec(basisu::eClear) + { + clear(); + } + + inline vec(const vec& other) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = other.m_s[i]; + } + + template + inline vec(const vec& other) + { + set(other); + } + + template + inline vec(const vec& other, T w) + { + *this = other; + m_s[N - 1] = w; + } + + template + inline explicit vec(Args... args) + { + static_assert(sizeof...(args) <= N); + set(args...); + } + + inline void clear() + { + if (N > 4) + memset(m_s, 0, sizeof(m_s)); + else + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = 0; + } + } + + template + inline vec& set(const vec& other) + { + if ((void*)this == (void*)&other) + return *this; + const uint32_t m = basisu::minimum(N, ON); + uint32_t i; + for (i = 0; i < m; i++) + m_s[i] = static_cast(other[i]); + for (; i < N; i++) + m_s[i] = 0; + return *this; + } + + inline vec& set_component(uint32_t index, T val) + { + assert(index < N); + m_s[index] = val; + return *this; + } + + inline vec& set_all(T val) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = val; + return *this; + } + + template + inline vec& set(Args... args) + { + static_assert(sizeof...(args) <= N); + + // Initialize using parameter pack expansion + T values[] = { static_cast(args)... }; + + // Special case if setting with a scalar + if (sizeof...(args) == 1) + { + set_all(values[0]); + } + else + { + // Copy the values into the vector + for (std::size_t i = 0; i < sizeof...(args); ++i) + { + m_s[i] = values[i]; + } + + // Zero-initialize the remaining elements (if any) + if (sizeof...(args) < N) + { + std::fill(m_s + sizeof...(args), m_s + N, T{}); + } + } + + return *this; + } + + inline vec& set(const T* pValues) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = pValues[i]; + return *this; + } + + template + inline vec& swizzle_set(const vec& other, uint32_t i) + { + return set(static_cast(other[i])); + } + + template + inline vec& swizzle_set(const vec& other, uint32_t i, uint32_t j) + { + return set(static_cast(other[i]), static_cast(other[j])); + } + + template + inline vec& swizzle_set(const vec& other, uint32_t i, uint32_t j, uint32_t k) + { + return set(static_cast(other[i]), static_cast(other[j]), static_cast(other[k])); + } + + template + inline vec& swizzle_set(const vec& other, uint32_t i, uint32_t j, uint32_t k, uint32_t l) + { + return set(static_cast(other[i]), static_cast(other[j]), static_cast(other[k]), static_cast(other[l])); + } + + inline vec& operator=(const vec& rhs) + { + if (this != &rhs) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = rhs.m_s[i]; + } + return *this; + } + + template + inline vec& operator=(const vec& other) + { + if ((void*)this == (void*)&other) + return *this; + + uint32_t s = basisu::minimum(N, O); + + uint32_t i; + for (i = 0; i < s; i++) + m_s[i] = static_cast(other[i]); + + for (; i < N; i++) + m_s[i] = 0; + + return *this; + } + + inline bool operator==(const vec& rhs) const + { + for (uint32_t i = 0; i < N; i++) + if (!(m_s[i] == rhs.m_s[i])) + return false; + return true; + } + + inline bool operator<(const vec& rhs) const + { + for (uint32_t i = 0; i < N; i++) + { + if (m_s[i] < rhs.m_s[i]) + return true; + else if (!(m_s[i] == rhs.m_s[i])) + return false; + } + + return false; + } + + inline T operator[](uint32_t i) const + { + assert(i < N); + return m_s[i]; + } + + inline T& operator[](uint32_t i) + { + assert(i < N); + return m_s[i]; + } + + template + inline uint64_t get_component_bits_as_uint() const + { + static_assert(index < N); + static_assert((sizeof(T) == sizeof(uint16_t)) || (sizeof(T) == sizeof(uint32_t)) || (sizeof(T) == sizeof(uint64_t)), "Unsupported type"); + + if (sizeof(T) == sizeof(uint16_t)) + return *reinterpret_cast(&m_s[index]); + else if (sizeof(T) == sizeof(uint32_t)) + return *reinterpret_cast(&m_s[index]); + else if (sizeof(T) == sizeof(uint64_t)) + return *reinterpret_cast(&m_s[index]); + else + { + assert(0); + return 0; + } + } + + inline T get_x(void) const + { + return m_s[0]; + } + inline T get_y(void) const + { + static_assert(N >= 2); + return m_s[1]; + } + inline T get_z(void) const + { + static_assert(N >= 3); + return m_s[2]; + } + inline T get_w(void) const + { + static_assert(N >= 4); + return m_s[3]; + } + + inline vec get_x_vector() const + { + return broadcast<0>(); + } + inline vec get_y_vector() const + { + return broadcast<1>(); + } + inline vec get_z_vector() const + { + return broadcast<2>(); + } + inline vec get_w_vector() const + { + return broadcast<3>(); + } + + inline T get_component(uint32_t i) const + { + return (*this)[i]; + } + + inline vec& set_x(T v) + { + m_s[0] = v; + return *this; + } + inline vec& set_y(T v) + { + static_assert(N >= 2); + m_s[1] = v; + return *this; + } + inline vec& set_z(T v) + { + static_assert(N >= 3); + m_s[2] = v; + return *this; + } + inline vec& set_w(T v) + { + static_assert(N >= 4); + m_s[3] = v; + return *this; + } + + inline const T* get_ptr() const + { + return reinterpret_cast(&m_s[0]); + } + inline T* get_ptr() + { + return reinterpret_cast(&m_s[0]); + } + + inline vec as_point() const + { + vec result(*this); + result[N - 1] = 1; + return result; + } + + inline vec as_dir() const + { + vec result(*this); + result[N - 1] = 0; + return result; + } + + inline vec<2, T> select2(uint32_t i, uint32_t j) const + { + assert((i < N) && (j < N)); + return vec<2, T>(m_s[i], m_s[j]); + } + + inline vec<3, T> select3(uint32_t i, uint32_t j, uint32_t k) const + { + assert((i < N) && (j < N) && (k < N)); + return vec<3, T>(m_s[i], m_s[j], m_s[k]); + } + + inline vec<4, T> select4(uint32_t i, uint32_t j, uint32_t k, uint32_t l) const + { + assert((i < N) && (j < N) && (k < N) && (l < N)); + return vec<4, T>(m_s[i], m_s[j], m_s[k], m_s[l]); + } + + inline bool is_dir() const + { + return m_s[N - 1] == 0; + } + inline bool is_vector() const + { + return is_dir(); + } + inline bool is_point() const + { + return m_s[N - 1] == 1; + } + + inline vec project() const + { + vec result(*this); + if (result[N - 1]) + result /= result[N - 1]; + return result; + } + + inline vec broadcast(unsigned i) const + { + return vec((*this)[i]); + } + + template + inline vec broadcast() const + { + return vec((*this)[i]); + } + + inline vec swizzle(uint32_t i, uint32_t j) const + { + return vec((*this)[i], (*this)[j]); + } + + inline vec swizzle(uint32_t i, uint32_t j, uint32_t k) const + { + return vec((*this)[i], (*this)[j], (*this)[k]); + } + + inline vec swizzle(uint32_t i, uint32_t j, uint32_t k, uint32_t l) const + { + return vec((*this)[i], (*this)[j], (*this)[k], (*this)[l]); + } + + inline vec operator-() const + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = -m_s[i]; + return result; + } + + inline vec operator+() const + { + return *this; + } + + inline vec& operator+=(const vec& other) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] += other.m_s[i]; + return *this; + } + + inline vec& operator-=(const vec& other) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] -= other.m_s[i]; + return *this; + } + + inline vec& operator*=(const vec& other) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] *= other.m_s[i]; + return *this; + } + + inline vec& operator/=(const vec& other) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] /= other.m_s[i]; + return *this; + } + + inline vec& operator*=(T s) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] *= s; + return *this; + } + + inline vec& operator/=(T s) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] /= s; + return *this; + } + + friend inline vec operator*(const vec& lhs, T val) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = lhs.m_s[i] * val; + return result; + } + + friend inline vec operator*(T val, const vec& rhs) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = val * rhs.m_s[i]; + return result; + } + + friend inline vec operator/(const vec& lhs, const vec& rhs) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = lhs.m_s[i] / rhs.m_s[i]; + return result; + } + + friend inline vec operator/(const vec& lhs, T val) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = lhs.m_s[i] / val; + return result; + } + + friend inline vec operator+(const vec& lhs, const vec& rhs) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = lhs.m_s[i] + rhs.m_s[i]; + return result; + } + + friend inline vec operator-(const vec& lhs, const vec& rhs) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result.m_s[i] = lhs.m_s[i] - rhs.m_s[i]; + return result; + } + + static inline vec<3, T> cross2(const vec& a, const vec& b) + { + static_assert(N >= 2); + return vec<3, T>(0, 0, a[0] * b[1] - a[1] * b[0]); + } + + inline vec<3, T> cross2(const vec& b) const + { + return cross2(*this, b); + } + + static inline vec<3, T> cross3(const vec& a, const vec& b) + { + static_assert(N >= 3); + return vec<3, T>(a[1] * b[2] - a[2] * b[1], a[2] * b[0] - a[0] * b[2], a[0] * b[1] - a[1] * b[0]); + } + + inline vec<3, T> cross3(const vec& b) const + { + return cross3(*this, b); + } + + static inline vec<3, T> cross(const vec& a, const vec& b) + { + static_assert(N >= 2); + + if (N == 2) + return cross2(a, b); + else + return cross3(a, b); + } + + inline vec<3, T> cross(const vec& b) const + { + static_assert(N >= 2); + return cross(*this, b); + } + + inline T dot(const vec& rhs) const + { + return dot(*this, rhs); + } + + inline vec dot_vector(const vec& rhs) const + { + return vec(dot(*this, rhs)); + } + + static inline T dot(const vec& lhs, const vec& rhs) + { + T result = lhs.m_s[0] * rhs.m_s[0]; + for (uint32_t i = 1; i < N; i++) + result += lhs.m_s[i] * rhs.m_s[i]; + return result; + } + + inline T dot2(const vec& rhs) const + { + static_assert(N >= 2); + return m_s[0] * rhs.m_s[0] + m_s[1] * rhs.m_s[1]; + } + + inline T dot3(const vec& rhs) const + { + static_assert(N >= 3); + return m_s[0] * rhs.m_s[0] + m_s[1] * rhs.m_s[1] + m_s[2] * rhs.m_s[2]; + } + + inline T dot4(const vec& rhs) const + { + static_assert(N >= 4); + return m_s[0] * rhs.m_s[0] + m_s[1] * rhs.m_s[1] + m_s[2] * rhs.m_s[2] + m_s[3] * rhs.m_s[3]; + } + + inline T norm(void) const + { + T sum = m_s[0] * m_s[0]; + for (uint32_t i = 1; i < N; i++) + sum += m_s[i] * m_s[i]; + return sum; + } + + inline T length(void) const + { + return sqrt(norm()); + } + + inline T squared_distance(const vec& rhs) const + { + T dist2 = 0; + for (uint32_t i = 0; i < N; i++) + { + T d = m_s[i] - rhs.m_s[i]; + dist2 += d * d; + } + return dist2; + } + + inline T squared_distance(const vec& rhs, T early_out) const + { + T dist2 = 0; + for (uint32_t i = 0; i < N; i++) + { + T d = m_s[i] - rhs.m_s[i]; + dist2 += d * d; + if (dist2 > early_out) + break; + } + return dist2; + } + + inline T distance(const vec& rhs) const + { + T dist2 = 0; + for (uint32_t i = 0; i < N; i++) + { + T d = m_s[i] - rhs.m_s[i]; + dist2 += d * d; + } + return sqrt(dist2); + } + + inline vec inverse() const + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = m_s[i] ? (1.0f / m_s[i]) : 0; + return result; + } + + // returns squared length (norm) + inline double normalize(const vec* pDefaultVec = NULL) + { + double n = m_s[0] * m_s[0]; + for (uint32_t i = 1; i < N; i++) + n += m_s[i] * m_s[i]; + + if (n != 0) + *this *= static_cast(1.0f / sqrt(n)); + else if (pDefaultVec) + *this = *pDefaultVec; + return n; + } + + inline double normalize3(const vec* pDefaultVec = NULL) + { + static_assert(N >= 3); + + double n = m_s[0] * m_s[0] + m_s[1] * m_s[1] + m_s[2] * m_s[2]; + + if (n != 0) + *this *= static_cast((1.0f / sqrt(n))); + else if (pDefaultVec) + *this = *pDefaultVec; + return n; + } + + inline vec& normalize_in_place(const vec* pDefaultVec = NULL) + { + normalize(pDefaultVec); + return *this; + } + + inline vec& normalize3_in_place(const vec* pDefaultVec = NULL) + { + normalize3(pDefaultVec); + return *this; + } + + inline vec get_normalized(const vec* pDefaultVec = NULL) const + { + vec result(*this); + result.normalize(pDefaultVec); + return result; + } + + inline vec get_normalized3(const vec* pDefaultVec = NULL) const + { + vec result(*this); + result.normalize3(pDefaultVec); + return result; + } + + inline vec& clamp(T l, T h) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = static_cast(basisu::clamp(m_s[i], l, h)); + return *this; + } + + inline vec& saturate() + { + return clamp(0.0f, 1.0f); + } + + inline vec& clamp(const vec& l, const vec& h) + { + for (uint32_t i = 0; i < N; i++) + m_s[i] = static_cast(basisu::clamp(m_s[i], l[i], h[i])); + return *this; + } + + inline bool is_within_bounds(const vec& l, const vec& h) const + { + for (uint32_t i = 0; i < N; i++) + if ((m_s[i] < l[i]) || (m_s[i] > h[i])) + return false; + + return true; + } + + inline bool is_within_bounds(T l, T h) const + { + for (uint32_t i = 0; i < N; i++) + if ((m_s[i] < l) || (m_s[i] > h)) + return false; + + return true; + } + + inline uint32_t get_major_axis(void) const + { + T m = fabs(m_s[0]); + uint32_t r = 0; + for (uint32_t i = 1; i < N; i++) + { + const T c = fabs(m_s[i]); + if (c > m) + { + m = c; + r = i; + } + } + return r; + } + + inline uint32_t get_minor_axis(void) const + { + T m = fabs(m_s[0]); + uint32_t r = 0; + for (uint32_t i = 1; i < N; i++) + { + const T c = fabs(m_s[i]); + if (c < m) + { + m = c; + r = i; + } + } + return r; + } + + inline void get_projection_axes(uint32_t& u, uint32_t& v) const + { + const int axis = get_major_axis(); + if (m_s[axis] < 0.0f) + { + v = basisu::next_wrap(axis, N); + u = basisu::next_wrap(v, N); + } + else + { + u = basisu::next_wrap(axis, N); + v = basisu::next_wrap(u, N); + } + } + + inline T get_absolute_minimum(void) const + { + T result = fabs(m_s[0]); + for (uint32_t i = 1; i < N; i++) + result = basisu::minimum(result, fabs(m_s[i])); + return result; + } + + inline T get_absolute_maximum(void) const + { + T result = fabs(m_s[0]); + for (uint32_t i = 1; i < N; i++) + result = basisu::maximum(result, fabs(m_s[i])); + return result; + } + + inline T get_minimum(void) const + { + T result = m_s[0]; + for (uint32_t i = 1; i < N; i++) + result = basisu::minimum(result, m_s[i]); + return result; + } + + inline T get_maximum(void) const + { + T result = m_s[0]; + for (uint32_t i = 1; i < N; i++) + result = basisu::maximum(result, m_s[i]); + return result; + } + + inline vec& remove_unit_direction(const vec& dir) + { + *this -= (dot(dir) * dir); + return *this; + } + + inline vec get_remove_unit_direction(const vec& dir) const + { + return *this - (dot(dir) * dir); + } + + inline bool all_less(const vec& b) const + { + for (uint32_t i = 0; i < N; i++) + if (m_s[i] >= b.m_s[i]) + return false; + return true; + } + + inline bool all_less_equal(const vec& b) const + { + for (uint32_t i = 0; i < N; i++) + if (m_s[i] > b.m_s[i]) + return false; + return true; + } + + inline bool all_greater(const vec& b) const + { + for (uint32_t i = 0; i < N; i++) + if (m_s[i] <= b.m_s[i]) + return false; + return true; + } + + inline bool all_greater_equal(const vec& b) const + { + for (uint32_t i = 0; i < N; i++) + if (m_s[i] < b.m_s[i]) + return false; + return true; + } + + inline vec negate_xyz() const + { + vec ret; + + ret[0] = -m_s[0]; + if (N >= 2) + ret[1] = -m_s[1]; + if (N >= 3) + ret[2] = -m_s[2]; + + for (uint32_t i = 3; i < N; i++) + ret[i] = m_s[i]; + + return ret; + } + + inline vec& invert() + { + for (uint32_t i = 0; i < N; i++) + if (m_s[i] != 0.0f) + m_s[i] = 1.0f / m_s[i]; + return *this; + } + + inline scalar_type perp_dot(const vec& b) const + { + static_assert(N == 2); + return m_s[0] * b.m_s[1] - m_s[1] * b.m_s[0]; + } + + inline vec perp() const + { + static_assert(N == 2); + return vec(-m_s[1], m_s[0]); + } + + inline vec get_floor() const + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = floor(m_s[i]); + return result; + } + + inline vec get_ceil() const + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = ceil(m_s[i]); + return result; + } + + inline T get_total() const + { + T res = m_s[0]; + for (uint32_t i = 1; i < N; i++) + res += m_s[i]; + return res; + } + + // static helper methods + + static inline vec mul_components(const vec& lhs, const vec& rhs) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = lhs.m_s[i] * rhs.m_s[i]; + return result; + } + + static inline vec mul_add_components(const vec& a, const vec& b, const vec& c) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = a.m_s[i] * b.m_s[i] + c.m_s[i]; + return result; + } + + static inline vec make_axis(uint32_t i) + { + vec result; + result.clear(); + result[i] = 1; + return result; + } + + static inline vec equals_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] == b[i]); + return ret; + } + + static inline vec not_equals_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] != b[i]); + return ret; + } + + static inline vec less_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] < b[i]); + return ret; + } + + static inline vec less_equals_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] <= b[i]); + return ret; + } + + static inline vec greater_equals_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] >= b[i]); + return ret; + } + + static inline vec greater_mask(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret[i] = (a[i] > b[i]); + return ret; + } + + static inline vec component_max(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret.m_s[i] = basisu::maximum(a.m_s[i], b.m_s[i]); + return ret; + } + + static inline vec component_min(const vec& a, const vec& b) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret.m_s[i] = basisu::minimum(a.m_s[i], b.m_s[i]); + return ret; + } + + static inline vec lerp(const vec& a, const vec& b, float t) + { + vec ret; + for (uint32_t i = 0; i < N; i++) + ret.m_s[i] = a.m_s[i] + (b.m_s[i] - a.m_s[i]) * t; + return ret; + } + + static inline bool equal_tol(const vec& a, const vec& b, float t) + { + for (uint32_t i = 0; i < N; i++) + if (!basisu::equal_tol(a.m_s[i], b.m_s[i], t)) + return false; + return true; + } + + inline bool equal_tol(const vec& b, float t) const + { + return equal_tol(*this, b, t); + } + + static inline vec make_random(basisu::rand& r, float l, float h) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = r.frand(l, h); + return result; + } + + static inline vec make_random(basisu::rand& r, const vec& l, const vec& h) + { + vec result; + for (uint32_t i = 0; i < N; i++) + result[i] = r.frand(l[i], h[i]); + return result; + } + + void print() const + { + for (uint32_t c = 0; c < N; c++) + printf("%3.3f ", (*this)[c]); + printf("\n"); + } + + protected: + T m_s[N]; + }; + + typedef vec<1, double> vec1D; + typedef vec<2, double> vec2D; + typedef vec<3, double> vec3D; + typedef vec<4, double> vec4D; + + typedef vec<1, float> vec1F; + + typedef vec<2, float> vec2F; + typedef basisu::vector vec2F_array; + + typedef vec<3, float> vec3F; + typedef basisu::vector vec3F_array; + + typedef vec<4, float> vec4F; + typedef basisu::vector vec4F_array; + + typedef vec<2, uint32_t> vec2U; + typedef vec<3, uint32_t> vec3U; + typedef vec<2, int> vec2I; + typedef vec<3, int> vec3I; + typedef vec<4, int> vec4I; + + typedef vec<2, int16_t> vec2I16; + typedef vec<3, int16_t> vec3I16; + + inline vec2F rotate_point_2D(const vec2F& p, float rad) + { + float c = cosf(rad); + float s = sinf(rad); + + float x = p[0]; + float y = p[1]; + + return vec2F(x * c - y * s, x * s + y * c); + } + + //-------------------------------------------------------------- + + // Matrix/vector cheat sheet, because confusingly, depending on how matrices are stored in memory people can use opposite definitions of "rows", "cols", etc. + // See http://www.mindcontrol.org/~hplus/graphics/matrix-layout.html + // + // So in this simple row-major general matrix class: + // matrix=[NumRows][NumCols] or [R][C], i.e. a 3x3 matrix stored in memory will appear as: R0C0, R0C1, R0C2, R1C0, R1C1, R1C2, etc. + // Matrix multiplication: [R0,C0]*[R1,C1]=[R0,C1], C0 must equal R1 + // + // In this class: + // A "row vector" type is a vector of size # of matrix cols, 1xC. It's the vector type that is used to store the matrix rows. + // A "col vector" type is a vector of size # of matrix rows, Rx1. It's a vector type large enough to hold each matrix column. + // + // Subrow/col vectors: last component is assumed to be either 0 (a "vector") or 1 (a "point") + // "subrow vector": vector/point of size # cols-1, 1x(C-1) + // "subcol vector": vector/point of size # rows-1, (R-1)x1 + // + // D3D style: + // vec*matrix, row vector on left (vec dotted against columns) + // [1,4]*[4,4]=[1,4] + // abcd * A B C D + // A B C D + // A B C D + // A B C D + // = e f g h + // + // Now confusingly, in the matrix transform method for vec*matrix below the vector's type is "col_vec", because col_vec will have the proper size for non-square matrices. But the vector on the left is written as row vector, argh. + // + // + // OGL style: + // matrix*vec, col vector on right (vec dotted against rows): + // [4,4]*[4,1]=[4,1] + // + // A B C D * e = e + // A B C D f f + // A B C D g g + // A B C D h h + + template + Z& matrix_mul_helper(Z& result, const X& lhs, const Y& rhs) + { + static_assert((int)Z::num_rows == (int)X::num_rows); + static_assert((int)Z::num_cols == (int)Y::num_cols); + static_assert((int)X::num_cols == (int)Y::num_rows); + assert(((void*)&result != (void*)&lhs) && ((void*)&result != (void*)&rhs)); + for (int r = 0; r < X::num_rows; r++) + for (int c = 0; c < Y::num_cols; c++) + { + typename Z::scalar_type s = lhs(r, 0) * rhs(0, c); + for (uint32_t i = 1; i < X::num_cols; i++) + s += lhs(r, i) * rhs(i, c); + result(r, c) = s; + } + return result; + } + + template + Z& matrix_mul_helper_transpose_lhs(Z& result, const X& lhs, const Y& rhs) + { + static_assert((int)Z::num_rows == (int)X::num_cols); + static_assert((int)Z::num_cols == (int)Y::num_cols); + static_assert((int)X::num_rows == (int)Y::num_rows); + assert(((void*)&result != (void*)&lhs) && ((void*)&result != (void*)&rhs)); + for (int r = 0; r < X::num_cols; r++) + for (int c = 0; c < Y::num_cols; c++) + { + typename Z::scalar_type s = lhs(0, r) * rhs(0, c); + for (uint32_t i = 1; i < X::num_rows; i++) + s += lhs(i, r) * rhs(i, c); + result(r, c) = s; + } + return result; + } + + template + Z& matrix_mul_helper_transpose_rhs(Z& result, const X& lhs, const Y& rhs) + { + static_assert((int)Z::num_rows == (int)X::num_rows); + static_assert((int)Z::num_cols == (int)Y::num_rows); + static_assert((int)X::num_cols == (int)Y::num_cols); + assert(((void*)&result != (void*)&lhs) && ((void*)&result != (void*)&rhs)); + for (int r = 0; r < X::num_rows; r++) + for (int c = 0; c < Y::num_rows; c++) + { + typename Z::scalar_type s = lhs(r, 0) * rhs(c, 0); + for (uint32_t i = 1; i < X::num_cols; i++) + s += lhs(r, i) * rhs(c, i); + result(r, c) = s; + } + return result; + } + + template + class matrix + { + public: + typedef T scalar_type; + enum + { + num_rows = R, + num_cols = C + }; + + typedef vec col_vec; + typedef vec < (R > 1) ? (R - 1) : 0, T > subcol_vec; + + typedef vec row_vec; + typedef vec < (C > 1) ? (C - 1) : 0, T > subrow_vec; + + inline matrix() + { + } + + inline matrix(basisu::eClear) + { + clear(); + } + + inline matrix(basisu::eIdentity) + { + set_identity_matrix(); + } + + inline matrix(const T* p) + { + set(p); + } + + inline matrix(const matrix& other) + { + for (uint32_t i = 0; i < R; i++) + m_rows[i] = other.m_rows[i]; + } + + inline matrix& operator=(const matrix& rhs) + { + if (this != &rhs) + for (uint32_t i = 0; i < R; i++) + m_rows[i] = rhs.m_rows[i]; + return *this; + } + + inline matrix(T val00, T val01, + T val10, T val11) + { + set(val00, val01, val10, val11); + } + + inline matrix(T val00, T val01, + T val10, T val11, + T val20, T val21) + { + set(val00, val01, val10, val11, val20, val21); + } + + inline matrix(T val00, T val01, T val02, + T val10, T val11, T val12, + T val20, T val21, T val22) + { + set(val00, val01, val02, val10, val11, val12, val20, val21, val22); + } + + inline matrix(T val00, T val01, T val02, T val03, + T val10, T val11, T val12, T val13, + T val20, T val21, T val22, T val23, + T val30, T val31, T val32, T val33) + { + set(val00, val01, val02, val03, val10, val11, val12, val13, val20, val21, val22, val23, val30, val31, val32, val33); + } + + inline matrix(T val00, T val01, T val02, T val03, + T val10, T val11, T val12, T val13, + T val20, T val21, T val22, T val23) + { + set(val00, val01, val02, val03, val10, val11, val12, val13, val20, val21, val22, val23); + } + + inline void set(const float* p) + { + for (uint32_t i = 0; i < R; i++) + { + m_rows[i].set(p); + p += C; + } + } + + inline void set(T val00, T val01, + T val10, T val11) + { + m_rows[0].set(val00, val01); + if (R >= 2) + { + m_rows[1].set(val10, val11); + + for (uint32_t i = 2; i < R; i++) + m_rows[i].clear(); + } + } + + inline void set(T val00, T val01, + T val10, T val11, + T val20, T val21) + { + m_rows[0].set(val00, val01); + if (R >= 2) + { + m_rows[1].set(val10, val11); + + if (R >= 3) + { + m_rows[2].set(val20, val21); + + for (uint32_t i = 3; i < R; i++) + m_rows[i].clear(); + } + } + } + + inline void set(T val00, T val01, T val02, + T val10, T val11, T val12, + T val20, T val21, T val22) + { + m_rows[0].set(val00, val01, val02); + if (R >= 2) + { + m_rows[1].set(val10, val11, val12); + if (R >= 3) + { + m_rows[2].set(val20, val21, val22); + + for (uint32_t i = 3; i < R; i++) + m_rows[i].clear(); + } + } + } + + inline void set(T val00, T val01, T val02, T val03, + T val10, T val11, T val12, T val13, + T val20, T val21, T val22, T val23, + T val30, T val31, T val32, T val33) + { + m_rows[0].set(val00, val01, val02, val03); + if (R >= 2) + { + m_rows[1].set(val10, val11, val12, val13); + if (R >= 3) + { + m_rows[2].set(val20, val21, val22, val23); + + if (R >= 4) + { + m_rows[3].set(val30, val31, val32, val33); + + for (uint32_t i = 4; i < R; i++) + m_rows[i].clear(); + } + } + } + } + + inline void set(T val00, T val01, T val02, T val03, + T val10, T val11, T val12, T val13, + T val20, T val21, T val22, T val23) + { + m_rows[0].set(val00, val01, val02, val03); + if (R >= 2) + { + m_rows[1].set(val10, val11, val12, val13); + if (R >= 3) + { + m_rows[2].set(val20, val21, val22, val23); + + for (uint32_t i = 3; i < R; i++) + m_rows[i].clear(); + } + } + } + + inline uint32_t get_num_rows() const + { + return num_rows; + } + + inline uint32_t get_num_cols() const + { + return num_cols; + } + + inline uint32_t get_total_elements() const + { + return num_rows * num_cols; + } + + inline T operator()(uint32_t r, uint32_t c) const + { + assert((r < R) && (c < C)); + return m_rows[r][c]; + } + + inline T& operator()(uint32_t r, uint32_t c) + { + assert((r < R) && (c < C)); + return m_rows[r][c]; + } + + inline const row_vec& operator[](uint32_t r) const + { + assert(r < R); + return m_rows[r]; + } + + inline row_vec& operator[](uint32_t r) + { + assert(r < R); + return m_rows[r]; + } + + inline const row_vec& get_row(uint32_t r) const + { + return (*this)[r]; + } + + inline row_vec& get_row(uint32_t r) + { + return (*this)[r]; + } + + inline void set_row(uint32_t r, const row_vec& v) + { + (*this)[r] = v; + } + + inline col_vec get_col(uint32_t c) const + { + assert(c < C); + col_vec result; + for (uint32_t i = 0; i < R; i++) + result[i] = m_rows[i][c]; + return result; + } + + inline void set_col(uint32_t c, const col_vec& col) + { + assert(c < C); + for (uint32_t i = 0; i < R; i++) + m_rows[i][c] = col[i]; + } + + inline void set_col(uint32_t c, const subcol_vec& col) + { + assert(c < C); + for (uint32_t i = 0; i < (R - 1); i++) + m_rows[i][c] = col[i]; + + m_rows[R - 1][c] = 0.0f; + } + + inline const row_vec& get_translate() const + { + return m_rows[R - 1]; + } + + inline matrix& set_translate(const row_vec& r) + { + m_rows[R - 1] = r; + return *this; + } + + inline matrix& set_translate(const subrow_vec& r) + { + m_rows[R - 1] = row_vec(r).as_point(); + return *this; + } + + inline const T* get_ptr() const + { + return reinterpret_cast(&m_rows[0]); + } + inline T* get_ptr() + { + return reinterpret_cast(&m_rows[0]); + } + + inline matrix& operator+=(const matrix& other) + { + for (uint32_t i = 0; i < R; i++) + m_rows[i] += other.m_rows[i]; + return *this; + } + + inline matrix& operator-=(const matrix& other) + { + for (uint32_t i = 0; i < R; i++) + m_rows[i] -= other.m_rows[i]; + return *this; + } + + inline matrix& operator*=(T val) + { + for (uint32_t i = 0; i < R; i++) + m_rows[i] *= val; + return *this; + } + + inline matrix& operator/=(T val) + { + for (uint32_t i = 0; i < R; i++) + m_rows[i] /= val; + return *this; + } + + inline matrix& operator*=(const matrix& other) + { + matrix result; + matrix_mul_helper(result, *this, other); + *this = result; + return *this; + } + + friend inline matrix operator+(const matrix& lhs, const matrix& rhs) + { + matrix result; + for (uint32_t i = 0; i < R; i++) + result[i] = lhs.m_rows[i] + rhs.m_rows[i]; + return result; + } + + friend inline matrix operator-(const matrix& lhs, const matrix& rhs) + { + matrix result; + for (uint32_t i = 0; i < R; i++) + result[i] = lhs.m_rows[i] - rhs.m_rows[i]; + return result; + } + + friend inline matrix operator*(const matrix& lhs, T val) + { + matrix result; + for (uint32_t i = 0; i < R; i++) + result[i] = lhs.m_rows[i] * val; + return result; + } + + friend inline matrix operator/(const matrix& lhs, T val) + { + matrix result; + for (uint32_t i = 0; i < R; i++) + result[i] = lhs.m_rows[i] / val; + return result; + } + + friend inline matrix operator*(T val, const matrix& rhs) + { + matrix result; + for (uint32_t i = 0; i < R; i++) + result[i] = val * rhs.m_rows[i]; + return result; + } + +#if 0 + template + friend inline matrix operator*(const matrix& lhs, const matrix& rhs) + { + matrix result; + return matrix_mul_helper(result, lhs, rhs); + } +#endif + friend inline matrix operator*(const matrix& lhs, const matrix& rhs) + { + matrix result; + return matrix_mul_helper(result, lhs, rhs); + } + + friend inline row_vec operator*(const col_vec& a, const matrix& b) + { + return transform(a, b); + } + + inline matrix operator+() const + { + return *this; + } + + inline matrix operator-() const + { + matrix result; + for (uint32_t i = 0; i < R; i++) + result[i] = -m_rows[i]; + return result; + } + + inline matrix& clear() + { + for (uint32_t i = 0; i < R; i++) + m_rows[i].clear(); + return *this; + } + + inline matrix& set_zero_matrix() + { + clear(); + return *this; + } + + inline matrix& set_identity_matrix() + { + for (uint32_t i = 0; i < R; i++) + { + m_rows[i].clear(); + m_rows[i][i] = 1.0f; + } + return *this; + } + + inline matrix& set_scale_matrix(float s) + { + clear(); + for (int i = 0; i < (R - 1); i++) + m_rows[i][i] = s; + m_rows[R - 1][C - 1] = 1.0f; + return *this; + } + + inline matrix& set_scale_matrix(const row_vec& s) + { + clear(); + for (uint32_t i = 0; i < R; i++) + m_rows[i][i] = s[i]; + return *this; + } + + inline matrix& set_scale_matrix(float x, float y) + { + set_identity_matrix(); + m_rows[0].set_x(x); + m_rows[1].set_y(y); + return *this; + } + + inline matrix& set_scale_matrix(float x, float y, float z) + { + set_identity_matrix(); + m_rows[0].set_x(x); + m_rows[1].set_y(y); + m_rows[2].set_z(z); + return *this; + } + + inline matrix& set_translate_matrix(const row_vec& s) + { + set_identity_matrix(); + set_translate(s); + return *this; + } + + inline matrix& set_translate_matrix(float x, float y) + { + set_identity_matrix(); + set_translate(row_vec(x, y).as_point()); + return *this; + } + + inline matrix& set_translate_matrix(float x, float y, float z) + { + set_identity_matrix(); + set_translate(row_vec(x, y, z).as_point()); + return *this; + } + + inline matrix get_transposed() const + { + static_assert(R == C); + + matrix result; + for (uint32_t i = 0; i < R; i++) + for (uint32_t j = 0; j < C; j++) + result.m_rows[i][j] = m_rows[j][i]; + return result; + } + + inline matrix get_transposed_nonsquare() const + { + matrix result; + for (uint32_t i = 0; i < R; i++) + for (uint32_t j = 0; j < C; j++) + result[j][i] = m_rows[i][j]; + return result; + } + + inline matrix& transpose_in_place() + { + matrix result; + for (uint32_t i = 0; i < R; i++) + for (uint32_t j = 0; j < C; j++) + result.m_rows[i][j] = m_rows[j][i]; + *this = result; + return *this; + } + + // Frobenius Norm + T get_norm() const + { + T result = 0; + + for (uint32_t i = 0; i < R; i++) + for (uint32_t j = 0; j < C; j++) + result += m_rows[i][j] * m_rows[i][j]; + + return static_cast(sqrt(result)); + } + + inline matrix get_power(T p) const + { + matrix result; + + for (uint32_t i = 0; i < R; i++) + for (uint32_t j = 0; j < C; j++) + result[i][j] = static_cast(pow(m_rows[i][j], p)); + + return result; + } + + inline matrix<1, R, T> numpy_dot(const matrix<1, C, T>& b) const + { + matrix<1, R, T> result; + + for (uint32_t r = 0; r < R; r++) + { + T sum = 0; + for (uint32_t c = 0; c < C; c++) + sum += m_rows[r][c] * b[0][c]; + + result[0][r] = static_cast(sum); + } + + return result; + } + + bool invert(matrix& result) const + { + static_assert(R == C); + + result.set_identity_matrix(); + + matrix mat(*this); + + for (uint32_t c = 0; c < C; c++) + { + uint32_t max_r = c; + for (uint32_t r = c + 1; r < R; r++) + if (fabs(mat[r][c]) > fabs(mat[max_r][c])) + max_r = r; + + if (mat[max_r][c] == 0.0f) + { + result.set_identity_matrix(); + return false; + } + + std::swap(mat[c], mat[max_r]); + std::swap(result[c], result[max_r]); + + result[c] /= mat[c][c]; + mat[c] /= mat[c][c]; + + for (uint32_t row = 0; row < R; row++) + { + if (row != c) + { + const row_vec temp(mat[row][c]); + mat[row] -= row_vec::mul_components(mat[c], temp); + result[row] -= row_vec::mul_components(result[c], temp); + } + } + } + + return true; + } + + matrix& invert_in_place() + { + matrix result; + invert(result); + *this = result; + return *this; + } + + matrix get_inverse() const + { + matrix result; + invert(result); + return result; + } + + T get_det() const + { + static_assert(R == C); + return det_helper(*this, R); + } + + bool equal_tol(const matrix& b, float tol) const + { + for (uint32_t r = 0; r < R; r++) + if (!row_vec::equal_tol(m_rows[r], b.m_rows[r], tol)) + return false; + return true; + } + + bool is_square() const + { + return R == C; + } + + double get_trace() const + { + static_assert(is_square()); + + T total = 0; + for (uint32_t i = 0; i < R; i++) + total += (*this)(i, i); + + return total; + } + + void print() const + { + for (uint32_t r = 0; r < R; r++) + { + for (uint32_t c = 0; c < C; c++) + printf("%3.7f ", (*this)(r, c)); + printf("\n"); + } + } + + // This method transforms a vec by a matrix (D3D-style: row vector on left). + // Confusingly, note that the data type is named "col_vec", but mathematically it's actually written as a row vector (of size equal to the # matrix rows, which is why it's called a "col_vec" in this class). + // 1xR * RxC = 1xC + // This dots against the matrix columns. + static inline row_vec transform(const col_vec& a, const matrix& b) + { + row_vec result(b[0] * a[0]); + for (uint32_t r = 1; r < R; r++) + result += b[r] * a[r]; + return result; + } + + // This method transforms a vec by a matrix (D3D-style: row vector on left). + // Last component of vec is assumed to be 1. + static inline row_vec transform_point(const col_vec& a, const matrix& b) + { + row_vec result(0); + for (int r = 0; r < (R - 1); r++) + result += b[r] * a[r]; + result += b[R - 1]; + return result; + } + + // This method transforms a vec by a matrix (D3D-style: row vector on left). + // Last component of vec is assumed to be 0. + static inline row_vec transform_vector(const col_vec& a, const matrix& b) + { + row_vec result(0); + for (int r = 0; r < (R - 1); r++) + result += b[r] * a[r]; + return result; + } + + // This method transforms a vec by a matrix (D3D-style: row vector on left). + // Last component of vec is assumed to be 1. + static inline subcol_vec transform_point(const subcol_vec& a, const matrix& b) + { + subcol_vec result(0); + for (int r = 0; r < static_cast(R); r++) + { + const T s = (r < subcol_vec::num_elements) ? a[r] : 1.0f; + for (int c = 0; c < static_cast(C - 1); c++) + result[c] += b[r][c] * s; + } + return result; + } + + // This method transforms a vec by a matrix (D3D-style: row vector on left). + // Last component of vec is assumed to be 0. + static inline subcol_vec transform_vector(const subcol_vec& a, const matrix& b) + { + subcol_vec result(0); + for (int r = 0; r < static_cast(R - 1); r++) + { + const T s = a[r]; + for (int c = 0; c < static_cast(C - 1); c++) + result[c] += b[r][c] * s; + } + return result; + } + + // Like transform() above, but the matrix is effectively transposed before the multiply. + static inline col_vec transform_transposed(const col_vec& a, const matrix& b) + { + static_assert(R == C); + col_vec result; + for (uint32_t r = 0; r < R; r++) + result[r] = b[r].dot(a); + return result; + } + + // Like transform() above, but the matrix is effectively transposed before the multiply. + // Last component of vec is assumed to be 0. + static inline col_vec transform_vector_transposed(const col_vec& a, const matrix& b) + { + static_assert(R == C); + col_vec result; + for (uint32_t r = 0; r < R; r++) + { + T s = 0; + for (uint32_t c = 0; c < (C - 1); c++) + s += b[r][c] * a[c]; + + result[r] = s; + } + return result; + } + + // This method transforms a vec by a matrix (D3D-style: row vector on left), but the matrix is effectively transposed before the multiply. + // Last component of vec is assumed to be 1. + static inline subcol_vec transform_point_transposed(const subcol_vec& a, const matrix& b) + { + static_assert(R == C); + subcol_vec result(0); + for (int r = 0; r < R; r++) + { + const T s = (r < subcol_vec::num_elements) ? a[r] : 1.0f; + for (int c = 0; c < (C - 1); c++) + result[c] += b[c][r] * s; + } + return result; + } + + // This method transforms a vec by a matrix (D3D-style: row vector on left), but the matrix is effectively transposed before the multiply. + // Last component of vec is assumed to be 0. + static inline subcol_vec transform_vector_transposed(const subcol_vec& a, const matrix& b) + { + static_assert(R == C); + subcol_vec result(0); + for (int r = 0; r < static_cast(R - 1); r++) + { + const T s = a[r]; + for (int c = 0; c < static_cast(C - 1); c++) + result[c] += b[c][r] * s; + } + return result; + } + + // This method transforms a matrix by a vector (OGL style, col vector on the right). + // Note that the data type is named "row_vec", but mathematically it's actually written as a column vector (of size equal to the # matrix cols). + // RxC * Cx1 = Rx1 + // This dots against the matrix rows. + static inline col_vec transform(const matrix& b, const row_vec& a) + { + col_vec result; + for (int r = 0; r < static_cast(R); r++) + result[r] = b[r].dot(a); + return result; + } + + // This method transforms a matrix by a vector (OGL style, col vector on the right), except the matrix is effectively transposed before the multiply. + // Note that the data type is named "row_vec", but mathematically it's actually written as a column vector (of size equal to the # matrix cols). + // RxC * Cx1 = Rx1 + // This dots against the matrix cols. + static inline col_vec transform_transposed(const matrix& b, const row_vec& a) + { + static_assert(R == C); + row_vec result(b[0] * a[0]); + for (int r = 1; r < static_cast(R); r++) + result += b[r] * a[r]; + return col_vec(result); + } + + static inline matrix& mul_components(matrix& result, const matrix& lhs, const matrix& rhs) + { + for (uint32_t r = 0; r < R; r++) + result[r] = row_vec::mul_components(lhs[r], rhs[r]); + return result; + } + + static inline matrix& concat(matrix& lhs, const matrix& rhs) + { + return matrix_mul_helper(lhs, matrix(lhs), rhs); + } + + inline matrix& concat_in_place(const matrix& rhs) + { + return concat(*this, rhs); + } + + static inline matrix& multiply(matrix& result, const matrix& lhs, const matrix& rhs) + { + matrix temp; + matrix* pResult = ((&result == &lhs) || (&result == &rhs)) ? &temp : &result; + + matrix_mul_helper(*pResult, lhs, rhs); + if (pResult != &result) + result = *pResult; + + return result; + } + + static matrix make_zero_matrix() + { + matrix result; + result.clear(); + return result; + } + + static matrix make_identity_matrix() + { + matrix result; + result.set_identity_matrix(); + return result; + } + + static matrix make_translate_matrix(const row_vec& t) + { + return matrix(basisu::cIdentity).set_translate(t); + } + + static matrix make_translate_matrix(float x, float y) + { + return matrix(basisu::cIdentity).set_translate_matrix(x, y); + } + + static matrix make_translate_matrix(float x, float y, float z) + { + return matrix(basisu::cIdentity).set_translate_matrix(x, y, z); + } + + static inline matrix make_scale_matrix(float s) + { + return matrix().set_scale_matrix(s); + } + + static inline matrix make_scale_matrix(const row_vec& s) + { + return matrix().set_scale_matrix(s); + } + + static inline matrix make_scale_matrix(float x, float y) + { + static_assert(R >= 3 && C >= 3); + matrix result; + result.set_identity_matrix(); + result.m_rows[0][0] = x; + result.m_rows[1][1] = y; + return result; + } + + static inline matrix make_scale_matrix(float x, float y, float z) + { + static_assert(R >= 4 && C >= 4); + matrix result; + result.set_identity_matrix(); + result.m_rows[0][0] = x; + result.m_rows[1][1] = y; + result.m_rows[2][2] = z; + return result; + } + + // Helpers derived from Graphics Gems 1 and 2 (Matrices and Transformations, Ronald N. Goldman) + static matrix make_rotate_matrix(const vec<3, T>& axis, T ang) + { + static_assert(R >= 3 && C >= 3); + + vec<3, T> norm_axis(axis.get_normalized()); + + double cos_a = cos(ang); + double inv_cos_a = 1.0f - cos_a; + + double sin_a = sin(ang); + + const T x = norm_axis[0]; + const T y = norm_axis[1]; + const T z = norm_axis[2]; + + const double x2 = norm_axis[0] * norm_axis[0]; + const double y2 = norm_axis[1] * norm_axis[1]; + const double z2 = norm_axis[2] * norm_axis[2]; + + matrix result; + result.set_identity_matrix(); + + result[0][0] = (T)((inv_cos_a * x2) + cos_a); + result[1][0] = (T)((inv_cos_a * x * y) + (sin_a * z)); + result[2][0] = (T)((inv_cos_a * x * z) - (sin_a * y)); + + result[0][1] = (T)((inv_cos_a * x * y) - (sin_a * z)); + result[1][1] = (T)((inv_cos_a * y2) + cos_a); + result[2][1] = (T)((inv_cos_a * y * z) + (sin_a * x)); + + result[0][2] = (T)((inv_cos_a * x * z) + (sin_a * y)); + result[1][2] = (T)((inv_cos_a * y * z) - (sin_a * x)); + result[2][2] = (T)((inv_cos_a * z2) + cos_a); + + return result; + } + + static inline matrix make_rotate_matrix(T ang) + { + static_assert(R >= 2 && C >= 2); + + matrix ret(basisu::cIdentity); + + const T sin_a = static_cast(sin(ang)); + const T cos_a = static_cast(cos(ang)); + + ret[0][0] = +cos_a; + ret[0][1] = -sin_a; + ret[1][0] = +sin_a; + ret[1][1] = +cos_a; + + return ret; + } + + static inline matrix make_rotate_matrix(uint32_t axis, T ang) + { + vec<3, T> axis_vec; + axis_vec.clear(); + axis_vec[axis] = 1.0f; + return make_rotate_matrix(axis_vec, ang); + } + + static inline matrix make_cross_product_matrix(const vec<3, scalar_type>& c) + { + static_assert((num_rows >= 3) && (num_cols >= 3)); + matrix ret(basisu::cClear); + ret[0][1] = c[2]; + ret[0][2] = -c[1]; + ret[1][0] = -c[2]; + ret[1][2] = c[0]; + ret[2][0] = c[1]; + ret[2][1] = -c[0]; + return ret; + } + + static inline matrix make_reflection_matrix(const vec<4, scalar_type>& n, const vec<4, scalar_type>& q) + { + static_assert((num_rows == 4) && (num_cols == 4)); + matrix ret; + assert(n.is_vector() && q.is_vector()); + ret = make_identity_matrix() - 2.0f * make_tensor_product_matrix(n, n); + ret.set_translate((2.0f * q.dot(n) * n).as_point()); + return ret; + } + + static inline matrix make_tensor_product_matrix(const row_vec& v, const row_vec& w) + { + matrix ret; + for (int r = 0; r < num_rows; r++) + ret[r] = row_vec::mul_components(v.broadcast(r), w); + return ret; + } + + static inline matrix make_uniform_scaling_matrix(const vec<4, scalar_type>& q, scalar_type c) + { + static_assert((num_rows == 4) && (num_cols == 4)); + assert(q.is_vector()); + matrix ret; + ret = c * make_identity_matrix(); + ret.set_translate(((1.0f - c) * q).as_point()); + return ret; + } + + static inline matrix make_nonuniform_scaling_matrix(const vec<4, scalar_type>& q, scalar_type c, const vec<4, scalar_type>& w) + { + static_assert((num_rows == 4) && (num_cols == 4)); + assert(q.is_vector() && w.is_vector()); + matrix ret; + ret = make_identity_matrix() - (1.0f - c) * make_tensor_product_matrix(w, w); + ret.set_translate(((1.0f - c) * q.dot(w) * w).as_point()); + return ret; + } + + // n = normal of plane, q = point on plane + static inline matrix make_ortho_projection_matrix(const vec<4, scalar_type>& n, const vec<4, scalar_type>& q) + { + assert(n.is_vector() && q.is_vector()); + matrix ret; + ret = make_identity_matrix() - make_tensor_product_matrix(n, n); + ret.set_translate((q.dot(n) * n).as_point()); + return ret; + } + + static inline matrix make_parallel_projection(const vec<4, scalar_type>& n, const vec<4, scalar_type>& q, const vec<4, scalar_type>& w) + { + assert(n.is_vector() && q.is_vector() && w.is_vector()); + matrix ret; + ret = make_identity_matrix() - (make_tensor_product_matrix(n, w) / (w.dot(n))); + ret.set_translate(((q.dot(n) / w.dot(n)) * w).as_point()); + return ret; + } + + protected: + row_vec m_rows[R]; + + static T det_helper(const matrix& a, uint32_t n) + { + // Algorithm ported from Numerical Recipes in C. + T d; + matrix m; + if (n == 2) + d = a(0, 0) * a(1, 1) - a(1, 0) * a(0, 1); + else + { + d = 0; + for (uint32_t j1 = 1; j1 <= n; j1++) + { + for (uint32_t i = 2; i <= n; i++) + { + int j2 = 1; + for (uint32_t j = 1; j <= n; j++) + { + if (j != j1) + { + m(i - 2, j2 - 1) = a(i - 1, j - 1); + j2++; + } + } + } + d += (((1 + j1) & 1) ? -1.0f : 1.0f) * a(1 - 1, j1 - 1) * det_helper(m, n - 1); + } + } + return d; + } + }; + + typedef matrix<2, 2, float> matrix22F; + typedef matrix<2, 2, double> matrix22D; + + typedef matrix<3, 3, float> matrix33F; + typedef matrix<3, 3, double> matrix33D; + + typedef matrix<4, 4, float> matrix44F; + typedef matrix<4, 4, double> matrix44D; + + typedef matrix<8, 8, float> matrix88F; + + // These helpers create good old D3D-style matrices. + inline matrix44F matrix44F_make_perspective_offcenter_lh(float l, float r, float b, float t, float nz, float fz) + { + float two_nz = 2.0f * nz; + float one_over_width = 1.0f / (r - l); + float one_over_height = 1.0f / (t - b); + + matrix44F view_to_proj; + view_to_proj[0].set(two_nz * one_over_width, 0.0f, 0.0f, 0.0f); + view_to_proj[1].set(0.0f, two_nz * one_over_height, 0.0f, 0.0f); + view_to_proj[2].set(-(l + r) * one_over_width, -(t + b) * one_over_height, fz / (fz - nz), 1.0f); + view_to_proj[3].set(0.0f, 0.0f, -view_to_proj[2][2] * nz, 0.0f); + return view_to_proj; + } + + // fov_y: full Y field of view (radians) + // aspect: viewspace width/height + inline matrix44F matrix44F_make_perspective_fov_lh(float fov_y, float aspect, float nz, float fz) + { + double sin_fov = sin(0.5f * fov_y); + double cos_fov = cos(0.5f * fov_y); + + float y_scale = static_cast(cos_fov / sin_fov); + float x_scale = static_cast(y_scale / aspect); + + matrix44F view_to_proj; + view_to_proj[0].set(x_scale, 0, 0, 0); + view_to_proj[1].set(0, y_scale, 0, 0); + view_to_proj[2].set(0, 0, fz / (fz - nz), 1); + view_to_proj[3].set(0, 0, -nz * fz / (fz - nz), 0); + return view_to_proj; + } + + inline matrix44F matrix44F_make_ortho_offcenter_lh(float l, float r, float b, float t, float nz, float fz) + { + matrix44F view_to_proj; + view_to_proj[0].set(2.0f / (r - l), 0.0f, 0.0f, 0.0f); + view_to_proj[1].set(0.0f, 2.0f / (t - b), 0.0f, 0.0f); + view_to_proj[2].set(0.0f, 0.0f, 1.0f / (fz - nz), 0.0f); + view_to_proj[3].set((l + r) / (l - r), (t + b) / (b - t), nz / (nz - fz), 1.0f); + return view_to_proj; + } + + inline matrix44F matrix44F_make_ortho_lh(float w, float h, float nz, float fz) + { + return matrix44F_make_ortho_offcenter_lh(-w * .5f, w * .5f, -h * .5f, h * .5f, nz, fz); + } + + inline matrix44F matrix44F_make_projection_to_screen_d3d(int x, int y, int w, int h, float min_z, float max_z) + { + matrix44F proj_to_screen; + proj_to_screen[0].set(w * .5f, 0.0f, 0.0f, 0.0f); + proj_to_screen[1].set(0, h * -.5f, 0.0f, 0.0f); + proj_to_screen[2].set(0, 0.0f, max_z - min_z, 0.0f); + proj_to_screen[3].set(x + w * .5f, y + h * .5f, min_z, 1.0f); + return proj_to_screen; + } + + inline matrix44F matrix44F_make_lookat_lh(const vec3F& camera_pos, const vec3F& look_at, const vec3F& camera_up, float camera_roll_ang_in_radians) + { + vec4F col2(look_at - camera_pos); + assert(col2.is_vector()); + if (col2.normalize() == 0.0f) + col2.set(0, 0, 1, 0); + + vec4F col1(camera_up); + assert(col1.is_vector()); + if (!col2[0] && !col2[2]) + col1.set(-1.0f, 0.0f, 0.0f, 0.0f); + + if ((col1.dot(col2)) > .9999f) + col1.set(0.0f, 1.0f, 0.0f, 0.0f); + + vec4F col0(vec4F::cross3(col1, col2).normalize_in_place()); + col1 = vec4F::cross3(col2, col0).normalize_in_place(); + + matrix44F rotm(matrix44F::make_identity_matrix()); + rotm.set_col(0, col0); + rotm.set_col(1, col1); + rotm.set_col(2, col2); + return matrix44F::make_translate_matrix(-camera_pos[0], -camera_pos[1], -camera_pos[2]) * rotm * matrix44F::make_rotate_matrix(2, camera_roll_ang_in_radians); + } + + template R matrix_NxN_create_DCT() + { + assert(R::num_rows == R::num_cols); + + const uint32_t N = R::num_cols; + + R result; + for (uint32_t k = 0; k < N; k++) + { + for (uint32_t n = 0; n < N; n++) + { + double f; + + if (!k) + f = 1.0f / sqrt(float(N)); + else + f = sqrt(2.0f / float(N)) * cos((basisu::cPiD * (2.0f * float(n) + 1.0f) * float(k)) / (2.0f * float(N))); + + result(k, n) = static_cast(f); + } + } + + return result; + } + + template R matrix_NxN_DCT(const R& a, const R& dct) + { + R temp; + matrix_mul_helper(temp, dct, a); + R result; + matrix_mul_helper_transpose_rhs(result, temp, dct); + return result; + } + + template R matrix_NxN_IDCT(const R& b, const R& dct) + { + R temp; + matrix_mul_helper_transpose_lhs(temp, dct, b); + R result; + matrix_mul_helper(result, temp, dct); + return result; + } + + template matrix matrix_kronecker_product(const X& a, const Y& b) + { + matrix result; + + for (uint32_t r = 0; r < X::num_rows; r++) + { + for (uint32_t c = 0; c < X::num_cols; c++) + { + for (uint32_t i = 0; i < Y::num_rows; i++) + for (uint32_t j = 0; j < Y::num_cols; j++) + result(r * Y::num_rows + i, c * Y::num_cols + j) = a(r, c) * b(i, j); + } + } + + return result; + } + + template matrix matrix_combine_vertically(const X& a, const Y& b) + { + matrix result; + + for (uint32_t r = 0; r < X::num_rows; r++) + for (uint32_t c = 0; c < X::num_cols; c++) + result(r, c) = a(r, c); + + for (uint32_t r = 0; r < Y::num_rows; r++) + for (uint32_t c = 0; c < Y::num_cols; c++) + result(r + X::num_rows, c) = b(r, c); + + return result; + } + + inline matrix88F get_haar8() + { + matrix22F haar2( + 1, 1, + 1, -1); + matrix22F i2( + 1, 0, + 0, 1); + matrix44F i4( + 1, 0, 0, 0, + 0, 1, 0, 0, + 0, 0, 1, 0, + 0, 0, 0, 1); + + matrix<1, 2, float> b0; b0(0, 0) = 1; b0(0, 1) = 1; + matrix<1, 2, float> b1; b1(0, 0) = 1.0f; b1(0, 1) = -1.0f; + + matrix<2, 4, float> haar4_0 = matrix_kronecker_product(haar2, b0); + matrix<2, 4, float> haar4_1 = matrix_kronecker_product(i2, b1); + + matrix<4, 4, float> haar4 = matrix_combine_vertically(haar4_0, haar4_1); + + matrix<4, 8, float> haar8_0 = matrix_kronecker_product(haar4, b0); + matrix<4, 8, float> haar8_1 = matrix_kronecker_product(i4, b1); + + haar8_0[2] *= sqrtf(2); + haar8_0[3] *= sqrtf(2); + haar8_1 *= 2.0f; + + matrix<8, 8, float> haar8 = matrix_combine_vertically(haar8_0, haar8_1); + + return haar8; + } + + inline matrix44F get_haar4() + { + const float sqrt2 = 1.4142135623730951f; + + return matrix44F( + .5f * 1, .5f * 1, .5f * 1, .5f * 1, + .5f * 1, .5f * 1, .5f * -1, .5f * -1, + .5f * sqrt2, .5f * -sqrt2, 0, 0, + 0, 0, .5f * sqrt2, .5f * -sqrt2); + } + + template + inline matrix<2, 2, T> get_inverse_2x2(const matrix<2, 2, T>& m) + { + double a = m[0][0]; + double b = m[0][1]; + double c = m[1][0]; + double d = m[1][1]; + + double det = a * d - b * c; + if (det != 0.0f) + det = 1.0f / det; + + matrix<2, 2, T> result; + result[0][0] = static_cast(d * det); + result[0][1] = static_cast(-b * det); + result[1][0] = static_cast(-c * det); + result[1][1] = static_cast(a * det); + return result; + } + +} // namespace bu_math + +namespace basisu +{ + class tracked_stat + { + public: + tracked_stat() { clear(); } + + inline void clear() { m_num = 0; m_total = 0; m_total2 = 0; } + + inline void update(int32_t val) { m_num++; m_total += val; m_total2 += val * val; } + + inline tracked_stat& operator += (uint32_t val) { update(val); return *this; } + + inline uint32_t get_number_of_values() { return m_num; } + inline uint64_t get_total() const { return m_total; } + inline uint64_t get_total2() const { return m_total2; } + + inline float get_average() const { return m_num ? (float)m_total / m_num : 0.0f; }; + inline float get_std_dev() const { return m_num ? sqrtf((float)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; } + inline float get_variance() const { float s = get_std_dev(); return s * s; } + + private: + uint32_t m_num; + int64_t m_total; + int64_t m_total2; + }; + + class tracked_stat_dbl + { + public: + tracked_stat_dbl() { clear(); } + + inline void clear() { m_num = 0; m_total = 0; m_total2 = 0; } + + inline void update(double val) { m_num++; m_total += val; m_total2 += val * val; } + + inline tracked_stat_dbl& operator += (double val) { update(val); return *this; } + + inline uint64_t get_number_of_values() { return m_num; } + inline double get_total() const { return m_total; } + inline double get_total2() const { return m_total2; } + + inline double get_average() const { return m_num ? m_total / (double)m_num : 0.0f; }; + inline double get_std_dev() const { return m_num ? sqrt((double)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; } + inline double get_variance() const { double s = get_std_dev(); return s * s; } + + private: + uint64_t m_num; + double m_total; + double m_total2; + }; + + template + struct stats + { + uint32_t m_n; + FloatType m_total, m_total_sq; // total, total of squares values + FloatType m_avg, m_avg_sq; // mean, mean of the squared values + FloatType m_rms; // sqrt(m_avg_sq) + FloatType m_std_dev, m_var; // population standard deviation and variance + FloatType m_mad; // mean absolute deviation + FloatType m_min, m_max, m_range; // min and max values, and max-min + FloatType m_len; // length of values as a vector (Euclidean norm or L2 norm) + FloatType m_coeff_of_var; // coefficient of variation (std_dev/mean), High CV: Indicates greater variability relative to the mean, meaning the data values are more spread out, + // Low CV : Indicates less variability relative to the mean, meaning the data values are more consistent. + + FloatType m_skewness; // Skewness = 0: The data is perfectly symmetric around the mean, + // Skewness > 0: The data is positively skewed (right-skewed), + // Skewness < 0: The data is negatively skewed (left-skewed) + // 0-.5 approx. symmetry, .5-1 moderate skew, >= 1 highly skewed + + FloatType m_kurtosis; // Excess Kurtosis: Kurtosis = 0: The distribution has normal kurtosis (mesokurtic) + // Kurtosis > 0: The distribution is leptokurtic, with heavy tails and a sharp peak + // Kurtosis < 0: The distribution is platykurtic, with light tails and a flatter peak + + bool m_any_zero; + + FloatType m_median; + uint32_t m_median_index; + + stats() + { + clear(); + } + + void clear() + { + m_n = 0; + m_total = 0, m_total_sq = 0; + m_avg = 0, m_avg_sq = 0; + m_rms = 0; + m_std_dev = 0, m_var = 0; + m_mad = 0; + m_min = BIG_FLOAT_VAL, m_max = -BIG_FLOAT_VAL; m_range = 0.0f; + m_len = 0; + m_coeff_of_var = 0; + m_skewness = 0; + m_kurtosis = 0; + m_any_zero = false; + + m_median = 0; + m_median_index = 0; + } + + template + void calc_median(uint32_t n, const T* pVals, uint32_t stride = 1) + { + m_median = 0; + m_median_index = 0; + + if (!n) + return; + + basisu::vector< std::pair > vals(n); + + for (uint32_t i = 0; i < n; i++) + { + vals[i].first = pVals[i * stride]; + vals[i].second = i; + } + + std::sort(vals.begin(), vals.end(), [](const std::pair& a, const std::pair& b) { + return a.first < b.first; + }); + + m_median = vals[n / 2].first; + if ((n & 1) == 0) + m_median = (m_median + vals[(n / 2) - 1].first) * .5f; + + m_median_index = vals[n / 2].second; + } + + template + void calc(uint32_t n, const T* pVals, uint32_t stride = 1, bool calc_median_flag = false) + { + clear(); + + if (!n) + return; + + if (calc_median_flag) + calc_median(n, pVals, stride); + + m_n = n; + + for (uint32_t i = 0; i < n; i++) + { + FloatType v = (FloatType)pVals[i * stride]; + + if (v == 0.0f) + m_any_zero = true; + + m_total += v; + m_total_sq += v * v; + + if (!i) + { + m_min = v; + m_max = v; + } + else + { + m_min = minimum(m_min, v); + m_max = maximum(m_max, v); + } + } + + m_range = m_max - m_min; + + m_len = sqrt(m_total_sq); + + const FloatType nd = (FloatType)n; + + m_avg = m_total / nd; + m_avg_sq = m_total_sq / nd; + m_rms = sqrt(m_avg_sq); + + for (uint32_t i = 0; i < n; i++) + { + FloatType v = (FloatType)pVals[i * stride]; + FloatType d = v - m_avg; + + const FloatType d2 = d * d; + const FloatType d3 = d2 * d; + const FloatType d4 = d3 * d; + + m_var += d2; + m_mad += fabs(d); + m_skewness += d3; + m_kurtosis += d4; + } + + m_var /= nd; + m_mad /= nd; + + m_std_dev = sqrt(m_var); + + m_coeff_of_var = (m_avg != 0.0f) ? (m_std_dev / fabs(m_avg)) : 0.0f; + + FloatType k3 = m_std_dev * m_std_dev * m_std_dev; + FloatType k4 = k3 * m_std_dev; + m_skewness = (k3 != 0.0f) ? ((m_skewness / nd) / k3) : 0.0f; + m_kurtosis = (k4 != 0.0f) ? (((m_kurtosis / nd) / k4) - 3.0f) : 0.0f; + } + + // Only compute average, variance and standard deviation. + template + void calc_simplified(uint32_t n, const T* pVals, uint32_t stride = 1) + { + clear(); + + if (!n) + return; + + m_n = n; + + for (uint32_t i = 0; i < n; i++) + { + FloatType v = (FloatType)pVals[i * stride]; + + m_total += v; + } + + const FloatType nd = (FloatType)n; + + m_avg = m_total / nd; + + for (uint32_t i = 0; i < n; i++) + { + FloatType v = (FloatType)pVals[i * stride]; + FloatType d = v - m_avg; + + const FloatType d2 = d * d; + + m_var += d2; + } + + m_var /= nd; + m_std_dev = sqrt(m_var); + } + }; + + template + struct comparative_stats + { + FloatType m_cov; // covariance + FloatType m_pearson; // Pearson Correlation Coefficient (r) [-1,1] + FloatType m_mse; // mean squared error + FloatType m_rmse; // root mean squared error + FloatType m_mae; // mean abs error + FloatType m_rmsle; // root mean squared log error + FloatType m_euclidean_dist; // euclidean distance between values as vectors + FloatType m_cosine_sim; // normalized dot products of values as vectors + FloatType m_min_diff, m_max_diff; // minimum/maximum abs difference between values + + comparative_stats() + { + clear(); + } + + void clear() + { + m_cov = 0; + m_pearson = 0; + m_mse = 0; + m_rmse = 0; + m_mae = 0; + m_rmsle = 0; + m_euclidean_dist = 0; + m_cosine_sim = 0; + m_min_diff = 0; + m_max_diff = 0; + } + + template + void calc(uint32_t n, const T* pA, const T* pB, uint32_t a_stride = 1, uint32_t b_stride = 1, const stats *pA_stats = nullptr, const stats *pB_stats = nullptr) + { + clear(); + if (!n) + return; + + stats temp_a_stats; + if (!pA_stats) + { + pA_stats = &temp_a_stats; + temp_a_stats.calc(n, pA, a_stride); + } + + stats temp_b_stats; + if (!pB_stats) + { + pB_stats = &temp_b_stats; + temp_b_stats.calc(n, pB, b_stride); + } + + for (uint32_t i = 0; i < n; i++) + { + const FloatType fa = (FloatType)pA[i * a_stride]; + const FloatType fb = (FloatType)pB[i * b_stride]; + + if ((pA_stats->m_min >= 0.0f) && (pB_stats->m_min >= 0.0f)) + { + const FloatType ld = log(fa + 1.0f) - log(fb + 1.0f); + m_rmsle += ld * ld; + } + + const FloatType diff = fa - fb; + const FloatType abs_diff = fabs(diff); + + m_mse += diff * diff; + m_mae += abs_diff; + + m_min_diff = i ? minimum(m_min_diff, abs_diff) : abs_diff; + m_max_diff = maximum(m_max_diff, abs_diff); + + const FloatType da = fa - pA_stats->m_avg; + const FloatType db = fb - pB_stats->m_avg; + m_cov += da * db; + + m_cosine_sim += fa * fb; + } + + const FloatType nd = (FloatType)n; + + m_euclidean_dist = sqrt(m_mse); + + m_mse /= nd; + m_rmse = sqrt(m_mse); + + m_mae /= nd; + + m_cov /= nd; + + FloatType dv = (pA_stats->m_std_dev * pB_stats->m_std_dev); + if (dv != 0.0f) + m_pearson = m_cov / dv; + + if ((pA_stats->m_min >= 0.0) && (pB_stats->m_min >= 0.0f)) + m_rmsle = sqrt(m_rmsle / nd); + + FloatType c = pA_stats->m_len * pB_stats->m_len; + if (c != 0.0f) + m_cosine_sim /= c; + else + m_cosine_sim = 0.0f; + } + + // Only computes Pearson, cov, mse, rmse, Euclidean distance + template + void calc_pearson(uint32_t n, const T* pA, const T* pB, uint32_t a_stride = 1, uint32_t b_stride = 1, const stats* pA_stats = nullptr, const stats* pB_stats = nullptr) + { + clear(); + if (!n) + return; + + stats temp_a_stats; + if (!pA_stats) + { + pA_stats = &temp_a_stats; + temp_a_stats.calc(n, pA, a_stride); + } + + stats temp_b_stats; + if (!pB_stats) + { + pB_stats = &temp_b_stats; + temp_b_stats.calc(n, pB, b_stride); + } + + for (uint32_t i = 0; i < n; i++) + { + const FloatType fa = (FloatType)pA[i * a_stride]; + const FloatType fb = (FloatType)pB[i * b_stride]; + + const FloatType diff = fa - fb; + + m_mse += diff * diff; + + const FloatType da = fa - pA_stats->m_avg; + const FloatType db = fb - pB_stats->m_avg; + m_cov += da * db; + } + + const FloatType nd = (FloatType)n; + + m_euclidean_dist = sqrt(m_mse); + + m_mse /= nd; + m_rmse = sqrt(m_mse); + + m_cov /= nd; + + FloatType dv = (pA_stats->m_std_dev * pB_stats->m_std_dev); + if (dv != 0.0f) + m_pearson = m_cov / dv; + } + + // Only computes MSE, RMSE, eclidiean distance, and covariance. + template + void calc_simplified(uint32_t n, const T* pA, const T* pB, uint32_t a_stride = 1, uint32_t b_stride = 1, const stats* pA_stats = nullptr, const stats* pB_stats = nullptr) + { + clear(); + if (!n) + return; + + stats temp_a_stats; + if (!pA_stats) + { + pA_stats = &temp_a_stats; + temp_a_stats.calc(n, pA, a_stride); + } + + stats temp_b_stats; + if (!pB_stats) + { + pB_stats = &temp_b_stats; + temp_b_stats.calc(n, pB, b_stride); + } + + for (uint32_t i = 0; i < n; i++) + { + const FloatType fa = (FloatType)pA[i * a_stride]; + const FloatType fb = (FloatType)pB[i * b_stride]; + + const FloatType diff = fa - fb; + + m_mse += diff * diff; + + const FloatType da = fa - pA_stats->m_avg; + const FloatType db = fb - pB_stats->m_avg; + m_cov += da * db; + } + + const FloatType nd = (FloatType)n; + + m_euclidean_dist = sqrt(m_mse); + + m_mse /= nd; + m_rmse = sqrt(m_mse); + + m_cov /= nd; + } + + // Only computes covariance. + template + void calc_cov(uint32_t n, const T* pA, const T* pB, uint32_t a_stride = 1, uint32_t b_stride = 1, const stats* pA_stats = nullptr, const stats* pB_stats = nullptr) + { + clear(); + if (!n) + return; + + stats temp_a_stats; + if (!pA_stats) + { + pA_stats = &temp_a_stats; + temp_a_stats.calc(n, pA, a_stride); + } + + stats temp_b_stats; + if (!pB_stats) + { + pB_stats = &temp_b_stats; + temp_b_stats.calc(n, pB, b_stride); + } + + for (uint32_t i = 0; i < n; i++) + { + const FloatType fa = (FloatType)pA[i * a_stride]; + const FloatType fb = (FloatType)pB[i * b_stride]; + + const FloatType da = fa - pA_stats->m_avg; + const FloatType db = fb - pB_stats->m_avg; + m_cov += da * db; + } + + const FloatType nd = (FloatType)n; + + m_cov /= nd; + } + }; + + class stat_history + { + public: + stat_history(uint32_t size) + { + init(size); + } + + void init(uint32_t size) + { + clear(); + + m_samples.reserve(size); + m_samples.resize(0); + m_max_samples = size; + } + + inline void clear() + { + m_samples.resize(0); + m_max_samples = 0; + } + + inline void update(double val) + { + m_samples.push_back(val); + + if (m_samples.size() > m_max_samples) + m_samples.erase_index(0); + } + + inline size_t size() + { + return m_samples.size(); + } + + struct stats + { + double m_avg = 0; + double m_std_dev = 0; + double m_var = 0; + double m_mad = 0; + double m_min_val = 0; + double m_max_val = 0; + + void clear() + { + basisu::clear_obj(*this); + } + }; + + inline void get_stats(stats& s) + { + s.clear(); + + if (m_samples.empty()) + return; + + double total = 0, total2 = 0; + + for (size_t i = 0; i < m_samples.size(); i++) + { + const double v = m_samples[i]; + + total += v; + total2 += v * v; + + if (!i) + { + s.m_min_val = v; + s.m_max_val = v; + } + else + { + s.m_min_val = basisu::minimum(s.m_min_val, v); + s.m_max_val = basisu::maximum(s.m_max_val, v); + } + } + + const double n = (double)m_samples.size(); + + s.m_avg = total / n; + s.m_std_dev = sqrt((n * total2 - total * total)) / n; + s.m_var = (n * total2 - total * total) / (n * n); + + double sc = 0; + for (size_t i = 0; i < m_samples.size(); i++) + { + const double v = m_samples[i]; + s.m_mad += fabs(v - s.m_avg); + + sc += basisu::square(v - s.m_avg); + } + sc = sqrt(sc / n); + + s.m_mad /= n; + } + + private: + uint32_t m_max_samples; + basisu::vector m_samples; + }; + + // bfloat16 helpers, see: + // https://en.wikipedia.org/wiki/Bfloat16_floating-point_format + + typedef union + { + uint32_t u; + float f; + } float32_union; + + typedef uint16_t bfloat16; + + inline float bfloat16_to_float(bfloat16 bfloat16) + { + float32_union float_union; + float_union.u = ((uint32_t)bfloat16) << 16; + return float_union.f; + } + + inline bfloat16 float_to_bfloat16(float input, bool round_flag = true) + { + float32_union float_union; + float_union.f = input; + + uint32_t exponent = (float_union.u >> 23) & 0xFF; + + // Check if the number is denormalized in float32 (exponent == 0) + if (exponent == 0) + { + // Handle denormalized float32 as zero in bfloat16 + return 0x0000; + } + + // Extract the top 16 bits (sign, exponent, and 7 most significant bits of the mantissa) + uint32_t upperBits = float_union.u >> 16; + + if (round_flag) + { + // Check the most significant bit of the lower 16 bits for rounding + uint32_t lowerBits = float_union.u & 0xFFFF; + + // Round to nearest or even + if ((lowerBits & 0x8000) && + ((lowerBits > 0x8000) || ((lowerBits == 0x8000) && (upperBits & 1))) + ) + { + // Round up + upperBits += 1; + + // Check for overflow in the exponent after rounding up + if (((upperBits & 0x7F80) == 0x7F80) && ((upperBits & 0x007F) == 0)) + { + // Exponent overflow (the upper bits became all 1s) + // Set the result to infinity + upperBits = (upperBits & 0x8000) | 0x7F80; // Preserve the sign bit, set exponent to 0xFF, and mantissa to 0 + } + } + } + + return (bfloat16)upperBits; + } + + inline int bfloat16_get_exp(bfloat16 v) + { + return (int)((v >> 7) & 0xFF) - 127; + } + + inline int bfloat16_get_mantissa(bfloat16 v) + { + return (v & 0x7F); + } + + inline int bfloat16_get_sign(bfloat16 v) + { + return (v & 0x8000) ? -1 : 1; + } + + inline bool bfloat16_is_nan_or_inf(bfloat16 v) + { + return ((v >> 7) & 0xFF) == 0xFF; + } + + inline bool bfloat16_is_zero(bfloat16 v) + { + return (v & 0x7FFF) == 0; + } + + inline bfloat16 bfloat16_init(int sign, int exp, int mant) + { + uint16_t res = (sign < 0) ? 0x8000 : 0; + + assert((exp >= -126) && (res <= 127)); + res |= ((exp + 127) << 7); + + assert((mant >= 0) && (mant < 128)); + res |= mant; + + return res; + } + + +} // namespace basisu + diff --git a/thirdparty/basis_universal/encoder/basisu_opencl.cpp b/thirdparty/basis_universal/encoder/basisu_opencl.cpp index e0611c18eefb..200cff50788a 100644 --- a/thirdparty/basis_universal/encoder/basisu_opencl.cpp +++ b/thirdparty/basis_universal/encoder/basisu_opencl.cpp @@ -789,7 +789,7 @@ namespace basisu struct opencl_context { - uint32_t m_ocl_total_pixel_blocks; + size_t m_ocl_total_pixel_blocks; cl_mem m_ocl_pixel_blocks; cl_command_queue m_command_queue; @@ -907,7 +907,7 @@ namespace basisu }; #pragma pack(pop) - bool opencl_set_pixel_blocks(opencl_context_ptr pContext, uint32_t total_blocks, const cl_pixel_block* pPixel_blocks) + bool opencl_set_pixel_blocks(opencl_context_ptr pContext, size_t total_blocks, const cl_pixel_block* pPixel_blocks) { if (!opencl_is_available()) return false; @@ -938,9 +938,11 @@ namespace basisu assert(pContext->m_ocl_pixel_blocks); if (!pContext->m_ocl_pixel_blocks) return false; + + assert(pContext->m_ocl_total_pixel_blocks <= INT_MAX); cl_encode_etc1s_param_struct ps; - ps.m_total_blocks = pContext->m_ocl_total_pixel_blocks; + ps.m_total_blocks = (int)pContext->m_ocl_total_pixel_blocks; ps.m_perceptual = perceptual; ps.m_total_perms = total_perms; @@ -1062,9 +1064,11 @@ namespace basisu assert(pContext->m_ocl_pixel_blocks); if (!pContext->m_ocl_pixel_blocks) return false; + + assert(pContext->m_ocl_total_pixel_blocks <= INT_MAX); cl_rec_param_struct ps; - ps.m_total_blocks = pContext->m_ocl_total_pixel_blocks; + ps.m_total_blocks = (int)pContext->m_ocl_total_pixel_blocks; ps.m_perceptual = perceptual; bool status = false; @@ -1118,8 +1122,10 @@ namespace basisu if (!pContext->m_ocl_pixel_blocks) return false; + assert(pContext->m_ocl_total_pixel_blocks <= INT_MAX); + fosc_param_struct ps; - ps.m_total_blocks = pContext->m_ocl_total_pixel_blocks; + ps.m_total_blocks = (int)pContext->m_ocl_total_pixel_blocks; ps.m_perceptual = perceptual; bool status = false; @@ -1170,8 +1176,10 @@ namespace basisu if (!pContext->m_ocl_pixel_blocks) return false; + assert(pContext->m_ocl_total_pixel_blocks <= INT_MAX); + ds_param_struct ps; - ps.m_total_blocks = pContext->m_ocl_total_pixel_blocks; + ps.m_total_blocks = (int)pContext->m_ocl_total_pixel_blocks; ps.m_perceptual = perceptual; bool status = false; @@ -1232,7 +1240,7 @@ namespace basisu BASISU_NOTE_UNUSED(context); } - bool opencl_set_pixel_blocks(opencl_context_ptr pContext, uint32_t total_blocks, const cl_pixel_block* pPixel_blocks) + bool opencl_set_pixel_blocks(opencl_context_ptr pContext, size_t total_blocks, const cl_pixel_block* pPixel_blocks) { BASISU_NOTE_UNUSED(pContext); BASISU_NOTE_UNUSED(total_blocks); diff --git a/thirdparty/basis_universal/encoder/basisu_opencl.h b/thirdparty/basis_universal/encoder/basisu_opencl.h index 2546a18dabbe..b44f288b7f94 100644 --- a/thirdparty/basis_universal/encoder/basisu_opencl.h +++ b/thirdparty/basis_universal/encoder/basisu_opencl.h @@ -43,7 +43,7 @@ namespace basisu // Must match BASISU_ETC1_CLUSTER_FIT_ORDER_TABLE_SIZE const uint32_t OPENCL_ENCODE_ETC1S_MAX_PERMS = 165; - bool opencl_set_pixel_blocks(opencl_context_ptr pContext, uint32_t total_blocks, const cl_pixel_block* pPixel_blocks); + bool opencl_set_pixel_blocks(opencl_context_ptr pContext, size_t total_blocks, const cl_pixel_block* pPixel_blocks); bool opencl_encode_etc1s_blocks(opencl_context_ptr pContext, etc_block* pOutput_blocks, bool perceptual, uint32_t total_perms); diff --git a/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp b/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp index 46cd837376ee..11c7ec2f60ae 100644 --- a/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp +++ b/thirdparty/basis_universal/encoder/basisu_resample_filters.cpp @@ -20,8 +20,7 @@ namespace basisu { -#define BOX_FILTER_SUPPORT (0.5f) - static float box_filter(float t) /* pulse/Fourier window */ + float box_filter(float t) /* pulse/Fourier window */ { // make_clist() calls the filter function with t inverted (pos = left, neg = right) if ((t >= -0.5f) && (t < 0.5f)) @@ -29,9 +28,8 @@ namespace basisu else return 0.0f; } - -#define TENT_FILTER_SUPPORT (1.0f) - static float tent_filter(float t) /* box (*) box, bilinear/triangle */ + + float tent_filter(float t) /* box (*) box, bilinear/triangle */ { if (t < 0.0f) t = -t; @@ -42,8 +40,7 @@ namespace basisu return 0.0f; } -#define BELL_SUPPORT (1.5f) - static float bell_filter(float t) /* box (*) box (*) box */ + float bell_filter(float t) /* box (*) box (*) box */ { if (t < 0.0f) t = -t; @@ -201,13 +198,12 @@ namespace basisu return (0.0f); } -#define GAUSSIAN_SUPPORT (1.25f) - static float gaussian_filter(float t) // with blackman window + float gaussian_filter(float t) // with blackman window { if (t < 0) t = -t; - if (t < GAUSSIAN_SUPPORT) - return clean(exp(-2.0f * t * t) * sqrt(2.0f / M_PI) * blackman_exact_window(t / GAUSSIAN_SUPPORT)); + if (t < BASISU_GAUSSIAN_FILTER_SUPPORT) + return clean(exp(-2.0f * t * t) * sqrt(2.0f / M_PI) * blackman_exact_window(t / BASISU_GAUSSIAN_FILTER_SUPPORT)); else return 0.0f; } @@ -310,9 +306,9 @@ namespace basisu const resample_filter g_resample_filters[] = { - { "box", box_filter, BOX_FILTER_SUPPORT }, - { "tent", tent_filter, TENT_FILTER_SUPPORT }, - { "bell", bell_filter, BELL_SUPPORT }, + { "box", box_filter, BASISU_BOX_FILTER_SUPPORT }, + { "tent", tent_filter, BASISU_TENT_FILTER_SUPPORT }, + { "bell", bell_filter, BASISU_BELL_FILTER_SUPPORT }, { "b-spline", B_spline_filter, B_SPLINE_SUPPORT }, { "mitchell", mitchell_filter, MITCHELL_SUPPORT }, { "blackman", blackman_filter, BLACKMAN_SUPPORT }, @@ -321,7 +317,7 @@ namespace basisu { "lanczos6", lanczos6_filter, LANCZOS6_SUPPORT }, { "lanczos12", lanczos12_filter, LANCZOS12_SUPPORT }, { "kaiser", kaiser_filter, KAISER_SUPPORT }, - { "gaussian", gaussian_filter, GAUSSIAN_SUPPORT }, + { "gaussian", gaussian_filter, BASISU_GAUSSIAN_FILTER_SUPPORT }, { "catmullrom", catmull_rom_filter, CATMULL_ROM_SUPPORT }, { "quadratic_interp", quadratic_interp_filter, QUADRATIC_SUPPORT }, { "quadratic_approx", quadratic_approx_filter, QUADRATIC_SUPPORT }, diff --git a/thirdparty/basis_universal/encoder/basisu_resampler.cpp b/thirdparty/basis_universal/encoder/basisu_resampler.cpp index a00c63335d09..fa06298528ba 100644 --- a/thirdparty/basis_universal/encoder/basisu_resampler.cpp +++ b/thirdparty/basis_universal/encoder/basisu_resampler.cpp @@ -573,7 +573,7 @@ namespace basisu /* Don't deallocate a contibutor list * if the user passed us one of their own. - */ + */ if ((m_Pclist_x) && (!m_clist_x_forced)) { diff --git a/thirdparty/basis_universal/encoder/basisu_resampler.h b/thirdparty/basis_universal/encoder/basisu_resampler.h index ac1ef73d7f3e..fc1918ec854e 100644 --- a/thirdparty/basis_universal/encoder/basisu_resampler.h +++ b/thirdparty/basis_universal/encoder/basisu_resampler.h @@ -113,6 +113,8 @@ namespace basisu Resample_Real filter_scale, Resample_Real src_ofs); + static void free_clist(Contrib_List* p) { if (p) { free(p->p); free(p); } } + private: Resampler(); Resampler(const Resampler &o); diff --git a/thirdparty/basis_universal/encoder/basisu_resampler_filters.h b/thirdparty/basis_universal/encoder/basisu_resampler_filters.h index 4d66ac2c7031..c96416b1aa7e 100644 --- a/thirdparty/basis_universal/encoder/basisu_resampler_filters.h +++ b/thirdparty/basis_universal/encoder/basisu_resampler_filters.h @@ -29,6 +29,18 @@ namespace basisu extern const resample_filter g_resample_filters[]; extern const int g_num_resample_filters; + + const float BASISU_BOX_FILTER_SUPPORT = 0.5f; + float box_filter(float t); /* pulse/Fourier window */ + + const float BASISU_TENT_FILTER_SUPPORT = 1.0f; + float tent_filter(float t); /* box (*) box, bilinear/triangle */ + + const float BASISU_GAUSSIAN_FILTER_SUPPORT = 1.25f; + float gaussian_filter(float t); // with blackman window + + const float BASISU_BELL_FILTER_SUPPORT = 1.5f; + float bell_filter(float t); /* box (*) box (*) box */ int find_resample_filter(const char *pName); diff --git a/thirdparty/basis_universal/encoder/basisu_ssim.cpp b/thirdparty/basis_universal/encoder/basisu_ssim.cpp index 608ce937fcdd..4cdf3a48d68f 100644 --- a/thirdparty/basis_universal/encoder/basisu_ssim.cpp +++ b/thirdparty/basis_universal/encoder/basisu_ssim.cpp @@ -91,6 +91,8 @@ namespace basisu void gaussian_filter(imagef &dst, const imagef &orig_img, uint32_t odd_filter_width, float sigma_sqr, bool wrapping, uint32_t width_divisor, uint32_t height_divisor) { + assert(&dst != &orig_img); + assert(odd_filter_width && (odd_filter_width & 1)); odd_filter_width |= 1; diff --git a/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp b/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp index 51f6e979d458..7e0a2b1df36f 100644 --- a/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp +++ b/thirdparty/basis_universal/encoder/basisu_uastc_enc.cpp @@ -3813,32 +3813,7 @@ namespace basisu return hash_hsieh((const uint8_t*)&s, sizeof(s)); } }; - - class tracked_stat - { - public: - tracked_stat() { clear(); } - - void clear() { m_num = 0; m_total = 0; m_total2 = 0; } - - void update(uint32_t val) { m_num++; m_total += val; m_total2 += val * val; } - - tracked_stat& operator += (uint32_t val) { update(val); return *this; } - - uint32_t get_number_of_values() { return m_num; } - uint64_t get_total() const { return m_total; } - uint64_t get_total2() const { return m_total2; } - - float get_average() const { return m_num ? (float)m_total / m_num : 0.0f; }; - float get_std_dev() const { return m_num ? sqrtf((float)(m_num * m_total2 - m_total * m_total)) / m_num : 0.0f; } - float get_variance() const { float s = get_std_dev(); return s * s; } - - private: - uint32_t m_num; - uint64_t m_total; - uint64_t m_total2; - }; - + static bool uastc_rdo_blocks(uint32_t first_index, uint32_t last_index, basist::uastc_block* pBlocks, const color_rgba* pBlock_pixels, const uastc_rdo_params& params, uint32_t flags, uint32_t &total_skipped, uint32_t &total_refined, uint32_t &total_modified, uint32_t &total_smooth) { @@ -4150,9 +4125,7 @@ namespace basisu const uint32_t first_index = block_index_iter; const uint32_t last_index = minimum(num_blocks, block_index_iter + blocks_per_job); -#ifndef __EMSCRIPTEN__ pJob_pool->add_job([first_index, last_index, pBlocks, pBlock_pixels, ¶ms, flags, &total_skipped, &total_modified, &total_refined, &total_smooth, &all_succeeded, &stat_mutex] { -#endif uint32_t job_skipped = 0, job_modified = 0, job_refined = 0, job_smooth = 0; @@ -4168,16 +4141,12 @@ namespace basisu total_smooth += job_smooth; } -#ifndef __EMSCRIPTEN__ } ); -#endif } // block_index_iter -#ifndef __EMSCRIPTEN__ pJob_pool->wait_for_all(); -#endif status = all_succeeded; } diff --git a/thirdparty/basis_universal/encoder/basisu_uastc_hdr_4x4_enc.cpp b/thirdparty/basis_universal/encoder/basisu_uastc_hdr_4x4_enc.cpp new file mode 100644 index 000000000000..dd9d6fbb369c --- /dev/null +++ b/thirdparty/basis_universal/encoder/basisu_uastc_hdr_4x4_enc.cpp @@ -0,0 +1,1277 @@ +// basisu_uastc_hdr_4x4_enc.cpp +#include "basisu_uastc_hdr_4x4_enc.h" +#include "../transcoder/basisu_transcoder.h" + +using namespace basist; + +namespace basisu +{ + +const uint32_t UHDR_MODE11_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, UHDR_MODE11_LAST_ISE_RANGE = astc_helpers::BISE_16_LEVELS; +const uint32_t UHDR_MODE7_PART1_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, UHDR_MODE7_PART1_LAST_ISE_RANGE = astc_helpers::BISE_16_LEVELS; +const uint32_t UHDR_MODE7_PART2_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, UHDR_MODE7_PART2_LAST_ISE_RANGE = astc_helpers::BISE_8_LEVELS; +const uint32_t UHDR_MODE11_PART2_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, UHDR_MODE11_PART2_LAST_ISE_RANGE = astc_helpers::BISE_4_LEVELS; + +uastc_hdr_4x4_codec_options::uastc_hdr_4x4_codec_options() : + astc_hdr_codec_base_options() +{ + init(); +} + +void uastc_hdr_4x4_codec_options::init() +{ + astc_hdr_codec_base_options::init(); + + // This was the log bias we used on the initial release. It's too low. + //m_q_log_bias = Q_LOG_BIAS_4x4; + + m_q_log_bias = Q_LOG_BIAS_6x6; + + m_bc6h_err_weight = .85f; + +#if 0 + // HACK HACK + m_disable_weight_plane_optimization = true; + m_take_first_non_clamping_mode11_submode = false; + m_take_first_non_clamping_mode7_submode = false; +#endif + + // Must set the quality level at least once to reset this struct. + set_quality_level(cDefaultLevel); +} + +void uastc_hdr_4x4_codec_options::set_quality_best() +{ + // highest achievable quality + m_mode11_direct_only = false; + + m_use_solid = true; + + m_use_mode11_part1 = true; + m_mode11_uber_mode = true; + m_first_mode11_weight_ise_range = UHDR_MODE11_FIRST_ISE_RANGE; + m_last_mode11_weight_ise_range = UHDR_MODE11_LAST_ISE_RANGE; + m_first_mode11_submode = -1; + m_last_mode11_submode = 7; + + m_use_mode7_part1 = true; + m_first_mode7_part1_weight_ise_range = UHDR_MODE7_PART1_FIRST_ISE_RANGE; + m_last_mode7_part1_weight_ise_range = UHDR_MODE7_PART1_LAST_ISE_RANGE; + m_mode7_full_s_optimization = true; + + m_use_mode7_part2 = true; + m_mode7_part2_part_masks = UINT32_MAX; + m_first_mode7_part2_weight_ise_range = UHDR_MODE7_PART2_FIRST_ISE_RANGE; + m_last_mode7_part2_weight_ise_range = UHDR_MODE7_PART2_LAST_ISE_RANGE; + + m_use_mode11_part2 = true; + m_mode11_part2_part_masks = UINT32_MAX; + m_first_mode11_part2_weight_ise_range = UHDR_MODE11_PART2_FIRST_ISE_RANGE; + m_last_mode11_part2_weight_ise_range = UHDR_MODE11_PART2_LAST_ISE_RANGE; + + m_refine_weights = true; + + m_use_estimated_partitions = false; + m_max_estimated_partitions = 0; +} + +void uastc_hdr_4x4_codec_options::set_quality_normal() +{ + m_use_solid = true; + + // We'll allow uber mode in normal if the user allows it. + m_use_mode11_part1 = true; + m_mode11_uber_mode = true; + m_first_mode11_weight_ise_range = 6; + m_last_mode11_weight_ise_range = UHDR_MODE11_LAST_ISE_RANGE; + + m_use_mode7_part1 = true; + m_first_mode7_part1_weight_ise_range = UHDR_MODE7_PART1_LAST_ISE_RANGE; + m_last_mode7_part1_weight_ise_range = UHDR_MODE7_PART1_LAST_ISE_RANGE; + + m_use_mode7_part2 = true; + m_mode7_part2_part_masks = UINT32_MAX; + m_first_mode7_part2_weight_ise_range = UHDR_MODE7_PART2_LAST_ISE_RANGE; + m_last_mode7_part2_weight_ise_range = UHDR_MODE7_PART2_LAST_ISE_RANGE; + + m_use_mode11_part2 = true; + m_mode11_part2_part_masks = UINT32_MAX; + m_first_mode11_part2_weight_ise_range = UHDR_MODE11_PART2_LAST_ISE_RANGE; + m_last_mode11_part2_weight_ise_range = UHDR_MODE11_PART2_LAST_ISE_RANGE; + + m_refine_weights = true; +} + +void uastc_hdr_4x4_codec_options::set_quality_fastest() +{ + m_use_solid = true; + + m_use_mode11_part1 = true; + m_mode11_uber_mode = false; + m_first_mode11_weight_ise_range = UHDR_MODE11_LAST_ISE_RANGE; + m_last_mode11_weight_ise_range = UHDR_MODE11_LAST_ISE_RANGE; + + m_use_mode7_part1 = false; + m_mode7_full_s_optimization = false; + + m_use_mode7_part2 = false; + m_use_mode11_part2 = false; + + m_refine_weights = false; +} + +void uastc_hdr_4x4_codec_options::set_quality_level(int level) +{ + level = clamp(level, cMinLevel, cMaxLevel); + + m_level = level; + + // First ensure all options are set to best. + set_quality_best(); + + switch (level) + { + case 0: + { + set_quality_fastest(); + break; + } + case 1: + { + set_quality_normal(); + + m_first_mode11_weight_ise_range = UHDR_MODE11_LAST_ISE_RANGE - 1; + m_last_mode11_weight_ise_range = UHDR_MODE11_LAST_ISE_RANGE; + + m_use_mode7_part1 = false; + m_mode7_full_s_optimization = false; + m_use_mode7_part2 = false; + + m_use_estimated_partitions = true; + m_max_estimated_partitions = 1; + + m_mode11_part2_part_masks = 1 | 2; + m_mode7_part2_part_masks = 1 | 2; + + // TODO: Disabling this hurts BC6H quality, but significantly speeds up compression. + //m_refine_weights = false; + break; + } + case 2: + { + set_quality_normal(); + + m_use_estimated_partitions = true; + m_max_estimated_partitions = 2; + + m_mode11_part2_part_masks = 1 | 2; + m_mode7_part2_part_masks = 1 | 2; + + break; + } + case 3: + { + m_use_estimated_partitions = true; + m_max_estimated_partitions = 2; + + m_mode11_part2_part_masks = 1 | 2 | 4 | 8; + m_mode7_part2_part_masks = 1 | 2 | 4 | 8; + + break; + } + default: + { + // best options already set + break; + } + } +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static bool pack_solid(const vec4F* pBlock_linear_colors, basisu::vector& all_results, const uastc_hdr_4x4_codec_options& coptions) +{ + float r = 0.0f, g = 0.0f, b = 0.0f; + + const float LOG_BIAS = .125f; + + bool solid_block = true; + for (uint32_t i = 0; i < 16; i++) + { + if ((pBlock_linear_colors[0][0] != pBlock_linear_colors[i][0]) || + (pBlock_linear_colors[0][1] != pBlock_linear_colors[i][1]) || + (pBlock_linear_colors[0][2] != pBlock_linear_colors[i][2])) + { + solid_block = false; + } + + r += log2f(pBlock_linear_colors[i][0] + LOG_BIAS); + g += log2f(pBlock_linear_colors[i][1] + LOG_BIAS); + b += log2f(pBlock_linear_colors[i][2] + LOG_BIAS); + } + + if (solid_block) + { + r = pBlock_linear_colors[0][0]; + g = pBlock_linear_colors[0][1]; + b = pBlock_linear_colors[0][2]; + } + else + { + r = maximum(0.0f, powf(2.0f, r * (1.0f / 16.0f)) - LOG_BIAS); + g = maximum(0.0f, powf(2.0f, g * (1.0f / 16.0f)) - LOG_BIAS); + b = maximum(0.0f, powf(2.0f, b * (1.0f / 16.0f)) - LOG_BIAS); + + // for safety + r = minimum(r, MAX_HALF_FLOAT); + g = minimum(g, MAX_HALF_FLOAT); + b = minimum(b, MAX_HALF_FLOAT); + } + + half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b), ah = float_to_half_non_neg_no_nan_inf(1.0f); + + astc_hdr_4x4_pack_results results; + results.clear(); + + uint8_t* packed_blk = (uint8_t*)&results.m_solid_blk; + results.m_is_solid = true; + + packed_blk[0] = 0b11111100; + packed_blk[1] = 255; + packed_blk[2] = 255; + packed_blk[3] = 255; + packed_blk[4] = 255; + packed_blk[5] = 255; + packed_blk[6] = 255; + packed_blk[7] = 255; + + packed_blk[8] = (uint8_t)rh; + packed_blk[9] = (uint8_t)(rh >> 8); + packed_blk[10] = (uint8_t)gh; + packed_blk[11] = (uint8_t)(gh >> 8); + packed_blk[12] = (uint8_t)bh; + packed_blk[13] = (uint8_t)(bh >> 8); + packed_blk[14] = (uint8_t)ah; + packed_blk[15] = (uint8_t)(ah >> 8); + + results.m_best_block_error = 0; + + if (!solid_block) + { + const float R_WEIGHT = coptions.m_r_err_scale; + const float G_WEIGHT = coptions.m_g_err_scale; + + // This MUST match how errors are computed in eval_selectors(). + for (uint32_t i = 0; i < 16; i++) + { + half_float dr = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][0]), dg = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][1]), db = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][2]); + double rd = q(rh, Q_LOG_BIAS_4x4) - q(dr, Q_LOG_BIAS_4x4); + double gd = q(gh, Q_LOG_BIAS_4x4) - q(dg, Q_LOG_BIAS_4x4); + double bd = q(bh, Q_LOG_BIAS_4x4) - q(db, Q_LOG_BIAS_4x4); + + double e = R_WEIGHT * (rd * rd) + G_WEIGHT * (gd * gd) + bd * bd; + + results.m_best_block_error += e; + } + } + + const half_float hc[3] = { rh, gh, bh }; + + bc6h_enc_block_solid_color(&results.m_bc6h_block, hc); + + all_results.push_back(results); + + return solid_block; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static void pack_mode11( + const vec4F* pBlock_linear_colors, const half_float pBlock_pixels_half[16][3], const vec4F pBlock_pixels_q16[16], + basisu::vector& all_results, + const uastc_hdr_4x4_codec_options& coptions, + uint32_t first_weight_ise_range, uint32_t last_weight_ise_range, bool constrain_ise_weight_selectors) +{ + BASISU_NOTE_UNUSED(pBlock_linear_colors); + assert(first_weight_ise_range <= last_weight_ise_range); + + uint8_t trial_endpoints[NUM_MODE11_ENDPOINTS], trial_weights[16]; + uint32_t trial_submode11 = 0; + + clear_obj(trial_endpoints); + clear_obj(trial_weights); + + for (uint32_t weight_ise_range = first_weight_ise_range; weight_ise_range <= last_weight_ise_range; weight_ise_range++) + { + const bool direct_only = coptions.m_mode11_direct_only; + + uint32_t endpoint_ise_range = astc_helpers::BISE_256_LEVELS; + if (weight_ise_range == astc_helpers::BISE_16_LEVELS) + endpoint_ise_range = astc_helpers::BISE_192_LEVELS; + else + { + assert(weight_ise_range < astc_helpers::BISE_16_LEVELS); + } + + double trial_error = encode_astc_hdr_block_mode_11(16, pBlock_pixels_half, pBlock_pixels_q16, weight_ise_range, trial_submode11, BIG_FLOAT_VAL, trial_endpoints, trial_weights, coptions, direct_only, + endpoint_ise_range, coptions.m_mode11_uber_mode && (weight_ise_range >= astc_helpers::BISE_4_LEVELS) && coptions.m_allow_uber_mode, constrain_ise_weight_selectors, coptions.m_first_mode11_submode, coptions.m_last_mode11_submode, false, cOrdinaryLeastSquares); + + if (trial_error < BIG_FLOAT_VAL) + { + astc_hdr_4x4_pack_results results; + results.clear(); + + results.m_best_block_error = trial_error; + + results.m_best_submodes[0] = trial_submode11; + results.m_constrained_weights = constrain_ise_weight_selectors; + + results.m_best_blk.m_num_partitions = 1; + results.m_best_blk.m_color_endpoint_modes[0] = 11; + results.m_best_blk.m_weight_ise_range = (uint8_t)weight_ise_range; + results.m_best_blk.m_endpoint_ise_range = (uint8_t)endpoint_ise_range; + + memcpy(results.m_best_blk.m_endpoints, trial_endpoints, NUM_MODE11_ENDPOINTS); + memcpy(results.m_best_blk.m_weights, trial_weights, 16); + +#ifdef _DEBUG + // Sanity checking + { + half_float block_pixels_half[16][3]; + + for (uint32_t i = 0; i < 16; i++) + { + block_pixels_half[i][0] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][0]); + block_pixels_half[i][1] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][1]); + block_pixels_half[i][2] = float_to_half_non_neg_no_nan_inf(pBlock_linear_colors[i][2]); + } + + half_float unpacked_astc_blk_rgba[4][4][4]; + bool res = astc_helpers::decode_block(results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16); + assert(res); + + half_float unpacked_astc_blk_rgb[4][4][3]; + for (uint32_t y = 0; y < 4; y++) + for (uint32_t x = 0; x < 4; x++) + for (uint32_t c = 0; c < 3; c++) + unpacked_astc_blk_rgb[y][x][c] = unpacked_astc_blk_rgba[y][x][c]; + + double cmp_err = compute_block_error(16, &block_pixels_half[0][0], &unpacked_astc_blk_rgb[0][0][0], coptions); + assert(results.m_best_block_error == cmp_err); + } +#endif + + // transcode to BC6H + assert(results.m_best_blk.m_color_endpoint_modes[0] == 11); + + // Get qlog12 endpoints + int e[2][3]; + bool success = decode_mode11_to_qlog12(results.m_best_blk.m_endpoints, e, results.m_best_blk.m_endpoint_ise_range); + assert(success); + BASISU_NOTE_UNUSED(success); + + // Transform endpoints to half float + half_float h_e[3][2] = + { + { qlog_to_half(e[0][0], 12), qlog_to_half(e[1][0], 12) }, + { qlog_to_half(e[0][1], 12), qlog_to_half(e[1][1], 12) }, + { qlog_to_half(e[0][2], 12), qlog_to_half(e[1][2], 12) } + }; + + // Transcode to bc6h + success = transcode_bc6h_1subset(h_e, results.m_best_blk, results.m_bc6h_block); + assert(success); + + all_results.push_back(results); + } + } +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static void pack_mode7_single_part( + const half_float pBlock_pixels_half[16][3], const vec4F pBlock_pixels_q16[16], + basisu::vector& all_results, const uastc_hdr_4x4_codec_options& coptions, + uint32_t first_mode7_part1_weight_ise_range, uint32_t last_mode7_part1_weight_ise_range) +{ + assert(first_mode7_part1_weight_ise_range <= last_mode7_part1_weight_ise_range); + + uint8_t trial_endpoints[NUM_MODE7_ENDPOINTS], trial_weights[16]; + uint32_t trial_submode7 = 0; + + clear_obj(trial_endpoints); + clear_obj(trial_weights); + + for (uint32_t weight_ise_range = first_mode7_part1_weight_ise_range; weight_ise_range <= last_mode7_part1_weight_ise_range; weight_ise_range++) + { + const uint32_t ise_endpoint_range = astc_helpers::BISE_256_LEVELS; + + double trial_error = encode_astc_hdr_block_mode_7(16, pBlock_pixels_half, pBlock_pixels_q16, weight_ise_range, trial_submode7, BIG_FLOAT_VAL, trial_endpoints, trial_weights, coptions, ise_endpoint_range); + + if (trial_error < BIG_FLOAT_VAL) + { + astc_hdr_4x4_pack_results results; + results.clear(); + + results.m_best_block_error = trial_error; + + results.m_best_submodes[0] = trial_submode7; + + results.m_best_blk.m_num_partitions = 1; + results.m_best_blk.m_color_endpoint_modes[0] = 7; + results.m_best_blk.m_weight_ise_range = (uint8_t)weight_ise_range; + results.m_best_blk.m_endpoint_ise_range = (uint8_t)ise_endpoint_range; + + memcpy(results.m_best_blk.m_endpoints, trial_endpoints, NUM_MODE7_ENDPOINTS); + memcpy(results.m_best_blk.m_weights, trial_weights, 16); + + // transcode to BC6H + assert(results.m_best_blk.m_color_endpoint_modes[0] == 7); + + // Get qlog12 endpoints + int e[2][3]; + if (!decode_mode7_to_qlog12(results.m_best_blk.m_endpoints, e, nullptr, results.m_best_blk.m_endpoint_ise_range)) + continue; + + // Transform endpoints to half float + half_float h_e[3][2] = + { + { qlog_to_half(e[0][0], 12), qlog_to_half(e[1][0], 12) }, + { qlog_to_half(e[0][1], 12), qlog_to_half(e[1][1], 12) }, + { qlog_to_half(e[0][2], 12), qlog_to_half(e[1][2], 12) } + }; + + // Transcode to bc6h + bool status = transcode_bc6h_1subset(h_e, results.m_best_blk, results.m_bc6h_block); + assert(status); + (void)status; + + all_results.push_back(results); + } + } +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static bool estimate_partition( + const half_float pBlock_pixels_half[16][3], + int* pBest_parts, uint32_t num_best_parts) +{ + assert(num_best_parts <= basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); + + vec3F training_vecs[16], mean(0.0f); + + for (uint32_t i = 0; i < 16; i++) + { + vec3F& v = training_vecs[i]; + + v[0] = (float)pBlock_pixels_half[i][0]; + v[1] = (float)pBlock_pixels_half[i][1]; + v[2] = (float)pBlock_pixels_half[i][2]; + + mean += v; + } + mean *= (1.0f / 16.0f); + + vec3F cluster_centroids[2] = { mean - vec3F(.1f), mean + vec3F(.1f) }; + + uint32_t cluster_pixels[2][16]; + uint32_t num_cluster_pixels[2]; + vec3F new_cluster_means[2]; + + for (uint32_t s = 0; s < 4; s++) + { + num_cluster_pixels[0] = 0; + num_cluster_pixels[1] = 0; + + new_cluster_means[0].clear(); + new_cluster_means[1].clear(); + + for (uint32_t i = 0; i < 16; i++) + { + float d0 = training_vecs[i].squared_distance(cluster_centroids[0]); + float d1 = training_vecs[i].squared_distance(cluster_centroids[1]); + + if (d0 < d1) + { + cluster_pixels[0][num_cluster_pixels[0]] = i; + new_cluster_means[0] += training_vecs[i]; + num_cluster_pixels[0]++; + } + else + { + cluster_pixels[1][num_cluster_pixels[1]] = i; + new_cluster_means[1] += training_vecs[i]; + num_cluster_pixels[1]++; + } + } + + if (!num_cluster_pixels[0] || !num_cluster_pixels[1]) + return false; + + cluster_centroids[0] = new_cluster_means[0] / (float)num_cluster_pixels[0]; + cluster_centroids[1] = new_cluster_means[1] / (float)num_cluster_pixels[1]; + } + + int desired_parts[4][4]; // [y][x] + for (uint32_t p = 0; p < 2; p++) + { + for (uint32_t i = 0; i < num_cluster_pixels[p]; i++) + { + const uint32_t pix_index = cluster_pixels[p][i]; + + desired_parts[pix_index >> 2][pix_index & 3] = p; + } + } + + uint32_t part_similarity[basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2]; + + for (uint32_t part_index = 0; part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; part_index++) + { + const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7; + + int total_sim_non_inv = 0; + int total_sim_inv = 0; + + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + int part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4]; + + if (part == desired_parts[y][x]) + total_sim_non_inv++; + + if ((part ^ 1) == desired_parts[y][x]) + total_sim_inv++; + } + } + + int total_sim = maximum(total_sim_non_inv, total_sim_inv); + + part_similarity[part_index] = (total_sim << 8) | part_index; + + } // part_index; + + std::sort(part_similarity, part_similarity + basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); + + for (uint32_t i = 0; i < num_best_parts; i++) + pBest_parts[i] = part_similarity[(basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2 - 1) - i] & 0xFF; + + return true; +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static void pack_mode7_2part( + const half_float pBlock_pixels_half[16][3], const vec4F pBlock_pixels_q16[16], + basisu::vector& all_results, const uastc_hdr_4x4_codec_options& coptions, + int num_estimated_partitions, const int *pEstimated_partitions, + uint32_t first_weight_ise_range, uint32_t last_weight_ise_range) +{ + assert(coptions.m_mode7_part2_part_masks); + + astc_helpers::log_astc_block trial_blk; + clear_obj(trial_blk); + trial_blk.m_grid_width = 4; + trial_blk.m_grid_height = 4; + + trial_blk.m_num_partitions = 2; + trial_blk.m_color_endpoint_modes[0] = 7; + trial_blk.m_color_endpoint_modes[1] = 7; + + uint32_t first_part_index = 0, last_part_index = basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; + + if (num_estimated_partitions) + { + first_part_index = 0; + last_part_index = num_estimated_partitions; + } + + for (uint32_t part_index_iter = first_part_index; part_index_iter < last_part_index; ++part_index_iter) + { + uint32_t part_index; + if (num_estimated_partitions) + { + part_index = pEstimated_partitions[part_index_iter]; + assert(part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); + } + else + { + part_index = part_index_iter; + if (((1U << part_index) & coptions.m_mode7_part2_part_masks) == 0) + continue; + } + + const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc; + const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7; + const bool invert_flag = basist::g_astc_bc7_common_partitions2[part_index].m_invert; + + half_float part_pixels_half[2][16][3]; + vec4F part_pixels_q16[2][16]; + + uint32_t pixel_part_index[4][4]; // [y][x] + uint32_t num_part_pixels[2] = { 0, 0 }; + + // Extract each subset's texels for this partition pattern + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + uint32_t part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4]; + if (invert_flag) + part = 1 - part; + + pixel_part_index[y][x] = part; + + const uint32_t n = num_part_pixels[part]; + + part_pixels_half[part][n][0] = pBlock_pixels_half[x + y * 4][0]; + part_pixels_half[part][n][1] = pBlock_pixels_half[x + y * 4][1]; + part_pixels_half[part][n][2] = pBlock_pixels_half[x + y * 4][2]; + part_pixels_q16[part][n] = pBlock_pixels_q16[x + y * 4]; + + num_part_pixels[part] = n + 1; + } + } + + trial_blk.m_partition_id = (uint16_t)astc_pattern; + + for (uint32_t weight_ise_range = first_weight_ise_range; weight_ise_range <= last_weight_ise_range; weight_ise_range++) + { + assert(weight_ise_range <= astc_helpers::BISE_8_LEVELS); + + uint32_t ise_endpoint_range = astc_helpers::BISE_256_LEVELS; + if (weight_ise_range == astc_helpers::BISE_5_LEVELS) + ise_endpoint_range = astc_helpers::BISE_192_LEVELS; + else if (weight_ise_range == astc_helpers::BISE_6_LEVELS) + ise_endpoint_range = astc_helpers::BISE_128_LEVELS; + else if (weight_ise_range == astc_helpers::BISE_8_LEVELS) + ise_endpoint_range = astc_helpers::BISE_80_LEVELS; + + uint8_t trial_endpoints[2][NUM_MODE7_ENDPOINTS], trial_weights[2][16]; + uint32_t trial_submode7[2]; + + clear_obj(trial_endpoints); + clear_obj(trial_weights); + clear_obj(trial_submode7); + + double total_trial_err = 0; + for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++) + { + total_trial_err += encode_astc_hdr_block_mode_7( + num_part_pixels[pack_part_index], part_pixels_half[pack_part_index], part_pixels_q16[pack_part_index], + weight_ise_range, trial_submode7[pack_part_index], BIG_FLOAT_VAL, + &trial_endpoints[pack_part_index][0], &trial_weights[pack_part_index][0], coptions, ise_endpoint_range); + + } // pack_part_index + + if (total_trial_err < BIG_FLOAT_VAL) + { + trial_blk.m_weight_ise_range = (uint8_t)weight_ise_range; + trial_blk.m_endpoint_ise_range = (uint8_t)ise_endpoint_range; + + for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++) + memcpy(&trial_blk.m_endpoints[pack_part_index * NUM_MODE7_ENDPOINTS], &trial_endpoints[pack_part_index][0], NUM_MODE7_ENDPOINTS); + + uint32_t src_pixel_index[2] = { 0, 0 }; + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + uint32_t p = pixel_part_index[y][x]; + trial_blk.m_weights[x + y * 4] = trial_weights[p][src_pixel_index[p]++]; + } + } + + astc_hdr_4x4_pack_results results; + results.clear(); + + results.m_best_block_error = total_trial_err; + results.m_best_submodes[0] = trial_submode7[0]; + results.m_best_submodes[1] = trial_submode7[1]; + results.m_best_pat_index = part_index; + + results.m_best_blk = trial_blk; + + bool status = transcode_bc6h_2subsets(part_index, results.m_best_blk, results.m_bc6h_block); + assert(status); + BASISU_NOTE_UNUSED(status); + + all_results.push_back(results); + } + + } // weight_ise_range + + } // part_index +} + +//-------------------------------------------------------------------------------------------------------------------------- + +static void pack_mode11_2part( + const half_float pBlock_pixels_half[16][3], const vec4F pBlock_pixels_q16[16], + basisu::vector& all_results, const uastc_hdr_4x4_codec_options& coptions, + int num_estimated_partitions, const int* pEstimated_partitions) +{ + assert(coptions.m_mode11_part2_part_masks); + + astc_helpers::log_astc_block trial_blk; + clear_obj(trial_blk); + trial_blk.m_grid_width = 4; + trial_blk.m_grid_height = 4; + + trial_blk.m_num_partitions = 2; + trial_blk.m_color_endpoint_modes[0] = 11; + trial_blk.m_color_endpoint_modes[1] = 11; + + uint32_t first_part_index = 0, last_part_index = basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; + + if (num_estimated_partitions) + { + first_part_index = 0; + last_part_index = num_estimated_partitions; + } + + for (uint32_t part_index_iter = first_part_index; part_index_iter < last_part_index; ++part_index_iter) + { + uint32_t part_index; + if (num_estimated_partitions) + { + part_index = pEstimated_partitions[part_index_iter]; + assert(part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); + } + else + { + part_index = part_index_iter; + if (((1U << part_index) & coptions.m_mode11_part2_part_masks) == 0) + continue; + } + + const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc; + const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7; + const bool invert_flag = basist::g_astc_bc7_common_partitions2[part_index].m_invert; + + half_float part_pixels_half[2][16][3]; + vec4F part_pixels_q16[2][16]; + + uint32_t pixel_part_index[4][4]; // [y][x] + uint32_t num_part_pixels[2] = { 0, 0 }; + + // Extract each subset's texels for this partition pattern + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + uint32_t part = basist::g_bc7_partition2[16 * bc7_pattern + x + y * 4]; + if (invert_flag) + part = 1 - part; + + pixel_part_index[y][x] = part; + + const uint32_t n = num_part_pixels[part]; + + part_pixels_half[part][n][0] = pBlock_pixels_half[x + y * 4][0]; + part_pixels_half[part][n][1] = pBlock_pixels_half[x + y * 4][1]; + part_pixels_half[part][n][2] = pBlock_pixels_half[x + y * 4][2]; + part_pixels_q16[part][n] = pBlock_pixels_q16[x + y * 4]; + + num_part_pixels[part] = n + 1; + } + } + + trial_blk.m_partition_id = (uint16_t)astc_pattern; + + for (uint32_t weight_ise_range = coptions.m_first_mode11_part2_weight_ise_range; weight_ise_range <= coptions.m_last_mode11_part2_weight_ise_range; weight_ise_range++) + { + bool direct_only = false; + uint32_t ise_endpoint_range = astc_helpers::BISE_64_LEVELS; + if (weight_ise_range == astc_helpers::BISE_4_LEVELS) + ise_endpoint_range = astc_helpers::BISE_40_LEVELS; + + uint8_t trial_endpoints[2][NUM_MODE11_ENDPOINTS], trial_weights[2][16]; + uint32_t trial_submode11[2]; + + clear_obj(trial_endpoints); + clear_obj(trial_weights); + clear_obj(trial_submode11); + + double total_trial_err = 0; + for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++) + { + total_trial_err += encode_astc_hdr_block_mode_11( + num_part_pixels[pack_part_index], part_pixels_half[pack_part_index], part_pixels_q16[pack_part_index], + weight_ise_range, trial_submode11[pack_part_index], BIG_FLOAT_VAL, + &trial_endpoints[pack_part_index][0], &trial_weights[pack_part_index][0], coptions, + direct_only, ise_endpoint_range, coptions.m_mode11_uber_mode && (weight_ise_range >= astc_helpers::BISE_4_LEVELS) && coptions.m_allow_uber_mode, false, + coptions.m_first_mode11_submode, coptions.m_last_mode11_submode, false, cOrdinaryLeastSquares); + + } // pack_part_index + + if (total_trial_err < BIG_FLOAT_VAL) + { + trial_blk.m_weight_ise_range = (uint8_t)weight_ise_range; + trial_blk.m_endpoint_ise_range = (uint8_t)ise_endpoint_range; + + for (uint32_t pack_part_index = 0; pack_part_index < 2; pack_part_index++) + memcpy(&trial_blk.m_endpoints[pack_part_index * NUM_MODE11_ENDPOINTS], &trial_endpoints[pack_part_index][0], NUM_MODE11_ENDPOINTS); + + uint32_t src_pixel_index[2] = { 0, 0 }; + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + uint32_t p = pixel_part_index[y][x]; + trial_blk.m_weights[x + y * 4] = trial_weights[p][src_pixel_index[p]++]; + } + } + + astc_hdr_4x4_pack_results results; + results.clear(); + + results.m_best_block_error = total_trial_err; + results.m_best_submodes[0] = trial_submode11[0]; + results.m_best_submodes[1] = trial_submode11[1]; + results.m_best_pat_index = part_index; + + results.m_best_blk = trial_blk; + + bool status = transcode_bc6h_2subsets(part_index, results.m_best_blk, results.m_bc6h_block); + assert(status); + BASISU_NOTE_UNUSED(status); + + all_results.push_back(results); + } + + } // weight_ise_range + + } // part_index +} + +bool astc_hdr_4x4_enc_block( + const float* pRGBPixels, const basist::half_float *pRGBPixelsHalf, + const uastc_hdr_4x4_codec_options& coptions, + basisu::vector& all_results) +{ + assert(g_astc_hdr_enc_initialized); + if (!g_astc_hdr_enc_initialized) + { + // astc_hdr_enc_init() MUST be called first. + assert(0); + return false; + } + + assert(coptions.m_use_solid || coptions.m_use_mode11_part1 || coptions.m_use_mode7_part2 || coptions.m_use_mode7_part1 || coptions.m_use_mode11_part2); + + all_results.resize(0); + + const half_float (*pBlock_pixels_half)[16][3] = reinterpret_cast(pRGBPixelsHalf); + + vec4F block_linear_colors[16]; + vec4F block_pixels_q16[16]; + + bool is_greyscale = true; + + for (uint32_t i = 0; i < 16; i++) + { + const float fr = pRGBPixels[i * 3 + 0], fg = pRGBPixels[i * 3 + 1], fb = pRGBPixels[i * 3 + 2]; + + // Sanity check the input block. + assert((fr >= 0) && (fr <= MAX_HALF_FLOAT) && (!std::isinf(fr)) && (!std::isnan(fr))); + assert((fg >= 0) && (fg <= MAX_HALF_FLOAT) && (!std::isinf(fg)) && (!std::isnan(fg))); + assert((fb >= 0) && (fb <= MAX_HALF_FLOAT) && (!std::isinf(fb)) && (!std::isnan(fb))); + + block_linear_colors[i].set(fr, fg, fb, 1.0f); + + const half_float hr = (*pBlock_pixels_half)[i][0]; + assert(hr == basist::float_to_half(fr)); + block_pixels_q16[i][0] = (float)half_to_qlog16(hr); + + const half_float hg = (*pBlock_pixels_half)[i][1]; + assert(hg == basist::float_to_half(fg)); + block_pixels_q16[i][1] = (float)half_to_qlog16(hg); + + const half_float hb = (*pBlock_pixels_half)[i][2]; + assert(hb == basist::float_to_half(fb)); + block_pixels_q16[i][2] = (float)half_to_qlog16(hb); + + block_pixels_q16[i][3] = 0.0f; + + if ((hr != hg) || (hr != hb)) + is_greyscale = false; + } // i + + bool is_solid = false; + if (coptions.m_use_solid) + is_solid = pack_solid(block_linear_colors, all_results, coptions); + + if (!is_solid) + { + if ((is_greyscale) && (coptions.m_level == 0)) + { + // Special case if it's a pure grayscale block - just try mode 7. + pack_mode7_single_part(*pBlock_pixels_half, block_pixels_q16, all_results, coptions, 1, 1); + pack_mode7_single_part(*pBlock_pixels_half, block_pixels_q16, all_results, coptions, UHDR_MODE7_PART1_LAST_ISE_RANGE, UHDR_MODE7_PART1_LAST_ISE_RANGE); + } + else + { + if (coptions.m_use_mode11_part1) + { + const size_t cur_num_results = all_results.size(); + + pack_mode11(block_linear_colors, *pBlock_pixels_half, block_pixels_q16, all_results, coptions, coptions.m_first_mode11_weight_ise_range, coptions.m_last_mode11_weight_ise_range, false); + + if (coptions.m_last_mode11_weight_ise_range >= astc_helpers::BISE_12_LEVELS) + { + // Try constrained weights if we're allowed to use 12/16 level ISE weight modes + pack_mode11(block_linear_colors, *pBlock_pixels_half, block_pixels_q16, all_results, coptions, maximum(coptions.m_first_mode11_weight_ise_range, astc_helpers::BISE_12_LEVELS), coptions.m_last_mode11_weight_ise_range, true); + } + + // If we couldn't get any mode 11 results at all, and we were restricted to just trying weight ISE range 8 (which required endpoint quantization) then + // fall back to weight ISE range 7 (which doesn't need any endpoint quantization). + // This is to guarantee we always get at least 1 non-solid result. + if (all_results.size() == cur_num_results) + { + if (coptions.m_first_mode11_weight_ise_range == astc_helpers::BISE_16_LEVELS) + { + pack_mode11(block_linear_colors, *pBlock_pixels_half, block_pixels_q16, all_results, coptions, astc_helpers::BISE_12_LEVELS, astc_helpers::BISE_12_LEVELS, false); + } + } + } + + if (coptions.m_use_mode7_part1) + { + // Mode 7 1-subset never requires endpoint quantization, so it cannot fail to find at least one usable solution. + pack_mode7_single_part(*pBlock_pixels_half, block_pixels_q16, all_results, coptions, coptions.m_first_mode7_part1_weight_ise_range, coptions.m_last_mode7_part1_weight_ise_range); + } + else if (is_greyscale) + { + // Special case if it's a pure grayscale block and mode 7 was disabled - try it anyway, because mode 11 has worse B channel quantization. + pack_mode7_single_part(*pBlock_pixels_half, block_pixels_q16, all_results, coptions, 1, 1); + pack_mode7_single_part(*pBlock_pixels_half, block_pixels_q16, all_results, coptions, UHDR_MODE7_PART1_LAST_ISE_RANGE, UHDR_MODE7_PART1_LAST_ISE_RANGE); + } + } + + bool have_est = false; + int best_parts[basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2]; + + if ((coptions.m_use_mode7_part2) || (coptions.m_use_mode11_part2)) + { + if (coptions.m_use_estimated_partitions) + have_est = estimate_partition(*pBlock_pixels_half, best_parts, coptions.m_max_estimated_partitions); + } + + if (coptions.m_use_mode7_part2) + { + const size_t cur_num_results = all_results.size(); + + pack_mode7_2part(*pBlock_pixels_half, block_pixels_q16, + all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts, + coptions.m_first_mode7_part2_weight_ise_range, coptions.m_last_mode7_part2_weight_ise_range); + + // If we couldn't find any packable 2-subset mode 7 results at weight levels >= 5 levels (which always requires endpoint quant), then try falling back to + // 5 levels which doesn't require endpoint quantization. + if (all_results.size() == cur_num_results) + { + if (coptions.m_first_mode7_part2_weight_ise_range >= astc_helpers::BISE_5_LEVELS) + { + pack_mode7_2part(*pBlock_pixels_half, block_pixels_q16, + all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts, + astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_4_LEVELS); + } + } + } + + if (coptions.m_use_mode11_part2) + { + // This always requires endpoint quant, so it could fail to find any usable solutions. + pack_mode11_2part(*pBlock_pixels_half, block_pixels_q16, all_results, coptions, have_est ? coptions.m_max_estimated_partitions : 0, best_parts); + } + + if (coptions.m_refine_weights) + { + // TODO: This is quite slow. + for (uint32_t i = 0; i < all_results.size(); i++) + { + bool status = astc_hdr_4x4_refine_weights(pRGBPixelsHalf, all_results[i], coptions, coptions.m_bc6h_err_weight, &all_results[i].m_improved_via_refinement_flag); + assert(status); + BASISU_NOTE_UNUSED(status); + } + } + + } // !is_solid + + return true; +} + +bool astc_hdr_4x4_pack_results_to_block(astc_blk& dst_blk, const astc_hdr_4x4_pack_results& results) +{ + assert(g_astc_hdr_enc_initialized); + if (!g_astc_hdr_enc_initialized) + return false; + + if (results.m_is_solid) + { + memcpy(&dst_blk, &results.m_solid_blk, sizeof(results.m_solid_blk)); + } + else + { + bool status = astc_helpers::pack_astc_block((astc_helpers::astc_block&)dst_blk, results.m_best_blk); + if (!status) + { + assert(0); + return false; + } + } + + return true; +} + +// Refines a block's chosen weight indices, balancing BC6H and ASTC HDR error. +bool astc_hdr_4x4_refine_weights(const half_float *pSource_block, + astc_hdr_4x4_pack_results& cur_results, const uastc_hdr_4x4_codec_options& coptions, float bc6h_weight, bool *pImproved_flag) +{ + if (pImproved_flag) + *pImproved_flag = false; + + if (cur_results.m_is_solid) + return true; + + const uint32_t total_weights = astc_helpers::get_ise_levels(cur_results.m_best_blk.m_weight_ise_range); + assert((total_weights >= MIN_SUPPORTED_WEIGHT_LEVELS) && (total_weights <= MAX_SUPPORTED_WEIGHT_LEVELS)); + + double best_err[4][4]; + uint8_t best_weight[4][4]; + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + best_err[y][x] = BIG_FLOAT_VAL; + best_weight[y][x] = 0; + } + } + + astc_hdr_4x4_pack_results temp_results; + + const float c_weights[3] = { coptions.m_r_err_scale, coptions.m_g_err_scale, 1.0f }; + + for (uint32_t weight_index = 0; weight_index < total_weights; weight_index++) + { + temp_results = cur_results; + for (uint32_t i = 0; i < 16; i++) + temp_results.m_best_blk.m_weights[i] = (uint8_t)weight_index; + + half_float unpacked_astc_blk_rgba[4][4][4]; + bool res = astc_helpers::decode_block(temp_results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16); + assert(res); + + basist::bc6h_block trial_bc6h_blk; + res = basist::astc_hdr_transcode_to_bc6h(temp_results.m_best_blk, trial_bc6h_blk); + assert(res); + + half_float unpacked_bc6h_blk[4][4][3]; + res = unpack_bc6h(&trial_bc6h_blk, unpacked_bc6h_blk, false); + assert(res); + BASISU_NOTE_UNUSED(res); + + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + double total_err = 0.0f; + + for (uint32_t c = 0; c < 3; c++) + { + const half_float orig_c = pSource_block[(x + y * 4) * 3 + c]; + const double orig_c_q = q(orig_c, Q_LOG_BIAS_4x4); + + const half_float astc_c = unpacked_astc_blk_rgba[y][x][c]; + const double astc_c_q = q(astc_c, Q_LOG_BIAS_4x4); + const double astc_e = square(astc_c_q - orig_c_q) * c_weights[c]; + + const half_float bc6h_c = unpacked_bc6h_blk[y][x][c]; + const double bc6h_c_q = q(bc6h_c, Q_LOG_BIAS_4x4); + const double bc6h_e = square(bc6h_c_q - orig_c_q) * c_weights[c]; + + const double overall_err = astc_e * (1.0f - bc6h_weight) + bc6h_e * bc6h_weight; + + total_err += overall_err; + + } // c + + if (total_err < best_err[y][x]) + { + best_err[y][x] = total_err; + best_weight[y][x] = (uint8_t)weight_index; + } + + } // x + } // y + + } // weight_index + + bool any_changed = false; + for (uint32_t i = 0; i < 16; i++) + { + if (cur_results.m_best_blk.m_weights[i] != best_weight[i >> 2][i & 3]) + { + any_changed = true; + break; + } + } + + if (any_changed) + { + memcpy(cur_results.m_best_blk.m_weights, best_weight, 16); + + { + bool res = basist::astc_hdr_transcode_to_bc6h(cur_results.m_best_blk, cur_results.m_bc6h_block); + assert(res); + BASISU_NOTE_UNUSED(res); + + half_float unpacked_astc_blk_rgba[4][4][4]; + res = astc_helpers::decode_block(cur_results.m_best_blk, unpacked_astc_blk_rgba, 4, 4, astc_helpers::cDecodeModeHDR16); + assert(res); + + half_float unpacked_astc_blk_rgb[4][4][3]; + for (uint32_t y = 0; y < 4; y++) + for (uint32_t x = 0; x < 4; x++) + for (uint32_t c = 0; c < 3; c++) + unpacked_astc_blk_rgb[y][x][c] = unpacked_astc_blk_rgba[y][x][c]; + + cur_results.m_best_block_error = compute_block_error(16, pSource_block, &unpacked_astc_blk_rgb[0][0][0], coptions); + } + + if (pImproved_flag) + *pImproved_flag = true; + } + + return true; +} + +void astc_hdr_4x4_block_stats::update(const astc_hdr_4x4_pack_results& log_blk) +{ + std::lock_guard lck(m_mutex); + + m_total_blocks++; + + if (log_blk.m_improved_via_refinement_flag) + m_total_refined++; + + if (log_blk.m_is_solid) + { + m_total_solid++; + } + else + { + int best_weight_range = log_blk.m_best_blk.m_weight_ise_range; + + if (log_blk.m_best_blk.m_color_endpoint_modes[0] == 7) + { + m_mode7_submode_hist[bounds_check(log_blk.m_best_submodes[0], 0U, 6U)]++; + + if (log_blk.m_best_blk.m_num_partitions == 2) + { + m_total_mode7_2part++; + + m_mode7_submode_hist[bounds_check(log_blk.m_best_submodes[1], 0U, 6U)]++; + m_total_2part++; + + m_weight_range_hist_7_2part[bounds_check(best_weight_range, 0, 11)]++; + + m_part_hist[bounds_check(log_blk.m_best_pat_index, 0U, 32U)]++; + } + else + { + m_total_mode7_1part++; + + m_weight_range_hist_7[bounds_check(best_weight_range, 0, 11)]++; + } + } + else + { + m_mode11_submode_hist[bounds_check(log_blk.m_best_submodes[0], 0U, 9U)]++; + if (log_blk.m_constrained_weights) + m_total_mode11_1part_constrained_weights++; + + if (log_blk.m_best_blk.m_num_partitions == 2) + { + m_total_mode11_2part++; + + m_mode11_submode_hist[bounds_check(log_blk.m_best_submodes[1], 0U, 9U)]++; + m_total_2part++; + + m_weight_range_hist_11_2part[bounds_check(best_weight_range, 0, 11)]++; + + m_part_hist[bounds_check(log_blk.m_best_pat_index, 0U, 32U)]++; + } + else + { + m_total_mode11_1part++; + + m_weight_range_hist_11[bounds_check(best_weight_range, 0, 11)]++; + } + } + } +} + +void astc_hdr_4x4_block_stats::print() +{ + std::lock_guard lck(m_mutex); + + assert(m_total_blocks); + if (!m_total_blocks) + return; + + printf("\nLow-level ASTC Encoder Statistics:\n"); + printf("Total blocks: %u\n", m_total_blocks); + printf("Total solid: %u %3.2f%%\n", m_total_solid, (m_total_solid * 100.0f) / m_total_blocks); + printf("Total refined: %u %3.2f%%\n", m_total_refined, (m_total_refined * 100.0f) / m_total_blocks); + + printf("Total mode 11, 1 partition: %u %3.2f%%\n", m_total_mode11_1part, (m_total_mode11_1part * 100.0f) / m_total_blocks); + printf("Total mode 11, 1 partition, constrained weights: %u %3.2f%%\n", m_total_mode11_1part_constrained_weights, (m_total_mode11_1part_constrained_weights * 100.0f) / m_total_blocks); + printf("Total mode 11, 2 partition: %u %3.2f%%\n", m_total_mode11_2part, (m_total_mode11_2part * 100.0f) / m_total_blocks); + + printf("Total mode 7, 1 partition: %u %3.2f%%\n", m_total_mode7_1part, (m_total_mode7_1part * 100.0f) / m_total_blocks); + printf("Total mode 7, 2 partition: %u %3.2f%%\n", m_total_mode7_2part, (m_total_mode7_2part * 100.0f) / m_total_blocks); + + printf("Total 2 partitions: %u %3.2f%%\n", m_total_2part, (m_total_2part * 100.0f) / m_total_blocks); + printf("\n"); + + printf("ISE texel weight range histogram mode 11:\n"); + for (uint32_t i = 1; i <= UHDR_MODE11_LAST_ISE_RANGE; i++) + printf("%u %u\n", i, m_weight_range_hist_11[i]); + printf("\n"); + + printf("ISE texel weight range histogram mode 11, 2 partition:\n"); + for (uint32_t i = 1; i <= UHDR_MODE11_PART2_LAST_ISE_RANGE; i++) + printf("%u %u\n", i, m_weight_range_hist_11_2part[i]); + printf("\n"); + + printf("ISE texel weight range histogram mode 7:\n"); + for (uint32_t i = 1; i <= UHDR_MODE7_PART1_LAST_ISE_RANGE; i++) + printf("%u %u\n", i, m_weight_range_hist_7[i]); + printf("\n"); + + printf("ISE texel weight range histogram mode 7, 2 partition:\n"); + for (uint32_t i = 1; i <= UHDR_MODE7_PART2_LAST_ISE_RANGE; i++) + printf("%u %u\n", i, m_weight_range_hist_7_2part[i]); + printf("\n"); + + printf("Mode 11 submode histogram:\n"); + for (uint32_t i = 0; i <= MODE11_TOTAL_SUBMODES; i++) // +1 because of the extra direct encoding + printf("%u %u\n", i, m_mode11_submode_hist[i]); + printf("\n"); + + printf("Mode 7 submode histogram:\n"); + for (uint32_t i = 0; i < MODE7_TOTAL_SUBMODES; i++) + printf("%u %u\n", i, m_mode7_submode_hist[i]); + printf("\n"); + + printf("Partition pattern table usage histogram:\n"); + for (uint32_t i = 0; i < basist::TOTAL_ASTC_BC7_COMMON_PARTITIONS2; i++) + printf("%u:%u ", i, m_part_hist[i]); + printf("\n\n"); +} + +} // namespace basisu + diff --git a/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.h b/thirdparty/basis_universal/encoder/basisu_uastc_hdr_4x4_enc.h similarity index 62% rename from thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.h rename to thirdparty/basis_universal/encoder/basisu_uastc_hdr_4x4_enc.h index ee122ff7cee9..390520a80074 100644 --- a/thirdparty/basis_universal/encoder/basisu_astc_hdr_enc.h +++ b/thirdparty/basis_universal/encoder/basisu_uastc_hdr_4x4_enc.h @@ -1,29 +1,20 @@ -// basisu_astc_hdr_enc.h +// basisu_uastc_hdr_4x4_enc.h #pragma once #include "basisu_enc.h" #include "basisu_gpu_texture.h" #include "../transcoder/basisu_astc_helpers.h" #include "../transcoder/basisu_astc_hdr_core.h" +#include "basisu_astc_hdr_common.h" namespace basisu { - // This MUST be called before encoding any blocks. - void astc_hdr_enc_init(); - - const uint32_t MODE11_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, MODE11_LAST_ISE_RANGE = astc_helpers::BISE_16_LEVELS; - const uint32_t MODE7_PART1_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, MODE7_PART1_LAST_ISE_RANGE = astc_helpers::BISE_16_LEVELS; - const uint32_t MODE7_PART2_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, MODE7_PART2_LAST_ISE_RANGE = astc_helpers::BISE_8_LEVELS; - const uint32_t MODE11_PART2_FIRST_ISE_RANGE = astc_helpers::BISE_3_LEVELS, MODE11_PART2_LAST_ISE_RANGE = astc_helpers::BISE_4_LEVELS; - const uint32_t MODE11_TOTAL_SUBMODES = 8; // plus an extra hidden submode, directly encoded, for direct, so really 9 (see tables 99/100 of the ASTC spec) - const uint32_t MODE7_TOTAL_SUBMODES = 6; - - struct astc_hdr_codec_options + struct uastc_hdr_4x4_codec_options : astc_hdr_codec_base_options { float m_bc6h_err_weight; bool m_use_solid; - bool m_use_mode11; + bool m_use_mode11_part1; bool m_mode11_uber_mode; uint32_t m_first_mode11_weight_ise_range; uint32_t m_last_mode11_weight_ise_range; @@ -45,8 +36,6 @@ namespace basisu uint32_t m_first_mode11_part2_weight_ise_range; uint32_t m_last_mode11_part2_weight_ise_range; - float m_r_err_scale, m_g_err_scale; - bool m_refine_weights; uint32_t m_level; @@ -54,13 +43,10 @@ namespace basisu bool m_use_estimated_partitions; uint32_t m_max_estimated_partitions; - // If true, the ASTC HDR compressor is allowed to more aggressively vary weight indices for slightly higher compression in non-fastest mode. This will hurt BC6H quality, however. - bool m_allow_uber_mode; - - astc_hdr_codec_options(); + uastc_hdr_4x4_codec_options(); void init(); - + // TODO: set_quality_level() is preferred to configure the codec for transcoding purposes. static const int cMinLevel = 0; static const int cMaxLevel = 4; @@ -73,7 +59,7 @@ namespace basisu void set_quality_fastest(); }; - struct astc_hdr_pack_results + struct astc_hdr_4x4_pack_results { double m_best_block_error; double m_bc6h_block_error; // note this is not used/set by the encoder, here for convienance @@ -119,35 +105,6 @@ namespace basisu } }; - void interpolate_qlog12_colors( - const int e[2][3], - basist::half_float* pDecoded_half, - vec3F* pDecoded_float, - uint32_t n, uint32_t ise_weight_range); - - bool get_astc_hdr_mode_11_block_colors( - const uint8_t* pEndpoints, - basist::half_float* pDecoded_half, - vec3F* pDecoded_float, - uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range); - - bool get_astc_hdr_mode_7_block_colors( - const uint8_t* pEndpoints, - basist::half_float* pDecoded_half, - vec3F* pDecoded_float, - uint32_t n, uint32_t ise_weight_range, uint32_t ise_endpoint_range); - - double eval_selectors( - uint32_t num_pixels, - uint8_t* pWeights, - const basist::half_float* pBlock_pixels_half, - uint32_t num_weight_levels, - const basist::half_float* pDecoded_half, - const astc_hdr_codec_options& coptions, - uint32_t usable_selector_bitmask = UINT32_MAX); - - double compute_block_error(const basist::half_float* pOrig_block, const basist::half_float* pPacked_block, const astc_hdr_codec_options& coptions); - // Encodes a 4x4 ASTC HDR block given a 4x4 array of source block pixels/texels. // Supports solid color blocks, mode 11 (all submodes), mode 7/1 partition (all submodes), // and mode 7/2 partitions (all submodes) - 30 patterns, only the ones also in common with the BC6H format. @@ -164,16 +121,16 @@ namespace basisu // astc_hdr_enc_init() MUST have been called first to initialized the codec. // Input pixels are checked and cannot be NaN's, Inf's, signed, or too large (greater than MAX_HALF_FLOAT, or 65504). // Normal values and denormals are okay. - bool astc_hdr_enc_block( - const float* pRGBPixels, - const astc_hdr_codec_options& coptions, - basisu::vector &all_results); + bool astc_hdr_4x4_enc_block( + const float* pRGBPixels, const basist::half_float *pRGBPixelsHalf, + const uastc_hdr_4x4_codec_options& coptions, + basisu::vector &all_results); - bool astc_hdr_pack_results_to_block(basist::astc_blk& dst_blk, const astc_hdr_pack_results& results); + bool astc_hdr_4x4_pack_results_to_block(basist::astc_blk& dst_blk, const astc_hdr_4x4_pack_results& results); - bool astc_hdr_refine_weights(const basist::half_float* pSource_block, astc_hdr_pack_results& cur_results, const astc_hdr_codec_options& coptions, float bc6h_weight, bool* pImproved_flag); + bool astc_hdr_4x4_refine_weights(const basist::half_float* pSource_block, astc_hdr_4x4_pack_results& cur_results, const uastc_hdr_4x4_codec_options& coptions, float bc6h_weight, bool* pImproved_flag); - struct astc_hdr_block_stats + struct astc_hdr_4x4_block_stats { std::mutex m_mutex; @@ -195,7 +152,7 @@ namespace basisu uint32_t m_total_refined; - astc_hdr_block_stats() { clear(); } + astc_hdr_4x4_block_stats() { clear(); } void clear() { @@ -215,7 +172,7 @@ namespace basisu clear_obj(m_part_hist); } - void update(const astc_hdr_pack_results& log_blk); + void update(const astc_hdr_4x4_pack_results& log_blk); void print(); }; diff --git a/thirdparty/basis_universal/patches/0001-external-zstd-pr344.patch b/thirdparty/basis_universal/patches/0001-external-zstd-pr344.patch index 505d17c5f32e..f5423b047ec1 100644 --- a/thirdparty/basis_universal/patches/0001-external-zstd-pr344.patch +++ b/thirdparty/basis_universal/patches/0001-external-zstd-pr344.patch @@ -1,5 +1,5 @@ diff --git a/thirdparty/basis_universal/encoder/basisu_comp.cpp b/thirdparty/basis_universal/encoder/basisu_comp.cpp -index f16e75bd46..81813257cd 100644 +index 59a2a50900..e9aa20f313 100644 --- a/thirdparty/basis_universal/encoder/basisu_comp.cpp +++ b/thirdparty/basis_universal/encoder/basisu_comp.cpp @@ -33,7 +33,7 @@ @@ -12,10 +12,10 @@ index f16e75bd46..81813257cd 100644 // Set to 1 to disable the mipPadding alignment workaround (which only seems to be needed when no key-values are written at all) diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp b/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp -index ea994b0c4f..32018cd282 100644 +index 0f7ca1565f..d7bce42013 100644 --- a/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp +++ b/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp -@@ -164,7 +164,7 @@ +@@ -169,7 +169,7 @@ // If BASISD_SUPPORT_KTX2_ZSTD is 0, UASTC files compressed with Zstd cannot be loaded. #if BASISD_SUPPORT_KTX2_ZSTD // We only use two Zstd API's: ZSTD_decompress() and ZSTD_isError() diff --git a/thirdparty/basis_universal/patches/0002-external-jpgd.patch b/thirdparty/basis_universal/patches/0002-external-jpgd.patch index bc2a61d150ac..08a2f42d0c92 100644 --- a/thirdparty/basis_universal/patches/0002-external-jpgd.patch +++ b/thirdparty/basis_universal/patches/0002-external-jpgd.patch @@ -1,8 +1,8 @@ diff --git a/thirdparty/basis_universal/encoder/basisu_enc.cpp b/thirdparty/basis_universal/encoder/basisu_enc.cpp -index 47e8981bc3..6c0ac0ad37 100644 +index b9804090b1..5987685ae7 100644 --- a/thirdparty/basis_universal/encoder/basisu_enc.cpp +++ b/thirdparty/basis_universal/encoder/basisu_enc.cpp -@@ -458,7 +458,7 @@ namespace basisu +@@ -492,7 +492,7 @@ namespace basisu bool load_jpg(const char *pFilename, image& img) { int width = 0, height = 0, actual_comps = 0; @@ -11,3 +11,12 @@ index 47e8981bc3..6c0ac0ad37 100644 if (!pImage_data) return false; +@@ -512,7 +512,7 @@ namespace basisu + } + + int width = 0, height = 0, actual_comps = 0; +- uint8_t* pImage_data = jpgd::decompress_jpeg_image_from_memory(pBuf, (int)buf_size, &width, &height, &actual_comps, 4, jpgd::jpeg_decoder::cFlagLinearChromaFiltering); ++ uint8_t* pImage_data = jpgd::decompress_jpeg_image_from_memory(pBuf, (int)buf_size, &width, &height, &actual_comps, 4, jpgd::jpeg_decoder::cFlagBoxChromaFiltering); + if (!pImage_data) + return false; + diff --git a/thirdparty/basis_universal/patches/0003-external-tinyexr.patch b/thirdparty/basis_universal/patches/0003-external-tinyexr.patch index e5f2b8422d28..494a95494062 100644 --- a/thirdparty/basis_universal/patches/0003-external-tinyexr.patch +++ b/thirdparty/basis_universal/patches/0003-external-tinyexr.patch @@ -1,8 +1,8 @@ diff --git a/thirdparty/basis_universal/encoder/basisu_enc.cpp b/thirdparty/basis_universal/encoder/basisu_enc.cpp -index 6c0ac0ad37..2bf486a028 100644 +index 7904aab91c..4d885cba16 100644 --- a/thirdparty/basis_universal/encoder/basisu_enc.cpp +++ b/thirdparty/basis_universal/encoder/basisu_enc.cpp -@@ -27,7 +27,7 @@ +@@ -29,7 +29,7 @@ #ifndef TINYEXR_USE_ZFP #define TINYEXR_USE_ZFP (1) #endif @@ -11,7 +11,7 @@ index 6c0ac0ad37..2bf486a028 100644 #ifndef MINIZ_HEADER_FILE_ONLY #define MINIZ_HEADER_FILE_ONLY -@@ -3257,7 +3257,8 @@ namespace basisu +@@ -3420,7 +3420,8 @@ namespace basisu float* out_rgba = nullptr; const char* err = nullptr; diff --git a/thirdparty/basis_universal/patches/0004-remove-tinydds-qoi.patch b/thirdparty/basis_universal/patches/0004-remove-tinydds-qoi.patch index 6a30616140b0..c62c54bee6a1 100644 --- a/thirdparty/basis_universal/patches/0004-remove-tinydds-qoi.patch +++ b/thirdparty/basis_universal/patches/0004-remove-tinydds-qoi.patch @@ -1,8 +1,8 @@ diff --git a/thirdparty/basis_universal/encoder/basisu_enc.cpp b/thirdparty/basis_universal/encoder/basisu_enc.cpp -index 2bf486a028..fff98e8301 100644 +index 4d885cba16..6c2cf0260e 100644 --- a/thirdparty/basis_universal/encoder/basisu_enc.cpp +++ b/thirdparty/basis_universal/encoder/basisu_enc.cpp -@@ -37,9 +37,6 @@ +@@ -39,9 +39,6 @@ #endif #include "basisu_miniz.h" @@ -12,7 +12,7 @@ index 2bf486a028..fff98e8301 100644 #if defined(_WIN32) // For QueryPerformanceCounter/QueryPerformanceFrequency #define WIN32_LEAN_AND_MEAN -@@ -408,16 +405,7 @@ namespace basisu +@@ -453,16 +450,7 @@ namespace basisu bool load_qoi(const char* pFilename, image& img) { @@ -31,7 +31,7 @@ index 2bf486a028..fff98e8301 100644 bool load_png(const uint8_t *pBuf, size_t buf_size, image &img, const char *pFilename) diff --git a/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp -index 000869a533..648cfb47ae 100644 +index 339218fcf2..028ac3f314 100644 --- a/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp +++ b/thirdparty/basis_universal/encoder/basisu_gpu_texture.cpp @@ -19,9 +19,6 @@ @@ -41,10 +41,10 @@ index 000869a533..648cfb47ae 100644 -#define TINYDDS_IMPLEMENTATION -#include "3rdparty/tinydds.h" - + #define BASISU_USE_GOOGLE_ASTC_DECODER (1) + namespace basisu - { - //------------------------------------------------------------------------------------------------ -@@ -1980,207 +1977,7 @@ namespace basisu +@@ -2049,207 +2046,7 @@ namespace basisu // and cubemap, cubemap mipmapped, and cubemap array mipmapped. bool write_dds_file(uint8_vec &dds_data, const basisu::vector& gpu_images, bool cubemap_flag, bool use_srgb_format) { @@ -63,11 +63,11 @@ index 000869a533..648cfb47ae 100644 - assert(0); - return false; - } -- slices = gpu_images.size() / 6; +- slices = gpu_images.size_u32() / 6; - } - else - { -- slices = gpu_images.size(); +- slices = gpu_images.size_u32(); - } - - uint32_t width = 0, height = 0, total_levels = 0; @@ -185,7 +185,7 @@ index 000869a533..648cfb47ae 100644 - assert(total_levels < 32); - for (uint32_t i = 0; i < total_levels; i++) - { -- mipmap_sizes[i] = mipmaps[i].size_in_bytes(); +- mipmap_sizes[i] = mipmaps[i].size_in_bytes_u32(); - mipmap_ptrs[i] = mipmaps[i].get_ptr(); - } - @@ -253,7 +253,7 @@ index 000869a533..648cfb47ae 100644 } bool write_dds_file(const char* pFilename, const basisu::vector& gpu_images, bool cubemap_flag, bool use_srgb_format) -@@ -2201,188 +1998,6 @@ namespace basisu +@@ -2270,188 +2067,6 @@ namespace basisu bool read_uncompressed_dds_file(const char* pFilename, basisu::vector &ldr_mips, basisu::vector& hdr_mips) { diff --git a/thirdparty/basis_universal/patches/0005-windows-illegal-character.patch b/thirdparty/basis_universal/patches/0005-windows-illegal-character.patch new file mode 100644 index 000000000000..a64a051b36e4 --- /dev/null +++ b/thirdparty/basis_universal/patches/0005-windows-illegal-character.patch @@ -0,0 +1,13 @@ +diff --git a/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.h b/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.h +index 0d6d2ae936..8b82ad8c29 100644 +--- a/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.h ++++ b/thirdparty/basis_universal/encoder/basisu_astc_hdr_6x6_enc.h +@@ -16,7 +16,7 @@ namespace astc_6x6_hdr + { + // Important: The Delta ITP colorspace error metric we use internally makes several assumptions about the nature of the HDR RGB inputs supplied to the encoder. + // This encoder computes colorspace error in the ICtCp (or more accurately the delta ITP, where CT is scaled by .5 vs. ICtCp to become T) colorspace, so getting this correct is important. +- // By default the encoder assumes the input is in absolute luminance (in nits or candela per square meter, cd/mâ–“), specified as positive-only linear light RGB, using the REC 709 colorspace gamut (but NOT the sRGB transfer function, i.e. linear light). ++ // By default the encoder assumes the input is in absolute luminance (in nits or candela per square meter, cd/m^2), specified as positive-only linear light RGB, using the REC 709 colorspace gamut (but NOT the sRGB transfer function, i.e. linear light). + // If the m_rec2020_bt2100_color_gamut flag is true, the input colorspace is treated as REC 2020/BT.2100 (which is wider than 709). + // For SDR/LDR->HDR upconversion, the REC 709 sRGB input should be converted to linear light (sRGB->linear) and the resulting normalized linear RGB values scaled by either 80 or 100 nits (the luminance of a typical SDR monitor). + // SDR upconversion to normalized [0,1] (i.e. non-absolute) luminances may work but is not supported because ITP errors will not be predicted correctly. diff --git a/thirdparty/basis_universal/patches/0006-ambiguous-calls.patch b/thirdparty/basis_universal/patches/0006-ambiguous-calls.patch new file mode 100644 index 000000000000..99905499bd34 --- /dev/null +++ b/thirdparty/basis_universal/patches/0006-ambiguous-calls.patch @@ -0,0 +1,22 @@ +diff --git a/thirdparty/basis_universal/transcoder/basisu_containers.h b/thirdparty/basis_universal/transcoder/basisu_containers.h +index 03fae33974..7fff4c243e 100644 +--- a/thirdparty/basis_universal/transcoder/basisu_containers.h ++++ b/thirdparty/basis_universal/transcoder/basisu_containers.h +@@ -3349,7 +3349,7 @@ namespace basisu + + inline size_t hash_key(const Key& k) const + { +- assert((safe_shift_left(1ULL, (SIZE_T_BITS - m_hash_shift))) == m_values.size()); ++ assert((safe_shift_left(static_cast(1), (SIZE_T_BITS - m_hash_shift))) == m_values.size()); + + // Fibonacci hashing + if (SIZE_T_BITS == 32) +@@ -3433,7 +3433,7 @@ namespace basisu + return false; + + new_map.m_hash_shift = SIZE_T_BITS - helpers::floor_log2i((uint64_t)new_hash_size); +- assert(new_hash_size == safe_shift_left(1ULL, SIZE_T_BITS - new_map.m_hash_shift)); ++ assert(new_hash_size == safe_shift_left(static_cast(1), SIZE_T_BITS - new_map.m_hash_shift)); + + new_map.m_grow_threshold = std::numeric_limits::max(); + diff --git a/thirdparty/basis_universal/transcoder/basisu.h b/thirdparty/basis_universal/transcoder/basisu.h index 939ee79e62b9..44fb9a3007b1 100644 --- a/thirdparty/basis_universal/transcoder/basisu.h +++ b/thirdparty/basis_universal/transcoder/basisu.h @@ -20,34 +20,7 @@ #pragma warning (disable : 4201) #pragma warning (disable : 4127) // warning C4127: conditional expression is constant #pragma warning (disable : 4530) // C++ exception handler used, but unwind semantics are not enabled. - - // Slamming this off always for v1.16 because we've gotten rid of most std containers. - #ifndef BASISU_NO_ITERATOR_DEBUG_LEVEL - #define BASISU_NO_ITERATOR_DEBUG_LEVEL (1) - #endif - - #ifndef BASISU_NO_ITERATOR_DEBUG_LEVEL - //#define _HAS_ITERATOR_DEBUGGING 0 - - #if defined(_DEBUG) || defined(DEBUG) - // This is madness, but we need to disable iterator debugging in debug builds or the encoder is unsable because MSVC's iterator debugging implementation is totally broken. - #ifndef _ITERATOR_DEBUG_LEVEL - #define _ITERATOR_DEBUG_LEVEL 1 - #endif - #ifndef _SECURE_SCL - #define _SECURE_SCL 1 - #endif - #else // defined(_DEBUG) || defined(DEBUG) - #ifndef _SECURE_SCL - #define _SECURE_SCL 0 - #endif - #ifndef _ITERATOR_DEBUG_LEVEL - #define _ITERATOR_DEBUG_LEVEL 0 - #endif - #endif // defined(_DEBUG) || defined(DEBUG) - - #endif // BASISU_NO_ITERATOR_DEBUG_LEVEL - + #endif // _MSC_VER #include @@ -66,6 +39,7 @@ #include #include #include +#include #include "basisu_containers.h" @@ -114,6 +88,7 @@ namespace basisu typedef basisu::vector int16_vec; typedef basisu::vector uint16_vec; typedef basisu::vector uint_vec; + typedef basisu::vector size_t_vec; typedef basisu::vector uint64_vec; typedef basisu::vector int_vec; typedef basisu::vector bool_vec; @@ -121,6 +96,16 @@ namespace basisu void enable_debug_printf(bool enabled); void debug_printf(const char *pFmt, ...); + void debug_puts(const char* p); + + template + inline void fmt_debug_printf(const char* pFmt, Args&&... args) + { + std::string res; + if (!fmt_variants(res, pFmt, fmt_variant_vec{ fmt_variant(std::forward(args))... })) + return; + debug_puts(res.c_str()); + } #ifndef __EMSCRIPTEN__ #ifdef __GNUC__ @@ -137,16 +122,13 @@ namespace basisu #endif #endif - template inline T0 lerp(T0 a, T0 b, T1 c) { return a + (b - a) * c; } - - template inline S maximum(S a, S b) { return (a > b) ? a : b; } - template inline S maximum(S a, S b, S c) { return maximum(maximum(a, b), c); } - template inline S maximum(S a, S b, S c, S d) { return maximum(maximum(maximum(a, b), c), d); } - - template inline S minimum(S a, S b) { return (a < b) ? a : b; } - template inline S minimum(S a, S b, S c) { return minimum(minimum(a, b), c); } - template inline S minimum(S a, S b, S c, S d) { return minimum(minimum(minimum(a, b), c), d); } + constexpr double cPiD = 3.14159265358979323846264338327950288; + constexpr float REALLY_SMALL_FLOAT_VAL = .000000125f; + constexpr float SMALL_FLOAT_VAL = .0000125f; + constexpr float BIG_FLOAT_VAL = 1e+30f; + template inline T0 lerp(T0 a, T0 b, T1 c) { return a + (b - a) * c; } + inline float clampf(float value, float low, float high) { if (value < low) value = low; else if (value > high) value = high; return value; } inline float saturate(float value) { return clampf(value, 0, 1.0f); } inline uint8_t minimumub(uint8_t a, uint8_t b) { return (a < b) ? a : b; } @@ -159,10 +141,31 @@ namespace basisu inline float maximumf(float a, float b) { return (a > b) ? a : b; } inline int squarei(int i) { return i * i; } inline float squaref(float i) { return i * i; } + inline double squared(double i) { return i * i; } template inline T square(T a) { return a * a; } + template inline T sign(T a) { return (a < 0) ? (T)-1 : ((a == 0) ? (T)0 : (T)1); } + + inline bool equal_tol(float a, float b, float t) { return fabsf(a - b) <= ((maximum(fabsf(a), fabsf(b)) + 1.0f) * t); } + inline bool equal_tol(double a, double b, double t) { return fabs(a - b) <= ((maximum(fabs(a), fabs(b)) + 1.0f) * t); } - template inline S clamp(S value, S low, S high) { return (value < low) ? low : ((value > high) ? high : value); } + template + inline T prev_wrap(T i, T n) + { + T temp = i - 1; + if (temp < 0) + temp = n - 1; + return temp; + } + template + inline T next_wrap(T i, T n) + { + T temp = i + 1; + if (temp >= n) + temp = 0; + return temp; + } + inline uint32_t iabs(int32_t i) { return (i < 0) ? static_cast(-i) : static_cast(i); } inline uint64_t iabs64(int64_t i) { return (i < 0) ? static_cast(-i) : static_cast(i); } @@ -356,6 +359,7 @@ namespace basisu return *this; } +#if 0 #ifdef __GNUC__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Warray-bounds" @@ -414,6 +418,57 @@ namespace basisu #ifdef __GNUC__ #pragma GCC diagnostic pop #endif +#else + inline operator uint32_t() const + { + if constexpr (NumBytes == 1) + { + return m_bytes[0]; + } + else if constexpr (NumBytes == 2) + { + return (m_bytes[1] << 8U) | m_bytes[0]; + } + else if constexpr (NumBytes == 3) + { + return (m_bytes[2] << 16U) | (m_bytes[1] << 8U) | m_bytes[0]; + } + else if constexpr (NumBytes == 4) + { + return read_le_dword(m_bytes); + } + else if constexpr (NumBytes == 5) + { + uint32_t l = read_le_dword(m_bytes); + uint32_t h = m_bytes[4]; + return static_cast(l) | (static_cast(h) << 32U); + } + else if constexpr (NumBytes == 6) + { + uint32_t l = read_le_dword(m_bytes); + uint32_t h = (m_bytes[5] << 8U) | m_bytes[4]; + return static_cast(l) | (static_cast(h) << 32U); + } + else if constexpr (NumBytes == 7) + { + uint32_t l = read_le_dword(m_bytes); + uint32_t h = (m_bytes[6] << 16U) | (m_bytes[5] << 8U) | m_bytes[4]; + return static_cast(l) | (static_cast(h) << 32U); + } + else if constexpr (NumBytes == 8) + { + uint32_t l = read_le_dword(m_bytes); + uint32_t h = read_le_dword(m_bytes + 4); + return static_cast(l) | (static_cast(h) << 32U); + } + else + { + static_assert(NumBytes <= 8, "Invalid NumBytes"); + return 0; + } + } + #endif + }; enum eZero { cZero }; @@ -446,18 +501,18 @@ namespace basisu static const uint8_t g_huffman_sorted_codelength_codes[] = { cHuffmanSmallZeroRunCode, cHuffmanBigZeroRunCode, cHuffmanSmallRepeatCode, cHuffmanBigRepeatCode, 0, 8, 7, 9, 6, 0xA, 5, 0xB, 4, 0xC, 3, 0xD, 2, 0xE, 1, 0xF, 0x10 }; const uint32_t cHuffmanTotalSortedCodelengthCodes = sizeof(g_huffman_sorted_codelength_codes) / sizeof(g_huffman_sorted_codelength_codes[0]); - // GPU texture formats + // GPU texture formats and various uncompressed texture formats. enum class texture_format { cInvalidTextureFormat = -1, // Block-based formats - cETC1, // ETC1 - cETC1S, // ETC1 (subset: diff colors only, no subblocks) - cETC2_RGB, // ETC2 color block (basisu doesn't support ETC2 planar/T/H modes - just basic ETC1) - cETC2_RGBA, // ETC2 EAC alpha block followed by ETC2 color block - cETC2_ALPHA, // ETC2 EAC alpha block + cETC1, // ETC1 + cETC1S, // ETC1 (subset: diff colors only, no subblocks) + cETC2_RGB, // ETC2 color block (basisu doesn't support ETC2 planar/T/H modes - just basic ETC1) + cETC2_RGBA, // ETC2 EAC alpha block followed by ETC2 color block + cETC2_ALPHA, // ETC2 EAC alpha block cBC1, // DXT1 cBC3, // DXT5 (BC4/DXT5A block followed by a BC1/DXT1 block) cBC4, // DXT5A @@ -466,7 +521,8 @@ namespace basisu cBC6HUnsigned, // HDR cBC7, cASTC_LDR_4x4, // ASTC 4x4 LDR only - cASTC_HDR_4x4, // ASTC 4x4 HDR only (but may use LDR ASTC blocks internally) + cASTC_HDR_4x4, // ASTC 4x4 HDR only (but may use LDR ASTC blocks internally, although our encoders don't do this) + cASTC_HDR_6x6, // ASTC 6x6 HDR only (but may use LDR ASTC blocks internally, although our encoders don't do this) cPVRTC1_4_RGB, cPVRTC1_4_RGBA, cATC_RGB, @@ -491,8 +547,33 @@ namespace basisu cRGB_9E5 }; + inline bool is_uncompressed_texture_format(texture_format fmt) + { + switch (fmt) + { + case texture_format::cRGBA32: + case texture_format::cRGB565: + case texture_format::cBGR565: + case texture_format::cRGBA4444: + case texture_format::cABGR4444: + case texture_format::cRGBA_HALF: + case texture_format::cRGB_HALF: + case texture_format::cRGB_9E5: + return true; + default: + break; + } + + return false; + } + + inline bool is_block_based_texture_format(texture_format fmt) + { + return !is_uncompressed_texture_format(fmt); + } + // This is bytes per block for GPU formats, or bytes per texel for uncompressed formats. - inline uint32_t get_bytes_per_block(texture_format fmt) + inline uint32_t get_bytes_per_block_or_pixel(texture_format fmt) { switch (fmt) { @@ -534,16 +615,22 @@ namespace basisu // This is qwords per block for GPU formats, or not valid for uncompressed formats. inline uint32_t get_qwords_per_block(texture_format fmt) { - return get_bytes_per_block(fmt) >> 3; + assert(is_block_based_texture_format(fmt)); + + const uint32_t bytes_per_block = get_bytes_per_block_or_pixel(fmt); + return bytes_per_block >> 3; } inline uint32_t get_block_width(texture_format fmt) { - BASISU_NOTE_UNUSED(fmt); + assert(is_block_based_texture_format(fmt)); + switch (fmt) { case texture_format::cFXT1_RGB: return 8; + case texture_format::cASTC_HDR_6x6: + return 6; default: break; } @@ -552,20 +639,42 @@ namespace basisu inline uint32_t get_block_height(texture_format fmt) { - BASISU_NOTE_UNUSED(fmt); + assert(is_block_based_texture_format(fmt)); + + switch (fmt) + { + case texture_format::cASTC_HDR_6x6: + return 6; + default: + break; + } return 4; } inline bool is_hdr_texture_format(texture_format fmt) { - if (fmt == texture_format::cASTC_HDR_4x4) - return true; - if (fmt == texture_format::cUASTC_HDR_4x4) - return true; - if ((fmt == texture_format::cBC6HSigned) || (fmt == texture_format::cBC6HUnsigned)) + switch (fmt) + { + case texture_format::cASTC_HDR_4x4: + case texture_format::cUASTC_HDR_4x4: + case texture_format::cASTC_HDR_6x6: + case texture_format::cBC6HSigned: + case texture_format::cBC6HUnsigned: + case texture_format::cRGBA_HALF: + case texture_format::cRGB_HALF: + case texture_format::cRGB_9E5: return true; + default: + break; + } + return false; } + + inline bool is_ldr_texture_format(texture_format fmt) + { + return !is_hdr_texture_format(fmt); + } } // namespace basisu diff --git a/thirdparty/basis_universal/transcoder/basisu_astc_hdr_core.h b/thirdparty/basis_universal/transcoder/basisu_astc_hdr_core.h index 82dcd2bfe196..f17271a7144a 100644 --- a/thirdparty/basis_universal/transcoder/basisu_astc_hdr_core.h +++ b/thirdparty/basis_universal/transcoder/basisu_astc_hdr_core.h @@ -35,40 +35,17 @@ namespace basist const uint32_t MAX_QLOG16 = 63487; const float MAX_QLOG16_VAL = 65504.0f; + // TODO: Should be called something like "NUM_MODE11_ENDPOINT_VALUES" const uint32_t NUM_MODE11_ENDPOINTS = 6, NUM_MODE7_ENDPOINTS = 4; - // Notes: - // qlog16_to_half(half_to_qlog16(half_val_as_int)) == half_val_as_int (is lossless) - // However, this is not lossless in the general sense. - inline half_float qlog16_to_half_slow(uint32_t qlog16) - { - assert(qlog16 <= 0xFFFF); - - int C = qlog16; - - int E = (C & 0xF800) >> 11; - int M = C & 0x7FF; - - int Mt; - if (M < 512) - Mt = 3 * M; - else if (M >= 1536) - Mt = 5 * M - 2048; - else - Mt = 4 * M - 512; - - int Cf = (E << 10) + (Mt >> 3); - return (half_float)Cf; - } - // This is not lossless - inline half_float qlog_to_half_slow(uint32_t qlog, uint32_t bits) + inline half_float qlog_to_half(uint32_t qlog, uint32_t bits) { assert((bits >= 7U) && (bits <= 16U)); assert(qlog < (1U << bits)); int C = qlog << (16 - bits); - return qlog16_to_half_slow(C); + return astc_helpers::qlog16_to_half(C); } void astc_hdr_core_init(); @@ -99,4 +76,131 @@ namespace basist bool astc_hdr_transcode_to_bc6h(const astc_blk& src_blk, bc6h_block& dst_blk); bool astc_hdr_transcode_to_bc6h(const astc_helpers::log_astc_block& log_blk, bc6h_block& dst_blk); + namespace astc_6x6_hdr + { + const uint32_t MAX_ASTC_HDR_6X6_DIM = 32768; + const int32_t REUSE_MAX_BUFFER_ROWS = 5; // 1+-(-4), so we need to buffer 5 rows total + + struct block_mode_desc + { + bool m_dp; + uint32_t m_cem; + uint32_t m_num_partitions; + uint32_t m_grid_x; + uint32_t m_grid_y; + + // the coding ISE ranges (which may not be valid ASTC ranges for this configuration) + uint32_t m_endpoint_ise_range; + uint32_t m_weight_ise_range; + + // the physical/output ASTC decompression ISE ranges (i.e. what the decompressor must output) + uint32_t m_transcode_endpoint_ise_range; + uint32_t m_transcode_weight_ise_range; + + uint32_t m_flags; + int m_dp_channel; + }; + + // Lack of level flag indicates level 3+ + const uint32_t BASIST_HDR_6X6_LEVEL0 = 1; + const uint32_t BASIST_HDR_6X6_LEVEL1 = 2; + const uint32_t BASIST_HDR_6X6_LEVEL2 = 4; + + const uint32_t TOTAL_BLOCK_MODE_DECS = 75; + extern const block_mode_desc g_block_mode_descs[TOTAL_BLOCK_MODE_DECS]; + + void copy_weight_grid(bool dual_plane, uint32_t grid_x, uint32_t grid_y, const uint8_t* transcode_weights, astc_helpers::log_astc_block& decomp_blk); + + enum class encoding_type + { + cInvalid = -1, + cRun = 0, + cSolid = 1, + cReuse = 2, + cBlock = 3, + cTotal + }; + + const uint32_t REUSE_XY_DELTA_BITS = 5; + const uint32_t NUM_REUSE_XY_DELTAS = 1 << REUSE_XY_DELTA_BITS; + + struct reuse_xy_delta + { + int8_t m_x, m_y; + }; + + extern const reuse_xy_delta g_reuse_xy_deltas[NUM_REUSE_XY_DELTAS]; + + const uint32_t RUN_CODE = 0b000, RUN_CODE_LEN = 3; + const uint32_t SOLID_CODE = 0b100, SOLID_CODE_LEN = 3; + const uint32_t REUSE_CODE = 0b10, REUSE_CODE_LEN = 2; + const uint32_t BLOCK_CODE = 0b1, BLOCK_CODE_LEN = 1; + + enum class endpoint_mode + { + cInvalid = -1, + + cRaw = 0, + cUseLeft, + cUseUpper, + cUseLeftDelta, + cUseUpperDelta, + + cTotal + }; + + enum class block_mode + { + cInvalid = -1, + + cBMTotalModes = TOTAL_BLOCK_MODE_DECS + }; + + const uint32_t NUM_ENDPOINT_DELTA_BITS = 5; + + const uint32_t NUM_UNIQUE_PARTITIONS2 = 521; + extern const uint32_t g_part2_unique_index_to_seed[NUM_UNIQUE_PARTITIONS2]; + + const uint32_t NUM_UNIQUE_PARTITIONS3 = 333; + extern const uint32_t g_part3_unique_index_to_seed[NUM_UNIQUE_PARTITIONS3]; + + bool decode_values(basist::bitwise_decoder& decoder, uint32_t total_values, uint32_t ise_range, uint8_t* pValues); + + void requantize_astc_weights(uint32_t n, const uint8_t* pSrc_ise_vals, uint32_t from_ise_range, uint8_t* pDst_ise_vals, uint32_t to_ise_range); + + void requantize_ise_endpoints(uint32_t cem, uint32_t src_ise_endpoint_range, const uint8_t* pSrc_endpoints, uint32_t dst_ise_endpoint_range, uint8_t* pDst_endpoints); + + const uint32_t BC6H_NUM_DIFF_ENDPOINT_MODES_TO_TRY_2 = 2; + const uint32_t BC6H_NUM_DIFF_ENDPOINT_MODES_TO_TRY_4 = 4; + const uint32_t BC6H_NUM_DIFF_ENDPOINT_MODES_TO_TRY_9 = 9; + + struct fast_bc6h_params + { + uint32_t m_num_diff_endpoint_modes_to_try; + uint32_t m_max_2subset_pats_to_try; + + bool m_hq_ls; + bool m_brute_force_weight4_assignment; + + fast_bc6h_params() + { + init(); + } + + void init() + { + m_hq_ls = true; + m_num_diff_endpoint_modes_to_try = BC6H_NUM_DIFF_ENDPOINT_MODES_TO_TRY_2; + m_max_2subset_pats_to_try = 1; + m_brute_force_weight4_assignment = false; + } + }; + + void fast_encode_bc6h(const basist::half_float* pPixels, basist::bc6h_block* pBlock, const fast_bc6h_params ¶ms); + + bool decode_6x6_hdr(const uint8_t* pComp_data, uint32_t comp_data_size, basisu::vector2D& decoded_blocks, uint32_t& width, uint32_t& height); + + } // namespace astc_6x6_hdr + } // namespace basist + diff --git a/thirdparty/basis_universal/transcoder/basisu_astc_helpers.h b/thirdparty/basis_universal/transcoder/basisu_astc_helpers.h index 09a234b2ae1f..1f78a702be9c 100644 --- a/thirdparty/basis_universal/transcoder/basisu_astc_helpers.h +++ b/thirdparty/basis_universal/transcoder/basisu_astc_helpers.h @@ -15,6 +15,7 @@ namespace astc_helpers const uint32_t MIN_GRID_DIM = 2; // the minimum dimension of a block's weight grid const uint32_t MIN_BLOCK_DIM = 4, MAX_BLOCK_DIM = 12; // the valid block dimensions in texels const uint32_t MAX_GRID_WEIGHTS = 64; // a block may have a maximum of 64 weight grid values + const uint32_t NUM_MODE11_ENDPOINTS = 6, NUM_MODE7_ENDPOINTS = 4; static const uint32_t NUM_ASTC_BLOCK_SIZES = 14; extern const uint8_t g_astc_block_sizes[NUM_ASTC_BLOCK_SIZES][2]; @@ -108,25 +109,30 @@ namespace astc_helpers bool m_error_flag; bool m_solid_color_flag_ldr, m_solid_color_flag_hdr; - uint16_t m_solid_color[4]; + uint8_t m_user_mode; // user defined value, not used in this module + // Rest is only valid if !m_solid_color_flag_ldr && !m_solid_color_flag_hdr - uint32_t m_grid_width, m_grid_height; // weight grid dimensions, not the dimension of the block + uint8_t m_grid_width, m_grid_height; // weight grid dimensions, not the dimension of the block bool m_dual_plane; - uint32_t m_weight_ise_range; // 0-11 - uint32_t m_endpoint_ise_range; // 4-20, this is actually inferred from the size of the other config bits+weights, but this is here for checking + uint8_t m_weight_ise_range; // 0-11 + uint8_t m_endpoint_ise_range; // 4-20, this is actually inferred from the size of the other config bits+weights, but this is here for checking - uint32_t m_color_component_selector; // 0-3, 0=GBA R, 1=RBA G, 2=RGA B, 3=RGB A, only used in dual plane mode + uint8_t m_color_component_selector; // 0-3, controls which channel uses the 2nd (odd) weights, only used in dual plane mode - uint32_t m_num_partitions; // or the # of subsets, 1-4 (1-3 if dual plane mode) - uint32_t m_partition_id; // 10-bits, must be 0 if m_num_partitions==1 + uint8_t m_num_partitions; // or the # of subsets, 1-4 (1-3 if dual plane mode) + uint16_t m_partition_id; // 10-bits, must be 0 if m_num_partitions==1 - uint32_t m_color_endpoint_modes[MAX_PARTITIONS]; // each subset's CEM's + uint8_t m_color_endpoint_modes[MAX_PARTITIONS]; // each subset's CEM's - // ISE weight grid values. In dual plane mode, the order is p0,p1, p0,p1, etc. - uint8_t m_weights[MAX_GRID_WEIGHTS]; + union + { + // ISE weight grid values. In dual plane mode, the order is p0,p1, p0,p1, etc. + uint8_t m_weights[MAX_GRID_WEIGHTS]; + uint16_t m_solid_color[4]; + }; // ISE endpoint values // Endpoint order examples: @@ -137,7 +143,7 @@ namespace astc_helpers // 2 subset RGB : RL0 RH0 GL0 GH0 BL0 BH0 RL1 RH1 GL1 GH1 BL1 BH1 // 2 subset RGBA : RL0 RH0 GL0 GH0 BL0 BH0 AL0 AH0 RL1 RH1 GL1 GH1 BL1 BH1 AL1 AH1 uint8_t m_endpoints[MAX_ENDPOINTS]; - + void clear() { memset(this, 0, sizeof(*this)); @@ -169,7 +175,7 @@ namespace astc_helpers inline int get_ise_sequence_bits(int count, int range) { - // See 18.22 Data Size Determination + // See 18.22 Data Size Determination - note this will be <= the # of bits actually written by encode_bise(). (It's magic.) int total_bits = g_ise_range_table[range][0] * count; total_bits += (g_ise_range_table[range][1] * 8 * count + 4) / 5; total_bits += (g_ise_range_table[range][2] * 7 * count + 2) / 3; @@ -182,16 +188,26 @@ namespace astc_helpers return (l * (64 - w) + h * w + 32) >> 6; } - void encode_bise(uint32_t* pDst, const uint8_t* pSrc_vals, uint32_t bit_pos, int num_vals, int range); + void encode_bise(uint32_t* pDst, const uint8_t* pSrc_vals, uint32_t bit_pos, int num_vals, int range, uint32_t *pStats = nullptr); + + struct pack_stats + { + uint32_t m_header_bits; + uint32_t m_endpoint_bits; + uint32_t m_weight_bits; + + inline pack_stats() { clear(); } + inline void clear() { memset(this, 0, sizeof(*this)); } + }; // Packs a logical to physical ASTC block. Note this does not validate the block's dimensions (use is_valid_block_size()), just the grid dimensions. - bool pack_astc_block(astc_block &phys_block, const log_astc_block& log_block, int* pExpected_endpoint_range = nullptr); + bool pack_astc_block(astc_block &phys_block, const log_astc_block& log_block, int* pExpected_endpoint_range = nullptr, pack_stats *pStats = nullptr); // Pack LDR void extent (really solid color) blocks. For LDR, pass in (val | (val << 8)) for each component. - void pack_void_extent_ldr(astc_block& blk, uint16_t r, uint16_t g, uint16_t b, uint16_t a); + void pack_void_extent_ldr(astc_block& blk, uint16_t r, uint16_t g, uint16_t b, uint16_t a, pack_stats *pStats = nullptr); // Pack HDR void extent (16-bit values are FP16/half floats - no NaN/Inf's) - void pack_void_extent_hdr(astc_block& blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah); + void pack_void_extent_hdr(astc_block& blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah, pack_stats* pStats = nullptr); // These helpers are all quite slow, but are useful for table preparation. @@ -299,6 +315,24 @@ namespace astc_helpers extern dequant_tables g_dequant_tables; void init_tables(bool init_rank_tabs); + + struct weighted_sample + { + uint8_t m_src_x; + uint8_t m_src_y; + uint8_t m_weights[2][2]; // [y][x], scaled by 16, round by adding 8 + }; + + void compute_upsample_weights( + int block_width, int block_height, + int weight_grid_width, int weight_grid_height, + weighted_sample* pWeights); // there will be block_width * block_height bilinear samples + + void upsample_weight_grid( + uint32_t bx, uint32_t by, // destination/to dimension + uint32_t wx, uint32_t wy, // source/from dimension + const uint8_t* pSrc_weights, // these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx] + uint8_t* pDst_weights); // [by][bx] // Procedurally returns the texel partition/subset index given the block coordinate and config. int compute_texel_partition(uint32_t seedIn, uint32_t xIn, uint32_t yIn, uint32_t zIn, int num_partitions, bool small_block); @@ -315,6 +349,27 @@ namespace astc_helpers half_float float_to_half(float val, bool toward_zero); float half_to_float(half_float hval); + // Notes: + // qlog16_to_half(half_to_qlog16(half_val_as_int)) == half_val_as_int (is lossless) + // However, this is not lossless in the general sense. + inline half_float qlog16_to_half(int k) + { + assert((k >= 0) && (k <= 0xFFFF)); + + int E = (k & 0xF800) >> 11; + int M = k & 0x7FF; + + int Mt; + if (M < 512) + Mt = 3 * M; + else if (M >= 1536) + Mt = 5 * M - 2048; + else + Mt = 4 * M - 512; + + return (half_float)((E << 10) + (Mt >> 3)); + } + const int MAX_RGB9E5 = 0xff80; void unpack_rgb9e5(uint32_t packed, float& r, float& g, float& b); uint32_t pack_rgb9e5(float r, float g, float b); @@ -437,7 +492,7 @@ namespace astc_helpers }; // Encodes 3 values to output, usable for any range that uses quints and bits - static inline void astc_encode_quints(uint32_t* pOutput, const uint8_t* pValues, uint32_t& bit_pos, int n) + static inline void astc_encode_quints(uint32_t* pOutput, const uint8_t* pValues, uint32_t& bit_pos, int n, uint32_t* pStats) { // First extract the quints and the bits from the 3 input values int quints = 0, bits[3]; @@ -461,6 +516,9 @@ namespace astc_helpers // Now interleave the 7 encoded quint bits with the bits to form the encoded output. See table 95-96. astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 2) << n) | (bits[1] << (3 + n)) | (astc_extract_bits(T, 3, 4) << (3 + n * 2)) | (bits[2] << (5 + n * 2)) | (astc_extract_bits(T, 5, 6) << (5 + n * 3)), 7 + n * 3); + + if (pStats) + *pStats += n * 3 + 7; } static const uint8_t g_astc_trit_encode[243] = { 0, 1, 2, 4, 5, 6, 8, 9, 10, 16, 17, 18, 20, 21, 22, 24, 25, 26, 3, 7, 11, 19, 23, 27, 12, 13, 14, 32, 33, 34, 36, 37, 38, 40, 41, 42, 48, 49, 50, 52, 53, 54, 56, 57, 58, 35, 39, @@ -471,7 +529,7 @@ namespace astc_helpers 191, 223, 124, 125, 126 }; // Encodes 5 values to output, usable for any range that uses trits and bits - static void astc_encode_trits(uint32_t* pOutput, const uint8_t* pValues, uint32_t& bit_pos, int n) + static void astc_encode_trits(uint32_t* pOutput, const uint8_t* pValues, uint32_t& bit_pos, int n, uint32_t *pStats) { // First extract the trits and the bits from the 5 input values int trits = 0, bits[5]; @@ -494,13 +552,16 @@ namespace astc_helpers // Now interleave the 8 encoded trit bits with the bits to form the encoded output. See table 94. astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 1) << n) | (bits[1] << (2 + n)), n * 2 + 2); - + astc_set_bits(pOutput, bit_pos, astc_extract_bits(T, 2, 3) | (bits[2] << 2) | (astc_extract_bits(T, 4, 4) << (2 + n)) | (bits[3] << (3 + n)) | (astc_extract_bits(T, 5, 6) << (3 + n * 2)) | (bits[4] << (5 + n * 2)) | (astc_extract_bits(T, 7, 7) << (5 + n * 3)), n * 3 + 6); + + if (pStats) + *pStats += n * 5 + 8; } // Packs values using ASTC's BISE to output buffer. - void encode_bise(uint32_t* pDst, const uint8_t* pSrc_vals, uint32_t bit_pos, int num_vals, int range) + void encode_bise(uint32_t* pDst, const uint8_t* pSrc_vals, uint32_t bit_pos, int num_vals, int range, uint32_t *pStats) { uint32_t temp[5] = { 0 }; @@ -533,19 +594,23 @@ namespace astc_helpers for (int i = 0; i < limit; i++) vals[i] = pSrc_vals[group_index * group_size + i]; + // Note this always writes a group of 3 or 5 bits values, even for incomplete groups. So it can write more than needed. + // get_ise_sequence_bits() returns the # of bits that must be written for proper decoding. if (group_size == 5) - astc_encode_trits(temp, vals, bit_pos, num_bits); + astc_encode_trits(temp, vals, bit_pos, num_bits, pStats); else - astc_encode_quints(temp, vals, bit_pos, num_bits); + astc_encode_quints(temp, vals, bit_pos, num_bits, pStats); } } else { for (int i = 0; i < num_vals; i++) astc_set_bits_1_to_9(temp, bit_pos, pSrc_vals[i], num_bits); + + if (pStats) + *pStats += num_vals * num_bits; } - // TODO: Could this write too many bits on incomplete blocks? pDst[0] |= temp[0]; pDst[1] |= temp[1]; pDst[2] |= temp[2]; pDst[3] |= temp[3]; } @@ -652,7 +717,7 @@ namespace astc_helpers return false; } - bool pack_astc_block(astc_block& phys_block, const log_astc_block& log_block, int* pExpected_endpoint_range) + bool pack_astc_block(astc_block& phys_block, const log_astc_block& log_block, int* pExpected_endpoint_range, pack_stats *pStats) { memset(&phys_block, 0, sizeof(phys_block)); @@ -665,12 +730,12 @@ namespace astc_helpers if (log_block.m_solid_color_flag_ldr) { - pack_void_extent_ldr(phys_block, log_block.m_solid_color[0], log_block.m_solid_color[1], log_block.m_solid_color[2], log_block.m_solid_color[3]); + pack_void_extent_ldr(phys_block, log_block.m_solid_color[0], log_block.m_solid_color[1], log_block.m_solid_color[2], log_block.m_solid_color[3], pStats); return true; } else if (log_block.m_solid_color_flag_hdr) { - pack_void_extent_hdr(phys_block, log_block.m_solid_color[0], log_block.m_solid_color[1], log_block.m_solid_color[2], log_block.m_solid_color[3]); + pack_void_extent_hdr(phys_block, log_block.m_solid_color[0], log_block.m_solid_color[1], log_block.m_solid_color[2], log_block.m_solid_color[3], pStats); return true; } @@ -687,6 +752,8 @@ namespace astc_helpers if (log_block.m_color_component_selector > 3) return false; + + // TODO: sanity check grid width/height vs. block's physical width/height uint32_t config_bits = 0; if (!get_config_bits(log_block, config_bits)) @@ -694,6 +761,8 @@ namespace astc_helpers uint32_t bit_pos = 0; astc_set_bits(&phys_block.m_vals[0], bit_pos, config_bits, 11); + if (pStats) + pStats->m_header_bits += 11; const uint32_t total_grid_weights = (log_block.m_dual_plane ? 2 : 1) * (log_block.m_grid_width * log_block.m_grid_height); const uint32_t total_weight_bits = get_ise_sequence_bits(total_grid_weights, log_block.m_weight_ise_range); @@ -705,6 +774,8 @@ namespace astc_helpers uint32_t total_extra_bits = 0; astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_num_partitions - 1, 2); + if (pStats) + pStats->m_header_bits += 2; if (log_block.m_num_partitions > 1) { @@ -712,12 +783,14 @@ namespace astc_helpers return false; astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_partition_id, 10); + if (pStats) + pStats->m_header_bits += 10; uint32_t highest_cem = 0, lowest_cem = UINT32_MAX; for (uint32_t j = 0; j < log_block.m_num_partitions; j++) { - highest_cem = my_max(highest_cem, log_block.m_color_endpoint_modes[j]); - lowest_cem = my_min(lowest_cem, log_block.m_color_endpoint_modes[j]); + highest_cem = my_max(highest_cem, log_block.m_color_endpoint_modes[j]); + lowest_cem = my_min(lowest_cem, log_block.m_color_endpoint_modes[j]); } if (highest_cem > 15) @@ -752,9 +825,13 @@ namespace astc_helpers uint32_t cem_bit_pos = 128 - total_weight_bits - total_extra_bits; astc_set_bits(&phys_block.m_vals[0], cem_bit_pos, encoded_cem >> 6, total_extra_bits); + if (pStats) + pStats->m_header_bits += total_extra_bits; } astc_set_bits(&phys_block.m_vals[0], bit_pos, encoded_cem & 0x3f, 6); + if (pStats) + pStats->m_header_bits += 6; } else { @@ -764,6 +841,8 @@ namespace astc_helpers return false; astc_set_bits(&phys_block.m_vals[0], bit_pos, log_block.m_color_endpoint_modes[0], 4); + if (pStats) + pStats->m_header_bits += 4; } if (log_block.m_dual_plane) @@ -775,6 +854,8 @@ namespace astc_helpers uint32_t ccs_bit_pos = 128 - (int)total_weight_bits - (int)total_extra_bits; astc_set_bits(&phys_block.m_vals[0], ccs_bit_pos, log_block.m_color_component_selector, 2); + if (pStats) + pStats->m_header_bits += 2; } const uint32_t total_config_bits = bit_pos + total_extra_bits; @@ -812,6 +893,12 @@ namespace astc_helpers return false; } + if (pStats) + { + pStats->m_endpoint_bits += get_ise_sequence_bits(total_cem_vals, endpoint_ise_range); + pStats->m_weight_bits += get_ise_sequence_bits(total_grid_weights, log_block.m_weight_ise_range); + } + // Pack endpoints forwards encode_bise(&phys_block.m_vals[0], log_block.m_endpoints, bit_pos, total_cem_vals, endpoint_ise_range); @@ -1210,7 +1297,7 @@ namespace astc_helpers } } - void pack_void_extent_ldr(astc_block &blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah) + void pack_void_extent_ldr(astc_block &blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah, pack_stats* pStats) { uint8_t* pDst = (uint8_t*)&blk.m_vals[0]; memset(pDst, 0xFF, 16); @@ -1226,10 +1313,13 @@ namespace astc_helpers pDst[13] = (uint8_t)(bh >> 8); pDst[14] = (uint8_t)ah; pDst[15] = (uint8_t)(ah >> 8); + + if (pStats) + pStats->m_header_bits += 128; } // rh-ah are half-floats - void pack_void_extent_hdr(astc_block& blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah) + void pack_void_extent_hdr(astc_block& blk, uint16_t rh, uint16_t gh, uint16_t bh, uint16_t ah, pack_stats *pStats) { uint8_t* pDst = (uint8_t*)&blk.m_vals[0]; memset(pDst, 0xFF, 16); @@ -1244,6 +1334,9 @@ namespace astc_helpers pDst[13] = (uint8_t)(bh >> 8); pDst[14] = (uint8_t)ah; pDst[15] = (uint8_t)(ah >> 8); + + if (pStats) + pStats->m_header_bits += 128; } bool is_cem_ldr(uint32_t mode) @@ -1323,22 +1416,17 @@ namespace astc_helpers dequant_tables g_dequant_tables; void precompute_texel_partitions_4x4(); + void precompute_texel_partitions_6x6(); void init_tables(bool init_rank_tabs) { g_dequant_tables.init(init_rank_tabs); precompute_texel_partitions_4x4(); + precompute_texel_partitions_6x6(); } - - struct weighted_sample - { - uint8_t m_src_x; - uint8_t m_src_y; - uint8_t m_weights[2][2]; // [y][x], scaled by 16, round by adding 8 - }; - - static void compute_upsample_weights( + + void compute_upsample_weights( int block_width, int block_height, int weight_grid_width, int weight_grid_height, weighted_sample* pWeights) // there will be block_width * block_height bilinear samples @@ -1373,7 +1461,7 @@ namespace astc_helpers } // Should be dequantized [0,64] weights - static void upsample_weight_grid( + void upsample_weight_grid( uint32_t bx, uint32_t by, // destination/to dimension uint32_t wx, uint32_t wy, // source/from dimension const uint8_t* pSrc_weights, // these are dequantized [0,64] weights, NOT ISE symbols, [wy][wx] @@ -1429,6 +1517,7 @@ namespace astc_helpers return p; } + // small_block = num_blk_pixels < 31 int compute_texel_partition(uint32_t seedIn, uint32_t xIn, uint32_t yIn, uint32_t zIn, int num_partitions, bool small_block) { assert(zIn == 0); @@ -1495,7 +1584,11 @@ namespace astc_helpers : 3; } - static uint32_t g_texel_partitions_4x4[1024][2]; + // 4x4, 2 and 3 subsets + static uint32_t g_texel_partitions_4x4[1024][2]; + + // 6x6, 2 and 3 subsets (2 subsets low 4 bits, 3 subsets high 4 bits) + static uint8_t g_texel_partitions_6x6[1024][6 * 6]; void precompute_texel_partitions_4x4() { @@ -1518,6 +1611,24 @@ namespace astc_helpers } } + void precompute_texel_partitions_6x6() + { + for (uint32_t p = 0; p < 1024; p++) + { + for (uint32_t y = 0; y < 6; y++) + { + for (uint32_t x = 0; x < 6; x++) + { + const uint32_t p2 = compute_texel_partition(p, x, y, 0, 2, false); + const uint32_t p3 = compute_texel_partition(p, x, y, 0, 3, false); + + assert((p2 <= 1) && (p3 <= 2)); + g_texel_partitions_6x6[p][x + y * 6] = (uint8_t)((p3 << 4) | p2); + } + } + } + } + static inline int get_precompute_texel_partitions_4x4(uint32_t seed, uint32_t x, uint32_t y, uint32_t num_partitions) { assert(g_texel_partitions_4x4[1][0]); @@ -1529,6 +1640,17 @@ namespace astc_helpers return (g_texel_partitions_4x4[seed][num_partitions - 2] >> shift) & 3; } + static inline int get_precompute_texel_partitions_6x6(uint32_t seed, uint32_t x, uint32_t y, uint32_t num_partitions) + { + assert(g_texel_partitions_6x6[0][0]); + assert(seed < 1024); + assert((x <= 5) && (y <= 5)); + assert((num_partitions >= 2) && (num_partitions <= 3)); + + const uint32_t shift = (num_partitions == 3) ? 4 : 0; + return (g_texel_partitions_6x6[seed][x + y * 6] >> shift) & 3; + } + void blue_contract( int r, int g, int b, int a, int &dr, int &dg, int &db, int &da) @@ -2144,25 +2266,7 @@ namespace astc_helpers x.u = m | (e << 23) | (s << 31); return x.f; } - - static inline half_float qlog16_to_half(int k) - { - assert((k >= 0) && (k <= 0xFFFF)); - - int E = (k & 0xF800) >> 11; - int M = k & 0x7FF; - - int Mt; - if (M < 512) - Mt = 3 * M; - else if (M >= 1536) - Mt = 5 * M - 2048; - else - Mt = 4 * M - 512; - - return (half_float)((E << 10) + (Mt >> 3)); - } - + // See https://registry.khronos.org/OpenGL/extensions/EXT/EXT_texture_shared_exponent.txt const int RGB9E5_EXPONENT_BITS = 5, RGB9E5_MANTISSA_BITS = 9, RGB9E5_EXP_BIAS = 15, RGB9E5_MAX_VALID_BIASED_EXP = 31; const int MAX_RGB9E5_EXP = (RGB9E5_MAX_VALID_BIASED_EXP - RGB9E5_EXP_BIAS); @@ -2514,7 +2618,8 @@ namespace astc_helpers // Decode texels const bool small_block = num_blk_pixels < 31; - const bool use_precomputed_texel_partitions = (blk_width == 4) && (blk_height == 4) && (log_blk.m_num_partitions >= 2) && (log_blk.m_num_partitions <= 3); + const bool use_precomputed_texel_partitions_4x4 = (blk_width == 4) && (blk_height == 4) && (log_blk.m_num_partitions >= 2) && (log_blk.m_num_partitions <= 3); + const bool use_precomputed_texel_partitions_6x6 = (blk_width == 6) && (blk_height == 6) && (log_blk.m_num_partitions >= 2) && (log_blk.m_num_partitions <= 3); const uint32_t ccs = log_blk.m_dual_plane ? log_blk.m_color_component_selector : UINT32_MAX; bool success = true; @@ -2527,9 +2632,17 @@ namespace astc_helpers for (uint32_t x = 0; x < blk_width; x++) { const uint32_t pixel_index = x + y * blk_width; - const uint32_t subset = (log_blk.m_num_partitions > 1) ? - (use_precomputed_texel_partitions ? get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions) : compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block)) - : 0; + + uint32_t subset = 0; + if (log_blk.m_num_partitions > 1) + { + if (use_precomputed_texel_partitions_4x4) + subset = get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions); + else if (use_precomputed_texel_partitions_6x6) + subset = get_precompute_texel_partitions_6x6(log_blk.m_partition_id, x, y, log_blk.m_num_partitions); + else + subset = compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block); + } int comp[3]; @@ -2592,9 +2705,17 @@ namespace astc_helpers for (uint32_t x = 0; x < blk_width; x++) { const uint32_t pixel_index = x + y * blk_width; - const uint32_t subset = (log_blk.m_num_partitions > 1) ? - (use_precomputed_texel_partitions ? get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions) : compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block)) - : 0; + + uint32_t subset = 0; + if (log_blk.m_num_partitions > 1) + { + if (use_precomputed_texel_partitions_4x4) + subset = get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions); + else if (use_precomputed_texel_partitions_6x6) + subset = get_precompute_texel_partitions_6x6(log_blk.m_partition_id, x, y, log_blk.m_num_partitions); + else + subset = compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block); + } for (uint32_t c = 0; c < 4; c++) { @@ -2653,9 +2774,16 @@ namespace astc_helpers { const uint32_t pixel_index = x + y * blk_width; - const uint32_t subset = (log_blk.m_num_partitions > 1) ? - (use_precomputed_texel_partitions ? get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions) : compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block)) - : 0; + uint32_t subset = 0; + if (log_blk.m_num_partitions > 1) + { + if (use_precomputed_texel_partitions_4x4) + subset = get_precompute_texel_partitions_4x4(log_blk.m_partition_id, x, y, log_blk.m_num_partitions); + else if (use_precomputed_texel_partitions_6x6) + subset = get_precompute_texel_partitions_6x6(log_blk.m_partition_id, x, y, log_blk.m_num_partitions); + else + subset = compute_texel_partition(log_blk.m_partition_id, x, y, 0, log_blk.m_num_partitions, small_block); + } if (!is_ldr_endpoints[subset]) { @@ -3235,10 +3363,10 @@ namespace astc_helpers if (p < 2) return false; - log_blk.m_grid_width = W; - log_blk.m_grid_height = H; + log_blk.m_grid_width = (uint8_t)W; + log_blk.m_grid_height = (uint8_t)H; - log_blk.m_weight_ise_range = (p - 2) + (P * BISE_10_LEVELS); + log_blk.m_weight_ise_range = (uint8_t)((p - 2) + (P * BISE_10_LEVELS)); assert(log_blk.m_weight_ise_range <= LAST_VALID_WEIGHT_ISE_RANGE); log_blk.m_dual_plane = Dp; @@ -3441,16 +3569,16 @@ namespace astc_helpers // Right before the weight bits, there may be extra CEM bits, then the 2 CCS bits if dual plane. - log_blk.m_num_partitions = bits.get_bits(11, 2) + 1; + log_blk.m_num_partitions = (uint8_t)(bits.get_bits(11, 2) + 1); if (log_blk.m_num_partitions == 1) - log_blk.m_color_endpoint_modes[0] = bits.get_bits(13, 4); // read CEM bits + log_blk.m_color_endpoint_modes[0] = (uint8_t)(bits.get_bits(13, 4)); // read CEM bits else { // 2 or more partitions if (log_blk.m_dual_plane && (log_blk.m_num_partitions == 4)) return false; - log_blk.m_partition_id = bits.get_bits(13, 10); + log_blk.m_partition_id = (uint16_t)bits.get_bits(13, 10); uint32_t cem_bits = bits.get_bits(23, 6); @@ -3458,7 +3586,7 @@ namespace astc_helpers { // All CEM's the same for (uint32_t i = 0; i < log_blk.m_num_partitions; i++) - log_blk.m_color_endpoint_modes[i] = cem_bits >> 2; + log_blk.m_color_endpoint_modes[i] = (uint8_t)(cem_bits >> 2); } else { @@ -3511,7 +3639,7 @@ namespace astc_helpers for (uint32_t i = 0; i < log_blk.m_num_partitions; i++) { - log_blk.m_color_endpoint_modes[i] = first_cem_index + (c[i] * 4) + m[i]; + log_blk.m_color_endpoint_modes[i] = (uint8_t)(first_cem_index + (c[i] * 4) + m[i]); assert(log_blk.m_color_endpoint_modes[i] <= 15); } } @@ -3528,7 +3656,7 @@ namespace astc_helpers return false; uint32_t ccs_bit_pos = end_of_weight_bit_ofs - total_extra_bits; - log_blk.m_color_component_selector = bits.get_bits(ccs_bit_pos, 2); + log_blk.m_color_component_selector = (uint8_t)(bits.get_bits(ccs_bit_pos, 2)); } uint32_t config_bit_pos = 11 + 2; // config+num_parts @@ -3569,7 +3697,7 @@ namespace astc_helpers if (endpoint_ise_range < (int)FIRST_VALID_ENDPOINT_ISE_RANGE) return false; - log_blk.m_endpoint_ise_range = endpoint_ise_range; + log_blk.m_endpoint_ise_range = (uint8_t)endpoint_ise_range; // Decode endpoints forwards in block decode_bise(log_blk.m_endpoint_ise_range, log_blk.m_endpoints, total_cem_vals, bits, config_bit_pos); diff --git a/thirdparty/basis_universal/transcoder/basisu_containers.h b/thirdparty/basis_universal/transcoder/basisu_containers.h index bfc51bb499cc..7fff4c243ea2 100644 --- a/thirdparty/basis_universal/transcoder/basisu_containers.h +++ b/thirdparty/basis_universal/transcoder/basisu_containers.h @@ -24,163 +24,183 @@ #define BASISU_FORCE_INLINE inline #endif +#define BASISU_HASHMAP_TEST 0 + namespace basisu { - enum { cInvalidIndex = -1 }; - - namespace helpers - { - inline bool is_power_of_2(uint32_t x) { return x && ((x & (x - 1U)) == 0U); } - inline bool is_power_of_2(uint64_t x) { return x && ((x & (x - 1U)) == 0U); } - template const T& minimum(const T& a, const T& b) { return (b < a) ? b : a; } - template const T& maximum(const T& a, const T& b) { return (a < b) ? b : a; } - - inline uint32_t floor_log2i(uint32_t v) - { - uint32_t l = 0; - while (v > 1U) - { - v >>= 1; - l++; - } - return l; - } - - inline uint32_t next_pow2(uint32_t val) - { - val--; - val |= val >> 16; - val |= val >> 8; - val |= val >> 4; - val |= val >> 2; - val |= val >> 1; - return val + 1; - } - - inline uint64_t next_pow2(uint64_t val) - { - val--; - val |= val >> 32; - val |= val >> 16; - val |= val >> 8; - val |= val >> 4; - val |= val >> 2; - val |= val >> 1; - return val + 1; - } - } // namespace helpers - - template - inline T* construct(T* p) - { - return new (static_cast(p)) T; - } - - template - inline T* construct(T* p, const U& init) - { - return new (static_cast(p)) T(init); - } - - template - inline void construct_array(T* p, size_t n) - { - T* q = p + n; - for (; p != q; ++p) - new (static_cast(p)) T; - } - - template - inline void construct_array(T* p, size_t n, const U& init) - { - T* q = p + n; - for (; p != q; ++p) - new (static_cast(p)) T(init); - } - - template - inline void destruct(T* p) - { - (void)p; - p->~T(); - } - - template inline void destruct_array(T* p, size_t n) - { - T* q = p + n; - for (; p != q; ++p) - p->~T(); - } - - template struct int_traits { enum { cMin = INT32_MIN, cMax = INT32_MAX, cSigned = true }; }; - - template<> struct int_traits { enum { cMin = INT8_MIN, cMax = INT8_MAX, cSigned = true }; }; - template<> struct int_traits { enum { cMin = INT16_MIN, cMax = INT16_MAX, cSigned = true }; }; - template<> struct int_traits { enum { cMin = INT32_MIN, cMax = INT32_MAX, cSigned = true }; }; - - template<> struct int_traits { enum { cMin = 0, cMax = UINT8_MAX, cSigned = false }; }; - template<> struct int_traits { enum { cMin = 0, cMax = UINT16_MAX, cSigned = false }; }; - template<> struct int_traits { enum { cMin = 0, cMax = UINT32_MAX, cSigned = false }; }; - - template - struct scalar_type - { - enum { cFlag = false }; - static inline void construct(T* p) { basisu::construct(p); } - static inline void construct(T* p, const T& init) { basisu::construct(p, init); } - static inline void construct_array(T* p, size_t n) { basisu::construct_array(p, n); } - static inline void destruct(T* p) { basisu::destruct(p); } - static inline void destruct_array(T* p, size_t n) { basisu::destruct_array(p, n); } - }; - - template struct scalar_type - { - enum { cFlag = true }; - static inline void construct(T** p) { memset(p, 0, sizeof(T*)); } - static inline void construct(T** p, T* init) { *p = init; } - static inline void construct_array(T** p, size_t n) { memset(p, 0, sizeof(T*) * n); } - static inline void destruct(T** p) { p; } - static inline void destruct_array(T** p, size_t n) { p, n; } - }; + enum { cInvalidIndex = -1 }; + + template inline S clamp(S value, S low, S high) { return (value < low) ? low : ((value > high) ? high : value); } + + template inline S maximum(S a, S b) { return (a > b) ? a : b; } + template inline S maximum(S a, S b, S c) { return maximum(maximum(a, b), c); } + template inline S maximum(S a, S b, S c, S d) { return maximum(maximum(maximum(a, b), c), d); } + + template inline S minimum(S a, S b) { return (a < b) ? a : b; } + template inline S minimum(S a, S b, S c) { return minimum(minimum(a, b), c); } + template inline S minimum(S a, S b, S c, S d) { return minimum(minimum(minimum(a, b), c), d); } + +#ifdef _MSC_VER + __declspec(noreturn) +#else + [[noreturn]] +#endif + void container_abort(const char* pMsg, ...); + + namespace helpers + { + inline bool is_power_of_2(uint32_t x) { return x && ((x & (x - 1U)) == 0U); } + inline bool is_power_of_2(uint64_t x) { return x && ((x & (x - 1U)) == 0U); } + + template const T& minimum(const T& a, const T& b) { return (b < a) ? b : a; } + template const T& maximum(const T& a, const T& b) { return (a < b) ? b : a; } + + inline uint32_t floor_log2i(uint32_t v) + { + uint32_t l = 0; + while (v > 1U) + { + v >>= 1; + l++; + } + return l; + } + + inline uint32_t floor_log2i(uint64_t v) + { + uint32_t l = 0; + while (v > 1U) + { + v >>= 1; + l++; + } + return l; + } + + inline uint32_t next_pow2(uint32_t val) + { + val--; + val |= val >> 16; + val |= val >> 8; + val |= val >> 4; + val |= val >> 2; + val |= val >> 1; + return val + 1; + } + + inline uint64_t next_pow2(uint64_t val) + { + val--; + val |= val >> 32; + val |= val >> 16; + val |= val >> 8; + val |= val >> 4; + val |= val >> 2; + val |= val >> 1; + return val + 1; + } + } // namespace helpers + + template + inline T* construct(T* p) + { + return new (static_cast(p)) T; + } + + template + inline T* construct(T* p, const U& init) + { + return new (static_cast(p)) T(init); + } + + template + inline void construct_array(T* p, size_t n) + { + T* q = p + n; + for (; p != q; ++p) + new (static_cast(p)) T; + } + + template + inline void construct_array(T* p, size_t n, const U& init) + { + T* q = p + n; + for (; p != q; ++p) + new (static_cast(p)) T(init); + } + + template + inline void destruct(T* p) + { + p->~T(); + } + + template inline void destruct_array(T* p, size_t n) + { + T* q = p + n; + for (; p != q; ++p) + p->~T(); + } + + template + struct scalar_type + { + enum { cFlag = false }; + static inline void construct(T* p) { basisu::construct(p); } + static inline void construct(T* p, const T& init) { basisu::construct(p, init); } + static inline void construct_array(T* p, size_t n) { basisu::construct_array(p, n); } + static inline void destruct(T* p) { basisu::destruct(p); } + static inline void destruct_array(T* p, size_t n) { basisu::destruct_array(p, n); } + }; + + template struct scalar_type + { + enum { cFlag = true }; + static inline void construct(T** p) { memset(p, 0, sizeof(T*)); } + static inline void construct(T** p, T* init) { *p = init; } + static inline void construct_array(T** p, size_t n) { memset(p, 0, sizeof(T*) * n); } + static inline void destruct(T** p) { p; } + static inline void destruct_array(T** p, size_t n) { p, n; } + }; #define BASISU_DEFINE_BUILT_IN_TYPE(X) \ - template<> struct scalar_type { \ - enum { cFlag = true }; \ - static inline void construct(X* p) { memset(p, 0, sizeof(X)); } \ - static inline void construct(X* p, const X& init) { memcpy(p, &init, sizeof(X)); } \ - static inline void construct_array(X* p, size_t n) { memset(p, 0, sizeof(X) * n); } \ - static inline void destruct(X* p) { p; } \ - static inline void destruct_array(X* p, size_t n) { p, n; } }; - - BASISU_DEFINE_BUILT_IN_TYPE(bool) - BASISU_DEFINE_BUILT_IN_TYPE(char) - BASISU_DEFINE_BUILT_IN_TYPE(unsigned char) - BASISU_DEFINE_BUILT_IN_TYPE(short) - BASISU_DEFINE_BUILT_IN_TYPE(unsigned short) - BASISU_DEFINE_BUILT_IN_TYPE(int) - BASISU_DEFINE_BUILT_IN_TYPE(unsigned int) - BASISU_DEFINE_BUILT_IN_TYPE(long) - BASISU_DEFINE_BUILT_IN_TYPE(unsigned long) + template<> struct scalar_type { \ + enum { cFlag = true }; \ + static inline void construct(X* p) { memset(p, 0, sizeof(X)); } \ + static inline void construct(X* p, const X& init) { memcpy(p, &init, sizeof(X)); } \ + static inline void construct_array(X* p, size_t n) { memset(p, 0, sizeof(X) * n); } \ + static inline void destruct(X* p) { p; } \ + static inline void destruct_array(X* p, size_t n) { p, n; } }; + + BASISU_DEFINE_BUILT_IN_TYPE(bool) + BASISU_DEFINE_BUILT_IN_TYPE(char) + BASISU_DEFINE_BUILT_IN_TYPE(unsigned char) + BASISU_DEFINE_BUILT_IN_TYPE(short) + BASISU_DEFINE_BUILT_IN_TYPE(unsigned short) + BASISU_DEFINE_BUILT_IN_TYPE(int) + BASISU_DEFINE_BUILT_IN_TYPE(unsigned int) + BASISU_DEFINE_BUILT_IN_TYPE(long) + BASISU_DEFINE_BUILT_IN_TYPE(unsigned long) #ifdef __GNUC__ - BASISU_DEFINE_BUILT_IN_TYPE(long long) - BASISU_DEFINE_BUILT_IN_TYPE(unsigned long long) + BASISU_DEFINE_BUILT_IN_TYPE(long long) + BASISU_DEFINE_BUILT_IN_TYPE(unsigned long long) #else - BASISU_DEFINE_BUILT_IN_TYPE(__int64) - BASISU_DEFINE_BUILT_IN_TYPE(unsigned __int64) + BASISU_DEFINE_BUILT_IN_TYPE(__int64) + BASISU_DEFINE_BUILT_IN_TYPE(unsigned __int64) #endif - BASISU_DEFINE_BUILT_IN_TYPE(float) - BASISU_DEFINE_BUILT_IN_TYPE(double) - BASISU_DEFINE_BUILT_IN_TYPE(long double) + BASISU_DEFINE_BUILT_IN_TYPE(float) + BASISU_DEFINE_BUILT_IN_TYPE(double) + BASISU_DEFINE_BUILT_IN_TYPE(long double) #undef BASISU_DEFINE_BUILT_IN_TYPE - template - struct bitwise_movable { enum { cFlag = false }; }; + template + struct bitwise_movable { enum { cFlag = false }; }; #define BASISU_DEFINE_BITWISE_MOVABLE(Q) template<> struct bitwise_movable { enum { cFlag = true }; }; - template - struct bitwise_copyable { enum { cFlag = false }; }; + template + struct bitwise_copyable { enum { cFlag = false }; }; #define BASISU_DEFINE_BITWISE_COPYABLE(Q) template<> struct bitwise_copyable { enum { cFlag = true }; }; @@ -188,592 +208,2031 @@ namespace basisu #define BASISU_IS_SCALAR_TYPE(T) (scalar_type::cFlag) -#if !defined(BASISU_HAVE_STD_TRIVIALLY_COPYABLE) && defined(__GNUC__) && __GNUC__<5 - //#define BASISU_IS_TRIVIALLY_COPYABLE(...) __has_trivial_copy(__VA_ARGS__) - #define BASISU_IS_TRIVIALLY_COPYABLE(...) __is_trivially_copyable(__VA_ARGS__) +#if !defined(BASISU_HAVE_STD_TRIVIALLY_COPYABLE) && defined(__GNUC__) && (__GNUC__ < 5) +#define BASISU_IS_TRIVIALLY_COPYABLE(...) __is_trivially_copyable(__VA_ARGS__) #else - #define BASISU_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value +#define BASISU_IS_TRIVIALLY_COPYABLE(...) std::is_trivially_copyable<__VA_ARGS__>::value #endif -// TODO: clean this up -#define BASISU_IS_BITWISE_COPYABLE(T) (BASISU_IS_SCALAR_TYPE(T) || BASISU_IS_POD(T) || BASISU_IS_TRIVIALLY_COPYABLE(T) || (bitwise_copyable::cFlag)) + // TODO: clean this up, it's still confusing (copying vs. movable). +#define BASISU_IS_BITWISE_COPYABLE(T) (BASISU_IS_SCALAR_TYPE(T) || BASISU_IS_POD(T) || BASISU_IS_TRIVIALLY_COPYABLE(T) || std::is_trivial::value || (bitwise_copyable::cFlag)) #define BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(T) (BASISU_IS_BITWISE_COPYABLE(T) || (bitwise_movable::cFlag)) -#define BASISU_HAS_DESTRUCTOR(T) ((!scalar_type::cFlag) && (!__is_pod(T))) - - typedef char(&yes_t)[1]; - typedef char(&no_t)[2]; - - template yes_t class_test(int U::*); - template no_t class_test(...); - - template struct is_class - { - enum { value = (sizeof(class_test(0)) == sizeof(yes_t)) }; - }; - - template struct is_pointer - { - enum { value = false }; - }; - - template struct is_pointer - { - enum { value = true }; - }; - - struct empty_type { }; - - BASISU_DEFINE_BITWISE_COPYABLE(empty_type); - BASISU_DEFINE_BITWISE_MOVABLE(empty_type); - - template struct rel_ops - { - friend bool operator!=(const T& x, const T& y) { return (!(x == y)); } - friend bool operator> (const T& x, const T& y) { return (y < x); } - friend bool operator<=(const T& x, const T& y) { return (!(y < x)); } - friend bool operator>=(const T& x, const T& y) { return (!(x < y)); } - }; - - struct elemental_vector - { - void* m_p; - uint32_t m_size; - uint32_t m_capacity; - - typedef void (*object_mover)(void* pDst, void* pSrc, uint32_t num); - - bool increase_capacity(uint32_t min_new_capacity, bool grow_hint, uint32_t element_size, object_mover pRelocate, bool nofail); - }; - - template - class vector : public rel_ops< vector > - { - public: - typedef T* iterator; - typedef const T* const_iterator; - typedef T value_type; - typedef T& reference; - typedef const T& const_reference; - typedef T* pointer; - typedef const T* const_pointer; - - inline vector() : - m_p(NULL), - m_size(0), - m_capacity(0) - { - } - - inline vector(uint32_t n, const T& init) : - m_p(NULL), - m_size(0), - m_capacity(0) - { - increase_capacity(n, false); - construct_array(m_p, n, init); - m_size = n; - } - - inline vector(const vector& other) : - m_p(NULL), - m_size(0), - m_capacity(0) - { - increase_capacity(other.m_size, false); - - m_size = other.m_size; - - if (BASISU_IS_BITWISE_COPYABLE(T)) - { +#define BASISU_HAS_DESTRUCTOR(T) ((!scalar_type::cFlag) && (!__is_pod(T)) && (!std::is_trivially_destructible::value)) + + typedef char(&yes_t)[1]; + typedef char(&no_t)[2]; + + template yes_t class_test(int U::*); + template no_t class_test(...); + + template struct is_class + { + enum { value = (sizeof(class_test(0)) == sizeof(yes_t)) }; + }; + + template struct is_pointer + { + enum { value = false }; + }; + + template struct is_pointer + { + enum { value = true }; + }; + + struct empty_type { }; + + BASISU_DEFINE_BITWISE_COPYABLE(empty_type); + BASISU_DEFINE_BITWISE_MOVABLE(empty_type); + + template struct rel_ops + { + friend bool operator!=(const T& x, const T& y) { return (!(x == y)); } + friend bool operator> (const T& x, const T& y) { return (y < x); } + friend bool operator<=(const T& x, const T& y) { return (!(y < x)); } + friend bool operator>=(const T& x, const T& y) { return (!(x < y)); } + }; + + struct elemental_vector + { + void* m_p; + size_t m_size; + size_t m_capacity; + + typedef void (*object_mover)(void* pDst, void* pSrc, size_t num); + + bool increase_capacity(size_t min_new_capacity, bool grow_hint, size_t element_size, object_mover pRelocate, bool nofail); + }; + + // Returns true if a+b would overflow a size_t. + inline bool add_overflow_check(size_t a, size_t b) + { + size_t c = a + b; + return c < a; + } + + // Returns false on overflow, true if OK. + template + inline bool can_fit_into_size_t(T val) + { + static_assert(std::is_integral::value, "T must be an integral type"); + + return (val >= 0) && (static_cast(val) == val); + } + + // Returns true if a*b would overflow a size_t. + inline bool mul_overflow_check(size_t a, size_t b) + { + // Avoid the division on 32-bit platforms + if (sizeof(size_t) == sizeof(uint32_t)) + return !can_fit_into_size_t(static_cast(a) * b); + else + return b && (a > (SIZE_MAX / b)); + } + + template + class writable_span; + + template + class readable_span + { + public: + using value_type = T; + using size_type = size_t; + using const_pointer = const T*; + using const_reference = const T&; + using const_iterator = const T*; + + inline readable_span() : + m_p(nullptr), + m_size(0) + { + } + + inline readable_span(const writable_span& other); + inline readable_span& operator= (const writable_span& rhs); + + inline readable_span(const_pointer p, size_t n) + { + set(p, n); + } + + inline readable_span(const_pointer s, const_pointer e) + { + set(s, e); + } + + inline readable_span(const readable_span& other) : + m_p(other.m_p), + m_size(other.m_size) + { + assert(!m_size || m_p); + } + + inline readable_span(readable_span&& other) : + m_p(other.m_p), + m_size(other.m_size) + { + assert(!m_size || m_p); + + other.m_p = nullptr; + other.m_size = 0; + } + + template + inline readable_span(const T(&arr)[N]) : + m_p(arr), + m_size(N) + { + } + + template + inline readable_span& set(const T(&arr)[N]) + { + m_p = arr; + m_size = N; + return *this; + } + + inline readable_span& set(const_pointer p, size_t n) + { + if (!p && n) + { + assert(0); + m_p = nullptr; + m_size = 0; + } + else + { + m_p = p; + m_size = n; + } + + return *this; + } + + inline readable_span& set(const_pointer s, const_pointer e) + { + if ((e < s) || (!s && e)) + { + assert(0); + m_p = nullptr; + m_size = 0; + } + else + { + m_p = s; + m_size = e - s; + } + + return *this; + } + + inline bool operator== (const readable_span& rhs) const + { + return (m_p == rhs.m_p) && (m_size == rhs.m_size); + } + + inline bool operator!= (const readable_span& rhs) const + { + return (m_p != rhs.m_p) || (m_size != rhs.m_size); + } + + // only true if the region is totally inside the span + inline bool is_inside_ptr(const_pointer p, size_t n) const + { + if (!is_valid()) + { + assert(0); + return false; + } + + if (!p) + { + assert(!n); + return false; + } + + return (p >= m_p) && ((p + n) <= end()); + } + + inline bool is_inside(size_t ofs, size_t size) const + { + if (add_overflow_check(ofs, size)) + { + assert(0); + return false; + } + + if (!is_valid()) + { + assert(0); + return false; + } + + if ((ofs + size) > m_size) + return false; + + return true; + } + + inline readable_span subspan(size_t ofs, size_t n) const + { + if (!is_valid()) + { + assert(0); + return readable_span((const_pointer)nullptr, (size_t)0); + } + + if (add_overflow_check(ofs, n)) + { + assert(0); + return readable_span((const_pointer)nullptr, (size_t)0); + } + + if ((ofs + n) > m_size) + { + assert(0); + return readable_span((const_pointer)nullptr, (size_t)0); + } + + return readable_span(m_p + ofs, n); + } + + void clear() + { + m_p = nullptr; + m_size = 0; + } + + inline bool empty() const { return !m_size; } + + // true if the span is non-nullptr and is not empty + inline bool is_valid() const { return m_p && m_size; } + + inline bool is_nullptr() const { return m_p == nullptr; } + + inline size_t size() const { return m_size; } + inline size_t size_in_bytes() const { assert(can_fit_into_size_t((uint64_t)m_size * sizeof(T))); return m_size * sizeof(T); } + + inline const_pointer get_ptr() const { return m_p; } + + inline const_iterator begin() const { return m_p; } + inline const_iterator end() const { assert(m_p || !m_size); return m_p + m_size; } + + inline const_iterator cbegin() const { return m_p; } + inline const_iterator cend() const { assert(m_p || !m_size); return m_p + m_size; } + + inline const_reference front() const + { + if (!(m_p && m_size)) + container_abort("readable_span invalid\n"); + + return m_p[0]; + } + + inline const_reference back() const + { + if (!(m_p && m_size)) + container_abort("readable_span invalid\n"); + + return m_p[m_size - 1]; + } + + inline readable_span& operator= (const readable_span& rhs) + { + m_p = rhs.m_p; + m_size = rhs.m_size; + return *this; + } + + inline readable_span& operator= (readable_span&& rhs) + { + if (this != &rhs) + { + m_p = rhs.m_p; + m_size = rhs.m_size; + rhs.m_p = nullptr; + rhs.m_size = 0; + } + + return *this; + } + + inline const_reference operator* () const + { + if (!(m_p && m_size)) + container_abort("readable_span invalid\n"); + + return *m_p; + } + + inline const_pointer operator-> () const + { + if (!(m_p && m_size)) + container_abort("readable_span invalid\n"); + + return m_p; + } + + inline readable_span& remove_prefix(size_t n) + { + if ((!m_p) || (n > m_size)) + { + assert(0); + return *this; + } + + m_p += n; + m_size -= n; + return *this; + } + + inline readable_span& remove_suffix(size_t n) + { + if ((!m_p) || (n > m_size)) + { + assert(0); + return *this; + } + + m_size -= n; + return *this; + } + + inline readable_span& enlarge(size_t n) + { + if (!m_p) + { + assert(0); + return *this; + } + + if (add_overflow_check(m_size, n)) + { + assert(0); + return *this; + } + + m_size += n; + return *this; + } + + bool copy_from(size_t src_ofs, size_t src_size, T* pDst, size_t dst_ofs) const + { + if (!src_size) + return true; + + if (!pDst) + { + assert(0); + return false; + } + + if (!is_inside(src_ofs, src_size)) + { + assert(0); + return false; + } + + const_pointer pS = m_p + src_ofs; + + if (BASISU_IS_BITWISE_COPYABLE(T)) + { + const uint64_t num_bytes = (uint64_t)src_size * sizeof(T); + + if (!can_fit_into_size_t(num_bytes)) + { + assert(0); + return false; + } + + memcpy(pDst, pS, (size_t)num_bytes); + } + else + { + T* pD = pDst + dst_ofs; + T* pDst_end = pD + src_size; + + while (pD != pDst_end) + *pD++ = *pS++; + } + + return true; + } + + inline const_reference operator[] (size_t idx) const + { + if ((!is_valid()) || (idx >= m_size)) + container_abort("readable_span: invalid span or index\n"); + + return m_p[idx]; + } + + inline uint16_t read_le16(size_t ofs) const + { + static_assert(sizeof(T) == 1, "T must be byte size"); + + if (!is_inside(ofs, sizeof(uint16_t))) + { + assert(0); + return false; + } + + const uint8_t a = (uint8_t)m_p[ofs]; + const uint8_t b = (uint8_t)m_p[ofs + 1]; + return a | (b << 8u); + } + + template + inline R read_val(size_t ofs) const + { + static_assert(sizeof(T) == 1, "T must be byte size"); + + if (!is_inside(ofs, sizeof(R))) + { + assert(0); + return (R)0; + } + + return *reinterpret_cast(&m_p[ofs]); + } + + inline uint16_t read_be16(size_t ofs) const + { + static_assert(sizeof(T) == 1, "T must be byte size"); + + if (!is_inside(ofs, sizeof(uint16_t))) + { + assert(0); + return 0; + } + + const uint8_t b = (uint8_t)m_p[ofs]; + const uint8_t a = (uint8_t)m_p[ofs + 1]; + return a | (b << 8u); + } + + inline uint32_t read_le32(size_t ofs) const + { + static_assert(sizeof(T) == 1, "T must be byte size"); + + if (!is_inside(ofs, sizeof(uint32_t))) + { + assert(0); + return 0; + } + + const uint8_t a = (uint8_t)m_p[ofs]; + const uint8_t b = (uint8_t)m_p[ofs + 1]; + const uint8_t c = (uint8_t)m_p[ofs + 2]; + const uint8_t d = (uint8_t)m_p[ofs + 3]; + return a | (b << 8u) | (c << 16u) | (d << 24u); + } + + inline uint32_t read_be32(size_t ofs) const + { + static_assert(sizeof(T) == 1, "T must be byte size"); + + if (!is_inside(ofs, sizeof(uint32_t))) + { + assert(0); + return 0; + } + + const uint8_t d = (uint8_t)m_p[ofs]; + const uint8_t c = (uint8_t)m_p[ofs + 1]; + const uint8_t b = (uint8_t)m_p[ofs + 2]; + const uint8_t a = (uint8_t)m_p[ofs + 3]; + return a | (b << 8u) | (c << 16u) | (d << 24u); + } + + inline uint64_t read_le64(size_t ofs) const + { + if (!add_overflow_check(ofs, sizeof(uint64_t))) + { + assert(0); + return 0; + } + const uint64_t l = read_le32(ofs); + const uint64_t h = read_le32(ofs + sizeof(uint32_t)); + return l | (h << 32u); + } + + inline uint64_t read_be64(size_t ofs) const + { + if (!add_overflow_check(ofs, sizeof(uint64_t))) + { + assert(0); + return 0; + } + const uint64_t h = read_be32(ofs); + const uint64_t l = read_be32(ofs + sizeof(uint32_t)); + return l | (h << 32u); + } + + private: + const_pointer m_p; + size_t m_size; + }; + + template + class writable_span + { + friend readable_span; + + public: + using value_type = T; + using size_type = size_t; + using const_pointer = const T*; + using const_reference = const T&; + using const_iterator = const T*; + using pointer = T*; + using reference = T&; + using iterator = T*; + + inline writable_span() : + m_p(nullptr), + m_size(0) + { + } + + inline writable_span(T* p, size_t n) + { + set(p, n); + } + + inline writable_span(T* s, T* e) + { + set(s, e); + } + + inline writable_span(const writable_span& other) : + m_p(other.m_p), + m_size(other.m_size) + { + assert(!m_size || m_p); + } + + inline writable_span(writable_span&& other) : + m_p(other.m_p), + m_size(other.m_size) + { + assert(!m_size || m_p); + + other.m_p = nullptr; + other.m_size = 0; + } + + template + inline writable_span(T(&arr)[N]) : + m_p(arr), + m_size(N) + { + } + + readable_span get_readable_span() const + { + return readable_span(m_p, m_size); + } + + template + inline writable_span& set(T(&arr)[N]) + { + m_p = arr; + m_size = N; + return *this; + } + + inline writable_span& set(T* p, size_t n) + { + if (!p && n) + { + assert(0); + m_p = nullptr; + m_size = 0; + } + else + { + m_p = p; + m_size = n; + } + + return *this; + } + + inline writable_span& set(T* s, T* e) + { + if ((e < s) || (!s && e)) + { + assert(0); + m_p = nullptr; + m_size = 0; + } + else + { + m_p = s; + m_size = e - s; + } + + return *this; + } + + inline bool operator== (const writable_span& rhs) const + { + return (m_p == rhs.m_p) && (m_size == rhs.m_size); + } + + inline bool operator== (const readable_span& rhs) const + { + return (m_p == rhs.m_p) && (m_size == rhs.m_size); + } + + inline bool operator!= (const writable_span& rhs) const + { + return (m_p != rhs.m_p) || (m_size != rhs.m_size); + } + + inline bool operator!= (const readable_span& rhs) const + { + return (m_p != rhs.m_p) || (m_size != rhs.m_size); + } + + // only true if the region is totally inside the span + inline bool is_inside_ptr(const_pointer p, size_t n) const + { + if (!is_valid()) + { + assert(0); + return false; + } + + if (!p) + { + assert(!n); + return false; + } + + return (p >= m_p) && ((p + n) <= end()); + } + + inline bool is_inside(size_t ofs, size_t size) const + { + if (add_overflow_check(ofs, size)) + { + assert(0); + return false; + } + + if (!is_valid()) + { + assert(0); + return false; + } + + if ((ofs + size) > m_size) + return false; + + return true; + } + + inline writable_span subspan(size_t ofs, size_t n) const + { + if (!is_valid()) + { + assert(0); + return writable_span((T*)nullptr, (size_t)0); + } + + if (add_overflow_check(ofs, n)) + { + assert(0); + return writable_span((T*)nullptr, (size_t)0); + } + + if ((ofs + n) > m_size) + { + assert(0); + return writable_span((T*)nullptr, (size_t)0); + } + + return writable_span(m_p + ofs, n); + } + + void clear() + { + m_p = nullptr; + m_size = 0; + } + + inline bool empty() const { return !m_size; } + + // true if the span is non-nullptr and is not empty + inline bool is_valid() const { return m_p && m_size; } + + inline bool is_nullptr() const { return m_p == nullptr; } + + inline size_t size() const { return m_size; } + inline size_t size_in_bytes() const { assert(can_fit_into_size_t((uint64_t)m_size * sizeof(T))); return m_size * sizeof(T); } + + inline T* get_ptr() const { return m_p; } + + inline iterator begin() const { return m_p; } + inline iterator end() const { assert(m_p || !m_size); return m_p + m_size; } + + inline const_iterator cbegin() const { return m_p; } + inline const_iterator cend() const { assert(m_p || !m_size); return m_p + m_size; } + + inline T& front() const + { + if (!(m_p && m_size)) + container_abort("writable_span invalid\n"); + + return m_p[0]; + } + + inline T& back() const + { + if (!(m_p && m_size)) + container_abort("writable_span invalid\n"); + + return m_p[m_size - 1]; + } + + inline writable_span& operator= (const writable_span& rhs) + { + m_p = rhs.m_p; + m_size = rhs.m_size; + return *this; + } + + inline writable_span& operator= (writable_span&& rhs) + { + if (this != &rhs) + { + m_p = rhs.m_p; + m_size = rhs.m_size; + rhs.m_p = nullptr; + rhs.m_size = 0; + } + + return *this; + } + + inline T& operator* () const + { + if (!(m_p && m_size)) + container_abort("writable_span invalid\n"); + + return *m_p; + } + + inline T* operator-> () const + { + if (!(m_p && m_size)) + container_abort("writable_span invalid\n"); + + return m_p; + } + + inline bool set_all(size_t ofs, size_t size, const_reference val) + { + if (!size) + return true; + + if (!is_inside(ofs, size)) + { + assert(0); + return false; + } + + T* pDst = m_p + ofs; + + if ((sizeof(T) == sizeof(uint8_t)) && (BASISU_IS_BITWISE_COPYABLE(T))) + { + memset(pDst, (int)((uint8_t)val), size); + } + else + { + + T* pDst_end = pDst + size; + + while (pDst != pDst_end) + *pDst++ = val; + } + + return true; + } + + inline bool set_all(const_reference val) + { + return set_all(0, m_size, val); + } + + inline writable_span& remove_prefix(size_t n) + { + if ((!m_p) || (n > m_size)) + { + assert(0); + return *this; + } + + m_p += n; + m_size -= n; + return *this; + } + + inline writable_span& remove_suffix(size_t n) + { + if ((!m_p) || (n > m_size)) + { + assert(0); + return *this; + } + + m_size -= n; + return *this; + } + + inline writable_span& enlarge(size_t n) + { + if (!m_p) + { + assert(0); + return *this; + } + + if (add_overflow_check(m_size, n)) + { + assert(0); + return *this; + } + + m_size += n; + return *this; + } + + // copy from this span to the destination ptr + bool copy_from(size_t src_ofs, size_t src_size, T* pDst, size_t dst_ofs) const + { + if (!src_size) + return true; + + if (!pDst) + { + assert(0); + return false; + } + + if (!is_inside(src_ofs, src_size)) + { + assert(0); + return false; + } + + const_pointer pS = m_p + src_ofs; + + if (BASISU_IS_BITWISE_COPYABLE(T)) + { + const uint64_t num_bytes = (uint64_t)src_size * sizeof(T); + + if (!can_fit_into_size_t(num_bytes)) + { + assert(0); + return false; + } + + memcpy(pDst, pS, (size_t)num_bytes); + } + else + { + T* pD = pDst + dst_ofs; + T* pDst_end = pD + src_size; + + while (pD != pDst_end) + *pD++ = *pS++; + } + + return true; + } + + // copy from the source ptr into this span + bool copy_into(const_pointer pSrc, size_t src_ofs, size_t src_size, size_t dst_ofs) const + { + if (!src_size) + return true; + + if (!pSrc) + { + assert(0); + return false; + } + + if (add_overflow_check(src_ofs, src_size) || add_overflow_check(dst_ofs, src_size)) + { + assert(0); + return false; + } + + if (!is_valid()) + { + assert(0); + return false; + } + + if (!is_inside(dst_ofs, src_size)) + { + assert(0); + return false; + } + + const_pointer pS = pSrc + src_ofs; + T* pD = m_p + dst_ofs; + + if (BASISU_IS_BITWISE_COPYABLE(T)) + { + const uint64_t num_bytes = (uint64_t)src_size * sizeof(T); + + if (!can_fit_into_size_t(num_bytes)) + { + assert(0); + return false; + } + + memcpy(pD, pS, (size_t)num_bytes); + } + else + { + T* pDst_end = pD + src_size; + + while (pD != pDst_end) + *pD++ = *pS++; + } + + return true; + } + + // copy from a source span into this span + bool copy_into(const readable_span& src, size_t src_ofs, size_t src_size, size_t dst_ofs) const + { + if (!src.is_inside(src_ofs, src_size)) + { + assert(0); + return false; + } + + return copy_into(src.get_ptr(), src_ofs, src_size, dst_ofs); + } + + // copy from a source span into this span + bool copy_into(const writable_span& src, size_t src_ofs, size_t src_size, size_t dst_ofs) const + { + if (!src.is_inside(src_ofs, src_size)) + { + assert(0); + return false; + } + + return copy_into(src.get_ptr(), src_ofs, src_size, dst_ofs); + } + + inline T& operator[] (size_t idx) const + { + if ((!is_valid()) || (idx >= m_size)) + container_abort("writable_span: invalid span or index\n"); + + return m_p[idx]; + } + + template + inline R read_val(size_t ofs) const + { + static_assert(sizeof(T) == 1, "T must be byte size"); + + if (!is_inside(ofs, sizeof(R))) + { + assert(0); + return (R)0; + } + + return *reinterpret_cast(&m_p[ofs]); + } + + template + inline bool write_val(size_t ofs, R val) const + { + static_assert(sizeof(T) == 1, "T must be byte size"); + + if (!is_inside(ofs, sizeof(R))) + { + assert(0); + return false; + } + + *reinterpret_cast(&m_p[ofs]) = val; + return true; + } + + inline bool write_le16(size_t ofs, uint16_t val) const + { + static_assert(sizeof(T) == 1, "T must be byte size"); + + if (!is_inside(ofs, sizeof(uint16_t))) + { + assert(0); + return false; + } + + m_p[ofs] = (uint8_t)val; + m_p[ofs + 1] = (uint8_t)(val >> 8u); + return true; + } + + inline bool write_be16(size_t ofs, uint16_t val) const + { + static_assert(sizeof(T) == 1, "T must be byte size"); + + if (!is_inside(ofs, sizeof(uint16_t))) + { + assert(0); + return false; + } + + m_p[ofs + 1] = (uint8_t)val; + m_p[ofs] = (uint8_t)(val >> 8u); + return true; + } + + inline bool write_le32(size_t ofs, uint32_t val) const + { + static_assert(sizeof(T) == 1, "T must be byte size"); + + if (!is_inside(ofs, sizeof(uint32_t))) + { + assert(0); + return false; + } + + m_p[ofs] = (uint8_t)val; + m_p[ofs + 1] = (uint8_t)(val >> 8u); + m_p[ofs + 2] = (uint8_t)(val >> 16u); + m_p[ofs + 3] = (uint8_t)(val >> 24u); + return true; + } + + inline bool write_be32(size_t ofs, uint32_t val) const + { + static_assert(sizeof(T) == 1, "T must be byte size"); + + if (!is_inside(ofs, sizeof(uint32_t))) + { + assert(0); + return false; + } + + m_p[ofs + 3] = (uint8_t)val; + m_p[ofs + 2] = (uint8_t)(val >> 8u); + m_p[ofs + 1] = (uint8_t)(val >> 16u); + m_p[ofs] = (uint8_t)(val >> 24u); + return true; + } + + inline bool write_le64(size_t ofs, uint64_t val) const + { + if (!add_overflow_check(ofs, sizeof(uint64_t))) + { + assert(0); + return false; + } + + return write_le32(ofs, (uint32_t)val) && write_le32(ofs + sizeof(uint32_t), (uint32_t)(val >> 32u)); + } + + inline bool write_be64(size_t ofs, uint64_t val) const + { + if (!add_overflow_check(ofs, sizeof(uint64_t))) + { + assert(0); + return false; + } + + return write_be32(ofs + sizeof(uint32_t), (uint32_t)val) && write_be32(ofs, (uint32_t)(val >> 32u)); + } + + inline uint16_t read_le16(size_t ofs) const + { + static_assert(sizeof(T) == 1, "T must be byte size"); + + if (!is_inside(ofs, sizeof(uint16_t))) + { + assert(0); + return 0; + } + + const uint8_t a = (uint8_t)m_p[ofs]; + const uint8_t b = (uint8_t)m_p[ofs + 1]; + return a | (b << 8u); + } + + inline uint16_t read_be16(size_t ofs) const + { + static_assert(sizeof(T) == 1, "T must be byte size"); + + if (!is_inside(ofs, sizeof(uint16_t))) + { + assert(0); + return 0; + } + + const uint8_t b = (uint8_t)m_p[ofs]; + const uint8_t a = (uint8_t)m_p[ofs + 1]; + return a | (b << 8u); + } + + inline uint32_t read_le32(size_t ofs) const + { + static_assert(sizeof(T) == 1, "T must be byte size"); + + if (!is_inside(ofs, sizeof(uint32_t))) + { + assert(0); + return 0; + } + + const uint8_t a = (uint8_t)m_p[ofs]; + const uint8_t b = (uint8_t)m_p[ofs + 1]; + const uint8_t c = (uint8_t)m_p[ofs + 2]; + const uint8_t d = (uint8_t)m_p[ofs + 3]; + return a | (b << 8u) | (c << 16u) | (d << 24u); + } + + inline uint32_t read_be32(size_t ofs) const + { + static_assert(sizeof(T) == 1, "T must be byte size"); + + if (!is_inside(ofs, sizeof(uint32_t))) + { + assert(0); + return 0; + } + + const uint8_t d = (uint8_t)m_p[ofs]; + const uint8_t c = (uint8_t)m_p[ofs + 1]; + const uint8_t b = (uint8_t)m_p[ofs + 2]; + const uint8_t a = (uint8_t)m_p[ofs + 3]; + return a | (b << 8u) | (c << 16u) | (d << 24u); + } + + inline uint64_t read_le64(size_t ofs) const + { + if (!add_overflow_check(ofs, sizeof(uint64_t))) + { + assert(0); + return 0; + } + const uint64_t l = read_le32(ofs); + const uint64_t h = read_le32(ofs + sizeof(uint32_t)); + return l | (h << 32u); + } + + inline uint64_t read_be64(size_t ofs) const + { + if (!add_overflow_check(ofs, sizeof(uint64_t))) + { + assert(0); + return 0; + } + const uint64_t h = read_be32(ofs); + const uint64_t l = read_be32(ofs + sizeof(uint32_t)); + return l | (h << 32u); + } + + private: + T* m_p; + size_t m_size; + }; + + template + inline readable_span::readable_span(const writable_span& other) : + m_p(other.m_p), + m_size(other.m_size) + { + } + + template + inline readable_span& readable_span::operator= (const writable_span& rhs) + { + m_p = rhs.m_p; + m_size = rhs.m_size; + return *this; + } + + template + inline bool span_copy(const writable_span& dst, const readable_span& src) + { + return dst.copy_into(src, 0, src.size(), 0); + } + + template + inline bool span_copy(const writable_span& dst, const writable_span& src) + { + return dst.copy_into(src, 0, src.size(), 0); + } + + template + inline bool span_copy(const writable_span& dst, size_t dst_ofs, const writable_span& src, size_t src_ofs, size_t len) + { + return dst.copy_into(src, src_ofs, len, dst_ofs); + } + + template + inline bool span_copy(const writable_span& dst, size_t dst_ofs, const readable_span& src, size_t src_ofs, size_t len) + { + return dst.copy_into(src, src_ofs, len, dst_ofs); + } + + template + class vector : public rel_ops< vector > + { + public: + typedef T* iterator; + typedef const T* const_iterator; + typedef T value_type; + typedef T& reference; + typedef const T& const_reference; + typedef T* pointer; + typedef const T* const_pointer; + + inline vector() : + m_p(nullptr), + m_size(0), + m_capacity(0) + { + } + + inline vector(size_t n, const T& init) : + m_p(nullptr), + m_size(0), + m_capacity(0) + { + increase_capacity(n, false); + construct_array(m_p, n, init); + m_size = n; + } + + inline vector(vector&& other) : + m_p(other.m_p), + m_size(other.m_size), + m_capacity(other.m_capacity) + { + other.m_p = nullptr; + other.m_size = 0; + other.m_capacity = 0; + } + + inline vector(const vector& other) : + m_p(nullptr), + m_size(0), + m_capacity(0) + { + increase_capacity(other.m_size, false); + + m_size = other.m_size; + + if (BASISU_IS_BITWISE_COPYABLE(T)) + { + #ifndef __EMSCRIPTEN__ #ifdef __GNUC__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wclass-memaccess" #endif #endif - if ((m_p) && (other.m_p)) - memcpy(m_p, other.m_p, m_size * sizeof(T)); + if ((m_p) && (other.m_p)) + { + memcpy(m_p, other.m_p, m_size * sizeof(T)); + } #ifndef __EMSCRIPTEN__ #ifdef __GNUC__ #pragma GCC diagnostic pop #endif #endif - } - else - { - T* pDst = m_p; - const T* pSrc = other.m_p; - for (uint32_t i = m_size; i > 0; i--) - construct(pDst++, *pSrc++); - } - } - - inline explicit vector(size_t size) : - m_p(NULL), - m_size(0), - m_capacity(0) - { - resize(size); - } - - inline ~vector() - { - if (m_p) - { - scalar_type::destruct_array(m_p, m_size); - free(m_p); - } - } - - inline vector& operator= (const vector& other) - { - if (this == &other) - return *this; - - if (m_capacity >= other.m_size) - resize(0); - else - { - clear(); - increase_capacity(other.m_size, false); - } - - if (BASISU_IS_BITWISE_COPYABLE(T)) - { + } + else + { + T* pDst = m_p; + const T* pSrc = other.m_p; + for (size_t i = m_size; i > 0; i--) + construct(pDst++, *pSrc++); + } + } + + inline explicit vector(size_t size) : + m_p(nullptr), + m_size(0), + m_capacity(0) + { + resize(size); + } + + inline explicit vector(std::initializer_list init_list) : + m_p(nullptr), + m_size(0), + m_capacity(0) + { + resize(init_list.size()); + + size_t idx = 0; + for (const T& elem : init_list) + m_p[idx++] = elem; + + assert(idx == m_size); + } + + inline vector(const readable_span& rs) : + m_p(nullptr), + m_size(0), + m_capacity(0) + { + set(rs); + } + + inline vector(const writable_span& ws) : + m_p(nullptr), + m_size(0), + m_capacity(0) + { + set(ws); + } + + // Set contents of vector to contents of the readable span + bool set(const readable_span& rs) + { + if (!rs.is_valid()) + { + assert(0); + return false; + } + + const size_t new_size = rs.size(); + + // Could call resize(), but it'll redundantly construct trivial types. + if (m_size != new_size) + { + if (new_size < m_size) + { + if (BASISU_HAS_DESTRUCTOR(T)) + { + scalar_type::destruct_array(m_p + new_size, m_size - new_size); + } + } + else + { + if (new_size > m_capacity) + { + if (!increase_capacity(new_size, false, true)) + return false; + } + } + + // Don't bother constructing trivial types, because we're going to memcpy() over them anyway. + if (!BASISU_IS_BITWISE_COPYABLE(T)) + { + scalar_type::construct_array(m_p + m_size, new_size - m_size); + } + + m_size = new_size; + } + + if (!rs.copy_from(0, rs.size(), m_p, 0)) + { + assert(0); + return false; + } + + return true; + } + + // Set contents of vector to contents of the writable span + inline bool set(const writable_span& ws) + { + return set(ws.get_readable_span()); + } + + inline ~vector() + { + if (m_p) + { + if (BASISU_HAS_DESTRUCTOR(T)) + { + scalar_type::destruct_array(m_p, m_size); + } + + free(m_p); + } + } + + inline vector& operator= (const vector& other) + { + if (this == &other) + return *this; + + if (m_capacity >= other.m_size) + resize(0); + else + { + clear(); + increase_capacity(other.m_size, false); + } + + if (BASISU_IS_BITWISE_COPYABLE(T)) + { #ifndef __EMSCRIPTEN__ #ifdef __GNUC__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wclass-memaccess" #endif #endif - if ((m_p) && (other.m_p)) - memcpy(m_p, other.m_p, other.m_size * sizeof(T)); + if ((m_p) && (other.m_p)) + memcpy(m_p, other.m_p, other.m_size * sizeof(T)); #ifndef __EMSCRIPTEN__ #ifdef __GNUC__ #pragma GCC diagnostic pop #endif #endif - } - else - { - T* pDst = m_p; - const T* pSrc = other.m_p; - for (uint32_t i = other.m_size; i > 0; i--) - construct(pDst++, *pSrc++); - } - - m_size = other.m_size; - - return *this; - } - - BASISU_FORCE_INLINE const T* begin() const { return m_p; } - BASISU_FORCE_INLINE T* begin() { return m_p; } - - BASISU_FORCE_INLINE const T* end() const { return m_p + m_size; } - BASISU_FORCE_INLINE T* end() { return m_p + m_size; } - - BASISU_FORCE_INLINE bool empty() const { return !m_size; } - BASISU_FORCE_INLINE uint32_t size() const { return m_size; } - BASISU_FORCE_INLINE uint32_t size_in_bytes() const { return m_size * sizeof(T); } - BASISU_FORCE_INLINE uint32_t capacity() const { return m_capacity; } - - // operator[] will assert on out of range indices, but in final builds there is (and will never be) any range checking on this method. - //BASISU_FORCE_INLINE const T& operator[] (uint32_t i) const { assert(i < m_size); return m_p[i]; } - //BASISU_FORCE_INLINE T& operator[] (uint32_t i) { assert(i < m_size); return m_p[i]; } - + } + else + { + T* pDst = m_p; + const T* pSrc = other.m_p; + for (size_t i = other.m_size; i > 0; i--) + construct(pDst++, *pSrc++); + } + + m_size = other.m_size; + + return *this; + } + + inline vector& operator= (vector&& rhs) + { + if (this != &rhs) + { + clear(); + + m_p = rhs.m_p; + m_size = rhs.m_size; + m_capacity = rhs.m_capacity; + + rhs.m_p = nullptr; + rhs.m_size = 0; + rhs.m_capacity = 0; + } + return *this; + } + + BASISU_FORCE_INLINE const T* begin() const { return m_p; } + BASISU_FORCE_INLINE T* begin() { return m_p; } + + BASISU_FORCE_INLINE const T* end() const { return m_p + m_size; } + BASISU_FORCE_INLINE T* end() { return m_p + m_size; } + + BASISU_FORCE_INLINE bool empty() const { return !m_size; } + + BASISU_FORCE_INLINE size_t size() const { return m_size; } + BASISU_FORCE_INLINE uint32_t size_u32() const { assert(m_size <= UINT32_MAX); return static_cast(m_size); } + + BASISU_FORCE_INLINE size_t size_in_bytes() const { return m_size * sizeof(T); } + BASISU_FORCE_INLINE uint32_t size_in_bytes_u32() const { assert((m_size * sizeof(T)) <= UINT32_MAX); return static_cast(m_size * sizeof(T)); } + + BASISU_FORCE_INLINE size_t capacity() const { return m_capacity; } + #if !BASISU_VECTOR_FORCE_CHECKING - BASISU_FORCE_INLINE const T& operator[] (size_t i) const { assert(i < m_size); return m_p[i]; } - BASISU_FORCE_INLINE T& operator[] (size_t i) { assert(i < m_size); return m_p[i]; } + BASISU_FORCE_INLINE const T& operator[] (size_t i) const { assert(i < m_size); return m_p[i]; } + BASISU_FORCE_INLINE T& operator[] (size_t i) { assert(i < m_size); return m_p[i]; } #else - BASISU_FORCE_INLINE const T& operator[] (size_t i) const - { - if (i >= m_size) - { - fprintf(stderr, "operator[] invalid index: %u, max entries %u, type size %u\n", (uint32_t)i, m_size, (uint32_t)sizeof(T)); - abort(); - } - return m_p[i]; - } - BASISU_FORCE_INLINE T& operator[] (size_t i) - { - if (i >= m_size) - { - fprintf(stderr, "operator[] invalid index: %u, max entries %u, type size %u\n", (uint32_t)i, m_size, (uint32_t)sizeof(T)); - abort(); - } - return m_p[i]; - } + BASISU_FORCE_INLINE const T& operator[] (size_t i) const + { + if (i >= m_size) + container_abort("vector::operator[] invalid index: %zu, max entries %u, type size %zu\n", i, m_size, sizeof(T)); + + return m_p[i]; + } + BASISU_FORCE_INLINE T& operator[] (size_t i) + { + if (i >= m_size) + container_abort("vector::operator[] invalid index: %zu, max entries %u, type size %zu\n", i, m_size, sizeof(T)); + + return m_p[i]; + } #endif - // at() always includes range checking, even in final builds, unlike operator []. - // The first element is returned if the index is out of range. - BASISU_FORCE_INLINE const T& at(size_t i) const { assert(i < m_size); return (i >= m_size) ? m_p[0] : m_p[i]; } - BASISU_FORCE_INLINE T& at(size_t i) { assert(i < m_size); return (i >= m_size) ? m_p[0] : m_p[i]; } - -#if !BASISU_VECTOR_FORCE_CHECKING - BASISU_FORCE_INLINE const T& front() const { assert(m_size); return m_p[0]; } - BASISU_FORCE_INLINE T& front() { assert(m_size); return m_p[0]; } + // at() always includes range checking, even in final builds, unlike operator []. + BASISU_FORCE_INLINE const T& at(size_t i) const + { + if (i >= m_size) + container_abort("vector::at() invalid index: %zu, max entries %u, type size %zu\n", i, m_size, sizeof(T)); - BASISU_FORCE_INLINE const T& back() const { assert(m_size); return m_p[m_size - 1]; } - BASISU_FORCE_INLINE T& back() { assert(m_size); return m_p[m_size - 1]; } -#else - BASISU_FORCE_INLINE const T& front() const - { - if (!m_size) - { - fprintf(stderr, "front: vector is empty, type size %u\n", (uint32_t)sizeof(T)); - abort(); - } - return m_p[0]; - } - BASISU_FORCE_INLINE T& front() - { - if (!m_size) - { - fprintf(stderr, "front: vector is empty, type size %u\n", (uint32_t)sizeof(T)); - abort(); - } - return m_p[0]; - } - - BASISU_FORCE_INLINE const T& back() const - { - if(!m_size) - { - fprintf(stderr, "back: vector is empty, type size %u\n", (uint32_t)sizeof(T)); - abort(); - } - return m_p[m_size - 1]; - } - BASISU_FORCE_INLINE T& back() - { - if (!m_size) - { - fprintf(stderr, "back: vector is empty, type size %u\n", (uint32_t)sizeof(T)); - abort(); - } - return m_p[m_size - 1]; - } -#endif + return m_p[i]; + } + BASISU_FORCE_INLINE T& at(size_t i) + { + if (i >= m_size) + container_abort("vector::at() invalid index: %zu, max entries %u, type size %zu\n", i, m_size, sizeof(T)); - BASISU_FORCE_INLINE const T* get_ptr() const { return m_p; } - BASISU_FORCE_INLINE T* get_ptr() { return m_p; } - - BASISU_FORCE_INLINE const T* data() const { return m_p; } - BASISU_FORCE_INLINE T* data() { return m_p; } - - // clear() sets the container to empty, then frees the allocated block. - inline void clear() - { - if (m_p) - { - scalar_type::destruct_array(m_p, m_size); - free(m_p); - m_p = NULL; - m_size = 0; - m_capacity = 0; - } - } - - inline void clear_no_destruction() - { - if (m_p) - { - free(m_p); - m_p = NULL; - m_size = 0; - m_capacity = 0; - } - } - - inline void reserve(size_t new_capacity_size_t) - { - if (new_capacity_size_t > UINT32_MAX) - { - assert(0); - return; - } - - uint32_t new_capacity = (uint32_t)new_capacity_size_t; - - if (new_capacity > m_capacity) - increase_capacity(new_capacity, false); - else if (new_capacity < m_capacity) - { - // Must work around the lack of a "decrease_capacity()" method. - // This case is rare enough in practice that it's probably not worth implementing an optimized in-place resize. - vector tmp; - tmp.increase_capacity(helpers::maximum(m_size, new_capacity), false); - tmp = *this; - swap(tmp); - } - } - - inline bool try_reserve(size_t new_capacity_size_t) - { - if (new_capacity_size_t > UINT32_MAX) - { - assert(0); - return false; - } - - uint32_t new_capacity = (uint32_t)new_capacity_size_t; - - if (new_capacity > m_capacity) - { - if (!increase_capacity(new_capacity, false, true)) - return false; - } - else if (new_capacity < m_capacity) - { - // Must work around the lack of a "decrease_capacity()" method. - // This case is rare enough in practice that it's probably not worth implementing an optimized in-place resize. - vector tmp; - if (!tmp.increase_capacity(helpers::maximum(m_size, new_capacity), false, true)) - return false; - tmp = *this; - swap(tmp); - } - - return true; - } - - // resize(0) sets the container to empty, but does not free the allocated block. - inline void resize(size_t new_size_size_t, bool grow_hint = false) - { - if (new_size_size_t > UINT32_MAX) - { - assert(0); - return; - } - - uint32_t new_size = (uint32_t)new_size_size_t; - - if (m_size != new_size) - { - if (new_size < m_size) - scalar_type::destruct_array(m_p + new_size, m_size - new_size); - else - { - if (new_size > m_capacity) - increase_capacity(new_size, (new_size == (m_size + 1)) || grow_hint); - - scalar_type::construct_array(m_p + m_size, new_size - m_size); - } + return m_p[i]; + } - m_size = new_size; - } - } - - inline bool try_resize(size_t new_size_size_t, bool grow_hint = false) - { - if (new_size_size_t > UINT32_MAX) - { - assert(0); - return false; - } - - uint32_t new_size = (uint32_t)new_size_size_t; - - if (m_size != new_size) - { - if (new_size < m_size) - scalar_type::destruct_array(m_p + new_size, m_size - new_size); - else - { - if (new_size > m_capacity) - { - if (!increase_capacity(new_size, (new_size == (m_size + 1)) || grow_hint, true)) - return false; - } - - scalar_type::construct_array(m_p + m_size, new_size - m_size); - } +#if !BASISU_VECTOR_FORCE_CHECKING + BASISU_FORCE_INLINE const T& front() const { assert(m_size); return m_p[0]; } + BASISU_FORCE_INLINE T& front() { assert(m_size); return m_p[0]; } - m_size = new_size; - } - - return true; - } - - // If size >= capacity/2, reset() sets the container's size to 0 but doesn't free the allocated block (because the container may be similarly loaded in the future). - // Otherwise it blows away the allocated block. See http://www.codercorner.com/blog/?p=494 - inline void reset() - { - if (m_size >= (m_capacity >> 1)) - resize(0); - else - clear(); - } - - inline T* enlarge(uint32_t i) - { - uint32_t cur_size = m_size; - resize(cur_size + i, true); - return get_ptr() + cur_size; - } - - inline T* try_enlarge(uint32_t i) - { - uint32_t cur_size = m_size; - if (!try_resize(cur_size + i, true)) - return NULL; - return get_ptr() + cur_size; - } - - BASISU_FORCE_INLINE void push_back(const T& obj) - { - assert(!m_p || (&obj < m_p) || (&obj >= (m_p + m_size))); - - if (m_size >= m_capacity) - increase_capacity(m_size + 1, true); - - scalar_type::construct(m_p + m_size, obj); - m_size++; - } - - inline bool try_push_back(const T& obj) - { - assert(!m_p || (&obj < m_p) || (&obj >= (m_p + m_size))); - - if (m_size >= m_capacity) - { - if (!increase_capacity(m_size + 1, true, true)) - return false; - } - - scalar_type::construct(m_p + m_size, obj); - m_size++; - - return true; - } - - inline void push_back_value(T obj) - { - if (m_size >= m_capacity) - increase_capacity(m_size + 1, true); - - scalar_type::construct(m_p + m_size, obj); - m_size++; - } - - inline void pop_back() - { - assert(m_size); - - if (m_size) - { - m_size--; - scalar_type::destruct(&m_p[m_size]); - } - } - - inline void insert(uint32_t index, const T* p, uint32_t n) - { - assert(index <= m_size); - if (!n) - return; - - const uint32_t orig_size = m_size; - resize(m_size + n, true); - - const uint32_t num_to_move = orig_size - index; - - if (BASISU_IS_BITWISE_COPYABLE(T)) - { - // This overwrites the destination object bits, but bitwise copyable means we don't need to worry about destruction. - memmove(m_p + index + n, m_p + index, sizeof(T) * num_to_move); - } - else - { - const T* pSrc = m_p + orig_size - 1; - T* pDst = const_cast(pSrc) + n; - - for (uint32_t i = 0; i < num_to_move; i++) - { - assert((pDst - m_p) < (int)m_size); - *pDst-- = *pSrc--; - } - } - - T* pDst = m_p + index; - - if (BASISU_IS_BITWISE_COPYABLE(T)) - { - // This copies in the new bits, overwriting the existing objects, which is OK for copyable types that don't need destruction. - memcpy(pDst, p, sizeof(T) * n); - } - else - { - for (uint32_t i = 0; i < n; i++) - { - assert((pDst - m_p) < (int)m_size); - *pDst++ = *p++; - } - } - } - - inline void insert(T* p, const T& obj) - { - int64_t ofs = p - begin(); - if ((ofs < 0) || (ofs > UINT32_MAX)) - { - assert(0); - return; - } - - insert((uint32_t)ofs, &obj, 1); - } - - // push_front() isn't going to be very fast - it's only here for usability. - inline void push_front(const T& obj) - { - insert(0, &obj, 1); - } - - vector& append(const vector& other) - { - if (other.m_size) - insert(m_size, &other[0], other.m_size); - return *this; - } - - vector& append(const T* p, uint32_t n) - { - if (n) - insert(m_size, p, n); - return *this; - } - - inline void erase(uint32_t start, uint32_t n) - { - assert((start + n) <= m_size); - if ((start + n) > m_size) - return; - - if (!n) - return; - - const uint32_t num_to_move = m_size - (start + n); - - T* pDst = m_p + start; - - const T* pSrc = m_p + start + n; - - if (BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(T)) - { - // This test is overly cautious. - if ((!BASISU_IS_BITWISE_COPYABLE(T)) || (BASISU_HAS_DESTRUCTOR(T))) - { - // Type has been marked explictly as bitwise movable, which means we can move them around but they may need to be destructed. - // First destroy the erased objects. - scalar_type::destruct_array(pDst, n); - } + BASISU_FORCE_INLINE const T& back() const { assert(m_size); return m_p[m_size - 1]; } + BASISU_FORCE_INLINE T& back() { assert(m_size); return m_p[m_size - 1]; } +#else + BASISU_FORCE_INLINE const T& front() const + { + if (!m_size) + container_abort("front: vector is empty, type size %zu\n", sizeof(T)); + + return m_p[0]; + } + BASISU_FORCE_INLINE T& front() + { + if (!m_size) + container_abort("front: vector is empty, type size %zu\n", sizeof(T)); + + return m_p[0]; + } + + BASISU_FORCE_INLINE const T& back() const + { + if (!m_size) + container_abort("back: vector is empty, type size %zu\n", sizeof(T)); + + return m_p[m_size - 1]; + } + BASISU_FORCE_INLINE T& back() + { + if (!m_size) + container_abort("back: vector is empty, type size %zu\n", sizeof(T)); + + return m_p[m_size - 1]; + } +#endif - // Copy "down" the objects to preserve, filling in the empty slots. + BASISU_FORCE_INLINE const T* get_ptr() const { return m_p; } + BASISU_FORCE_INLINE T* get_ptr() { return m_p; } + + BASISU_FORCE_INLINE const T* data() const { return m_p; } + BASISU_FORCE_INLINE T* data() { return m_p; } + + // clear() sets the container to empty, then frees the allocated block. + inline void clear() + { + if (m_p) + { + if (BASISU_HAS_DESTRUCTOR(T)) + { + scalar_type::destruct_array(m_p, m_size); + } + + free(m_p); + + m_p = nullptr; + m_size = 0; + m_capacity = 0; + } + } + + inline void clear_no_destruction() + { + if (m_p) + { + free(m_p); + m_p = nullptr; + m_size = 0; + m_capacity = 0; + } + } + + inline void reserve(size_t new_capacity) + { + if (!try_reserve(new_capacity)) + container_abort("vector:reserve: try_reserve failed!\n"); + } + + inline bool try_reserve(size_t new_capacity) + { + if (new_capacity > m_capacity) + { + if (!increase_capacity(new_capacity, false, true)) + return false; + } + else if (new_capacity < m_capacity) + { + // Must work around the lack of a "decrease_capacity()" method. + // This case is rare enough in practice that it's probably not worth implementing an optimized in-place resize. + vector tmp; + if (!tmp.increase_capacity(helpers::maximum(m_size, new_capacity), false, true)) + return false; + + tmp = *this; + swap(tmp); + } + + return true; + } + + // try_resize(0) sets the container to empty, but does not free the allocated block. + inline bool try_resize(size_t new_size, bool grow_hint = false) + { + if (m_size != new_size) + { + if (new_size < m_size) + { + if (BASISU_HAS_DESTRUCTOR(T)) + { + scalar_type::destruct_array(m_p + new_size, m_size - new_size); + } + } + else + { + if (new_size > m_capacity) + { + if (!increase_capacity(new_size, (new_size == (m_size + 1)) || grow_hint, true)) + return false; + } + + scalar_type::construct_array(m_p + m_size, new_size - m_size); + } + + m_size = new_size; + } + + return true; + } + + // resize(0) sets the container to empty, but does not free the allocated block. + inline void resize(size_t new_size, bool grow_hint = false) + { + if (!try_resize(new_size, grow_hint)) + container_abort("vector::resize failed, new size %zu\n", new_size); + } + + // If size >= capacity/2, reset() sets the container's size to 0 but doesn't free the allocated block (because the container may be similarly loaded in the future). + // Otherwise it blows away the allocated block. See http://www.codercorner.com/blog/?p=494 + inline void reset() + { + if (m_size >= (m_capacity >> 1)) + resize(0); + else + clear(); + } + + inline T* try_enlarge(size_t i) + { + size_t cur_size = m_size; + + if (add_overflow_check(cur_size, i)) + return nullptr; + + if (!try_resize(cur_size + i, true)) + return nullptr; + + return get_ptr() + cur_size; + } + + inline T* enlarge(size_t i) + { + T* p = try_enlarge(i); + if (!p) + container_abort("vector::enlarge failed, amount %zu!\n", i); + return p; + } + + BASISU_FORCE_INLINE void push_back(const T& obj) + { + assert(!m_p || (&obj < m_p) || (&obj >= (m_p + m_size))); + + if (m_size >= m_capacity) + { + if (add_overflow_check(m_size, 1)) + container_abort("vector::push_back: vector too large\n"); + + increase_capacity(m_size + 1, true); + } + + scalar_type::construct(m_p + m_size, obj); + m_size++; + } + + BASISU_FORCE_INLINE void push_back_value(T&& obj) + { + assert(!m_p || (&obj < m_p) || (&obj >= (m_p + m_size))); + + if (m_size >= m_capacity) + { + if (add_overflow_check(m_size, 1)) + container_abort("vector::push_back_value: vector too large\n"); + + increase_capacity(m_size + 1, true); + } + + new ((void*)(m_p + m_size)) T(std::move(obj)); + m_size++; + } + + inline bool try_push_back(const T& obj) + { + assert(!m_p || (&obj < m_p) || (&obj >= (m_p + m_size))); + + if (m_size >= m_capacity) + { + if (add_overflow_check(m_size, 1)) + return false; + + if (!increase_capacity(m_size + 1, true, true)) + return false; + } + + scalar_type::construct(m_p + m_size, obj); + m_size++; + + return true; + } + + inline bool try_push_back(T&& obj) + { + assert(!m_p || (&obj < m_p) || (&obj >= (m_p + m_size))); + + if (m_size >= m_capacity) + { + if (add_overflow_check(m_size, 1)) + return false; + + if (!increase_capacity(m_size + 1, true, true)) + return false; + } + + new ((void*)(m_p + m_size)) T(std::move(obj)); + m_size++; + + return true; + } + + // obj is explictly passed in by value, not ref + inline void push_back_value(T obj) + { + if (m_size >= m_capacity) + { + if (add_overflow_check(m_size, 1)) + container_abort("vector::push_back_value: vector too large\n"); + + increase_capacity(m_size + 1, true); + } + + scalar_type::construct(m_p + m_size, obj); + m_size++; + } + + // obj is explictly passed in by value, not ref + inline bool try_push_back_value(T obj) + { + if (m_size >= m_capacity) + { + if (add_overflow_check(m_size, 1)) + return false; + + if (!increase_capacity(m_size + 1, true, true)) + return false; + } + + scalar_type::construct(m_p + m_size, obj); + m_size++; + + return true; + } + + template + BASISU_FORCE_INLINE void emplace_back(Args&&... args) + { + if (m_size >= m_capacity) + { + if (add_overflow_check(m_size, 1)) + container_abort("vector::enlarge: vector too large\n"); + + increase_capacity(m_size + 1, true); + } + + new ((void*)(m_p + m_size)) T(std::forward(args)...); // perfect forwarding + m_size++; + } + + template + BASISU_FORCE_INLINE bool try_emplace_back(Args&&... args) + { + if (m_size >= m_capacity) + { + if (add_overflow_check(m_size, 1)) + return false; + + if (!increase_capacity(m_size + 1, true, true)) + return false; + } + + new ((void*)(m_p + m_size)) T(std::forward(args)...); // perfect forwarding + m_size++; + + return true; + } + + inline void pop_back() + { + assert(m_size); + + if (m_size) + { + m_size--; + scalar_type::destruct(&m_p[m_size]); + } + } + + inline bool try_insert(size_t index, const T* p, size_t n) + { + assert(index <= m_size); + + if (index > m_size) + return false; + + if (!n) + return true; + + const size_t orig_size = m_size; + + if (add_overflow_check(m_size, n)) + return false; + + if (!try_resize(m_size + n, true)) + return false; + + const size_t num_to_move = orig_size - index; + + if (BASISU_IS_BITWISE_COPYABLE(T)) + { + // This overwrites the destination object bits, but bitwise copyable means we don't need to worry about destruction. + memmove(m_p + index + n, m_p + index, sizeof(T) * num_to_move); + } + else + { + const T* pSrc = m_p + orig_size - 1; + T* pDst = const_cast(pSrc) + n; + + for (size_t i = 0; i < num_to_move; i++) + { + assert((uint64_t)(pDst - m_p) < (uint64_t)m_size); + + *pDst = std::move(*pSrc); + pDst--; + pSrc--; + } + } + + T* pDst = m_p + index; + + if (BASISU_IS_BITWISE_COPYABLE(T)) + { + // This copies in the new bits, overwriting the existing objects, which is OK for copyable types that don't need destruction. + memcpy(pDst, p, sizeof(T) * n); + } + else + { + for (size_t i = 0; i < n; i++) + { + assert((uint64_t)(pDst - m_p) < (uint64_t)m_size); + *pDst++ = *p++; + } + } + + return true; + } + + inline void insert(size_t index, const T* p, size_t n) + { + if (!try_insert(index, p, n)) + container_abort("vector::insert() failed!\n"); + } + + inline bool try_insert(T* p, const T& obj) + { + if (p < begin()) + { + assert(0); + return false; + } + + uint64_t ofs = p - begin(); + + if (ofs > m_size) + { + assert(0); + return false; + } + + if ((size_t)ofs != ofs) + { + assert(0); + return false; + } + + return try_insert((size_t)ofs, &obj, 1); + } + + inline void insert(T* p, const T& obj) + { + if (!try_insert(p, obj)) + container_abort("vector::insert() failed!\n"); + } + + // push_front() isn't going to be very fast - it's only here for usability. + inline void push_front(const T& obj) + { + insert(0, &obj, 1); + } + + inline bool try_push_front(const T& obj) + { + return try_insert(0, &obj, 1); + } + + vector& append(const vector& other) + { + if (other.m_size) + insert(m_size, &other[0], other.m_size); + return *this; + } + + bool try_append(const vector& other) + { + if (other.m_size) + return try_insert(m_size, &other[0], other.m_size); + + return true; + } + + vector& append(const T* p, size_t n) + { + if (n) + insert(m_size, p, n); + return *this; + } + + bool try_append(const T* p, size_t n) + { + if (n) + return try_insert(m_size, p, n); + + return true; + } + + inline bool erase(size_t start, size_t n) + { + if (add_overflow_check(start, n)) + { + assert(0); + return false; + } + + assert((start + n) <= m_size); + + if ((start + n) > m_size) + { + assert(0); + return false; + } + + if (!n) + return true; + + const size_t num_to_move = m_size - (start + n); + + T* pDst = m_p + start; + + const T* pSrc = m_p + start + n; + + if (BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(T)) + { + // This test is overly cautious. + if ((!BASISU_IS_BITWISE_COPYABLE(T)) || (BASISU_HAS_DESTRUCTOR(T))) + { + // Type has been marked explictly as bitwise movable, which means we can move them around but they may need to be destructed. + // First destroy the erased objects. + scalar_type::destruct_array(pDst, n); + } + + // Copy "down" the objects to preserve, filling in the empty slots. #ifndef __EMSCRIPTEN__ #ifdef __GNUC__ @@ -782,1254 +2241,1962 @@ namespace basisu #endif #endif - memmove(pDst, pSrc, num_to_move * sizeof(T)); + memmove(pDst, pSrc, num_to_move * sizeof(T)); #ifndef __EMSCRIPTEN__ #ifdef __GNUC__ #pragma GCC diagnostic pop #endif #endif - } - else - { - // Type is not bitwise copyable or movable. - // Move them down one at a time by using the equals operator, and destroying anything that's left over at the end. - T* pDst_end = pDst + num_to_move; - while (pDst != pDst_end) - *pDst++ = *pSrc++; - - scalar_type::destruct_array(pDst_end, n); - } - - m_size -= n; - } - - inline void erase(uint32_t index) - { - erase(index, 1); - } - - inline void erase(T* p) - { - assert((p >= m_p) && (p < (m_p + m_size))); - erase(static_cast(p - m_p)); - } - - inline void erase(T *pFirst, T *pEnd) - { - assert(pFirst <= pEnd); - assert(pFirst >= begin() && pFirst <= end()); - assert(pEnd >= begin() && pEnd <= end()); - - int64_t ofs = pFirst - begin(); - if ((ofs < 0) || (ofs > UINT32_MAX)) - { - assert(0); - return; - } - - int64_t n = pEnd - pFirst; - if ((n < 0) || (n > UINT32_MAX)) - { - assert(0); - return; - } - - erase((uint32_t)ofs, (uint32_t)n); - } - - void erase_unordered(uint32_t index) - { - assert(index < m_size); - - if ((index + 1) < m_size) - (*this)[index] = back(); - - pop_back(); - } - - inline bool operator== (const vector& rhs) const - { - if (m_size != rhs.m_size) - return false; - else if (m_size) - { - if (scalar_type::cFlag) - return memcmp(m_p, rhs.m_p, sizeof(T) * m_size) == 0; - else - { - const T* pSrc = m_p; - const T* pDst = rhs.m_p; - for (uint32_t i = m_size; i; i--) - if (!(*pSrc++ == *pDst++)) - return false; - } - } - - return true; - } - - inline bool operator< (const vector& rhs) const - { - const uint32_t min_size = helpers::minimum(m_size, rhs.m_size); - - const T* pSrc = m_p; - const T* pSrc_end = m_p + min_size; - const T* pDst = rhs.m_p; - - while ((pSrc < pSrc_end) && (*pSrc == *pDst)) - { - pSrc++; - pDst++; - } - - if (pSrc < pSrc_end) - return *pSrc < *pDst; - - return m_size < rhs.m_size; - } - - inline void swap(vector& other) - { - std::swap(m_p, other.m_p); - std::swap(m_size, other.m_size); - std::swap(m_capacity, other.m_capacity); - } - - inline void sort() - { - std::sort(begin(), end()); - } - - inline void unique() - { - if (!empty()) - { - sort(); - - resize(std::unique(begin(), end()) - begin()); - } - } - - inline void reverse() - { - uint32_t j = m_size >> 1; - for (uint32_t i = 0; i < j; i++) - std::swap(m_p[i], m_p[m_size - 1 - i]); - } - - inline int find(const T& key) const - { - const T* p = m_p; - const T* p_end = m_p + m_size; - - uint32_t index = 0; - - while (p != p_end) - { - if (key == *p) - return index; - - p++; - index++; - } - - return cInvalidIndex; - } - - inline int find_sorted(const T& key) const - { - if (m_size) - { - // Uniform binary search - Knuth Algorithm 6.2.1 U, unrolled twice. - int i = ((m_size + 1) >> 1) - 1; - int m = m_size; - - for (; ; ) - { - assert(i >= 0 && i < (int)m_size); - const T* pKey_i = m_p + i; - int cmp = key < *pKey_i; -#if defined(_DEBUG) || defined(DEBUG) - int cmp2 = *pKey_i < key; - assert((cmp != cmp2) || (key == *pKey_i)); -#endif - if ((!cmp) && (key == *pKey_i)) return i; - m >>= 1; - if (!m) break; - cmp = -cmp; - i += (((m + 1) >> 1) ^ cmp) - cmp; - if (i < 0) - break; - - assert(i >= 0 && i < (int)m_size); - pKey_i = m_p + i; - cmp = key < *pKey_i; -#if defined(_DEBUG) || defined(DEBUG) - cmp2 = *pKey_i < key; - assert((cmp != cmp2) || (key == *pKey_i)); -#endif - if ((!cmp) && (key == *pKey_i)) return i; - m >>= 1; - if (!m) break; - cmp = -cmp; - i += (((m + 1) >> 1) ^ cmp) - cmp; - if (i < 0) - break; - } - } - - return cInvalidIndex; - } - - template - inline int find_sorted(const T& key, Q less_than) const - { - if (m_size) - { - // Uniform binary search - Knuth Algorithm 6.2.1 U, unrolled twice. - int i = ((m_size + 1) >> 1) - 1; - int m = m_size; - - for (; ; ) - { - assert(i >= 0 && i < (int)m_size); - const T* pKey_i = m_p + i; - int cmp = less_than(key, *pKey_i); - if ((!cmp) && (!less_than(*pKey_i, key))) return i; - m >>= 1; - if (!m) break; - cmp = -cmp; - i += (((m + 1) >> 1) ^ cmp) - cmp; - if (i < 0) - break; - - assert(i >= 0 && i < (int)m_size); - pKey_i = m_p + i; - cmp = less_than(key, *pKey_i); - if ((!cmp) && (!less_than(*pKey_i, key))) return i; - m >>= 1; - if (!m) break; - cmp = -cmp; - i += (((m + 1) >> 1) ^ cmp) - cmp; - if (i < 0) - break; - } - } - - return cInvalidIndex; - } - - inline uint32_t count_occurences(const T& key) const - { - uint32_t c = 0; - - const T* p = m_p; - const T* p_end = m_p + m_size; - - while (p != p_end) - { - if (key == *p) - c++; - - p++; - } - - return c; - } - - inline void set_all(const T& o) - { - if ((sizeof(T) == 1) && (scalar_type::cFlag)) - { + } + else + { + // Type is not bitwise copyable or movable. + // Move them down one at a time by using the equals operator, and destroying anything that's left over at the end. + T* pDst_end = pDst + num_to_move; + + while (pDst != pDst_end) + { + *pDst = std::move(*pSrc); + + ++pDst; + ++pSrc; + } + + scalar_type::destruct_array(pDst_end, n); + } + + m_size -= n; + + return true; + } + + inline bool erase_index(size_t index) + { + return erase(index, 1); + } + + inline bool erase(T* p) + { + assert((p >= m_p) && (p < (m_p + m_size))); + + if (p < m_p) + return false; + + return erase_index(static_cast(p - m_p)); + } + + inline bool erase(T* pFirst, T* pEnd) + { + assert(pFirst <= pEnd); + assert(pFirst >= begin() && pFirst <= end()); + assert(pEnd >= begin() && pEnd <= end()); + + if ((pFirst < begin()) || (pEnd < pFirst)) + { + assert(0); + return false; + } + + uint64_t ofs = pFirst - begin(); + if ((size_t)ofs != ofs) + { + assert(0); + return false; + } + + uint64_t n = pEnd - pFirst; + if ((size_t)n != n) + { + assert(0); + return false; + } + + return erase((size_t)ofs, (size_t)n); + } + + bool erase_unordered(size_t index) + { + if (index >= m_size) + { + assert(0); + return false; + } + + if ((index + 1) < m_size) + { + (*this)[index] = std::move(back()); + } + + pop_back(); + return true; + } + + inline bool operator== (const vector& rhs) const + { + if (m_size != rhs.m_size) + return false; + else if (m_size) + { + if (scalar_type::cFlag) + return memcmp(m_p, rhs.m_p, sizeof(T) * m_size) == 0; + else + { + const T* pSrc = m_p; + const T* pDst = rhs.m_p; + for (size_t i = m_size; i; i--) + if (!(*pSrc++ == *pDst++)) + return false; + } + } + + return true; + } + + inline bool operator< (const vector& rhs) const + { + const size_t min_size = helpers::minimum(m_size, rhs.m_size); + + const T* pSrc = m_p; + const T* pSrc_end = m_p + min_size; + const T* pDst = rhs.m_p; + + while ((pSrc < pSrc_end) && (*pSrc == *pDst)) + { + pSrc++; + pDst++; + } + + if (pSrc < pSrc_end) + return *pSrc < *pDst; + + return m_size < rhs.m_size; + } + + inline void swap(vector& other) + { + std::swap(m_p, other.m_p); + std::swap(m_size, other.m_size); + std::swap(m_capacity, other.m_capacity); + } + + inline void sort() + { + std::sort(begin(), end()); + } + + inline void unique() + { + if (!empty()) + { + sort(); + + resize(std::unique(begin(), end()) - begin()); + } + } + + inline void reverse() + { + const size_t j = m_size >> 1; + + for (size_t i = 0; i < j; i++) + std::swap(m_p[i], m_p[m_size - 1 - i]); + } + + inline bool find(const T& key, size_t &idx) const + { + idx = 0; + + const T* p = m_p; + const T* p_end = m_p + m_size; + + size_t index = 0; + + while (p != p_end) + { + if (key == *p) + { + idx = index; + return true; + } + + p++; + index++; + } + + return false; + } + + inline bool find_sorted(const T& key, size_t& idx) const + { + idx = 0; + + if (!m_size) + return false; + + // Inclusive range + size_t low = 0, high = m_size - 1; + + while (low <= high) + { + size_t mid = (size_t)(((uint64_t)low + (uint64_t)high) >> 1); + + const T* pTrial_key = m_p + mid; + + // Sanity check comparison operator + assert(!((*pTrial_key < key) && (key < *pTrial_key))); + + if (*pTrial_key < key) + { + if (add_overflow_check(mid, 1)) + break; + + low = mid + 1; + } + else if (key < *pTrial_key) + { + if (!mid) + break; + + high = mid - 1; + } + else + { + idx = mid; + return true; + } + } + + return false; + } + + inline size_t count_occurences(const T& key) const + { + size_t c = 0; + + const T* p = m_p; + const T* p_end = m_p + m_size; + + while (p != p_end) + { + if (key == *p) + c++; + + p++; + } + + return c; + } + + inline void set_all(const T& o) + { + if ((sizeof(T) == 1) && (scalar_type::cFlag)) + { #ifndef __EMSCRIPTEN__ #ifdef __GNUC__ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wclass-memaccess" #endif #endif - memset(m_p, *reinterpret_cast(&o), m_size); + memset(m_p, *reinterpret_cast(&o), m_size); #ifndef __EMSCRIPTEN__ #ifdef __GNUC__ #pragma GCC diagnostic pop #endif #endif - } - else - { - T* pDst = m_p; - T* pDst_end = pDst + m_size; - while (pDst != pDst_end) - *pDst++ = o; - } - } - - // Caller assumes ownership of the heap block associated with the container. Container is cleared. - inline void* assume_ownership() - { - T* p = m_p; - m_p = NULL; - m_size = 0; - m_capacity = 0; - return p; - } - - // Caller is granting ownership of the indicated heap block. - // Block must have size constructed elements, and have enough room for capacity elements. - // The block must have been allocated using malloc(). - // Important: This method is used in Basis Universal. If you change how this container allocates memory, you'll need to change any users of this method. - inline bool grant_ownership(T* p, uint32_t size, uint32_t capacity) - { - // To prevent the caller from obviously shooting themselves in the foot. - if (((p + capacity) > m_p) && (p < (m_p + m_capacity))) - { - // Can grant ownership of a block inside the container itself! - assert(0); - return false; - } - - if (size > capacity) - { - assert(0); - return false; - } - - if (!p) - { - if (capacity) - { - assert(0); - return false; - } - } - else if (!capacity) - { - assert(0); - return false; - } - - clear(); - m_p = p; - m_size = size; - m_capacity = capacity; - return true; - } - - private: - T* m_p; - uint32_t m_size; - uint32_t m_capacity; - - template struct is_vector { enum { cFlag = false }; }; - template struct is_vector< vector > { enum { cFlag = true }; }; - - static void object_mover(void* pDst_void, void* pSrc_void, uint32_t num) - { - T* pSrc = static_cast(pSrc_void); - T* const pSrc_end = pSrc + num; - T* pDst = static_cast(pDst_void); - - while (pSrc != pSrc_end) - { - // placement new - new (static_cast(pDst)) T(*pSrc); - pSrc->~T(); - ++pSrc; - ++pDst; - } - } - - inline bool increase_capacity(uint32_t min_new_capacity, bool grow_hint, bool nofail = false) - { - return reinterpret_cast(this)->increase_capacity( - min_new_capacity, grow_hint, sizeof(T), - (BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(T) || (is_vector::cFlag)) ? NULL : object_mover, nofail); - } - }; - - template struct bitwise_movable< vector > { enum { cFlag = true }; }; - - // Hash map - - template - struct hasher - { - inline size_t operator() (const T& key) const { return static_cast(key); } - }; - - template - struct equal_to - { - inline bool operator()(const T& a, const T& b) const { return a == b; } - }; - - // Important: The Hasher and Equals objects must be bitwise movable! - template, typename Equals = equal_to > - class hash_map - { - public: - class iterator; - class const_iterator; - - private: - friend class iterator; - friend class const_iterator; - - enum state - { - cStateInvalid = 0, - cStateValid = 1 - }; - - enum - { - cMinHashSize = 4U - }; - - public: - typedef hash_map hash_map_type; - typedef std::pair value_type; - typedef Key key_type; - typedef Value referent_type; - typedef Hasher hasher_type; - typedef Equals equals_type; - - hash_map() : - m_hash_shift(32), m_num_valid(0), m_grow_threshold(0) - { - } - - hash_map(const hash_map& other) : - m_values(other.m_values), - m_hash_shift(other.m_hash_shift), - m_hasher(other.m_hasher), - m_equals(other.m_equals), - m_num_valid(other.m_num_valid), - m_grow_threshold(other.m_grow_threshold) - { - } - - hash_map& operator= (const hash_map& other) - { - if (this == &other) - return *this; - - clear(); - - m_values = other.m_values; - m_hash_shift = other.m_hash_shift; - m_num_valid = other.m_num_valid; - m_grow_threshold = other.m_grow_threshold; - m_hasher = other.m_hasher; - m_equals = other.m_equals; - - return *this; - } - - inline ~hash_map() - { - clear(); - } - - const Equals& get_equals() const { return m_equals; } - Equals& get_equals() { return m_equals; } - - void set_equals(const Equals& equals) { m_equals = equals; } - - const Hasher& get_hasher() const { return m_hasher; } - Hasher& get_hasher() { return m_hasher; } - - void set_hasher(const Hasher& hasher) { m_hasher = hasher; } - - inline void clear() - { - if (!m_values.empty()) - { - if (BASISU_HAS_DESTRUCTOR(Key) || BASISU_HAS_DESTRUCTOR(Value)) - { - node* p = &get_node(0); - node* p_end = p + m_values.size(); - - uint32_t num_remaining = m_num_valid; - while (p != p_end) - { - if (p->state) - { - destruct_value_type(p); - num_remaining--; - if (!num_remaining) - break; - } - - p++; - } - } - - m_values.clear_no_destruction(); - - m_hash_shift = 32; - m_num_valid = 0; - m_grow_threshold = 0; - } - } - - inline void reset() - { - if (!m_num_valid) - return; - - if (BASISU_HAS_DESTRUCTOR(Key) || BASISU_HAS_DESTRUCTOR(Value)) - { - node* p = &get_node(0); - node* p_end = p + m_values.size(); - - uint32_t num_remaining = m_num_valid; - while (p != p_end) - { - if (p->state) - { - destruct_value_type(p); - p->state = cStateInvalid; - - num_remaining--; - if (!num_remaining) - break; - } - - p++; - } - } - else if (sizeof(node) <= 32) - { - memset(&m_values[0], 0, m_values.size_in_bytes()); - } - else - { - node* p = &get_node(0); - node* p_end = p + m_values.size(); - - uint32_t num_remaining = m_num_valid; - while (p != p_end) - { - if (p->state) - { - p->state = cStateInvalid; - - num_remaining--; - if (!num_remaining) - break; - } - - p++; - } - } - - m_num_valid = 0; - } - - inline uint32_t size() - { - return m_num_valid; - } - - inline uint32_t get_table_size() - { - return m_values.size(); - } - - inline bool empty() - { - return !m_num_valid; - } - - inline void reserve(uint32_t new_capacity) - { - uint64_t new_hash_size = helpers::maximum(1U, new_capacity); - - new_hash_size = new_hash_size * 2ULL; - - if (!helpers::is_power_of_2(new_hash_size)) - new_hash_size = helpers::next_pow2(new_hash_size); - - new_hash_size = helpers::maximum(cMinHashSize, new_hash_size); - - new_hash_size = helpers::minimum(0x80000000UL, new_hash_size); - - if (new_hash_size > m_values.size()) - rehash((uint32_t)new_hash_size); - } - - class iterator - { - friend class hash_map; - friend class hash_map::const_iterator; - - public: - inline iterator() : m_pTable(NULL), m_index(0) { } - inline iterator(hash_map_type& table, uint32_t index) : m_pTable(&table), m_index(index) { } - inline iterator(const iterator& other) : m_pTable(other.m_pTable), m_index(other.m_index) { } - - inline iterator& operator= (const iterator& other) - { - m_pTable = other.m_pTable; - m_index = other.m_index; - return *this; - } - - // post-increment - inline iterator operator++(int) - { - iterator result(*this); - ++*this; - return result; - } - - // pre-increment - inline iterator& operator++() - { - probe(); - return *this; - } - - inline value_type& operator*() const { return *get_cur(); } - inline value_type* operator->() const { return get_cur(); } - - inline bool operator == (const iterator& b) const { return (m_pTable == b.m_pTable) && (m_index == b.m_index); } - inline bool operator != (const iterator& b) const { return !(*this == b); } - inline bool operator == (const const_iterator& b) const { return (m_pTable == b.m_pTable) && (m_index == b.m_index); } - inline bool operator != (const const_iterator& b) const { return !(*this == b); } - - private: - hash_map_type* m_pTable; - uint32_t m_index; - - inline value_type* get_cur() const - { - assert(m_pTable && (m_index < m_pTable->m_values.size())); - assert(m_pTable->get_node_state(m_index) == cStateValid); - - return &m_pTable->get_node(m_index); - } - - inline void probe() - { - assert(m_pTable); - m_index = m_pTable->find_next(m_index); - } - }; - - class const_iterator - { - friend class hash_map; - friend class hash_map::iterator; - - public: - inline const_iterator() : m_pTable(NULL), m_index(0) { } - inline const_iterator(const hash_map_type& table, uint32_t index) : m_pTable(&table), m_index(index) { } - inline const_iterator(const iterator& other) : m_pTable(other.m_pTable), m_index(other.m_index) { } - inline const_iterator(const const_iterator& other) : m_pTable(other.m_pTable), m_index(other.m_index) { } - - inline const_iterator& operator= (const const_iterator& other) - { - m_pTable = other.m_pTable; - m_index = other.m_index; - return *this; - } - - inline const_iterator& operator= (const iterator& other) - { - m_pTable = other.m_pTable; - m_index = other.m_index; - return *this; - } - - // post-increment - inline const_iterator operator++(int) - { - const_iterator result(*this); - ++*this; - return result; - } - - // pre-increment - inline const_iterator& operator++() - { - probe(); - return *this; - } - - inline const value_type& operator*() const { return *get_cur(); } - inline const value_type* operator->() const { return get_cur(); } - - inline bool operator == (const const_iterator& b) const { return (m_pTable == b.m_pTable) && (m_index == b.m_index); } - inline bool operator != (const const_iterator& b) const { return !(*this == b); } - inline bool operator == (const iterator& b) const { return (m_pTable == b.m_pTable) && (m_index == b.m_index); } - inline bool operator != (const iterator& b) const { return !(*this == b); } - - private: - const hash_map_type* m_pTable; - uint32_t m_index; - - inline const value_type* get_cur() const - { - assert(m_pTable && (m_index < m_pTable->m_values.size())); - assert(m_pTable->get_node_state(m_index) == cStateValid); - - return &m_pTable->get_node(m_index); - } - - inline void probe() - { - assert(m_pTable); - m_index = m_pTable->find_next(m_index); - } - }; - - inline const_iterator begin() const - { - if (!m_num_valid) - return end(); - - return const_iterator(*this, find_next(UINT32_MAX)); - } - - inline const_iterator end() const - { - return const_iterator(*this, m_values.size()); - } - - inline iterator begin() - { - if (!m_num_valid) - return end(); - - return iterator(*this, find_next(UINT32_MAX)); - } - - inline iterator end() - { - return iterator(*this, m_values.size()); - } - - // insert_result.first will always point to inserted key/value (or the already existing key/value). - // insert_resutt.second will be true if a new key/value was inserted, or false if the key already existed (in which case first will point to the already existing value). - typedef std::pair insert_result; - - inline insert_result insert(const Key& k, const Value& v = Value()) - { - insert_result result; - if (!insert_no_grow(result, k, v)) - { - grow(); - - // This must succeed. - if (!insert_no_grow(result, k, v)) - { - fprintf(stderr, "insert() failed"); - abort(); - } - } - - return result; - } - - inline insert_result insert(const value_type& v) - { - return insert(v.first, v.second); - } - - inline const_iterator find(const Key& k) const - { - return const_iterator(*this, find_index(k)); - } - - inline iterator find(const Key& k) - { - return iterator(*this, find_index(k)); - } - - inline bool erase(const Key& k) - { - uint32_t i = find_index(k); - - if (i >= m_values.size()) - return false; - - node* pDst = &get_node(i); - destruct_value_type(pDst); - pDst->state = cStateInvalid; - - m_num_valid--; - - for (; ; ) - { - uint32_t r, j = i; - - node* pSrc = pDst; - - do - { - if (!i) - { - i = m_values.size() - 1; - pSrc = &get_node(i); - } - else - { - i--; - pSrc--; - } - - if (!pSrc->state) - return true; - - r = hash_key(pSrc->first); - - } while ((i <= r && r < j) || (r < j && j < i) || (j < i && i <= r)); - - move_node(pDst, pSrc); - - pDst = pSrc; - } - } - - inline void swap(hash_map_type& other) - { - m_values.swap(other.m_values); - std::swap(m_hash_shift, other.m_hash_shift); - std::swap(m_num_valid, other.m_num_valid); - std::swap(m_grow_threshold, other.m_grow_threshold); - std::swap(m_hasher, other.m_hasher); - std::swap(m_equals, other.m_equals); - } - - private: - struct node : public value_type - { - uint8_t state; - }; - - static inline void construct_value_type(value_type* pDst, const Key& k, const Value& v) - { - if (BASISU_IS_BITWISE_COPYABLE(Key)) - memcpy(&pDst->first, &k, sizeof(Key)); - else - scalar_type::construct(&pDst->first, k); - - if (BASISU_IS_BITWISE_COPYABLE(Value)) - memcpy(&pDst->second, &v, sizeof(Value)); - else - scalar_type::construct(&pDst->second, v); - } - - static inline void construct_value_type(value_type* pDst, const value_type* pSrc) - { - if ((BASISU_IS_BITWISE_COPYABLE(Key)) && (BASISU_IS_BITWISE_COPYABLE(Value))) - { - memcpy(pDst, pSrc, sizeof(value_type)); - } - else - { - if (BASISU_IS_BITWISE_COPYABLE(Key)) - memcpy(&pDst->first, &pSrc->first, sizeof(Key)); - else - scalar_type::construct(&pDst->first, pSrc->first); - - if (BASISU_IS_BITWISE_COPYABLE(Value)) - memcpy(&pDst->second, &pSrc->second, sizeof(Value)); - else - scalar_type::construct(&pDst->second, pSrc->second); - } - } - - static inline void destruct_value_type(value_type* p) - { - scalar_type::destruct(&p->first); - scalar_type::destruct(&p->second); - } - - // Moves *pSrc to *pDst efficiently. - // pDst should NOT be constructed on entry. - static inline void move_node(node* pDst, node* pSrc, bool update_src_state = true) - { - assert(!pDst->state); - - if (BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(Key) && BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(Value)) - { - memcpy(pDst, pSrc, sizeof(node)); - } - else - { - if (BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(Key)) - memcpy(&pDst->first, &pSrc->first, sizeof(Key)); - else - { - scalar_type::construct(&pDst->first, pSrc->first); - scalar_type::destruct(&pSrc->first); - } - - if (BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(Value)) - memcpy(&pDst->second, &pSrc->second, sizeof(Value)); - else - { - scalar_type::construct(&pDst->second, pSrc->second); - scalar_type::destruct(&pSrc->second); - } - - pDst->state = cStateValid; - } - - if (update_src_state) - pSrc->state = cStateInvalid; - } - - struct raw_node - { - inline raw_node() - { - node* p = reinterpret_cast(this); - p->state = cStateInvalid; - } - - inline ~raw_node() - { - node* p = reinterpret_cast(this); - if (p->state) - hash_map_type::destruct_value_type(p); - } - - inline raw_node(const raw_node& other) - { - node* pDst = reinterpret_cast(this); - const node* pSrc = reinterpret_cast(&other); - - if (pSrc->state) - { - hash_map_type::construct_value_type(pDst, pSrc); - pDst->state = cStateValid; - } - else - pDst->state = cStateInvalid; - } - - inline raw_node& operator= (const raw_node& rhs) - { - if (this == &rhs) - return *this; - - node* pDst = reinterpret_cast(this); - const node* pSrc = reinterpret_cast(&rhs); - - if (pSrc->state) - { - if (pDst->state) - { - pDst->first = pSrc->first; - pDst->second = pSrc->second; - } - else - { - hash_map_type::construct_value_type(pDst, pSrc); - pDst->state = cStateValid; - } - } - else if (pDst->state) - { - hash_map_type::destruct_value_type(pDst); - pDst->state = cStateInvalid; - } - - return *this; - } - - uint8_t m_bits[sizeof(node)]; - }; - - typedef basisu::vector node_vector; - - node_vector m_values; - uint32_t m_hash_shift; - - Hasher m_hasher; - Equals m_equals; - - uint32_t m_num_valid; - - uint32_t m_grow_threshold; - - inline uint32_t hash_key(const Key& k) const - { - assert((1U << (32U - m_hash_shift)) == m_values.size()); - - uint32_t hash = static_cast(m_hasher(k)); - - // Fibonacci hashing - hash = (2654435769U * hash) >> m_hash_shift; - - assert(hash < m_values.size()); - return hash; - } - - inline const node& get_node(uint32_t index) const - { - return *reinterpret_cast(&m_values[index]); - } - - inline node& get_node(uint32_t index) - { - return *reinterpret_cast(&m_values[index]); - } - - inline state get_node_state(uint32_t index) const - { - return static_cast(get_node(index).state); - } - - inline void set_node_state(uint32_t index, bool valid) - { - get_node(index).state = valid; - } - - inline void grow() - { - uint64_t n = m_values.size() * 3ULL; // was * 2 - - if (!helpers::is_power_of_2(n)) - n = helpers::next_pow2(n); - - if (n > 0x80000000UL) - n = 0x80000000UL; - - rehash(helpers::maximum(cMinHashSize, (uint32_t)n)); - } - - inline void rehash(uint32_t new_hash_size) - { - assert(new_hash_size >= m_num_valid); - assert(helpers::is_power_of_2(new_hash_size)); - - if ((new_hash_size < m_num_valid) || (new_hash_size == m_values.size())) - return; - - hash_map new_map; - new_map.m_values.resize(new_hash_size); - new_map.m_hash_shift = 32U - helpers::floor_log2i(new_hash_size); - assert(new_hash_size == (1U << (32U - new_map.m_hash_shift))); - new_map.m_grow_threshold = UINT_MAX; - - node* pNode = reinterpret_cast(m_values.begin()); - node* pNode_end = pNode + m_values.size(); - - while (pNode != pNode_end) - { - if (pNode->state) - { - new_map.move_into(pNode); - - if (new_map.m_num_valid == m_num_valid) - break; - } - - pNode++; - } - - new_map.m_grow_threshold = (new_hash_size + 1U) >> 1U; - - m_values.clear_no_destruction(); - m_hash_shift = 32; - - swap(new_map); - } - - inline uint32_t find_next(uint32_t index) const - { - index++; - - if (index >= m_values.size()) - return index; - - const node* pNode = &get_node(index); - - for (; ; ) - { - if (pNode->state) - break; - - if (++index >= m_values.size()) - break; - - pNode++; - } - - return index; - } - - inline uint32_t find_index(const Key& k) const - { - if (m_num_valid) - { - uint32_t index = hash_key(k); - const node* pNode = &get_node(index); - - if (pNode->state) - { - if (m_equals(pNode->first, k)) - return index; + } + else + { + T* pDst = m_p; + T* pDst_end = pDst + m_size; + while (pDst != pDst_end) + *pDst++ = o; + } + } + + // Caller assumes ownership of the heap block associated with the container. Container is cleared. + // Caller must use free() on the returned pointer. + inline void* assume_ownership() + { + T* p = m_p; + m_p = nullptr; + m_size = 0; + m_capacity = 0; + return p; + } + + // Caller is granting ownership of the indicated heap block. + // Block must have size constructed elements, and have enough room for capacity elements. + // The block must have been allocated using malloc(). + // Important: This method is used in Basis Universal. If you change how this container allocates memory, you'll need to change any users of this method. + inline bool grant_ownership(T* p, size_t size, size_t capacity) + { + // To prevent the caller from obviously shooting themselves in the foot. + if (((p + capacity) > m_p) && (p < (m_p + m_capacity))) + { + // Can grant ownership of a block inside the container itself! + assert(0); + return false; + } + + if (size > capacity) + { + assert(0); + return false; + } + + if (!p) + { + if (capacity) + { + assert(0); + return false; + } + } + else if (!capacity) + { + assert(0); + return false; + } + + clear(); + m_p = p; + m_size = size; + m_capacity = capacity; + return true; + } + + readable_span get_readable_span() const + { + return readable_span(m_p, m_size); + } + + writable_span get_writable_span() + { + return writable_span(m_p, m_size); + } + + private: + T* m_p; + size_t m_size; // the number of constructed objects + size_t m_capacity; // the size of the allocation + + template struct is_vector { enum { cFlag = false }; }; + template struct is_vector< vector > { enum { cFlag = true }; }; + + static void object_mover(void* pDst_void, void* pSrc_void, size_t num) + { + T* pSrc = static_cast(pSrc_void); + T* const pSrc_end = pSrc + num; + T* pDst = static_cast(pDst_void); + + while (pSrc != pSrc_end) + { + new ((void*)(pDst)) T(std::move(*pSrc)); + scalar_type::destruct(pSrc); + + ++pSrc; + ++pDst; + } + } + + inline bool increase_capacity(size_t min_new_capacity, bool grow_hint, bool nofail = false) + { + return reinterpret_cast(this)->increase_capacity( + min_new_capacity, grow_hint, sizeof(T), + (BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(T) || (is_vector::cFlag)) ? nullptr : object_mover, nofail); + } + }; + + template struct bitwise_movable< vector > { enum { cFlag = true }; }; + + // Hash map + // rg TODO 9/8/2024: I've upgraded this class to support 64-bit size_t, and it needs a lot more testing. + + const uint32_t SIZE_T_BITS = sizeof(size_t) * 8U; + + inline uint32_t safe_shift_left(uint32_t v, uint32_t l) + { + return (l < 32U) ? (v << l) : 0; + } + + inline uint64_t safe_shift_left(uint64_t v, uint32_t l) + { + return (l < 64U) ? (v << l) : 0; + } + + template + struct hasher + { + inline size_t operator() (const T& key) const { return static_cast(key); } + }; + + template + struct equal_to + { + inline bool operator()(const T& a, const T& b) const { return a == b; } + }; + + // Important: The Hasher and Equals objects must be bitwise movable! + template, typename Equals = equal_to > + class hash_map + { + public: + class iterator; + class const_iterator; + + private: + friend class iterator; + friend class const_iterator; + + enum state + { + cStateInvalid = 0, + cStateValid = 1 + }; + + enum + { + cMinHashSize = 4U + }; + + public: + typedef hash_map hash_map_type; + typedef std::pair value_type; + typedef Key key_type; + typedef Value referent_type; + typedef Hasher hasher_type; + typedef Equals equals_type; + + hash_map() : + m_num_valid(0), + m_grow_threshold(0), + m_hash_shift(SIZE_T_BITS) + { + static_assert((SIZE_T_BITS == 32) || (SIZE_T_BITS == 64), "SIZE_T_BITS must be 32 or 64"); + } + + hash_map(const hash_map& other) : + m_values(other.m_values), + m_num_valid(other.m_num_valid), + m_grow_threshold(other.m_grow_threshold), + m_hash_shift(other.m_hash_shift), + m_hasher(other.m_hasher), + m_equals(other.m_equals) + { + static_assert((SIZE_T_BITS == 32) || (SIZE_T_BITS == 64), "SIZE_T_BITS must be 32 or 64"); + } + + hash_map(hash_map&& other) : + m_values(std::move(other.m_values)), + m_num_valid(other.m_num_valid), + m_grow_threshold(other.m_grow_threshold), + m_hash_shift(other.m_hash_shift), + m_hasher(std::move(other.m_hasher)), + m_equals(std::move(other.m_equals)) + { + static_assert((SIZE_T_BITS == 32) || (SIZE_T_BITS == 64), "SIZE_T_BITS must be 32 or 64"); + + other.m_hash_shift = SIZE_T_BITS; + other.m_num_valid = 0; + other.m_grow_threshold = 0; + } + + hash_map& operator= (const hash_map& other) + { + if (this == &other) + return *this; + + clear(); + + m_values = other.m_values; + m_hash_shift = other.m_hash_shift; + m_num_valid = other.m_num_valid; + m_grow_threshold = other.m_grow_threshold; + m_hasher = other.m_hasher; + m_equals = other.m_equals; + + return *this; + } + + hash_map& operator= (hash_map&& other) + { + if (this == &other) + return *this; + + clear(); + + m_values = std::move(other.m_values); + m_hash_shift = other.m_hash_shift; + m_num_valid = other.m_num_valid; + m_grow_threshold = other.m_grow_threshold; + m_hasher = std::move(other.m_hasher); + m_equals = std::move(other.m_equals); + + other.m_hash_shift = SIZE_T_BITS; + other.m_num_valid = 0; + other.m_grow_threshold = 0; + + return *this; + } + + inline ~hash_map() + { + clear(); + } + + inline const Equals& get_equals() const { return m_equals; } + inline Equals& get_equals() { return m_equals; } + inline void set_equals(const Equals& equals) { m_equals = equals; } + + inline const Hasher& get_hasher() const { return m_hasher; } + inline Hasher& get_hasher() { return m_hasher; } + inline void set_hasher(const Hasher& hasher) { m_hasher = hasher; } + + inline void clear() + { + if (m_values.empty()) + return; + + if (BASISU_HAS_DESTRUCTOR(Key) || BASISU_HAS_DESTRUCTOR(Value)) + { + node* p = &get_node(0); + node* p_end = p + m_values.size(); + + size_t num_remaining = m_num_valid; + while (p != p_end) + { + if (p->state) + { + destruct_value_type(p); + num_remaining--; + if (!num_remaining) + break; + } + + p++; + } + } + + m_values.clear_no_destruction(); + + m_hash_shift = SIZE_T_BITS; + m_num_valid = 0; + m_grow_threshold = 0; + } + + inline void reset() + { + if (!m_num_valid) + return; + + if (BASISU_HAS_DESTRUCTOR(Key) || BASISU_HAS_DESTRUCTOR(Value)) + { + node* p = &get_node(0); + node* p_end = p + m_values.size(); + + size_t num_remaining = m_num_valid; + while (p != p_end) + { + if (p->state) + { + destruct_value_type(p); + p->state = cStateInvalid; + + num_remaining--; + if (!num_remaining) + break; + } + + p++; + } + } + else if (sizeof(node) <= 16) + { + memset(&m_values[0], 0, m_values.size_in_bytes()); + } + else + { + node* p = &get_node(0); + node* p_end = p + m_values.size(); + + size_t num_remaining = m_num_valid; + while (p != p_end) + { + if (p->state) + { + p->state = cStateInvalid; + + num_remaining--; + if (!num_remaining) + break; + } + + p++; + } + } + + m_num_valid = 0; + } + + inline size_t size() + { + return m_num_valid; + } + + inline size_t get_table_size() + { + return m_values.size(); + } + + inline bool empty() + { + return !m_num_valid; + } + + inline bool reserve(size_t new_capacity) + { + if (!new_capacity) + return true; + + uint64_t new_hash_size = new_capacity; + + new_hash_size = new_hash_size * 2ULL; + + if (!helpers::is_power_of_2(new_hash_size)) + new_hash_size = helpers::next_pow2(new_hash_size); + + new_hash_size = helpers::maximum(cMinHashSize, new_hash_size); + + if (!can_fit_into_size_t(new_hash_size)) + { + assert(0); + return false; + } + + assert(new_hash_size >= new_capacity); + + if (new_hash_size <= m_values.size()) + return true; + + return rehash((size_t)new_hash_size); + } + + class iterator + { + friend class hash_map; + friend class hash_map::const_iterator; + + public: + inline iterator() : m_pTable(nullptr), m_index(0) { } + inline iterator(hash_map_type& table, size_t index) : m_pTable(&table), m_index(index) { } + inline iterator(const iterator& other) : m_pTable(other.m_pTable), m_index(other.m_index) { } + + inline iterator& operator= (const iterator& other) + { + m_pTable = other.m_pTable; + m_index = other.m_index; + return *this; + } + + // post-increment + inline iterator operator++(int) + { + iterator result(*this); + ++*this; + return result; + } + + // pre-increment + inline iterator& operator++() + { + probe(); + return *this; + } + + inline value_type& operator*() const { return *get_cur(); } + inline value_type* operator->() const { return get_cur(); } + + inline bool operator == (const iterator& b) const { return (m_pTable == b.m_pTable) && (m_index == b.m_index); } + inline bool operator != (const iterator& b) const { return !(*this == b); } + inline bool operator == (const const_iterator& b) const { return (m_pTable == b.m_pTable) && (m_index == b.m_index); } + inline bool operator != (const const_iterator& b) const { return !(*this == b); } + + private: + hash_map_type* m_pTable; + size_t m_index; + + inline value_type* get_cur() const + { + assert(m_pTable && (m_index < m_pTable->m_values.size())); + assert(m_pTable->get_node_state(m_index) == cStateValid); + + return &m_pTable->get_node(m_index); + } + + inline void probe() + { + assert(m_pTable); + m_index = m_pTable->find_next(m_index); + } + }; + + class const_iterator + { + friend class hash_map; + friend class hash_map::iterator; + + public: + inline const_iterator() : m_pTable(nullptr), m_index(0) { } + inline const_iterator(const hash_map_type& table, size_t index) : m_pTable(&table), m_index(index) { } + inline const_iterator(const iterator& other) : m_pTable(other.m_pTable), m_index(other.m_index) { } + inline const_iterator(const const_iterator& other) : m_pTable(other.m_pTable), m_index(other.m_index) { } + + inline const_iterator& operator= (const const_iterator& other) + { + m_pTable = other.m_pTable; + m_index = other.m_index; + return *this; + } + + inline const_iterator& operator= (const iterator& other) + { + m_pTable = other.m_pTable; + m_index = other.m_index; + return *this; + } + + // post-increment + inline const_iterator operator++(int) + { + const_iterator result(*this); + ++*this; + return result; + } + + // pre-increment + inline const_iterator& operator++() + { + probe(); + return *this; + } + + inline const value_type& operator*() const { return *get_cur(); } + inline const value_type* operator->() const { return get_cur(); } + + inline bool operator == (const const_iterator& b) const { return (m_pTable == b.m_pTable) && (m_index == b.m_index); } + inline bool operator != (const const_iterator& b) const { return !(*this == b); } + inline bool operator == (const iterator& b) const { return (m_pTable == b.m_pTable) && (m_index == b.m_index); } + inline bool operator != (const iterator& b) const { return !(*this == b); } + + private: + const hash_map_type* m_pTable; + size_t m_index; + + inline const value_type* get_cur() const + { + assert(m_pTable && (m_index < m_pTable->m_values.size())); + assert(m_pTable->get_node_state(m_index) == cStateValid); + + return &m_pTable->get_node(m_index); + } + + inline void probe() + { + assert(m_pTable); + m_index = m_pTable->find_next(m_index); + } + }; + + inline const_iterator begin() const + { + if (!m_num_valid) + return end(); + + return const_iterator(*this, find_next(std::numeric_limits::max())); + } + + inline const_iterator end() const + { + return const_iterator(*this, m_values.size()); + } + + inline iterator begin() + { + if (!m_num_valid) + return end(); + + return iterator(*this, find_next(std::numeric_limits::max())); + } + + inline iterator end() + { + return iterator(*this, m_values.size()); + } + + // insert_result.first will always point to inserted key/value (or the already existing key/value). + // insert_result.second will be true if a new key/value was inserted, or false if the key already existed (in which case first will point to the already existing value). + typedef std::pair insert_result; + + inline insert_result insert(const Key& k, const Value& v = Value()) + { + insert_result result; + if (!insert_no_grow(result, k, v)) + { + if (!try_grow()) + container_abort("hash_map::try_grow() failed"); + + // This must succeed. + if (!insert_no_grow(result, k, v)) + container_abort("hash_map::insert() failed"); + } + + return result; + } + + inline bool try_insert(insert_result& result, const Key& k, const Value& v = Value()) + { + if (!insert_no_grow(result, k, v)) + { + if (!try_grow()) + return false; + + if (!insert_no_grow(result, k, v)) + return false; + } + + return true; + } + + inline insert_result insert(Key&& k, Value&& v = Value()) + { + insert_result result; + if (!insert_no_grow_move(result, std::move(k), std::move(v))) + { + if (!try_grow()) + container_abort("hash_map::try_grow() failed"); + + // This must succeed. + if (!insert_no_grow_move(result, std::move(k), std::move(v))) + container_abort("hash_map::insert() failed"); + } + + return result; + } + + inline bool try_insert(insert_result& result, Key&& k, Value&& v = Value()) + { + if (!insert_no_grow_move(result, std::move(k), std::move(v))) + { + if (!try_grow()) + return false; + + if (!insert_no_grow_move(result, std::move(k), std::move(v))) + return false; + } + + return true; + } + + inline insert_result insert(const value_type& v) + { + return insert(v.first, v.second); + } + + inline bool try_insert(insert_result& result, const value_type& v) + { + return try_insert(result, v.first, v.second); + } + + inline insert_result insert(value_type&& v) + { + return insert(std::move(v.first), std::move(v.second)); + } + + inline bool try_insert(insert_result& result, value_type&& v) + { + return try_insert(result, std::move(v.first), std::move(v.second)); + } + + inline const_iterator find(const Key& k) const + { + return const_iterator(*this, find_index(k)); + } + + inline iterator find(const Key& k) + { + return iterator(*this, find_index(k)); + } + + inline bool contains(const Key& k) const + { + const size_t idx = find_index(k); + return idx != m_values.size(); + } + + inline bool erase(const Key& k) + { + size_t i = find_index(k); + + if (i >= m_values.size()) + return false; + + node* pDst = &get_node(i); + destruct_value_type(pDst); + pDst->state = cStateInvalid; + + m_num_valid--; + + for (; ; ) + { + size_t r, j = i; + + node* pSrc = pDst; + + do + { + if (!i) + { + i = m_values.size() - 1; + pSrc = &get_node(i); + } + else + { + i--; + pSrc--; + } + + if (!pSrc->state) + return true; + + r = hash_key(pSrc->first); + + } while ((i <= r && r < j) || (r < j && j < i) || (j < i && i <= r)); + + move_node(pDst, pSrc); + + pDst = pSrc; + } + } + + inline void swap(hash_map_type& other) + { + m_values.swap(other.m_values); + std::swap(m_hash_shift, other.m_hash_shift); + std::swap(m_num_valid, other.m_num_valid); + std::swap(m_grow_threshold, other.m_grow_threshold); + std::swap(m_hasher, other.m_hasher); + std::swap(m_equals, other.m_equals); + } + + private: + struct node : public value_type + { + uint8_t state; + }; + + static inline void construct_value_type(value_type* pDst, const Key& k, const Value& v) + { + if (BASISU_IS_BITWISE_COPYABLE(Key)) + memcpy(&pDst->first, &k, sizeof(Key)); + else + scalar_type::construct(&pDst->first, k); + + if (BASISU_IS_BITWISE_COPYABLE(Value)) + memcpy(&pDst->second, &v, sizeof(Value)); + else + scalar_type::construct(&pDst->second, v); + } + + static inline void construct_value_type(value_type* pDst, const value_type* pSrc) + { + if ((BASISU_IS_BITWISE_COPYABLE(Key)) && (BASISU_IS_BITWISE_COPYABLE(Value))) + { + memcpy(pDst, pSrc, sizeof(value_type)); + } + else + { + if (BASISU_IS_BITWISE_COPYABLE(Key)) + memcpy(&pDst->first, &pSrc->first, sizeof(Key)); + else + scalar_type::construct(&pDst->first, pSrc->first); + + if (BASISU_IS_BITWISE_COPYABLE(Value)) + memcpy(&pDst->second, &pSrc->second, sizeof(Value)); + else + scalar_type::construct(&pDst->second, pSrc->second); + } + } + + static inline void destruct_value_type(value_type* p) + { + scalar_type::destruct(&p->first); + scalar_type::destruct(&p->second); + } + + // Moves nodes *pSrc to *pDst efficiently from one hashmap to another. + // pDst should NOT be constructed on entry. + static inline void move_node(node* pDst, node* pSrc, bool update_src_state = true) + { + assert(!pDst->state); + + if (BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(Key) && BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(Value)) + { + memcpy(pDst, pSrc, sizeof(node)); + + assert(pDst->state == cStateValid); + } + else + { + if (BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(Key)) + memcpy(&pDst->first, &pSrc->first, sizeof(Key)); + else + { + new ((void*)&pDst->first) Key(std::move(pSrc->first)); + scalar_type::destruct(&pSrc->first); + } + + if (BASISU_IS_BITWISE_COPYABLE_OR_MOVABLE(Value)) + memcpy(&pDst->second, &pSrc->second, sizeof(Value)); + else + { + new ((void*)&pDst->second) Value(std::move(pSrc->second)); + scalar_type::destruct(&pSrc->second); + } + + pDst->state = cStateValid; + } + + if (update_src_state) + pSrc->state = cStateInvalid; + } + + struct raw_node + { + inline raw_node() + { + node* p = reinterpret_cast(this); + p->state = cStateInvalid; + } + + // In practice, this should never be called (right?). We manage destruction ourselves. + inline ~raw_node() + { + node* p = reinterpret_cast(this); + if (p->state) + hash_map_type::destruct_value_type(p); + } + + inline raw_node(const raw_node& other) + { + node* pDst = reinterpret_cast(this); + const node* pSrc = reinterpret_cast(&other); + + if (pSrc->state) + { + hash_map_type::construct_value_type(pDst, pSrc); + pDst->state = cStateValid; + } + else + pDst->state = cStateInvalid; + } + + inline raw_node& operator= (const raw_node& rhs) + { + if (this == &rhs) + return *this; + + node* pDst = reinterpret_cast(this); + const node* pSrc = reinterpret_cast(&rhs); + + if (pSrc->state) + { + if (pDst->state) + { + pDst->first = pSrc->first; + pDst->second = pSrc->second; + } + else + { + hash_map_type::construct_value_type(pDst, pSrc); + pDst->state = cStateValid; + } + } + else if (pDst->state) + { + hash_map_type::destruct_value_type(pDst); + pDst->state = cStateInvalid; + } + + return *this; + } + + uint8_t m_bits[sizeof(node)]; + }; + + typedef basisu::vector node_vector; + + node_vector m_values; + + size_t m_num_valid; + size_t m_grow_threshold; + + uint32_t m_hash_shift; + + Hasher m_hasher; + Equals m_equals; + + inline size_t hash_key(const Key& k) const + { + assert((safe_shift_left(static_cast(1), (SIZE_T_BITS - m_hash_shift))) == m_values.size()); + + // Fibonacci hashing + if (SIZE_T_BITS == 32) + { + assert(m_hash_shift != 32); + + uint32_t hash = static_cast(m_hasher(k)); + hash = (2654435769U * hash) >> m_hash_shift; + + assert(hash < m_values.size()); + return (size_t)hash; + } + else + { + assert(m_hash_shift != 64); + + uint64_t hash = static_cast(m_hasher(k)); + hash = (0x9E3779B97F4A7C15ULL * hash) >> m_hash_shift; + + assert(hash < m_values.size()); + return (size_t)hash; + } + } + + inline const node& get_node(size_t index) const + { + return *reinterpret_cast(&m_values[index]); + } + + inline node& get_node(size_t index) + { + return *reinterpret_cast(&m_values[index]); + } + + inline state get_node_state(size_t index) const + { + return static_cast(get_node(index).state); + } + + inline void set_node_state(size_t index, bool valid) + { + get_node(index).state = valid; + } + + inline bool try_grow() + { + uint64_t n = m_values.size() * 2ULL; + + if (!helpers::is_power_of_2(n)) + n = helpers::next_pow2(n); + + if (!can_fit_into_size_t(n)) + { + assert(0); + return false; + } + + return rehash(helpers::maximum(cMinHashSize, (size_t)n)); + } + + // new_hash_size must be a power of 2. + inline bool rehash(size_t new_hash_size) + { + if (!helpers::is_power_of_2((uint64_t)new_hash_size)) + { + assert(0); + return false; + } + + if (new_hash_size < m_num_valid) + { + assert(0); + return false; + } + + if (new_hash_size == m_values.size()) + return true; - const uint32_t orig_index = index; + hash_map new_map; + if (!new_map.m_values.try_resize(new_hash_size)) + return false; - for (; ; ) - { - if (!index) - { - index = m_values.size() - 1; - pNode = &get_node(index); - } - else - { - index--; - pNode--; - } + new_map.m_hash_shift = SIZE_T_BITS - helpers::floor_log2i((uint64_t)new_hash_size); + assert(new_hash_size == safe_shift_left(static_cast(1), SIZE_T_BITS - new_map.m_hash_shift)); - if (index == orig_index) - break; + new_map.m_grow_threshold = std::numeric_limits::max(); + + node* pNode = reinterpret_cast(m_values.begin()); + node* pNode_end = pNode + m_values.size(); + + while (pNode != pNode_end) + { + if (pNode->state) + { + new_map.move_into(pNode); + + if (new_map.m_num_valid == m_num_valid) + break; + } + + pNode++; + } + + new_map.m_grow_threshold = new_hash_size >> 1U; + if (new_hash_size & 1) + new_map.m_grow_threshold++; + + m_values.clear_no_destruction(); + m_hash_shift = SIZE_T_BITS; + + swap(new_map); + + return true; + } + + inline size_t find_next(size_t index) const + { + index++; + + if (index >= m_values.size()) + return index; + + const node* pNode = &get_node(index); + + for (; ; ) + { + if (pNode->state) + break; + + if (++index >= m_values.size()) + break; + + pNode++; + } + + return index; + } + + inline size_t find_index(const Key& k) const + { + if (m_num_valid) + { + size_t index = hash_key(k); + const node* pNode = &get_node(index); + + if (pNode->state) + { + if (m_equals(pNode->first, k)) + return index; + + const size_t orig_index = index; + + for (; ; ) + { + if (!index) + { + index = m_values.size() - 1; + pNode = &get_node(index); + } + else + { + index--; + pNode--; + } + + if (index == orig_index) + break; + + if (!pNode->state) + break; + + if (m_equals(pNode->first, k)) + return index; + } + } + } + + return m_values.size(); + } + + inline bool insert_no_grow(insert_result& result, const Key& k, const Value& v) + { + if (!m_values.size()) + return false; + + size_t index = hash_key(k); + node* pNode = &get_node(index); + + if (pNode->state) + { + if (m_equals(pNode->first, k)) + { + result.first = iterator(*this, index); + result.second = false; + return true; + } + + const size_t orig_index = index; + + for (; ; ) + { + if (!index) + { + index = m_values.size() - 1; + pNode = &get_node(index); + } + else + { + index--; + pNode--; + } + + if (orig_index == index) + return false; + + if (!pNode->state) + break; + + if (m_equals(pNode->first, k)) + { + result.first = iterator(*this, index); + result.second = false; + return true; + } + } + } + + if (m_num_valid >= m_grow_threshold) + return false; + + construct_value_type(pNode, k, v); + + pNode->state = cStateValid; + + m_num_valid++; + assert(m_num_valid <= m_values.size()); + + result.first = iterator(*this, index); + result.second = true; + + return true; + } + + // Move user supplied key/value into a node. + static inline void move_value_type(value_type* pDst, Key&& k, Value&& v) + { + // Not checking for is MOVABLE because the caller could later destruct k and/or v (what state do we set them to?) + if (BASISU_IS_BITWISE_COPYABLE(Key)) + { + memcpy(&pDst->first, &k, sizeof(Key)); + } + else + { + new ((void*)&pDst->first) Key(std::move(k)); + // No destruction - user will do that (we don't own k). + } + + if (BASISU_IS_BITWISE_COPYABLE(Value)) + { + memcpy(&pDst->second, &v, sizeof(Value)); + } + else + { + new ((void*)&pDst->second) Value(std::move(v)); + // No destruction - user will do that (we don't own v). + } + } + + // Insert user provided k/v, by moving, into the current hash table + inline bool insert_no_grow_move(insert_result& result, Key&& k, Value&& v) + { + if (!m_values.size()) + return false; + + size_t index = hash_key(k); + node* pNode = &get_node(index); + + if (pNode->state) + { + if (m_equals(pNode->first, k)) + { + result.first = iterator(*this, index); + result.second = false; + return true; + } + + const size_t orig_index = index; + + for (; ; ) + { + if (!index) + { + index = m_values.size() - 1; + pNode = &get_node(index); + } + else + { + index--; + pNode--; + } + + if (orig_index == index) + return false; + + if (!pNode->state) + break; + + if (m_equals(pNode->first, k)) + { + result.first = iterator(*this, index); + result.second = false; + return true; + } + } + } + + if (m_num_valid >= m_grow_threshold) + return false; + + move_value_type(pNode, std::move(k), std::move(v)); + + pNode->state = cStateValid; + + m_num_valid++; + assert(m_num_valid <= m_values.size()); + + result.first = iterator(*this, index); + result.second = true; + + return true; + } + + // Insert pNode by moving into the current hash table + inline void move_into(node* pNode) + { + size_t index = hash_key(pNode->first); + node* pDst_node = &get_node(index); + + if (pDst_node->state) + { + const size_t orig_index = index; + + for (; ; ) + { + if (!index) + { + index = m_values.size() - 1; + pDst_node = &get_node(index); + } + else + { + index--; + pDst_node--; + } + + if (index == orig_index) + { + assert(false); + return; + } + + if (!pDst_node->state) + break; + } + } + + // No need to update the source node's state (it's going away) + move_node(pDst_node, pNode, false); + + m_num_valid++; + } + }; + + template + struct bitwise_movable< hash_map > { enum { cFlag = true }; }; - if (!pNode->state) - break; - - if (m_equals(pNode->first, k)) - return index; - } - } - } - - return m_values.size(); - } - - inline bool insert_no_grow(insert_result& result, const Key& k, const Value& v = Value()) - { - if (!m_values.size()) - return false; - - uint32_t index = hash_key(k); - node* pNode = &get_node(index); - - if (pNode->state) - { - if (m_equals(pNode->first, k)) - { - result.first = iterator(*this, index); - result.second = false; - return true; - } - - const uint32_t orig_index = index; - - for (; ; ) - { - if (!index) - { - index = m_values.size() - 1; - pNode = &get_node(index); - } - else - { - index--; - pNode--; - } - - if (orig_index == index) - return false; - - if (!pNode->state) - break; - - if (m_equals(pNode->first, k)) - { - result.first = iterator(*this, index); - result.second = false; - return true; - } - } - } - - if (m_num_valid >= m_grow_threshold) - return false; - - construct_value_type(pNode, k, v); - - pNode->state = cStateValid; - - m_num_valid++; - assert(m_num_valid <= m_values.size()); - - result.first = iterator(*this, index); - result.second = true; - - return true; - } - - inline void move_into(node* pNode) - { - uint32_t index = hash_key(pNode->first); - node* pDst_node = &get_node(index); - - if (pDst_node->state) - { - const uint32_t orig_index = index; - - for (; ; ) - { - if (!index) - { - index = m_values.size() - 1; - pDst_node = &get_node(index); - } - else - { - index--; - pDst_node--; - } - - if (index == orig_index) - { - assert(false); - return; - } - - if (!pDst_node->state) - break; - } - } - - move_node(pDst_node, pNode, false); +#if BASISU_HASHMAP_TEST + extern void hash_map_test(); +#endif - m_num_valid++; - } - }; + // String formatting + inline std::string string_format(const char* pFmt, ...) + { + char buf[2048]; - template - struct bitwise_movable< hash_map > { enum { cFlag = true }; }; - -#if BASISU_HASHMAP_TEST - extern void hash_map_test(); + va_list args; + va_start(args, pFmt); +#ifdef _WIN32 + vsprintf_s(buf, sizeof(buf), pFmt, args); +#else + vsnprintf(buf, sizeof(buf), pFmt, args); +#endif + va_end(args); + + return std::string(buf); + } + + enum class variant_type + { + cInvalid, + cI32, cU32, + cI64, cU64, + cFlt, cDbl, cBool, + cStrPtr, cStdStr + }; + + struct fmt_variant + { + union + { + int32_t m_i32; + uint32_t m_u32; + int64_t m_i64; + uint64_t m_u64; + float m_flt; + double m_dbl; + bool m_bool; + const char* m_pStr; + }; + + std::string m_str; + + variant_type m_type; + + inline fmt_variant() : + m_u64(0), + m_type(variant_type::cInvalid) + { + } + + inline fmt_variant(const fmt_variant& other) : + m_u64(other.m_u64), + m_str(other.m_str), + m_type(other.m_type) + { + } + + inline fmt_variant(fmt_variant&& other) : + m_u64(other.m_u64), + m_str(std::move(other.m_str)), + m_type(other.m_type) + { + other.m_type = variant_type::cInvalid; + other.m_u64 = 0; + } + + inline fmt_variant& operator= (fmt_variant&& other) + { + if (this == &other) + return *this; + + m_type = other.m_type; + m_u64 = other.m_u64; + m_str = std::move(other.m_str); + + other.m_type = variant_type::cInvalid; + other.m_u64 = 0; + + return *this; + } + + inline fmt_variant& operator= (const fmt_variant& rhs) + { + if (this == &rhs) + return *this; + + m_u64 = rhs.m_u64; + m_type = rhs.m_type; + m_str = rhs.m_str; + + return *this; + } + + inline fmt_variant(int32_t v) : m_i32(v), m_type(variant_type::cI32) { } + inline fmt_variant(uint32_t v) : m_u32(v), m_type(variant_type::cU32) { } + inline fmt_variant(int64_t v) : m_i64(v), m_type(variant_type::cI64) { } + inline fmt_variant(uint64_t v) : m_u64(v), m_type(variant_type::cU64) { } +#ifdef _MSC_VER + inline fmt_variant(unsigned long v) : m_u64(v), m_type(variant_type::cU64) {} + inline fmt_variant(long v) : m_i64(v), m_type(variant_type::cI64) {} #endif - + inline fmt_variant(float v) : m_flt(v), m_type(variant_type::cFlt) { } + inline fmt_variant(double v) : m_dbl(v), m_type(variant_type::cDbl) { } + inline fmt_variant(const char* pStr) : m_pStr(pStr), m_type(variant_type::cStrPtr) { } + inline fmt_variant(const std::string& str) : m_u64(0), m_str(str), m_type(variant_type::cStdStr) { } + inline fmt_variant(bool val) : m_bool(val), m_type(variant_type::cBool) { } + + bool to_string(std::string& res, std::string& fmt) const; + }; + + typedef basisu::vector fmt_variant_vec; + + bool fmt_variants(std::string& res, const char* pFmt, const fmt_variant_vec& variants); + + template + inline bool fmt_string(std::string& res, const char* pFmt, Args&&... args) + { + return fmt_variants(res, pFmt, fmt_variant_vec{ fmt_variant(std::forward(args))... }); + } + + template + inline std::string fmt_string(const char* pFmt, Args&&... args) + { + std::string res; + fmt_variants(res, pFmt, fmt_variant_vec{ fmt_variant(std::forward(args))... }); + return res; + } + + template + inline int fmt_printf(const char* pFmt, Args&&... args) + { + std::string res; + if (!fmt_variants(res, pFmt, fmt_variant_vec{ fmt_variant(std::forward(args))... })) + return EOF; + + return fputs(res.c_str(), stdout); + } + + template + inline int fmt_fprintf(FILE* pFile, const char* pFmt, Args&&... args) + { + std::string res; + if (!fmt_variants(res, pFmt, fmt_variant_vec{ fmt_variant(std::forward(args))... })) + return EOF; + + return fputs(res.c_str(), pFile); + } + + // fixed_array - zero initialized by default, operator[] is always bounds checked. + template + class fixed_array + { + static_assert(N >= 1, "fixed_array size must be at least 1"); + + public: + using value_type = T; + using size_type = std::size_t; + using difference_type = std::ptrdiff_t; + using reference = T&; + using const_reference = const T&; + using pointer = T*; + using const_pointer = const T*; + using iterator = T*; + using const_iterator = const T*; + + T m_data[N]; + + BASISU_FORCE_INLINE fixed_array() + { + initialize_array(); + } + + BASISU_FORCE_INLINE fixed_array(std::initializer_list list) + { + assert(list.size() <= N); + + std::size_t copy_size = std::min(list.size(), N); + std::copy_n(list.begin(), copy_size, m_data); // Copy up to min(list.size(), N) + + if (list.size() < N) + { + // Initialize the rest of the array + std::fill(m_data + copy_size, m_data + N, T{}); + } + } + + BASISU_FORCE_INLINE T& operator[](std::size_t index) + { + if (index >= N) + container_abort("fixed_array: Index out of bounds."); + return m_data[index]; + } + + BASISU_FORCE_INLINE const T& operator[](std::size_t index) const + { + if (index >= N) + container_abort("fixed_array: Index out of bounds."); + return m_data[index]; + } + + BASISU_FORCE_INLINE T* begin() { return m_data; } + BASISU_FORCE_INLINE const T* begin() const { return m_data; } + + BASISU_FORCE_INLINE T* end() { return m_data + N; } + BASISU_FORCE_INLINE const T* end() const { return m_data + N; } + + BASISU_FORCE_INLINE const T* data() const { return m_data; } + BASISU_FORCE_INLINE T* data() { return m_data; } + + BASISU_FORCE_INLINE const T& front() const { return m_data[0]; } + BASISU_FORCE_INLINE T& front() { return m_data[0]; } + + BASISU_FORCE_INLINE const T& back() const { return m_data[N - 1]; } + BASISU_FORCE_INLINE T& back() { return m_data[N - 1]; } + + BASISU_FORCE_INLINE constexpr std::size_t size() const { return N; } + + BASISU_FORCE_INLINE void clear() + { + initialize_array(); // Reinitialize the array + } + + BASISU_FORCE_INLINE void set_all(const T& value) + { + std::fill(m_data, m_data + N, value); + } + + BASISU_FORCE_INLINE readable_span get_readable_span() const + { + return readable_span(m_data, N); + } + + BASISU_FORCE_INLINE writable_span get_writable_span() + { + return writable_span(m_data, N); + } + + private: + BASISU_FORCE_INLINE void initialize_array() + { + if constexpr (std::is_integral::value || std::is_floating_point::value) + memset(m_data, 0, sizeof(m_data)); + else + std::fill(m_data, m_data + N, T{}); + } + + BASISU_FORCE_INLINE T& access_element(std::size_t index) + { + if (index >= N) + container_abort("fixed_array: Index out of bounds."); + return m_data[index]; + } + + BASISU_FORCE_INLINE const T& access_element(std::size_t index) const + { + if (index >= N) + container_abort("fixed_array: Index out of bounds."); + return m_data[index]; + } + }; + + // 2D array + + template + class vector2D + { + typedef basisu::vector vec_type; + + uint32_t m_width, m_height; + vec_type m_values; + + public: + vector2D() : + m_width(0), + m_height(0) + { + } + + vector2D(uint32_t w, uint32_t h) : + m_width(0), + m_height(0) + { + resize(w, h); + } + + vector2D(const vector2D& other) + { + *this = other; + } + + vector2D(vector2D&& other) : + m_width(0), + m_height(0) + { + *this = std::move(other); + } + + vector2D& operator= (const vector2D& other) + { + if (this != &other) + { + m_width = other.m_width; + m_height = other.m_height; + m_values = other.m_values; + } + return *this; + } + + vector2D& operator= (vector2D&& other) + { + if (this != &other) + { + m_width = other.m_width; + m_height = other.m_height; + m_values = std::move(other.m_values); + + other.m_width = 0; + other.m_height = 0; + } + return *this; + } + + inline bool operator== (const vector2D& rhs) const + { + return (m_width == rhs.m_width) && (m_height == rhs.m_height) && (m_values == rhs.m_values); + } + + inline size_t size_in_bytes() const { return m_values.size_in_bytes(); } + + inline uint32_t get_width() const { return m_width; } + inline uint32_t get_height() const { return m_height; } + + inline const T& operator() (uint32_t x, uint32_t y) const { assert(x < m_width && y < m_height); return m_values[x + y * m_width]; } + inline T& operator() (uint32_t x, uint32_t y) { assert(x < m_width && y < m_height); return m_values[x + y * m_width]; } + + inline size_t size() const { return m_values.size(); } + + inline const T& operator[] (uint32_t i) const { return m_values[i]; } + inline T& operator[] (uint32_t i) { return m_values[i]; } + + inline const T& at_clamped(int x, int y) const { return (*this)(clamp(x, 0, m_width - 1), clamp(y, 0, m_height - 1)); } + inline T& at_clamped(int x, int y) { return (*this)(clamp(x, 0, m_width - 1), clamp(y, 0, m_height - 1)); } + + void clear() + { + m_width = 0; + m_height = 0; + m_values.clear(); + } + + void set_all(const T& val) + { + vector_set_all(m_values, val); + } + + inline const T* get_ptr() const { return m_values.data(); } + inline T* get_ptr() { return m_values.data(); } + + vector2D& resize(uint32_t new_width, uint32_t new_height) + { + if ((m_width == new_width) && (m_height == new_height)) + return *this; + + const uint64_t total_vals = (uint64_t)new_width * new_height; + + if (!can_fit_into_size_t(total_vals)) + { + // What can we do? + assert(0); + return *this; + } + + vec_type oldVals((size_t)total_vals); + oldVals.swap(m_values); + + const uint32_t w = minimum(m_width, new_width); + const uint32_t h = minimum(m_height, new_height); + + if ((w) && (h)) + { + for (uint32_t y = 0; y < h; y++) + for (uint32_t x = 0; x < w; x++) + m_values[x + y * new_width] = oldVals[x + y * m_width]; + } + + m_width = new_width; + m_height = new_height; + + return *this; + } + + bool try_resize(uint32_t new_width, uint32_t new_height) + { + if ((m_width == new_width) && (m_height == new_height)) + return true; + + const uint64_t total_vals = (uint64_t)new_width * new_height; + + if (!can_fit_into_size_t(total_vals)) + { + // What can we do? + assert(0); + return false; + } + + vec_type oldVals; + if (!oldVals.try_resize((size_t)total_vals)) + return false; + + oldVals.swap(m_values); + + const uint32_t w = minimum(m_width, new_width); + const uint32_t h = minimum(m_height, new_height); + + if ((w) && (h)) + { + for (uint32_t y = 0; y < h; y++) + for (uint32_t x = 0; x < w; x++) + m_values[x + y * new_width] = oldVals[x + y * m_width]; + } + + m_width = new_width; + m_height = new_height; + + return true; + } + + const vector2D& extract_block_clamped(T* pDst, uint32_t src_x, uint32_t src_y, uint32_t w, uint32_t h) const + { + // HACK HACK + if (((src_x + w) > m_width) || ((src_y + h) > m_height)) + { + // Slower clamping case + for (uint32_t y = 0; y < h; y++) + for (uint32_t x = 0; x < w; x++) + *pDst++ = at_clamped(src_x + x, src_y + y); + } + else + { + const T* pSrc = &m_values[src_x + src_y * m_width]; + + for (uint32_t y = 0; y < h; y++) + { + memcpy(pDst, pSrc, w * sizeof(T)); + pSrc += m_width; + pDst += w; + } + } + + return *this; + } + }; + } // namespace basisu namespace std { - template - inline void swap(basisu::vector& a, basisu::vector& b) - { - a.swap(b); - } - - template - inline void swap(basisu::hash_map& a, basisu::hash_map& b) - { - a.swap(b); - } + template + inline void swap(basisu::vector& a, basisu::vector& b) + { + a.swap(b); + } + + template + inline void swap(basisu::hash_map& a, basisu::hash_map& b) + { + a.swap(b); + } } // namespace std diff --git a/thirdparty/basis_universal/transcoder/basisu_containers_impl.h b/thirdparty/basis_universal/transcoder/basisu_containers_impl.h index 60c0b3d89f91..d4d3eb23bc08 100644 --- a/thirdparty/basis_universal/transcoder/basisu_containers_impl.h +++ b/thirdparty/basis_universal/transcoder/basisu_containers_impl.h @@ -7,308 +7,806 @@ namespace basisu { - bool elemental_vector::increase_capacity(uint32_t min_new_capacity, bool grow_hint, uint32_t element_size, object_mover pMover, bool nofail) - { - assert(m_size <= m_capacity); - - if (sizeof(void *) == sizeof(uint64_t)) - assert(min_new_capacity < (0x400000000ULL / element_size)); - else - assert(min_new_capacity < (0x7FFF0000U / element_size)); - - if (m_capacity >= min_new_capacity) - return true; - - uint64_t new_capacity_u64 = min_new_capacity; - if ((grow_hint) && (!helpers::is_power_of_2(new_capacity_u64))) - new_capacity_u64 = helpers::next_pow2(new_capacity_u64); - - size_t new_capacity = (size_t)new_capacity_u64; - if (new_capacity != new_capacity_u64) - { - if (nofail) - return false; - fprintf(stderr, "elemental_vector::increase_capacity: vector too large\n"); - abort(); - } - - const uint64_t desired_size_u64 = (uint64_t)element_size * new_capacity; - - const size_t desired_size = (size_t)desired_size_u64; - if (desired_size_u64 != desired_size) - { - if (nofail) - return false; - fprintf(stderr, "elemental_vector::increase_capacity: vector too large\n"); - abort(); - } - - size_t actual_size = 0; - if (!pMover) - { - void* new_p = realloc(m_p, desired_size); - if (!new_p) - { - if (nofail) - return false; - - char buf[256]; - snprintf(buf, sizeof(buf), "elemental_vector::increase_capacity: realloc() failed allocating %zu bytes", desired_size); - fprintf(stderr, "%s", buf); - abort(); - } + // A container operation has internally panicked in an unrecoverable way. + // Either an allocation has failed, or a range or consistency check has failed. +#ifdef _MSC_VER + __declspec(noreturn) +#else + [[noreturn]] +#endif + void container_abort(const char* pMsg, ...) + { + assert(0); + + va_list args; + va_start(args, pMsg); + + char buf[1024] = {}; + +#ifdef _MSC_VER + vsprintf_s(buf, sizeof(buf), pMsg, args); +#else + vsnprintf(buf, sizeof(buf), pMsg, args); +#endif + va_end(args); + + fputs(buf, stderr); + + std::terminate(); + } + + bool elemental_vector::increase_capacity(size_t min_new_capacity, bool grow_hint, size_t element_size, object_mover pMover, bool nofail_flag) + { + assert(m_size <= m_capacity); + assert(min_new_capacity >= m_size); + assert(element_size); + + // Basic sanity check min_new_capacity + if (!can_fit_into_size_t((uint64_t)min_new_capacity * element_size)) + { + assert(0); + + if (nofail_flag) + return false; + + container_abort("elemental_vector::increase_capacity: requesting too many elements\n"); + } + + // Check for sane library limits + if (sizeof(void*) == sizeof(uint64_t)) + { + // 16 GB + assert(min_new_capacity < (0x400000000ULL / element_size)); + } + else + { + // ~1.99 GB + assert(min_new_capacity < (0x7FFF0000U / element_size)); + } + + // If vector is already large enough just return. + if (m_capacity >= min_new_capacity) + return true; + + uint64_t new_capacity_u64 = min_new_capacity; + + if ((grow_hint) && (!helpers::is_power_of_2(new_capacity_u64))) + { + new_capacity_u64 = helpers::next_pow2(new_capacity_u64); + + if (!can_fit_into_size_t(new_capacity_u64)) + { + assert(0); + + if (nofail_flag) + return false; + + container_abort("elemental_vector::increase_capacity: vector too large\n"); + } + } + + const uint64_t desired_size_u64 = element_size * new_capacity_u64; + + if (!can_fit_into_size_t(desired_size_u64)) + { + assert(0); + + if (nofail_flag) + return false; + + container_abort("elemental_vector::increase_capacity: vector too large\n"); + } + + const size_t desired_size = static_cast(desired_size_u64); + + size_t actual_size = 0; + BASISU_NOTE_UNUSED(actual_size); + + if (!pMover) + { + void* new_p = realloc(m_p, desired_size); + if (!new_p) + { + assert(0); + + if (nofail_flag) + return false; + + container_abort("elemental_vector::increase_capacity: realloc() failed allocating %zu bytes", desired_size); + } #if BASISU_VECTOR_DETERMINISTIC - actual_size = desired_size; + actual_size = desired_size; #elif defined(_MSC_VER) - actual_size = _msize(new_p); + actual_size = _msize(new_p); #elif HAS_MALLOC_USABLE_SIZE - actual_size = malloc_usable_size(new_p); + actual_size = malloc_usable_size(new_p); #else - actual_size = desired_size; + actual_size = desired_size; #endif - m_p = new_p; - } - else - { - void* new_p = malloc(desired_size); - if (!new_p) - { - if (nofail) - return false; - - char buf[256]; - snprintf(buf, sizeof(buf), "elemental_vector::increase_capacity: malloc() failed allocating %zu bytes", desired_size); - fprintf(stderr, "%s", buf); - abort(); - } + m_p = new_p; + } + else + { + void* new_p = malloc(desired_size); + if (!new_p) + { + assert(0); + if (nofail_flag) + return false; + + container_abort("elemental_vector::increase_capacity: malloc() failed allocating %zu bytes", desired_size); + } #if BASISU_VECTOR_DETERMINISTIC - actual_size = desired_size; + actual_size = desired_size; #elif defined(_MSC_VER) - actual_size = _msize(new_p); + actual_size = _msize(new_p); #elif HAS_MALLOC_USABLE_SIZE - actual_size = malloc_usable_size(new_p); + actual_size = malloc_usable_size(new_p); #else - actual_size = desired_size; + actual_size = desired_size; #endif - (*pMover)(new_p, m_p, m_size); + (*pMover)(new_p, m_p, m_size); - if (m_p) - free(m_p); - - m_p = new_p; - } + if (m_p) + free(m_p); - if (actual_size > desired_size) - m_capacity = static_cast(actual_size / element_size); - else - m_capacity = static_cast(new_capacity); + m_p = new_p; + } - return true; - } +#if BASISU_VECTOR_DETERMINISTIC + m_capacity = static_cast(new_capacity_u64); +#else + if (actual_size > desired_size) + m_capacity = static_cast(actual_size / element_size); + else + m_capacity = static_cast(new_capacity_u64); +#endif + + return true; + } #if BASISU_HASHMAP_TEST #define HASHMAP_TEST_VERIFY(c) do { if (!(c)) handle_hashmap_test_verify_failure(__LINE__); } while(0) - static void handle_hashmap_test_verify_failure(int line) - { - fprintf(stderr, "HASHMAP_TEST_VERIFY() faild on line %i\n", line); - abort(); - } - - class counted_obj - { - public: - counted_obj(uint32_t v = 0) : - m_val(v) - { - m_count++; - } - - counted_obj(const counted_obj& obj) : - m_val(obj.m_val) - { - m_count++; - } - - ~counted_obj() - { - assert(m_count > 0); - m_count--; - } - - static uint32_t m_count; - - uint32_t m_val; - - operator size_t() const { return m_val; } - - bool operator== (const counted_obj& rhs) const { return m_val == rhs.m_val; } - bool operator== (const uint32_t rhs) const { return m_val == rhs; } - - }; - - uint32_t counted_obj::m_count; - - static uint32_t urand32() - { - uint32_t a = rand(); - uint32_t b = rand() << 15; - uint32_t c = rand() << (32 - 15); - return a ^ b ^ c; - } - - static int irand32(int l, int h) - { - assert(l < h); - if (l >= h) - return l; - - uint32_t range = static_cast(h - l); - - uint32_t rnd = urand32(); - - uint32_t rnd_range = static_cast((((uint64_t)range) * ((uint64_t)rnd)) >> 32U); - - int result = l + rnd_range; - assert((result >= l) && (result < h)); - return result; - } - - void hash_map_test() - { - { - basisu::hash_map k; - basisu::hash_map l; - std::swap(k, l); - - k.begin(); - k.end(); - k.clear(); - k.empty(); - k.erase(0); - k.insert(0, 1); - k.find(0); - k.get_equals(); - k.get_hasher(); - k.get_table_size(); - k.reset(); - k.reserve(1); - k = l; - k.set_equals(l.get_equals()); - k.set_hasher(l.get_hasher()); - k.get_table_size(); - } - - uint32_t seed = 0; - for (; ; ) - { - seed++; - - typedef basisu::hash_map my_hash_map; - my_hash_map m; - - const uint32_t n = irand32(0, 100000); - - printf("%u\n", n); - - srand(seed); // r1.seed(seed); - - basisu::vector q; - - uint32_t count = 0; - for (uint32_t i = 0; i < n; i++) - { - uint32_t v = urand32() & 0x7FFFFFFF; - my_hash_map::insert_result res = m.insert(counted_obj(v), counted_obj(v ^ 0xdeadbeef)); - if (res.second) - { - count++; - q.push_back(v); - } - } - - HASHMAP_TEST_VERIFY(m.size() == count); - - srand(seed); - - my_hash_map cm(m); - m.clear(); - m = cm; - cm.reset(); - - for (uint32_t i = 0; i < n; i++) - { - uint32_t v = urand32() & 0x7FFFFFFF; - my_hash_map::const_iterator it = m.find(counted_obj(v)); - HASHMAP_TEST_VERIFY(it != m.end()); - HASHMAP_TEST_VERIFY(it->first == v); - HASHMAP_TEST_VERIFY(it->second == (v ^ 0xdeadbeef)); - } - - for (uint32_t t = 0; t < 2; t++) - { - const uint32_t nd = irand32(1, q.size() + 1); - for (uint32_t i = 0; i < nd; i++) - { - uint32_t p = irand32(0, q.size()); - - int k = q[p]; - if (k >= 0) - { - q[p] = -k - 1; - - bool s = m.erase(counted_obj(k)); - HASHMAP_TEST_VERIFY(s); - } - } - - typedef basisu::hash_map uint_hash_set; - uint_hash_set s; - - for (uint32_t i = 0; i < q.size(); i++) - { - int v = q[i]; - - if (v >= 0) - { - my_hash_map::const_iterator it = m.find(counted_obj(v)); - HASHMAP_TEST_VERIFY(it != m.end()); - HASHMAP_TEST_VERIFY(it->first == (uint32_t)v); - HASHMAP_TEST_VERIFY(it->second == ((uint32_t)v ^ 0xdeadbeef)); - - s.insert(v); - } - else - { - my_hash_map::const_iterator it = m.find(counted_obj(-v - 1)); - HASHMAP_TEST_VERIFY(it == m.end()); - } - } - - uint32_t found_count = 0; - for (my_hash_map::const_iterator it = m.begin(); it != m.end(); ++it) - { - HASHMAP_TEST_VERIFY(it->second == ((uint32_t)it->first ^ 0xdeadbeef)); - - uint_hash_set::const_iterator fit(s.find((uint32_t)it->first)); - HASHMAP_TEST_VERIFY(fit != s.end()); - - HASHMAP_TEST_VERIFY(fit->first == it->first); - - found_count++; - } - - HASHMAP_TEST_VERIFY(found_count == s.size()); - } - - HASHMAP_TEST_VERIFY(counted_obj::m_count == m.size() * 2); - } - } + static void handle_hashmap_test_verify_failure(int line) + { + container_abort("HASHMAP_TEST_VERIFY() faild on line %i\n", line); + } + + class counted_obj + { + public: + counted_obj(uint32_t v = 0) : + m_val(v) + { + m_count++; + } + + counted_obj(const counted_obj& obj) : + m_val(obj.m_val) + { + if (m_val != UINT64_MAX) + m_count++; + } + + counted_obj(counted_obj&& obj) : + m_val(obj.m_val) + { + obj.m_val = UINT64_MAX; + } + + counted_obj& operator= (counted_obj&& rhs) + { + if (this != &rhs) + { + m_val = rhs.m_val; + rhs.m_val = UINT64_MAX; + } + return *this; + } + + ~counted_obj() + { + if (m_val != UINT64_MAX) + { + assert(m_count > 0); + m_count--; + } + } + + static uint32_t m_count; + + uint64_t m_val; + + operator size_t() const { return (size_t)m_val; } + + bool operator== (const counted_obj& rhs) const { return m_val == rhs.m_val; } + bool operator== (const uint32_t rhs) const { return m_val == rhs; } + + }; + + uint32_t counted_obj::m_count; + + static uint32_t urand32() + { + uint32_t a = rand(); + uint32_t b = rand() << 15; + uint32_t c = rand() << (32 - 15); + return a ^ b ^ c; + } + + static int irand32(int l, int h) + { + assert(l < h); + if (l >= h) + return l; + + uint32_t range = static_cast(h - l); + + uint32_t rnd = urand32(); + + uint32_t rnd_range = static_cast((((uint64_t)range) * ((uint64_t)rnd)) >> 32U); + + int result = l + rnd_range; + assert((result >= l) && (result < h)); + return result; + } + + void hash_map_test() + { + { + basisu::hash_map s; + uint_vec k; + + for (uint32_t i = 0; i < 1000000; i++) + { + s.insert(i); + k.push_back(i); + } + + for (uint32_t i = 0; i < k.size(); i++) + { + uint32_t r = rand() ^ (rand() << 15); + + uint32_t j = i + (r % (k.size() - i)); + + std::swap(k[i], k[j]); + } + + basisu::hash_map s1(s); + + for (uint32_t i = 0; i < 1000000; i++) + { + auto res = s.find(i); + HASHMAP_TEST_VERIFY(res != s.end()); + HASHMAP_TEST_VERIFY(res->first == i); + s.erase(i); + } + + for (uint32_t it = 0; it < 1000000; it++) + { + uint32_t i = k[it]; + + auto res = s1.find(i); + HASHMAP_TEST_VERIFY(res != s.end()); + HASHMAP_TEST_VERIFY(res->first == i); + s1.erase(i); + } + + for (uint32_t i = 0; i < 1000000; i++) + { + auto res = s.find(i); + HASHMAP_TEST_VERIFY(res == s.end()); + + auto res1 = s1.find(i); + HASHMAP_TEST_VERIFY(res1 == s1.end()); + } + + HASHMAP_TEST_VERIFY(s.empty()); + HASHMAP_TEST_VERIFY(s1.empty()); + } + + { + typedef basisu::hash_map< uint32_t, basisu::vector > hm; + hm q; + + basisu::vector a, b; + a.push_back(1); + b.push_back(2); + b.push_back(3); + + basisu::vector c(b); + + hm::insert_result ir; + q.try_insert(ir, 1, std::move(a)); + q.try_insert(ir, 2, std::move(b)); + q.try_insert(ir, std::make_pair(3, c)); + } + + { + typedef basisu::hash_map my_hash_map; + my_hash_map m; + counted_obj a, b; + m.insert(std::move(a), std::move(b)); + } + + { + basisu::hash_map k; + basisu::hash_map l; + std::swap(k, l); + + k.begin(); + k.end(); + k.clear(); + k.empty(); + k.erase(0); + k.insert(0, 1); + k.find(0); + k.get_equals(); + k.get_hasher(); + k.get_table_size(); + k.reset(); + k.reserve(1); + k = l; + k.set_equals(l.get_equals()); + k.set_hasher(l.get_hasher()); + k.get_table_size(); + } + + uint32_t seed = 0; + for (; ; ) + { + seed++; + + typedef basisu::hash_map my_hash_map; + my_hash_map m; + + const uint32_t n = irand32(1, 100000); + + printf("%u\n", n); + + srand(seed); // r1.seed(seed); + + basisu::vector q; + + uint32_t count = 0; + for (uint32_t i = 0; i < n; i++) + { + uint32_t v = urand32() & 0x7FFFFFFF; + my_hash_map::insert_result res = m.insert(counted_obj(v), counted_obj(v ^ 0xdeadbeef)); + if (res.second) + { + count++; + q.push_back(v); + } + } + + HASHMAP_TEST_VERIFY(m.size() == count); + + srand(seed); + + my_hash_map cm(m); + m.clear(); + m = cm; + cm.reset(); + + for (uint32_t i = 0; i < n; i++) + { + uint32_t v = urand32() & 0x7FFFFFFF; + my_hash_map::const_iterator it = m.find(counted_obj(v)); + HASHMAP_TEST_VERIFY(it != m.end()); + HASHMAP_TEST_VERIFY(it->first == v); + HASHMAP_TEST_VERIFY(it->second == (v ^ 0xdeadbeef)); + } + + for (uint32_t t = 0; t < 2; t++) + { + const uint32_t nd = irand32(1, q.size_u32() + 1); + for (uint32_t i = 0; i < nd; i++) + { + uint32_t p = irand32(0, q.size_u32()); + + int k = q[p]; + if (k >= 0) + { + q[p] = -k - 1; + + bool s = m.erase(counted_obj(k)); + HASHMAP_TEST_VERIFY(s); + } + } + + typedef basisu::hash_map uint_hash_set; + uint_hash_set s; + + for (uint32_t i = 0; i < q.size(); i++) + { + int v = q[i]; + + if (v >= 0) + { + my_hash_map::const_iterator it = m.find(counted_obj(v)); + HASHMAP_TEST_VERIFY(it != m.end()); + HASHMAP_TEST_VERIFY(it->first == (uint32_t)v); + HASHMAP_TEST_VERIFY(it->second == ((uint32_t)v ^ 0xdeadbeef)); + + s.insert(v); + } + else + { + my_hash_map::const_iterator it = m.find(counted_obj(-v - 1)); + HASHMAP_TEST_VERIFY(it == m.end()); + } + } + + uint32_t found_count = 0; + for (my_hash_map::const_iterator it = m.begin(); it != m.end(); ++it) + { + HASHMAP_TEST_VERIFY(it->second == ((uint32_t)it->first ^ 0xdeadbeef)); + + uint_hash_set::const_iterator fit(s.find((uint32_t)it->first)); + HASHMAP_TEST_VERIFY(fit != s.end()); + + HASHMAP_TEST_VERIFY(fit->first == it->first); + + found_count++; + } + + HASHMAP_TEST_VERIFY(found_count == s.size()); + } + + HASHMAP_TEST_VERIFY(counted_obj::m_count == m.size() * 2); + } + } #endif // BASISU_HASHMAP_TEST + // String formatting + + bool fmt_variant::to_string(std::string& res, std::string& fmt) const + { + res.resize(0); + + // Scan for allowed formatting characters. + for (size_t i = 0; i < fmt.size(); i++) + { + const char c = fmt[i]; + + if (isdigit(c) || (c == '.') || (c == ' ') || (c == '#') || (c == '+') || (c == '-')) + continue; + + if (isalpha(c)) + { + if ((i + 1) == fmt.size()) + continue; + } + + return false; + } + + if (fmt.size() && (fmt.back() == 'c')) + { + if ((m_type == variant_type::cI32) || (m_type == variant_type::cU32)) + { + if (m_u32 > 255) + return false; + + // Explictly allowing caller to pass in a char of 0, which is ignored. + if (m_u32) + res.push_back((uint8_t)m_u32); + return true; + } + else + return false; + } + + switch (m_type) + { + case variant_type::cInvalid: + { + return false; + } + case variant_type::cI32: + { + if (fmt.size()) + { + int e = fmt.back(); + if (isalpha(e)) + { + if ((e != 'x') && (e != 'X') && (e != 'i') && (e != 'd') && (e != 'u')) + return false; + } + else + { + fmt += "i"; + } + + res = string_format((std::string("%") + fmt).c_str(), m_i32); + } + else + { + res = string_format("%i", m_i32); + } + break; + } + case variant_type::cU32: + { + if (fmt.size()) + { + int e = fmt.back(); + if (isalpha(e)) + { + if ((e != 'x') && (e != 'X') && (e != 'i') && (e != 'd') && (e != 'u')) + return false; + } + else + { + fmt += "u"; + } + + res = string_format((std::string("%") + fmt).c_str(), m_u32); + } + else + { + res = string_format("%u", m_u32); + } + break; + } + case variant_type::cI64: + { + if (fmt.size()) + { + int e = fmt.back(); + if (isalpha(e)) + { + if (e == 'x') + { + fmt.pop_back(); + fmt += PRIx64; + } + else if (e == 'X') + { + fmt.pop_back(); + fmt += PRIX64; + } + else + return false; + } + else + { + fmt += PRId64; + } + + res = string_format((std::string("%") + fmt).c_str(), m_i64); + } + else + { + res = string_format("%" PRId64, m_i64); + } + break; + } + case variant_type::cU64: + { + if (fmt.size()) + { + int e = fmt.back(); + if (isalpha(e)) + { + if (e == 'x') + { + fmt.pop_back(); + fmt += PRIx64; + } + else if (e == 'X') + { + fmt.pop_back(); + fmt += PRIX64; + } + else + return false; + } + else + { + fmt += PRIu64; + } + + res = string_format((std::string("%") + fmt).c_str(), m_u64); + } + else + { + res = string_format("%" PRIu64, m_u64); + } + break; + } + case variant_type::cFlt: + { + if (fmt.size()) + { + int e = fmt.back(); + if (isalpha(e)) + { + if ((e != 'f') && (e != 'g') && (e != 'e') && (e != 'E')) + return false; + } + else + { + fmt += "f"; + } + + res = string_format((std::string("%") + fmt).c_str(), m_flt); + } + else + { + res = string_format("%f", m_flt); + } + break; + } + case variant_type::cDbl: + { + if (fmt.size()) + { + int e = fmt.back(); + if (isalpha(e)) + { + if ((e != 'f') && (e != 'g') && (e != 'e') && (e != 'E')) + return false; + } + else + { + fmt += "f"; + } + + res = string_format((std::string("%") + fmt).c_str(), m_dbl); + } + else + { + res = string_format("%f", m_dbl); + } + break; + } + case variant_type::cStrPtr: + { + if (fmt.size()) + return false; + if (!m_pStr) + return false; + res = m_pStr; + break; + } + case variant_type::cBool: + { + if (fmt.size()) + return false; + res = m_bool ? "true" : "false"; + break; + } + case variant_type::cStdStr: + { + if (fmt.size()) + return false; + res = m_str; + break; + } + default: + { + return false; + } + } + + return true; + } + + bool fmt_variants(std::string& res, const char* pFmt, const fmt_variant_vec& variants) + { + res.resize(0); + + // Must specify a format string + if (!pFmt) + { + assert(0); + return false; + } + + // Check format string's length + const size_t fmt_len = strlen(pFmt); + if (!fmt_len) + { + if (variants.size()) + { + assert(0); + return false; + } + return true; + } + + // Wildly estimate output length + res.reserve(fmt_len + 32); + + std::string var_fmt; + var_fmt.reserve(16); + + std::string tmp; + tmp.reserve(16); + + size_t variant_index = 0; + bool inside_brackets = false; + const char* p = pFmt; + + while (*p) + { + const uint8_t c = *p++; + + if (inside_brackets) + { + if (c == '}') + { + inside_brackets = false; + + if (variant_index >= variants.size()) + { + assert(0); + return false; + } + + if (!variants[variant_index].to_string(tmp, var_fmt)) + { + assert(0); + return false; + } + + res += tmp; + + variant_index++; + } + else + { + // Check for forbidden formatting characters. + if ((c == '*') || (c == 'n') || (c == '%')) + { + assert(0); + return false; + } + + var_fmt.push_back(c); + } + } + else if (c == '{') + { + // Check for escaped '{' + if (*p == '{') + { + res.push_back((char)c); + p++; + } + else + { + inside_brackets = true; + var_fmt.resize(0); + } + } + else + { + res.push_back((char)c); + } + } + + if (inside_brackets) + { + assert(0); + return false; + } + + if (variant_index != variants.size()) + { + assert(0); + return false; + } + + return true; + } + } // namespace basisu diff --git a/thirdparty/basis_universal/transcoder/basisu_file_headers.h b/thirdparty/basis_universal/transcoder/basisu_file_headers.h index d29e3feb0340..5c1606625ad1 100644 --- a/thirdparty/basis_universal/transcoder/basisu_file_headers.h +++ b/thirdparty/basis_universal/transcoder/basisu_file_headers.h @@ -38,7 +38,7 @@ namespace basist basisu::packed_uint<2> m_orig_width; // The original image width (may not be a multiple of 4 pixels) basisu::packed_uint<2> m_orig_height; // The original image height (may not be a multiple of 4 pixels) - basisu::packed_uint<2> m_num_blocks_x; // The slice's block X dimensions. Each block is 4x4 pixels. The slice's pixel resolution may or may not be a power of 2. + basisu::packed_uint<2> m_num_blocks_x; // The slice's block X dimensions. Each block is 4x4 or 6x6 pixels. The slice's pixel resolution may or may not be a power of 2. basisu::packed_uint<2> m_num_blocks_y; // The slice's block Y dimensions. basisu::packed_uint<4> m_file_ofs; // Offset from the start of the file to the start of the slice's data @@ -72,9 +72,9 @@ namespace basist // We do make sure the various constraints are followed (2DArray/cubemap/videoframes/volume implies that each image has the same resolution and # of mipmap levels, etc., cubemap implies that the # of image slices is a multiple of 6) enum basis_texture_type { - cBASISTexType2D = 0, // An arbitrary array of 2D RGB or RGBA images with optional mipmaps, array size = # images, each image may have a different resolution and # of mipmap levels + cBASISTexType2D = 0, // An arbitrary array of 2D RGB or RGBA images with optional mipmaps, array size = # images, each image may have a different resolution and # of mipmap levels cBASISTexType2DArray = 1, // An array of 2D RGB or RGBA images with optional mipmaps, array size = # images, each image has the same resolution and mipmap levels - cBASISTexTypeCubemapArray = 2, // an array of cubemap levels, total # of images must be divisable by 6, in X+, X-, Y+, Y-, Z+, Z- order, with optional mipmaps + cBASISTexTypeCubemapArray = 2, // an array of cubemap levels, total # of images must be divisable by 6, in X+, X-, Y+, Y-, Z+, Z- order, with optional mipmaps cBASISTexTypeVideoFrames = 3, // An array of 2D video frames, with optional mipmaps, # frames = # images, each image has the same resolution and # of mipmap levels cBASISTexTypeVolume = 4, // A 3D texture with optional mipmaps, Z dimension = # images, each image has the same resolution and # of mipmap levels @@ -90,7 +90,10 @@ namespace basist { cETC1S = 0, cUASTC4x4 = 1, - cUASTC_HDR_4x4 = 2 + cUASTC_HDR_4x4 = 2, + cASTC_HDR_6x6 = 3, + cASTC_HDR_6x6_INTERMEDIATE = 4, + cTotalFormats }; struct basis_file_header diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp b/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp index 32018cd282d9..d7bce420133f 100644 --- a/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp +++ b/thirdparty/basis_universal/transcoder/basisu_transcoder.cpp @@ -14,7 +14,6 @@ // limitations under the License. #include "basisu_transcoder.h" -#include #include "basisu_containers_impl.h" #define BASISU_ASTC_HELPERS_IMPLEMENTATION @@ -22,6 +21,12 @@ #include "basisu_astc_hdr_core.h" +#include + +#if defined(_MSC_VER) + #include // For __popcnt intrinsic +#endif + #ifndef BASISD_IS_BIG_ENDIAN // TODO: This doesn't work on OSX. How can this be so difficult? //#if defined(__BIG_ENDIAN__) || defined(_BIG_ENDIAN) || defined(BIG_ENDIAN) @@ -150,7 +155,7 @@ #define BASISD_WRITE_NEW_BC7_MODE5_TABLES 0 #define BASISD_WRITE_NEW_DXT1_TABLES 0 -#define BASISD_WRITE_NEW_ETC2_EAC_A8_TABLES 0 +#define BASISD_WRITE_NEW_ETC2_EAC_A8_TABLES 0 #define BASISD_WRITE_NEW_ASTC_TABLES 0 #define BASISD_WRITE_NEW_ATC_TABLES 0 #define BASISD_WRITE_NEW_ETC2_EAC_R11_TABLES 0 @@ -168,6 +173,10 @@ #endif #endif +#if BASISD_SUPPORT_UASTC_HDR +using namespace basist::astc_6x6_hdr; +#endif + namespace basisu { bool g_debug_printf; @@ -190,11 +199,22 @@ namespace basisu va_end(args); } } + + void debug_puts(const char* p) + { +#if BASISU_FORCE_DEVEL_MESSAGES + g_debug_printf = true; +#endif + if (g_debug_printf) + { + //puts(p); + printf("%s", p); + } + } } // namespace basisu namespace basist { - #if BASISD_ENABLE_DEBUG_FLAGS static uint32_t g_debug_flags = 0; #endif @@ -226,6 +246,73 @@ namespace basist static inline float saturate(float value) { return clampf(value, 0, 1.0f); } static inline uint8_t mul_8(uint32_t v, uint32_t q) { v = v * q + 128; return (uint8_t)((v + (v >> 8)) >> 8); } + static inline int mul_8bit(int a, int b) { int t = a * b + 128; return (t + (t >> 8)) >> 8; } + static inline int lerp_8bit(int a, int b, int s) { assert(a >= 0 && a <= 255); assert(b >= 0 && b <= 255); assert(s >= 0 && s <= 255); return a + mul_8bit(b - a, s); } + + struct vec2F + { + float c[2]; + + inline vec2F() {} + + inline vec2F(float s) { c[0] = s; c[1] = s; } + inline vec2F(float x, float y) { c[0] = x; c[1] = y; } + + inline void set(float x, float y) { c[0] = x; c[1] = y; } + + inline float dot(const vec2F& o) const { return (c[0] * o.c[0]) + (c[1] * o.c[1]); } + + inline float operator[] (uint32_t index) const { assert(index < 2); return c[index]; } + inline float& operator[] (uint32_t index) { assert(index < 2); return c[index]; } + + inline vec2F& clamp(float l, float h) + { + c[0] = basisu::clamp(c[0], l, h); + c[1] = basisu::clamp(c[1], l, h); + return *this; + } + + static vec2F lerp(const vec2F& a, const vec2F& b, float s) + { + vec2F res; + for (uint32_t i = 0; i < 2; i++) + res[i] = basisu::lerp(a[i], b[i], s); + return res; + } + }; + + struct vec3F + { + float c[3]; + + inline vec3F() {} + + inline vec3F(float s) { c[0] = s; c[1] = s; c[2] = s; } + inline vec3F(float x, float y, float z) { c[0] = x; c[1] = y; c[2] = z; } + + inline void set(float x, float y, float z) { c[0] = x; c[1] = y; c[2] = z; } + + inline float dot(const vec3F& o) const { return (c[0] * o.c[0]) + (c[1] * o.c[1]) + (c[2] * o.c[2]); } + + inline float operator[] (uint32_t index) const { assert(index < 3); return c[index]; } + inline float &operator[] (uint32_t index) { assert(index < 3); return c[index]; } + + inline vec3F& clamp(float l, float h) + { + c[0] = basisu::clamp(c[0], l, h); + c[1] = basisu::clamp(c[1], l, h); + c[2] = basisu::clamp(c[2], l, h); + return *this; + } + + static vec3F lerp(const vec3F& a, const vec3F& b, float s) + { + vec3F res; + for (uint32_t i = 0; i < 3; i++) + res[i] = basisu::lerp(a[i], b[i], s); + return res; + } + }; uint16_t crc16(const void* r, size_t size, uint16_t crc) { @@ -241,6 +328,16 @@ namespace basist return static_cast(~crc); } + + struct vec4F + { + float c[4]; + + inline void set(float x, float y, float z, float w) { c[0] = x; c[1] = y; c[2] = z; c[3] = w; } + + float operator[] (uint32_t index) const { assert(index < 4); return c[index]; } + float& operator[] (uint32_t index) { assert(index < 4); return c[index]; } + }; enum etc_constants { @@ -1245,7 +1342,6 @@ namespace basist } #endif - #if BASISD_SUPPORT_UASTC || BASISD_SUPPORT_ETC2_EAC_A8 || BASISD_SUPPORT_ETC2_EAC_RG11 static const int8_t g_eac_modifier_table[16][8] = { @@ -1910,6 +2006,21 @@ namespace basist void uastc_init(); #endif +#if BASISD_SUPPORT_UASTC_HDR + namespace astc_6x6_hdr + { + static void init_quantize_tables(); + static void fast_encode_bc6h_init(); + } +#endif + +#if BASISD_SUPPORT_BC7_MODE5 + namespace bc7_mode_5_encoder + { + void encode_bc7_mode5_init(); + } +#endif + static bool g_transcoder_initialized; // Library global initialization. Requires ~9 milliseconds when compiled and executed natively on a Core i7 2.2 GHz. @@ -2045,8 +2156,14 @@ namespace basist #if BASISD_SUPPORT_UASTC_HDR bc6h_enc_init(); + astc_6x6_hdr::init_quantize_tables(); + fast_encode_bc6h_init(); #endif - + +#if BASISD_SUPPORT_BC7_MODE5 + bc7_mode_5_encoder::encode_bc7_mode5_init(); +#endif + g_transcoder_initialized = true; } @@ -4443,6 +4560,198 @@ namespace basist set_block_bits((uint8_t*)pDst, output_bits, 31, 97); } + + static inline vec3F rgb_to_ycocg(const vec3F& rgb) + { + return vec3F(rgb.dot(vec3F(0.25f, 0.5f, 0.25f)), rgb.dot(vec3F(0.5f, 0.0f, -0.5f)), rgb.dot(vec3F(-0.25f, 0.5f, -0.25f))); + } + + static inline vec2F rgb_to_cocg(const vec3F& rgb) + { + return vec2F(rgb.dot(vec3F(0.5f, 0.0f, -0.5f)), rgb.dot(vec3F(-0.25f, 0.5f, -0.25f))); + } + + static inline vec3F ycocg_to_rgb(const vec3F& ycocg) + { + return vec3F(ycocg.dot(vec3F(1.0f, 1.0f, -1.0f)), ycocg.dot(vec3F(1.0f, 0.0f, 1.0f)), ycocg.dot(vec3F(1.0f, -1.0f, -1.0f))); + } + + static inline vec3F color32_to_vec3F(const color32& c) + { + return vec3F(c.r, c.g, c.b); + } + + static inline vec3F color5_to_ycocg(const endpoint& e) + { + const int r = (e.m_color5[0] << 3) | (e.m_color5[0] >> 2); + const int g = (e.m_color5[1] << 3) | (e.m_color5[1] >> 2); + const int b = (e.m_color5[2] << 3) | (e.m_color5[2] >> 2); + return rgb_to_ycocg(vec3F((float)r, (float)g, (float)b)); + } + + static inline vec2F color5_to_cocg(const endpoint& e) + { + const int r = (e.m_color5[0] << 3) | (e.m_color5[0] >> 2); + const int g = (e.m_color5[1] << 3) | (e.m_color5[1] >> 2); + const int b = (e.m_color5[2] << 3) | (e.m_color5[2] >> 2); + return rgb_to_cocg(vec3F((float)r, (float)g, (float)b)); + } + + static inline uint32_t bc7_7_to_8(uint32_t v) + { + assert(v < 128); + return (v << 1) | (v >> 6); + } + + static inline uint32_t bc7_interp2(uint32_t l, uint32_t h, uint32_t w) + { + assert(w < 4); + return (l * (64 - basist::g_bc7_weights2[w]) + h * basist::g_bc7_weights2[w] + 32) >> 6; + } + + static inline vec2F get_endpoint_cocg_clamped(int bx, int by, const basisu::vector2D& decoded_endpoints, const endpoint* pEndpoints) + { + const uint32_t endpoint_index = decoded_endpoints.at_clamped(bx, by); + return color5_to_cocg(pEndpoints[endpoint_index]); + } + + static void chroma_filter_bc7_mode5(const basisu::vector2D& decoded_endpoints, void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t output_row_pitch_in_blocks_or_pixels, const endpoint *pEndpoints) + { + const bool hq_bc7_mode_5_encoder_mode = false; + + const int CHROMA_THRESH = 10; + + uint32_t total_filtered_blocks = 0; + + for (int by = 0; by < (int)num_blocks_y; by++) + { + for (int bx = 0; bx < (int)num_blocks_x; bx++) + { + vec2F center_cocg(color5_to_cocg(pEndpoints[decoded_endpoints(bx, by)])); + + //bool filter_flag = false; + for (int dy = -1; dy <= 1; dy++) + { + const int oy = by + dy; + if ((oy < 0) || (oy >= (int)num_blocks_y)) + continue; + + for (int dx = -1; dx <= 1; dx++) + { + if ((dx | dy) == 0) + continue; + + const int ox = bx + dx; + if ((ox < 0) || (ox >= (int)num_blocks_x)) + continue; + + vec2F nearby_cocg(color5_to_cocg(pEndpoints[decoded_endpoints(ox, oy)])); + + float delta_co = fabsf(nearby_cocg[0] - center_cocg[0]); + float delta_cg = fabsf(nearby_cocg[1] - center_cocg[1]); + + if ((delta_co > CHROMA_THRESH) || (delta_cg > CHROMA_THRESH)) + { + //filter_flag = true; + goto do_filter; + } + + } // dx + } // dy + + continue; + + do_filter:; + + total_filtered_blocks++; + + bc7_mode_5* pDst_block = (bc7_mode_5*)(static_cast(pDst_blocks) + (bx + by * output_row_pitch_in_blocks_or_pixels) * sizeof(bc7_mode_5)); + + //memset(pDst_block, 0x80, 16); + + int lr = bc7_7_to_8(pDst_block->m_lo.m_r0); + int lg = bc7_7_to_8(pDst_block->m_lo.m_g0); + int lb = bc7_7_to_8(pDst_block->m_lo.m_b0); + + int hr = bc7_7_to_8(pDst_block->m_lo.m_r1); + int hg = bc7_7_to_8(pDst_block->m_lo.m_g1); + int hb = bc7_7_to_8(pDst_block->m_lo.m_b1); + + float y_vals[4]; + for (uint32_t i = 0; i < 4; i++) + { + int cr = bc7_interp2(lr, hr, i); + int cg = bc7_interp2(lg, hg, i); + int cb = bc7_interp2(lb, hb, i); + y_vals[i] = (float)cr * .25f + (float)cg * .5f + (float)cb * .25f; + } // i + + uint64_t sel_bits = pDst_block->m_hi_bits >> 2; + + float block_y_vals[16]; // [y][x] + float y_sum = 0.0f, y_sum_sq = 0.0f; + + for (uint32_t i = 0; i < 16; i++) + { + uint32_t sel = sel_bits & (i ? 3 : 1); + sel_bits >>= (i ? 2 : 1); + float y = y_vals[sel]; + block_y_vals[i] = y; + y_sum += y; + y_sum_sq += y * y; + + } // i + + const float S = 1.0f / 16.0f; + float y_var = (y_sum_sq * S) - basisu::squaref(y_sum * S); + + // Don't bother if the block is too smooth. + const float Y_VAR_SKIP_THRESH = 3.0f; + if (y_var < Y_VAR_SKIP_THRESH) + continue; + + color32 block_to_pack[16]; + + for (int bpy = 0; bpy < 4; bpy++) + { + const int uby = by + ((bpy - 2) >> 2); + + for (int bpx = 0; bpx < 4; bpx++) + { + const float fx = ((float)((bpx + 2) & 3) + .5f) * (1.0f / 4.0f); + const float fy = ((float)((bpy + 2) & 3) + .5f) * (1.0f / 4.0f); + + const int ubx = bx + ((bpx - 2) >> 2); + + vec2F a(get_endpoint_cocg_clamped(ubx, uby, decoded_endpoints, pEndpoints)); + vec2F b(get_endpoint_cocg_clamped(ubx + 1, uby, decoded_endpoints, pEndpoints)); + vec2F c(get_endpoint_cocg_clamped(ubx, uby + 1, decoded_endpoints, pEndpoints)); + vec2F d(get_endpoint_cocg_clamped(ubx + 1, uby + 1, decoded_endpoints, pEndpoints)); + + assert((fx >= 0) && (fx <= 1.0f) && (fy >= 0) && (fy <= 1.0f)); + + // TODO: Could merge this into 4 muls on each corner by weights + vec2F ab = vec2F::lerp(a, b, fx); + vec2F cd = vec2F::lerp(c, d, fx); + vec2F f = vec2F::lerp(ab, cd, fy); + + vec3F final_ycocg(block_y_vals[bpx + bpy * 4], f[0], f[1]); + + vec3F final_conv(ycocg_to_rgb(final_ycocg)); + final_conv.clamp(0.0f, 255.0f); + + block_to_pack[bpx + bpy * 4].set_noclamp_rgba((int)(.5f + final_conv[0]), (int)(.5f + final_conv[1]), (int)(.5f + final_conv[2]), 255); + + } // x + } // y + + bc7_mode_5_encoder::encode_bc7_mode_5_block(pDst_block, block_to_pack, hq_bc7_mode_5_encoder_mode); + + } // bx + } // by + + //basisu::fmt_printf("Chroma thresh: {}, Total blocks to filter: {} out of {} {}\n", CHROMA_THRESH, total_filtered_blocks, num_blocks_x * num_blocks_y, (float)total_filtered_blocks * 100.0f / (num_blocks_x * num_blocks_y)); + } #endif // BASISD_SUPPORT_BC7_MODE5 #if BASISD_SUPPORT_ETC2_EAC_A8 || BASISD_SUPPORT_UASTC @@ -6927,9 +7236,7 @@ namespace basist pBlock->m_modulation[3] = (uint8_t)sels3; } } - - typedef struct { float c[4]; } vec4F; - + static inline vec4F* vec4F_set_scalar(vec4F* pV, float x) { pV->c[0] = x; pV->c[1] = x; pV->c[2] = x; pV->c[3] = x; return pV; } static inline vec4F* vec4F_set(vec4F* pV, float x, float y, float z, float w) { pV->c[0] = x; pV->c[1] = y; pV->c[2] = z; pV->c[3] = w; return pV; } static inline vec4F* vec4F_saturate_in_place(vec4F* pV) { pV->c[0] = saturate(pV->c[0]); pV->c[1] = saturate(pV->c[1]); pV->c[2] = saturate(pV->c[2]); pV->c[3] = saturate(pV->c[3]); return pV; } @@ -7171,7 +7478,7 @@ namespace basist } // See if any of the block colors got clamped - if so the principle axis got distorted (it's no longer just the ETC1S luma axis). // To keep quality up we need to use full 4D PCA in this case. - else if ((block_cols[low_selector].c[0] == 0) || (block_cols[high_selector].c[0] == 255) || + else if ((block_cols[low_selector].c[0] == 0) || (block_cols[high_selector].c[0] == 255) || (block_cols[low_selector].c[1] == 0) || (block_cols[high_selector].c[1] == 255) || (block_cols[low_selector].c[2] == 0) || (block_cols[high_selector].c[2] == 255) || (block_cols[alpha_selectors.m_lo_selector].c[3] == 0) || (block_cols[alpha_selectors.m_hi_selector].c[3] == 255)) @@ -7536,114 +7843,494 @@ namespace basist #endif // BASISD_SUPPORT_PVRTC2 //------------------------------------------------------------------------------------------------ + + // BC7 mode 5 RGB encoder - basisu_lowlevel_etc1s_transcoder::basisu_lowlevel_etc1s_transcoder() : - m_pGlobal_codebook(nullptr), - m_selector_history_buf_size(0) - { - } +#if BASISD_SUPPORT_BC7_MODE5 + namespace bc7_mode_5_encoder + { + static float g_mode5_rgba_midpoints[128]; - bool basisu_lowlevel_etc1s_transcoder::decode_palettes( - uint32_t num_endpoints, const uint8_t* pEndpoints_data, uint32_t endpoints_data_size, - uint32_t num_selectors, const uint8_t* pSelectors_data, uint32_t selectors_data_size) - { - if (m_pGlobal_codebook) + void encode_bc7_mode5_init() { - BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 11\n"); - return false; - } - bitwise_decoder sym_codec; + // Mode 5 endpoint midpoints + for (uint32_t i = 0; i < 128; i++) + { + uint32_t vl = (i << 1); + vl |= (vl >> 7); + float lo = vl / 255.0f; - huffman_decoding_table color5_delta_model0, color5_delta_model1, color5_delta_model2, inten_delta_model; + uint32_t vh = basisu::minimumi(127, i + 1) << 1; + vh |= (vh >> 7); + float hi = vh / 255.0f; - if (!sym_codec.init(pEndpoints_data, endpoints_data_size)) - { - BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 0\n"); - return false; + if (i == 127) + g_mode5_rgba_midpoints[i] = 1e+15f; + else + g_mode5_rgba_midpoints[i] = (lo + hi) / 2.0f; + } } - if (!sym_codec.read_huffman_table(color5_delta_model0)) + static inline uint32_t from_7(uint32_t v) { - BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 1\n"); - return false; + assert(v < 128); + return (v << 1) | (v >> 6); } - if (!sym_codec.read_huffman_table(color5_delta_model1)) + static inline int to_7(float c) { - BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 1a\n"); - return false; + assert((c >= 0) && (c <= 1.0f)); + + int vl = (int)(c * 127.0f); + vl += (c > g_mode5_rgba_midpoints[vl]); + return clampi(vl, 0, 127); } - if (!sym_codec.read_huffman_table(color5_delta_model2)) + static inline int to_7(int c8) { - BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 2a\n"); - return false; + assert((c8 >= 0) && (c8 <= 255)); + + float c = (float)c8 * (1.0f / 255.0f); + + int vl = (int)(c * 127.0f); + vl += (c > g_mode5_rgba_midpoints[vl]); + return clampi(vl, 0, 127); } - if (!sym_codec.read_huffman_table(inten_delta_model)) + // This is usable with ASTC as well, which uses the same 2-bit interpolation weights. + static inline uint32_t bc7_interp2(uint32_t l, uint32_t h, uint32_t w) { - BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 2b\n"); - return false; + assert(w < 4); + return (l * (64 - basist::g_bc7_weights2[w]) + h * basist::g_bc7_weights2[w] + 32) >> 6; } - if (!color5_delta_model0.is_valid() || !color5_delta_model1.is_valid() || !color5_delta_model2.is_valid() || !inten_delta_model.is_valid()) + static void eval_weights( + const color32 *pPixels, uint8_t* pWeights, + int lr, int lg, int lb, + int hr, int hg, int hb) { - BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 2b\n"); - return false; - } + lr = from_7(lr); lg = from_7(lg); lb = from_7(lb); + hr = from_7(hr); hg = from_7(hg); hb = from_7(hb); - const bool endpoints_are_grayscale = sym_codec.get_bits(1) != 0; + int cr[4], cg[4], cb[4]; + for (uint32_t i = 0; i < 4; i++) + { + cr[i] = (uint8_t)bc7_interp2(lr, hr, i); + cg[i] = (uint8_t)bc7_interp2(lg, hg, i); + cb[i] = (uint8_t)bc7_interp2(lb, hb, i); + } - m_local_endpoints.resize(num_endpoints); +#if 0 + for (uint32_t i = 0; i < 16; i++) + { + const int pr = pPixels[i].r, pg = pPixels[i].g, pb = pPixels[i].b; - color32 prev_color5(16, 16, 16, 0); - uint32_t prev_inten = 0; + uint32_t best_err = UINT32_MAX; + uint32_t best_idx = 0; + for (uint32_t j = 0; j < 4; j++) + { + uint32_t e = square(pr - cr[j]) + square(pg - cg[j]) + square(pb - cb[j]); + if (e < best_err) + { + best_err = e; + best_idx = j; + } - for (uint32_t i = 0; i < num_endpoints; i++) - { - uint32_t inten_delta = sym_codec.decode_huffman(inten_delta_model); - m_local_endpoints[i].m_inten5 = static_cast((inten_delta + prev_inten) & 7); - prev_inten = m_local_endpoints[i].m_inten5; + pWeights[i] = (uint8_t)best_idx; + } + } // i +#else + int ar = cr[3] - cr[0], ag = cg[3] - cg[0], ab = cb[3] - cb[0]; - for (uint32_t c = 0; c < (endpoints_are_grayscale ? 1U : 3U); c++) - { - int delta; - if (prev_color5[c] <= basist::COLOR5_PAL0_PREV_HI) - delta = sym_codec.decode_huffman(color5_delta_model0); - else if (prev_color5[c] <= basist::COLOR5_PAL1_PREV_HI) - delta = sym_codec.decode_huffman(color5_delta_model1); - else - delta = sym_codec.decode_huffman(color5_delta_model2); + int dots[4]; + for (uint32_t i = 0; i < 4; i++) + dots[i] = (int)cr[i] * ar + (int)cg[i] * ag + (int)cb[i] * ab; - int v = (prev_color5[c] + delta) & 31; + // seems very rare in LDR, so rare that it doesn't matter + //assert(dots[0] <= dots[1]); + //assert(dots[1] <= dots[2]); + //assert(dots[2] <= dots[3]); - m_local_endpoints[i].m_color5[c] = static_cast(v); + int t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3]; - prev_color5[c] = static_cast(v); - } + ar *= 2; ag *= 2; ab *= 2; - if (endpoints_are_grayscale) + for (uint32_t i = 0; i < 16; i += 4) { - m_local_endpoints[i].m_color5[1] = m_local_endpoints[i].m_color5[0]; - m_local_endpoints[i].m_color5[2] = m_local_endpoints[i].m_color5[0]; + const int d0 = pPixels[i + 0].r * ar + pPixels[i + 0].g * ag + pPixels[i + 0].b * ab; + const int d1 = pPixels[i + 1].r * ar + pPixels[i + 1].g * ag + pPixels[i + 1].b * ab; + const int d2 = pPixels[i + 2].r * ar + pPixels[i + 2].g * ag + pPixels[i + 2].b * ab; + const int d3 = pPixels[i + 3].r * ar + pPixels[i + 3].g * ag + pPixels[i + 3].b * ab; + + pWeights[i + 0] = (d0 > t0) + (d0 >= t1) + (d0 >= t2); + pWeights[i + 1] = (d1 > t0) + (d1 >= t1) + (d1 >= t2); + pWeights[i + 2] = (d2 > t0) + (d2 >= t1) + (d2 >= t2); + pWeights[i + 3] = (d3 > t0) + (d3 >= t1) + (d3 >= t2); } +#endif } - sym_codec.stop(); - - m_local_selectors.resize(num_selectors); - - if (!sym_codec.init(pSelectors_data, selectors_data_size)) + static void pack_bc7_mode5_rgb_block( + bc7_mode_5* pDst_block, + int lr, int lg, int lb, int hr, int hg, int hb, + const uint8_t* pWeights) { - BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 5\n"); - return false; - } + assert((lr >= 0) && (lr <= 127)); + assert((lg >= 0) && (lg <= 127)); + assert((lb >= 0) && (lb <= 127)); + assert((hr >= 0) && (hr <= 127)); + assert((hg >= 0) && (hg <= 127)); + assert((hb >= 0) && (hb <= 127)); - basist::huffman_decoding_table delta_selector_pal_model; + pDst_block->m_lo_bits = 0; - const bool used_global_selector_cb = (sym_codec.get_bits(1) == 1); + uint8_t weight_inv = 0; + if (pWeights[0] & 2) + { + std::swap(lr, hr); + std::swap(lg, hg); + std::swap(lb, hb); + weight_inv = 3; + } + assert((pWeights[0] ^ weight_inv) <= 1); - if (used_global_selector_cb) + pDst_block->m_lo.m_mode = 32; + pDst_block->m_lo.m_r0 = lr; + pDst_block->m_lo.m_r1 = hr; + pDst_block->m_lo.m_g0 = lg; + pDst_block->m_lo.m_g1 = hg; + pDst_block->m_lo.m_b0 = lb; + pDst_block->m_lo.m_b1 = hb; + + pDst_block->m_lo.m_a0 = 255; + pDst_block->m_lo.m_a1_0 = 63; + + uint64_t sel_bits = 3; + uint32_t cur_ofs = 2; + for (uint32_t i = 0; i < 16; i++) + { + assert(pWeights[i] <= 3); + sel_bits |= ((uint64_t)(weight_inv ^ pWeights[i])) << cur_ofs; + cur_ofs += (i ? 2 : 1); + } + + pDst_block->m_hi_bits = sel_bits; + } + + // This table is: 9 * (w * w), 9 * ((1.0f - w) * w), 9 * ((1.0f - w) * (1.0f - w)) + // where w is [0,1/3,2/3,1]. 9 is the perfect multiplier. + static const uint32_t g_weight_vals4[4] = { 0x000009, 0x010204, 0x040201, 0x090000 }; + + static inline bool compute_least_squares_endpoints4_rgb( + const color32 *pColors, const uint8_t* pSelectors, + int& lr, int& lg, int& lb, int& hr, int& hg, int& hb, + int total_r, int total_g, int total_b) + { + uint32_t uq00_r = 0, uq00_g = 0, uq00_b = 0; + uint32_t weight_accum = 0; + for (uint32_t i = 0; i < 16; i++) + { + const uint8_t r = pColors[i].r, g = pColors[i].g, b = pColors[i].b; + const uint8_t sel = pSelectors[i]; + + weight_accum += g_weight_vals4[sel]; + uq00_r += sel * r; + uq00_g += sel * g; + uq00_b += sel * b; + } + + int q10_r = total_r * 3 - uq00_r; + int q10_g = total_g * 3 - uq00_g; + int q10_b = total_b * 3 - uq00_b; + + float z00 = (float)((weight_accum >> 16) & 0xFF); + float z10 = (float)((weight_accum >> 8) & 0xFF); + float z11 = (float)(weight_accum & 0xFF); + float z01 = z10; + + float det = z00 * z11 - z01 * z10; + if (fabs(det) < 1e-8f) + return false; + + det = (3.0f / 255.0f) / det; + + float iz00, iz01, iz10, iz11; + iz00 = z11 * det; + iz01 = -z01 * det; + iz10 = -z10 * det; + iz11 = z00 * det; + + float fhr = basisu::clamp(iz00 * (float)uq00_r + iz01 * q10_r, 0.0f, 1.0f); + float flr = basisu::clamp(iz10 * (float)uq00_r + iz11 * q10_r, 0.0f, 1.0f); + + float fhg = basisu::clamp(iz00 * (float)uq00_g + iz01 * q10_g, 0.0f, 1.0f); + float flg = basisu::clamp(iz10 * (float)uq00_g + iz11 * q10_g, 0.0f, 1.0f); + + float fhb = basisu::clamp(iz00 * (float)uq00_b + iz01 * q10_b, 0.0f, 1.0f); + float flb = basisu::clamp(iz10 * (float)uq00_b + iz11 * q10_b, 0.0f, 1.0f); + + lr = to_7(flr); lg = to_7(flg); lb = to_7(flb); + hr = to_7(fhr); hg = to_7(fhg); hb = to_7(fhb); + + return true; + } + + void encode_bc7_mode_5_block(void* pDst_block, color32* pPixels, bool hq_mode) + { + assert(g_mode5_rgba_midpoints[1]); + + int total_r = 0, total_g = 0, total_b = 0; + + int min_r = 255, min_g = 255, min_b = 255; + int max_r = 0, max_g = 0, max_b = 0; + + for (uint32_t i = 0; i < 16; i++) + { + int r = pPixels[i].r, g = pPixels[i].g, b = pPixels[i].b; + + total_r += r; total_g += g; total_b += b; + + min_r = basisu::minimum(min_r, r); min_g = basisu::minimum(min_g, g); min_b = basisu::minimum(min_b, b); + max_r = basisu::maximum(max_r, r); max_g = basisu::maximum(max_g, g); max_b = basisu::maximum(max_b, b); + } + + if ((min_r == max_r) && (min_g == max_g) && (min_b == max_b)) + { + const int lr = g_bc7_m5_equals_1[min_r].m_lo, lg = g_bc7_m5_equals_1[min_g].m_lo, lb = g_bc7_m5_equals_1[min_b].m_lo; + const int hr = g_bc7_m5_equals_1[min_r].m_hi, hg = g_bc7_m5_equals_1[min_g].m_hi, hb = g_bc7_m5_equals_1[min_b].m_hi; + uint8_t solid_weights[16]; + memset(solid_weights, 1, 16); + pack_bc7_mode5_rgb_block((bc7_mode_5*)pDst_block, lr, lg, lb, hr, hg, hb, solid_weights); + return; + } + + int mean_r = (total_r + 8) >> 4, mean_g = (total_g + 8) >> 4, mean_b = (total_b + 8) >> 4; + + // covar rows are: + // 0, 1, 2 + // 1, 3, 4 + // 2, 4, 5 + int icov[6] = { 0, 0, 0, 0, 0, 0 }; + + for (uint32_t i = 0; i < 16; i++) + { + int r = (int)pPixels[i].r - mean_r; + int g = (int)pPixels[i].g - mean_g; + int b = (int)pPixels[i].b - mean_b; + icov[0] += r * r; icov[1] += r * g; icov[2] += r * b; + icov[3] += g * g; icov[4] += g * b; + icov[5] += b * b; + } + + int block_max_var = basisu::maximum(icov[0], icov[3], icov[5]); // not divided by 16, i.e. scaled by 16 + + // TODO: Tune this + const int32_t SIMPLE_BLOCK_THRESH = 10 * 16; + + if ((!hq_mode) && (block_max_var < SIMPLE_BLOCK_THRESH)) + { + const int L = 16, H = 239; + + int lr = to_7(lerp_8bit(min_r, max_r, L)); + int lg = to_7(lerp_8bit(min_g, max_g, L)); + int lb = to_7(lerp_8bit(min_b, max_b, L)); + + int hr = to_7(lerp_8bit(min_r, max_r, H)); + int hg = to_7(lerp_8bit(min_g, max_g, H)); + int hb = to_7(lerp_8bit(min_b, max_b, H)); + + uint8_t cur_weights[16]; + eval_weights(pPixels, cur_weights, lr, lg, lb, hr, hg, hb); + + pack_bc7_mode5_rgb_block((bc7_mode_5*)pDst_block, lr, lg, lb, hr, hg, hb, cur_weights); + return; + } + + float cov[6]; + for (uint32_t i = 0; i < 6; i++) + cov[i] = (float)icov[i]; + + const float sc = 1.0f / (float)block_max_var; + const float wx = sc * cov[0], wy = sc * cov[3], wz = sc * cov[5]; + + const float alt_xr = cov[0] * wx + cov[1] * wy + cov[2] * wz; + const float alt_xg = cov[1] * wx + cov[3] * wy + cov[4] * wz; + const float alt_xb = cov[2] * wx + cov[4] * wy + cov[5] * wz; + + int saxis_r = 306, saxis_g = 601, saxis_b = 117; + + float k = basisu::maximum(fabsf(alt_xr), fabsf(alt_xg), fabsf(alt_xb)); + if (fabs(k) >= basisu::SMALL_FLOAT_VAL) + { + float m = 2048.0f / k; + saxis_r = (int)(alt_xr * m); + saxis_g = (int)(alt_xg * m); + saxis_b = (int)(alt_xb * m); + } + + saxis_r = (int)((uint32_t)saxis_r << 4U); + saxis_g = (int)((uint32_t)saxis_g << 4U); + saxis_b = (int)((uint32_t)saxis_b << 4U); + + int low_dot = INT_MAX, high_dot = INT_MIN; + + for (uint32_t i = 0; i < 16; i += 4) + { + int dot0 = ((pPixels[i].r * saxis_r + pPixels[i].g * saxis_g + pPixels[i].b * saxis_b) & ~0xF) + i; + int dot1 = ((pPixels[i + 1].r * saxis_r + pPixels[i + 1].g * saxis_g + pPixels[i + 1].b * saxis_b) & ~0xF) + i + 1; + int dot2 = ((pPixels[i + 2].r * saxis_r + pPixels[i + 2].g * saxis_g + pPixels[i + 2].b * saxis_b) & ~0xF) + i + 2; + int dot3 = ((pPixels[i + 3].r * saxis_r + pPixels[i + 3].g * saxis_g + pPixels[i + 3].b * saxis_b) & ~0xF) + i + 3; + + int min_d01 = basisu::minimum(dot0, dot1); + int max_d01 = basisu::maximum(dot0, dot1); + + int min_d23 = basisu::minimum(dot2, dot3); + int max_d23 = basisu::maximum(dot2, dot3); + + int min_d = basisu::minimum(min_d01, min_d23); + int max_d = basisu::maximum(max_d01, max_d23); + + low_dot = basisu::minimum(low_dot, min_d); + high_dot = basisu::maximum(high_dot, max_d); + } + int low_c = low_dot & 15; + int high_c = high_dot & 15; + + int lr = to_7(pPixels[low_c].r), lg = to_7(pPixels[low_c].g), lb = to_7(pPixels[low_c].b); + int hr = to_7(pPixels[high_c].r), hg = to_7(pPixels[high_c].g), hb = to_7(pPixels[high_c].b); + + uint8_t cur_weights[16]; + eval_weights(pPixels, cur_weights, lr, lg, lb, hr, hg, hb); + + if (compute_least_squares_endpoints4_rgb( + pPixels, cur_weights, + lr, lg, lb, hr, hg, hb, + total_r, total_g, total_b)) + { + eval_weights(pPixels, cur_weights, lr, lg, lb, hr, hg, hb); + } + +#if 0 + lr = 0; lg = 0; lb = 0; + hr = 0; hg = 0; hb = 0; +#endif + + pack_bc7_mode5_rgb_block((bc7_mode_5*)pDst_block, lr, lg, lb, hr, hg, hb, cur_weights); + } + + } // namespace bc7_mode_5_encoder + +#endif // BASISD_SUPPORT_BC7_MODE5 + + //------------------------------------------------------------------------------------------------ + + basisu_lowlevel_etc1s_transcoder::basisu_lowlevel_etc1s_transcoder() : + m_pGlobal_codebook(nullptr), + m_selector_history_buf_size(0) + { + } + + bool basisu_lowlevel_etc1s_transcoder::decode_palettes( + uint32_t num_endpoints, const uint8_t* pEndpoints_data, uint32_t endpoints_data_size, + uint32_t num_selectors, const uint8_t* pSelectors_data, uint32_t selectors_data_size) + { + if (m_pGlobal_codebook) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 11\n"); + return false; + } + bitwise_decoder sym_codec; + + huffman_decoding_table color5_delta_model0, color5_delta_model1, color5_delta_model2, inten_delta_model; + + if (!sym_codec.init(pEndpoints_data, endpoints_data_size)) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 0\n"); + return false; + } + + if (!sym_codec.read_huffman_table(color5_delta_model0)) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 1\n"); + return false; + } + + if (!sym_codec.read_huffman_table(color5_delta_model1)) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 1a\n"); + return false; + } + + if (!sym_codec.read_huffman_table(color5_delta_model2)) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 2a\n"); + return false; + } + + if (!sym_codec.read_huffman_table(inten_delta_model)) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 2b\n"); + return false; + } + + if (!color5_delta_model0.is_valid() || !color5_delta_model1.is_valid() || !color5_delta_model2.is_valid() || !inten_delta_model.is_valid()) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 2b\n"); + return false; + } + + const bool endpoints_are_grayscale = sym_codec.get_bits(1) != 0; + + m_local_endpoints.resize(num_endpoints); + + color32 prev_color5(16, 16, 16, 0); + uint32_t prev_inten = 0; + + for (uint32_t i = 0; i < num_endpoints; i++) + { + uint32_t inten_delta = sym_codec.decode_huffman(inten_delta_model); + m_local_endpoints[i].m_inten5 = static_cast((inten_delta + prev_inten) & 7); + prev_inten = m_local_endpoints[i].m_inten5; + + for (uint32_t c = 0; c < (endpoints_are_grayscale ? 1U : 3U); c++) + { + int delta; + if (prev_color5[c] <= basist::COLOR5_PAL0_PREV_HI) + delta = sym_codec.decode_huffman(color5_delta_model0); + else if (prev_color5[c] <= basist::COLOR5_PAL1_PREV_HI) + delta = sym_codec.decode_huffman(color5_delta_model1); + else + delta = sym_codec.decode_huffman(color5_delta_model2); + + int v = (prev_color5[c] + delta) & 31; + + m_local_endpoints[i].m_color5[c] = static_cast(v); + + prev_color5[c] = static_cast(v); + } + + if (endpoints_are_grayscale) + { + m_local_endpoints[i].m_color5[1] = m_local_endpoints[i].m_color5[0]; + m_local_endpoints[i].m_color5[2] = m_local_endpoints[i].m_color5[0]; + } + } + + sym_codec.stop(); + + m_local_selectors.resize(num_selectors); + + if (!sym_codec.init(pSelectors_data, selectors_data_size)) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: fail 5\n"); + return false; + } + + basist::huffman_decoding_table delta_selector_pal_model; + + const bool used_global_selector_cb = (sym_codec.get_bits(1) == 1); + + if (used_global_selector_cb) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::decode_palettes: global selector codebooks are unsupported\n"); return false; @@ -7799,7 +8486,7 @@ namespace basist bool basisu_lowlevel_etc1s_transcoder::transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const bool is_video, const bool is_alpha_slice, const uint32_t level_index, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels, - basisu_transcoder_state* pState, bool transcode_alpha, void *pAlpha_blocks, uint32_t output_rows_in_pixels) + basisu_transcoder_state* pState, bool transcode_alpha, void *pAlpha_blocks, uint32_t output_rows_in_pixels, uint32_t decode_flags) { // 'pDst_blocks' unused when disabling *all* hardware transcode options // (and 'bc1_allow_threecolor_blocks' when disabling DXT) @@ -7878,6 +8565,7 @@ namespace basist block.set_diff_bit(true); + // Important: This MUST be freed before this function returns. void* pPVRTC_work_mem = nullptr; uint32_t* pPVRTC_endpoints = nullptr; if ((fmt == block_format::cPVRTC1_4_RGB) || (fmt == block_format::cPVRTC1_4_RGBA)) @@ -7888,7 +8576,7 @@ namespace basist BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_slice: malloc failed\n"); return false; } - pPVRTC_endpoints = (uint32_t*) & ((decoder_etc_block*)pPVRTC_work_mem)[num_blocks_x * num_blocks_y]; + pPVRTC_endpoints = (uint32_t*)&((decoder_etc_block*)pPVRTC_work_mem)[num_blocks_x * num_blocks_y]; } if (pState->m_block_endpoint_preds[0].size() < num_blocks_x) @@ -7906,12 +8594,35 @@ namespace basist if (!endpoints.size() || !selectors.size()) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_slice: global codebooks must be unpacked first\n"); + + if (pPVRTC_work_mem) + free(pPVRTC_work_mem); + return false; } const uint32_t SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX = (uint32_t)selectors.size(); const uint32_t SELECTOR_HISTORY_BUF_RLE_SYMBOL_INDEX = m_selector_history_buf_size + SELECTOR_HISTORY_BUF_FIRST_SYMBOL_INDEX; +#if BASISD_SUPPORT_BC7_MODE5 + const bool bc7_chroma_filtering = ((decode_flags & cDecodeFlagsNoETC1SChromaFiltering) == 0) && + ((fmt == block_format::cBC7_M5_COLOR) || (fmt == block_format::cBC7)); + + basisu::vector2D decoded_endpoints; + if (bc7_chroma_filtering) + { + if (!decoded_endpoints.try_resize(num_blocks_x, num_blocks_y)) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_slice: allocation failed\n"); + + if (pPVRTC_work_mem) + free(pPVRTC_work_mem); + + return false; + } + } +#endif + for (uint32_t block_y = 0; block_y < num_blocks_y; block_y++) { const uint32_t cur_block_endpoint_pred_array = block_y & 1; @@ -8245,6 +8956,12 @@ namespace basist case block_format::cBC7_M5_COLOR: { #if BASISD_SUPPORT_BC7_MODE5 + if (bc7_chroma_filtering) + { + assert(endpoint_index <= UINT16_MAX); + decoded_endpoints(block_x, block_y) = (uint16_t)endpoint_index; + } + void* pDst_block = static_cast(pDst_blocks) + (block_x + block_y * output_row_pitch_in_blocks_or_pixels) * output_block_or_pixel_stride_in_bytes; convert_etc1s_to_bc7_m5_color(pDst_block, pEndpoints, pSelector); #else @@ -8599,11 +9316,15 @@ namespace basist } // block_x - } // block-y + } // block_y if (endpoint_pred_repeat_count != 0) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_slice: endpoint_pred_repeat_count != 0. The file is corrupted or this is a bug\n"); + + if (pPVRTC_work_mem) + free(pPVRTC_work_mem); + return false; } @@ -8617,19 +9338,29 @@ namespace basist fixup_pvrtc1_4_modulation_rgba((decoder_etc_block*)pPVRTC_work_mem, pPVRTC_endpoints, pDst_blocks, num_blocks_x, num_blocks_y, pAlpha_blocks, &endpoints[0], &selectors[0]); #endif // BASISD_SUPPORT_PVRTC1 +#if BASISD_SUPPORT_BC7_MODE5 + if (bc7_chroma_filtering) + { + chroma_filter_bc7_mode5(decoded_endpoints, pDst_blocks, num_blocks_x, num_blocks_y, output_row_pitch_in_blocks_or_pixels, &endpoints[0]); + } +#endif + if (pPVRTC_work_mem) free(pPVRTC_work_mem); return true; } - bool basis_validate_output_buffer_size(transcoder_texture_format target_format, + bool basis_validate_output_buffer_size( + basis_tex_format source_format, + transcoder_texture_format target_format, uint32_t output_blocks_buf_size_in_blocks_or_pixels, uint32_t orig_width, uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels, - uint32_t output_rows_in_pixels, - uint32_t total_slice_blocks) + uint32_t output_rows_in_pixels) { + BASISU_NOTE_UNUSED(source_format); + if (basis_transcoder_format_is_uncompressed(target_format)) { // Assume the output buffer is orig_width by orig_height @@ -8646,38 +9377,79 @@ namespace basist return false; } } - else if (target_format == transcoder_texture_format::cTFFXT1_RGB) - { - const uint32_t num_blocks_fxt1_x = (orig_width + 7) / 8; - const uint32_t num_blocks_fxt1_y = (orig_height + 3) / 4; - const uint32_t total_blocks_fxt1 = num_blocks_fxt1_x * num_blocks_fxt1_y; - - if (output_blocks_buf_size_in_blocks_or_pixels < total_blocks_fxt1) - { - BASISU_DEVEL_ERROR("basis_validate_output_buffer_size: output_blocks_buf_size_in_blocks_or_pixels < total_blocks_fxt1\n"); - return false; - } - } else { - if (output_blocks_buf_size_in_blocks_or_pixels < total_slice_blocks) + const uint32_t dst_block_width = basis_get_block_width(target_format); + const uint32_t dst_block_height = basis_get_block_height(target_format); + //const uint32_t bytes_per_block = basis_get_bytes_per_block_or_pixel(target_format); + + // Take into account the destination format's block width/height. + const uint32_t num_dst_blocks_x = (orig_width + dst_block_width - 1) / dst_block_width; + const uint32_t num_dst_blocks_y = (orig_height + dst_block_height - 1) / dst_block_height; + const uint32_t total_dst_blocks = num_dst_blocks_x * num_dst_blocks_y; + + assert(total_dst_blocks); + + // Note this only computes the # of blocks we will write during transcoding, but for PVRTC1 OpenGL may require more for very small textures. + // basis_compute_transcoded_image_size_in_bytes() may return larger buffers. + if (output_blocks_buf_size_in_blocks_or_pixels < total_dst_blocks) { - BASISU_DEVEL_ERROR("basis_validate_output_buffer_size: output_blocks_buf_size_in_blocks_or_pixels < transcode_image\n"); + BASISU_DEVEL_ERROR("basis_validate_output_buffer_size: output_blocks_buf_size_in_blocks_or_pixels is too small\n"); return false; } } + return true; } + + uint32_t basis_compute_transcoded_image_size_in_bytes(transcoder_texture_format target_format, uint32_t orig_width, uint32_t orig_height) + { + assert(orig_width && orig_height); - bool basisu_lowlevel_etc1s_transcoder::transcode_image( - transcoder_texture_format target_format, - void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, - const uint8_t* pCompressed_data, uint32_t compressed_data_length, - uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index, - uint32_t rgb_offset, uint32_t rgb_length, uint32_t alpha_offset, uint32_t alpha_length, - uint32_t decode_flags, - bool basis_file_has_alpha_slices, - bool is_video, + const uint32_t dst_block_width = basis_get_block_width(target_format); + const uint32_t dst_block_height = basis_get_block_height(target_format); + + if (basis_transcoder_format_is_uncompressed(target_format)) + { + // Uncompressed formats are just plain raster images. + const uint32_t bytes_per_pixel = basis_get_uncompressed_bytes_per_pixel(target_format); + const uint32_t bytes_per_line = orig_width * bytes_per_pixel; + const uint32_t bytes_per_slice = bytes_per_line * orig_height; + return bytes_per_slice; + } + + // Compressed formats are 2D arrays of blocks. + const uint32_t bytes_per_block = basis_get_bytes_per_block_or_pixel(target_format); + + if ((target_format == transcoder_texture_format::cTFPVRTC1_4_RGB) || (target_format == transcoder_texture_format::cTFPVRTC1_4_RGBA)) + { + // For PVRTC1, Basis only writes (or requires) total_blocks * bytes_per_block. But GL requires extra padding for very small textures: + // https://www.khronos.org/registry/OpenGL/extensions/IMG/IMG_texture_compression_pvrtc.txt + const uint32_t width = (orig_width + 3) & ~3; + const uint32_t height = (orig_height + 3) & ~3; + const uint32_t size_in_bytes = (std::max(8U, width) * std::max(8U, height) * 4 + 7) / 8; + return size_in_bytes; + } + + // Take into account the destination format's block width/height. + const uint32_t num_dst_blocks_x = (orig_width + dst_block_width - 1) / dst_block_width; + const uint32_t num_dst_blocks_y = (orig_height + dst_block_height - 1) / dst_block_height; + const uint32_t total_dst_blocks = num_dst_blocks_x * num_dst_blocks_y; + + assert(total_dst_blocks); + + return total_dst_blocks * bytes_per_block; + } + + bool basisu_lowlevel_etc1s_transcoder::transcode_image( + transcoder_texture_format target_format, + void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, + const uint8_t* pCompressed_data, uint32_t compressed_data_length, + uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index, + uint32_t rgb_offset, uint32_t rgb_length, uint32_t alpha_offset, uint32_t alpha_length, + uint32_t decode_flags, + bool basis_file_has_alpha_slices, + bool is_video, uint32_t output_row_pitch_in_blocks_or_pixels, basisu_transcoder_state* pState, uint32_t output_rows_in_pixels) @@ -8720,8 +9492,8 @@ namespace basist const bool transcode_alpha_data_to_opaque_formats = (decode_flags & cDecodeFlagsTranscodeAlphaDataToOpaqueFormats) != 0; const uint32_t bytes_per_block_or_pixel = basis_get_bytes_per_block_or_pixel(target_format); const uint32_t total_slice_blocks = num_blocks_x * num_blocks_y; - - if (!basis_validate_output_buffer_size(target_format, output_blocks_buf_size_in_blocks_or_pixels, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, output_rows_in_pixels, total_slice_blocks)) + + if (!basis_validate_output_buffer_size(basis_tex_format::cETC1S, target_format, output_blocks_buf_size_in_blocks_or_pixels, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, output_rows_in_pixels)) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: output buffer size too small\n"); return false; @@ -8746,7 +9518,7 @@ namespace basist case transcoder_texture_format::cTFETC1_RGB: { //status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC1, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cETC1, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cETC1, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { @@ -8761,7 +9533,7 @@ namespace basist return false; #else // status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC1, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cBC1, bytes_per_block_or_pixel, true, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cBC1, bytes_per_block_or_pixel, true, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to BC1 failed\n"); @@ -8776,7 +9548,7 @@ namespace basist return false; #else //status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cBC4, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cBC4, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to BC4 failed\n"); @@ -8792,7 +9564,7 @@ namespace basist #else // output_row_pitch_in_blocks_or_pixels is actually ignored because we're transcoding to PVRTC1. (Print a dev warning if it's != 0?) //status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC1_4_RGB, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cPVRTC1_4_RGB, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cPVRTC1_4_RGB, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to PVRTC1 4 RGB failed\n"); @@ -8814,7 +9586,7 @@ namespace basist // First transcode alpha data to temp buffer //status = transcode_slice(pData, data_size, slice_index + 1, &temp_block_indices[0], total_slice_blocks, block_format::cIndices, sizeof(uint32_t), decode_flags, pSlice_descs[slice_index].m_num_blocks_x, pState); - status = transcode_slice(&temp_block_indices[0], num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cIndices, sizeof(uint32_t), false, is_video, true, level_index, orig_width, orig_height, num_blocks_x, pState, false, nullptr, 0); + status = transcode_slice(&temp_block_indices[0], num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cIndices, sizeof(uint32_t), false, is_video, true, level_index, orig_width, orig_height, num_blocks_x, pState, false, nullptr, 0, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to PVRTC1 4 RGBA failed (0)\n"); @@ -8823,7 +9595,7 @@ namespace basist { // output_row_pitch_in_blocks_or_pixels is actually ignored because we're transcoding to PVRTC1. (Print a dev warning if it's != 0?) //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC1_4_RGBA, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState, &temp_block_indices[0]); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cPVRTC1_4_RGBA, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, &temp_block_indices[0], 0); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cPVRTC1_4_RGBA, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, &temp_block_indices[0], 0, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to PVRTC1 4 RGBA failed (1)\n"); @@ -8845,13 +9617,13 @@ namespace basist // First transcode the color slice. The cBC7_M5_COLOR transcoder will output opaque mode 5 blocks. //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC7_M5_COLOR, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cBC7_M5_COLOR, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cBC7_M5_COLOR, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if ((status) && (basis_file_has_alpha_slices)) { // Now transcode the alpha slice. The cBC7_M5_ALPHA transcoder will now change the opaque mode 5 blocks to blocks with alpha. //status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC7_M5_ALPHA, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cBC7_M5_ALPHA, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cBC7_M5_ALPHA, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); } if (!status) @@ -8874,7 +9646,7 @@ namespace basist { // First decode the alpha data //status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_A8, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cETC2_EAC_A8, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cETC2_EAC_A8, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); } else { @@ -8887,7 +9659,7 @@ namespace basist { // Now decode the color data //status = transcode_slice(pData, data_size, slice_index, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC1, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice((uint8_t *)pOutput_blocks + 8, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cETC1, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice((uint8_t *)pOutput_blocks + 8, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cETC1, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to ETC2 RGB failed\n"); @@ -8915,7 +9687,7 @@ namespace basist if (basis_file_has_alpha_slices) { //status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cBC4, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cBC4, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); } else { @@ -8927,7 +9699,7 @@ namespace basist { // Now decode the color data. Forbid 3 color blocks, which aren't allowed in BC3. //status = transcode_slice(pData, data_size, slice_index, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC1, 16, decode_flags | cDecodeFlagsBC1ForbidThreeColorBlocks, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice((uint8_t *)pOutput_blocks + 8, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cBC1, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice((uint8_t *)pOutput_blocks + 8, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cBC1, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to BC3 RGB failed\n"); @@ -8955,14 +9727,14 @@ namespace basist // Decode the R data (actually the green channel of the color data slice in the basis file) //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cBC4, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cBC4, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (status) { if (basis_file_has_alpha_slices) { // Decode the G data (actually the green channel of the alpha data slice in the basis file) //status = transcode_slice(pData, data_size, slice_index + 1, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice((uint8_t *)pOutput_blocks + 8, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cBC4, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice((uint8_t *)pOutput_blocks + 8, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cBC4, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to BC5 1 failed\n"); @@ -8993,18 +9765,18 @@ namespace basist { // First decode the alpha data to the output (we're using the output texture as a temp buffer here). //status = transcode_slice(pData, data_size, slice_index + 1, (uint8_t*)pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cIndices, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cIndices, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cIndices, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (status) { // Now decode the color data and transcode to ASTC. The transcoder function will read the alpha selector data from the output texture as it converts and // transcode both the alpha and color data at the same time to ASTC. //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cASTC_4x4, 16, decode_flags | cDecodeFlagsOutputHasAlphaIndices, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cASTC_4x4, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, true, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cASTC_4x4, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, true, nullptr, output_rows_in_pixels, decode_flags); } } else //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cASTC_4x4, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cASTC_4x4, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cASTC_4x4, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { @@ -9021,7 +9793,7 @@ namespace basist return false; #else //status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cATC_RGB, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cATC_RGB, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cATC_RGB, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to ATC_RGB failed\n"); @@ -9044,7 +9816,7 @@ namespace basist if (basis_file_has_alpha_slices) { //status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC4, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cBC4, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cBC4, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); } else { @@ -9055,7 +9827,7 @@ namespace basist if (status) { //status = transcode_slice(pData, data_size, slice_index, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cATC_RGB, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice((uint8_t *)pOutput_blocks + 8, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cATC_RGB, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice((uint8_t *)pOutput_blocks + 8, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cATC_RGB, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to ATC RGB failed\n"); @@ -9075,7 +9847,7 @@ namespace basist return false; #else //status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC2_4_RGB, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cPVRTC2_4_RGB, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cPVRTC2_4_RGB, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to cPVRTC2_4_RGB failed\n"); @@ -9093,7 +9865,7 @@ namespace basist { // First decode the alpha data to the output (we're using the output texture as a temp buffer here). //status = transcode_slice(pData, data_size, slice_index + 1, (uint8_t*)pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cIndices, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cIndices, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cIndices, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to failed\n"); @@ -9102,12 +9874,12 @@ namespace basist { // Now decode the color data and transcode to PVRTC2 RGBA. //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC2_4_RGBA, bytes_per_block_or_pixel, decode_flags | cDecodeFlagsOutputHasAlphaIndices, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cPVRTC2_4_RGBA, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, true, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cPVRTC2_4_RGBA, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, true, nullptr, output_rows_in_pixels, decode_flags); } } else //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC2_4_RGB, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cPVRTC2_4_RGB, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cPVRTC2_4_RGB, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { @@ -9124,14 +9896,14 @@ namespace basist // First decode the alpha data if (basis_file_has_alpha_slices) //status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cA32, sizeof(uint32_t), decode_flags, output_row_pitch_in_blocks_or_pixels, pState, nullptr, output_rows_in_pixels); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cA32, sizeof(uint32_t), false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cA32, sizeof(uint32_t), false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); else status = true; if (status) { //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, basis_file_has_alpha_slices ? block_format::cRGB32 : block_format::cRGBA32, sizeof(uint32_t), decode_flags, output_row_pitch_in_blocks_or_pixels, pState, nullptr, output_rows_in_pixels); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, basis_file_has_alpha_slices ? block_format::cRGB32 : block_format::cRGBA32, sizeof(uint32_t), false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, basis_file_has_alpha_slices ? block_format::cRGB32 : block_format::cRGBA32, sizeof(uint32_t), false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to RGBA32 RGB failed\n"); @@ -9150,7 +9922,7 @@ namespace basist // Raw 16bpp pixels, decoded in the usual raster order (NOT block order) into an image in memory. //status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, (fmt == transcoder_texture_format::cTFRGB565) ? block_format::cRGB565 : block_format::cBGR565, sizeof(uint16_t), decode_flags, output_row_pitch_in_blocks_or_pixels, pState, nullptr, output_rows_in_pixels); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, (target_format == transcoder_texture_format::cTFRGB565) ? block_format::cRGB565 : block_format::cBGR565, sizeof(uint16_t), false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, (target_format == transcoder_texture_format::cTFRGB565) ? block_format::cRGB565 : block_format::cBGR565, sizeof(uint16_t), false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to RGB565 RGB failed\n"); @@ -9165,14 +9937,14 @@ namespace basist // First decode the alpha data if (basis_file_has_alpha_slices) //status = transcode_slice(pData, data_size, slice_index + 1, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cRGBA4444_ALPHA, sizeof(uint16_t), decode_flags, output_row_pitch_in_blocks_or_pixels, pState, nullptr, output_rows_in_pixels); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cRGBA4444_ALPHA, sizeof(uint16_t), false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cRGBA4444_ALPHA, sizeof(uint16_t), false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); else status = true; if (status) { //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, basis_file_has_alpha_slices ? block_format::cRGBA4444_COLOR : block_format::cRGBA4444_COLOR_OPAQUE, sizeof(uint16_t), decode_flags, output_row_pitch_in_blocks_or_pixels, pState, nullptr, output_rows_in_pixels); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, basis_file_has_alpha_slices ? block_format::cRGBA4444_COLOR : block_format::cRGBA4444_COLOR_OPAQUE, sizeof(uint16_t), false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, basis_file_has_alpha_slices ? block_format::cRGBA4444_COLOR : block_format::cRGBA4444_COLOR_OPAQUE, sizeof(uint16_t), false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to RGBA4444 RGB failed\n"); @@ -9192,7 +9964,7 @@ namespace basist return false; #else //status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cFXT1_RGB, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cFXT1_RGB, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cFXT1_RGB, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to FXT1_RGB failed\n"); @@ -9207,7 +9979,7 @@ namespace basist return false; #else //status = transcode_slice(pData, data_size, slice_index_to_decode, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_R11, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cETC2_EAC_R11, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pData, data_len, block_format::cETC2_EAC_R11, bytes_per_block_or_pixel, false, is_video, is_alpha_slice, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to ETC2_EAC_R11 failed\n"); @@ -9228,7 +10000,7 @@ namespace basist { // First decode the alpha data to G //status = transcode_slice(pData, data_size, slice_index + 1, (uint8_t*)pOutput_blocks + 8, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_R11, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice((uint8_t *)pOutput_blocks + 8, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cETC2_EAC_R11, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice((uint8_t *)pOutput_blocks + 8, num_blocks_x, num_blocks_y, pCompressed_data + alpha_offset, alpha_length, block_format::cETC2_EAC_R11, bytes_per_block_or_pixel, false, is_video, true, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); } else { @@ -9240,7 +10012,7 @@ namespace basist { // Now decode the color data to R //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_EAC_R11, 16, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); - status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cETC2_EAC_R11, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + rgb_offset, rgb_length, block_format::cETC2_EAC_R11, bytes_per_block_or_pixel, false, is_video, false, level_index, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, false, nullptr, output_rows_in_pixels, decode_flags); if (!status) { BASISU_DEVEL_ERROR("basisu_lowlevel_etc1s_transcoder::transcode_image: transcode_slice() to ETC2_EAC_R11 R failed\n"); @@ -9267,11 +10039,11 @@ namespace basist //------------------------------------------------------------------------------------------------ - basisu_lowlevel_uastc_transcoder::basisu_lowlevel_uastc_transcoder() + basisu_lowlevel_uastc_ldr_4x4_transcoder::basisu_lowlevel_uastc_ldr_4x4_transcoder() { } - bool basisu_lowlevel_uastc_transcoder::transcode_slice( + bool basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_slice( void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels, @@ -9283,7 +10055,7 @@ namespace basist assert(g_transcoder_initialized); if (!g_transcoder_initialized) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_slice: Transcoder not globally initialized.\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_slice: Transcoder not globally initialized.\n"); return false; } @@ -9312,7 +10084,7 @@ namespace basist uint32_t total_expected_block_bytes = sizeof(uastc_block) * total_blocks; if (image_data_size < total_expected_block_bytes) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_slice: image_data_size < total_expected_block_bytes The file is corrupted or this is a bug.\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_slice: image_data_size < total_expected_block_bytes The file is corrupted or this is a bug.\n"); return false; } @@ -9504,7 +10276,7 @@ namespace basist if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_slice: Transcoder failed to unpack a UASTC block - this is a bug, or the data was corrupted\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_slice: Transcoder failed to unpack a UASTC block - this is a bug, or the data was corrupted\n"); return false; } @@ -9515,7 +10287,7 @@ namespace basist return true; #else - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_slice: UASTC is unsupported\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_slice: UASTC is unsupported\n"); BASISU_NOTE_UNUSED(decode_flags); BASISU_NOTE_UNUSED(channel0); @@ -9534,7 +10306,7 @@ namespace basist #endif } - bool basisu_lowlevel_uastc_transcoder::transcode_image( + bool basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image( transcoder_texture_format target_format, void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, const uint8_t* pCompressed_data, uint32_t compressed_data_length, @@ -9553,7 +10325,7 @@ namespace basist if (((uint64_t)slice_offset + slice_length) > (uint64_t)compressed_data_length) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: source data buffer too small\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: source data buffer too small\n"); return false; } @@ -9562,7 +10334,7 @@ namespace basist if ((!basisu::is_pow2(num_blocks_x * 4)) || (!basisu::is_pow2(num_blocks_y * 4))) { // PVRTC1 only supports power of 2 dimensions - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: PVRTC1 only supports power of 2 dimensions\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: PVRTC1 only supports power of 2 dimensions\n"); return false; } } @@ -9575,11 +10347,11 @@ namespace basist const bool transcode_alpha_data_to_opaque_formats = (decode_flags & cDecodeFlagsTranscodeAlphaDataToOpaqueFormats) != 0; const uint32_t bytes_per_block_or_pixel = basis_get_bytes_per_block_or_pixel(target_format); - const uint32_t total_slice_blocks = num_blocks_x * num_blocks_y; + //const uint32_t total_slice_blocks = num_blocks_x * num_blocks_y; - if (!basis_validate_output_buffer_size(target_format, output_blocks_buf_size_in_blocks_or_pixels, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, output_rows_in_pixels, total_slice_blocks)) + if (!basis_validate_output_buffer_size(basis_tex_format::cUASTC4x4, target_format, output_blocks_buf_size_in_blocks_or_pixels, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, output_rows_in_pixels)) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: output buffer size too small\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: output buffer size too small\n"); return false; } @@ -9592,11 +10364,11 @@ namespace basist { //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC1, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cETC1, - bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1); + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to ETC1 failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: transcode_slice() to ETC1 failed\n"); } break; } @@ -9604,10 +10376,10 @@ namespace basist { //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cETC2_RGBA, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cETC2_RGBA, - bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1); + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to ETC2 failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: transcode_slice() to ETC2 failed\n"); } break; } @@ -9616,10 +10388,10 @@ namespace basist // TODO: ETC1S allows BC1 from alpha channel. That doesn't seem actually useful, though. //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC1, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBC1, - bytes_per_block_or_pixel, true, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1); + bytes_per_block_or_pixel, true, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to BC1 failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: transcode_slice() to BC1 failed\n"); } break; } @@ -9627,10 +10399,10 @@ namespace basist { //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC3, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBC3, - bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1); + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to BC3 failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: transcode_slice() to BC3 failed\n"); } break; } @@ -9641,10 +10413,10 @@ namespace basist // ((has_alpha) && (transcode_alpha_data_to_opaque_formats)) ? 3 : 0); status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBC4, bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, - ((has_alpha) && (transcode_alpha_data_to_opaque_formats)) ? 3 : 0); + ((has_alpha) && (transcode_alpha_data_to_opaque_formats)) ? 3 : 0, -1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to BC4 failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: transcode_slice() to BC4 failed\n"); } break; } @@ -9655,10 +10427,10 @@ namespace basist // 0, 3); status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBC5, bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, - 0, 3); + 0, 3, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to BC5 failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: transcode_slice() to BC5 failed\n"); } break; } @@ -9667,10 +10439,10 @@ namespace basist { //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBC7, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBC7, - bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, -1, -1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to BC7 failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: transcode_slice() to BC7 failed\n"); } break; } @@ -9678,10 +10450,10 @@ namespace basist { //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC1_4_RGB, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cPVRTC1_4_RGB, - bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, -1, -1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to PVRTC1 RGB 4bpp failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: transcode_slice() to PVRTC1 RGB 4bpp failed\n"); } break; } @@ -9689,10 +10461,10 @@ namespace basist { //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cPVRTC1_4_RGBA, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cPVRTC1_4_RGBA, - bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, -1, -1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to PVRTC1 RGBA 4bpp failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: transcode_slice() to PVRTC1 RGBA 4bpp failed\n"); } break; } @@ -9700,32 +10472,32 @@ namespace basist { //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cASTC_4x4, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cASTC_4x4, - bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, -1, -1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to ASTC 4x4 failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: transcode_slice() to ASTC 4x4 failed\n"); } break; } case transcoder_texture_format::cTFATC_RGB: case transcoder_texture_format::cTFATC_RGBA: { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: UASTC->ATC currently unsupported\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: UASTC->ATC currently unsupported\n"); return false; } case transcoder_texture_format::cTFFXT1_RGB: { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: UASTC->FXT1 currently unsupported\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: UASTC->FXT1 currently unsupported\n"); return false; } case transcoder_texture_format::cTFPVRTC2_4_RGB: { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: UASTC->PVRTC2 currently unsupported\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: UASTC->PVRTC2 currently unsupported\n"); return false; } case transcoder_texture_format::cTFPVRTC2_4_RGBA: { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: UASTC->PVRTC2 currently unsupported\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: UASTC->PVRTC2 currently unsupported\n"); return false; } case transcoder_texture_format::cTFETC2_EAC_R11: @@ -9735,10 +10507,10 @@ namespace basist // ((has_alpha) && (transcode_alpha_data_to_opaque_formats)) ? 3 : 0); status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cETC2_EAC_R11, bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, - ((has_alpha) && (transcode_alpha_data_to_opaque_formats)) ? 3 : 0); + ((has_alpha) && (transcode_alpha_data_to_opaque_formats)) ? 3 : 0, -1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to EAC R11 failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: transcode_slice() to EAC R11 failed\n"); } break; } @@ -9749,10 +10521,10 @@ namespace basist // 0, 3); status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cETC2_EAC_RG11, bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, - 0, 3); + 0, 3, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_basisu_lowlevel_uastc_transcodertranscoder::transcode_image: transcode_slice() to EAC RG11 failed\n"); + BASISU_DEVEL_ERROR("basisu_basisu_lowlevel_uastc_ldr_4x4_transcodertranscoder::transcode_image: transcode_slice() to EAC RG11 failed\n"); } break; } @@ -9760,10 +10532,10 @@ namespace basist { //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cRGBA32, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGBA32, - bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, -1, -1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to RGBA32 failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: transcode_slice() to RGBA32 failed\n"); } break; } @@ -9771,10 +10543,10 @@ namespace basist { //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cRGB565, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGB565, - bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, -1, -1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to RGB565 failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: transcode_slice() to RGB565 failed\n"); } break; } @@ -9782,10 +10554,10 @@ namespace basist { //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cBGR565, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBGR565, - bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, -1, -1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to RGB565 failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: transcode_slice() to RGB565 failed\n"); } break; } @@ -9793,17 +10565,17 @@ namespace basist { //status = transcode_slice(pData, data_size, slice_index, pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, block_format::cRGBA4444, bytes_per_block_or_pixel, decode_flags, output_row_pitch_in_blocks_or_pixels, pState); status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGBA4444, - bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, -1, -1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: transcode_slice() to RGBA4444 failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: transcode_slice() to RGBA4444 failed\n"); } break; } default: { assert(0); - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_transcoder::transcode_image: Invalid format\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_ldr_4x4_transcoder::transcode_image: Invalid format\n"); break; } } @@ -9812,12 +10584,13 @@ namespace basist } //------------------------------------------------------------------------------------------------ + // UASTC HDR 4x4 - basisu_lowlevel_uastc_hdr_transcoder::basisu_lowlevel_uastc_hdr_transcoder() + basisu_lowlevel_uastc_hdr_4x4_transcoder::basisu_lowlevel_uastc_hdr_4x4_transcoder() { } - bool basisu_lowlevel_uastc_hdr_transcoder::transcode_slice( + bool basisu_lowlevel_uastc_hdr_4x4_transcoder::transcode_slice( void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels, @@ -9829,11 +10602,13 @@ namespace basist BASISU_NOTE_UNUSED(channel0); BASISU_NOTE_UNUSED(channel1); BASISU_NOTE_UNUSED(decode_flags); + BASISU_NOTE_UNUSED(orig_width); + BASISU_NOTE_UNUSED(orig_height); assert(g_transcoder_initialized); if (!g_transcoder_initialized) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_slice: Transcoder not globally initialized.\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_4x4_transcoder::transcode_slice: Transcoder not globally initialized.\n"); return false; } @@ -9857,7 +10632,7 @@ namespace basist uint32_t total_expected_block_bytes = sizeof(astc_blk) * total_blocks; if (image_data_size < total_expected_block_bytes) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_slice: image_data_size < total_expected_block_bytes The file is corrupted or this is a bug.\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_4x4_transcoder::transcode_slice: image_data_size < total_expected_block_bytes The file is corrupted or this is a bug.\n"); return false; } @@ -9878,7 +10653,7 @@ namespace basist case block_format::cUASTC_HDR_4x4: case block_format::cASTC_HDR_4x4: { - // Nothing to do, UASTC HDR is just ASTC. + // Nothing to do, UASTC HDR 4x4 is just ASTC. memcpy(pDst_block, pSource_block, sizeof(uastc_block)); status = true; break; @@ -9993,7 +10768,8 @@ namespace basist if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_slice: Transcoder failed to unpack a UASTC HDR block - this is a bug, or the data was corrupted\n"); return false; + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_4x4_transcoder::transcode_slice: Transcoder failed to unpack a UASTC HDR block - this is a bug, or the data was corrupted\n"); + return false; } } // block_x @@ -10002,7 +10778,7 @@ namespace basist return true; #else - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_slice: UASTC_HDR is unsupported\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_4x4_transcoder::transcode_slice: UASTC_HDR is unsupported\n"); BASISU_NOTE_UNUSED(decode_flags); BASISU_NOTE_UNUSED(channel0); @@ -10021,7 +10797,7 @@ namespace basist #endif } - bool basisu_lowlevel_uastc_hdr_transcoder::transcode_image( + bool basisu_lowlevel_uastc_hdr_4x4_transcoder::transcode_image( transcoder_texture_format target_format, void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, const uint8_t* pCompressed_data, uint32_t compressed_data_length, @@ -10041,16 +10817,16 @@ namespace basist if (((uint64_t)slice_offset + slice_length) > (uint64_t)compressed_data_length) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: source data buffer too small\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_4x4_transcoder::transcode_image: source data buffer too small\n"); return false; } const uint32_t bytes_per_block_or_pixel = basis_get_bytes_per_block_or_pixel(target_format); - const uint32_t total_slice_blocks = num_blocks_x * num_blocks_y; + //const uint32_t total_slice_blocks = num_blocks_x * num_blocks_y; - if (!basis_validate_output_buffer_size(target_format, output_blocks_buf_size_in_blocks_or_pixels, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, output_rows_in_pixels, total_slice_blocks)) + if (!basis_validate_output_buffer_size(basis_tex_format::cUASTC_HDR_4x4, target_format, output_blocks_buf_size_in_blocks_or_pixels, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, output_rows_in_pixels)) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: output buffer size too small\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_4x4_transcoder::transcode_image: output buffer size too small\n"); return false; } @@ -10061,58 +10837,58 @@ namespace basist case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: { status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cASTC_HDR_4x4, - bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1); + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: transcode_slice() to ASTC_HDR failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_4x4_transcoder::transcode_image: transcode_slice() to ASTC_HDR failed\n"); } break; } case transcoder_texture_format::cTFBC6H: { status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBC6H, - bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1); + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: transcode_slice() to BC6H failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_4x4_transcoder::transcode_image: transcode_slice() to BC6H failed\n"); } break; } case transcoder_texture_format::cTFRGB_HALF: { status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGB_HALF, - bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: transcode_slice() to RGB_HALF failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_4x4_transcoder::transcode_image: transcode_slice() to RGB_HALF failed\n"); } break; } case transcoder_texture_format::cTFRGBA_HALF: { status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGBA_HALF, - bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: transcode_slice() to RGBA_HALF failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_4x4_transcoder::transcode_image: transcode_slice() to RGBA_HALF failed\n"); } break; } case transcoder_texture_format::cTFRGB_9E5: { status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGB_9E5, - bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); if (!status) { - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: transcode_slice() to RGBA_HALF failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_4x4_transcoder::transcode_image: transcode_slice() to RGBA_HALF failed\n"); } break; } default: { assert(0); - BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_transcoder::transcode_image: Invalid format\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_uastc_hdr_4x4_transcoder::transcode_image: Invalid format\n"); break; } } @@ -10121,4282 +10897,7052 @@ namespace basist } //------------------------------------------------------------------------------------------------ - - basisu_transcoder::basisu_transcoder() : - m_ready_to_transcode(false) - { - } + // ASTC 6x6 HDR - bool basisu_transcoder::validate_file_checksums(const void* pData, uint32_t data_size, bool full_validation) const + basisu_lowlevel_astc_hdr_6x6_transcoder::basisu_lowlevel_astc_hdr_6x6_transcoder() { - if (!validate_header(pData, data_size)) - return false; - - const basis_file_header* pHeader = reinterpret_cast(pData); - -#if !BASISU_NO_HEADER_OR_DATA_CRC16_CHECKS - if (crc16(&pHeader->m_data_size, sizeof(basis_file_header) - BASISU_OFFSETOF(basis_file_header, m_data_size), 0) != pHeader->m_header_crc16) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header CRC check failed\n"); - return false; - } - - if (full_validation) - { - if (crc16(reinterpret_cast(pData) + sizeof(basis_file_header), pHeader->m_data_size, 0) != pHeader->m_data_crc16) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: data CRC check failed\n"); - return false; - } - } -#endif - - return true; } - bool basisu_transcoder::validate_header_quick(const void* pData, uint32_t data_size) const + // num_blocks_x/num_blocks_y are source 6x6 blocks + bool basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_slice( + void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, + uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, + const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels, + basisu_transcoder_state* pState, uint32_t output_rows_in_pixels, int channel0, int channel1, uint32_t decode_flags) { - if (data_size <= sizeof(basis_file_header)) - return false; - - const basis_file_header* pHeader = reinterpret_cast(pData); + BASISU_NOTE_UNUSED(pState); + BASISU_NOTE_UNUSED(bc1_allow_threecolor_blocks); + BASISU_NOTE_UNUSED(has_alpha); + BASISU_NOTE_UNUSED(channel0); + BASISU_NOTE_UNUSED(channel1); + BASISU_NOTE_UNUSED(decode_flags); + BASISU_NOTE_UNUSED(orig_width); + BASISU_NOTE_UNUSED(orig_height); - if ((pHeader->m_sig != basis_file_header::cBASISSigValue) || (pHeader->m_ver != BASISD_SUPPORTED_BASIS_VERSION) || (pHeader->m_header_size != sizeof(basis_file_header))) + assert(g_transcoder_initialized); + if (!g_transcoder_initialized) { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header has an invalid signature, or file version is unsupported\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_slice: Transcoder not globally initialized.\n"); return false; } - uint32_t expected_file_size = sizeof(basis_file_header) + pHeader->m_data_size; - if (data_size < expected_file_size) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: source buffer is too small\n"); - return false; - } +#if BASISD_SUPPORT_UASTC_HDR + const uint32_t total_src_blocks = num_blocks_x * num_blocks_y; - if ((!pHeader->m_total_slices) || (!pHeader->m_total_images)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::validate_header_quick: header is invalid\n"); - return false; - } + const uint32_t output_block_width = get_block_width(fmt); + //const uint32_t output_block_height = get_block_height(fmt); - if ((pHeader->m_slice_desc_file_ofs >= data_size) || - ((data_size - pHeader->m_slice_desc_file_ofs) < (sizeof(basis_slice_desc) * pHeader->m_total_slices)) - ) + if (!output_row_pitch_in_blocks_or_pixels) { - BASISU_DEVEL_ERROR("basisu_transcoder::validate_header_quick: passed in buffer is too small or data is corrupted\n"); - return false; + if (basis_block_format_is_uncompressed(fmt)) + output_row_pitch_in_blocks_or_pixels = orig_width; + else + output_row_pitch_in_blocks_or_pixels = (orig_width + output_block_width - 1) / output_block_width; } - return true; - } - - bool basisu_transcoder::validate_header(const void* pData, uint32_t data_size) const - { - if (data_size <= sizeof(basis_file_header)) + if (basis_block_format_is_uncompressed(fmt)) { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: input source buffer is too small\n"); - return false; + if (!output_rows_in_pixels) + output_rows_in_pixels = orig_height; } - const basis_file_header* pHeader = reinterpret_cast(pData); - - if ((pHeader->m_sig != basis_file_header::cBASISSigValue) || (pHeader->m_ver != BASISD_SUPPORTED_BASIS_VERSION) || (pHeader->m_header_size != sizeof(basis_file_header))) + uint32_t total_expected_block_bytes = sizeof(astc_blk) * total_src_blocks; + if (image_data_size < total_expected_block_bytes) { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header has an invalid signature, or file version is unsupported\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_slice: image_data_size < total_expected_block_bytes The file is corrupted or this is a bug.\n"); return false; } - uint32_t expected_file_size = sizeof(basis_file_header) + pHeader->m_data_size; - if (data_size < expected_file_size) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: input source buffer is too small, or header is corrupted\n"); - return false; - } + const astc_blk* pSource_block = reinterpret_cast(pImage_data); - if ((!pHeader->m_total_images) || (!pHeader->m_total_slices)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid basis file (total images or slices are 0)\n"); - return false; - } + bool status = false; - if (pHeader->m_total_images > pHeader->m_total_slices) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid basis file (too many images)\n"); - return false; - } + half_float unpacked_blocks[12][12][3]; // [y][x][c] - if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S) + assert(((orig_width + 5) / 6) == num_blocks_x); + assert(((orig_height + 5) / 6) == num_blocks_y); + + if (fmt == block_format::cBC6H) { - if (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) + const uint32_t num_dst_blocks_x = (orig_width + 3) / 4; + const uint32_t num_dst_blocks_y = (orig_height + 3) / 4; + + if (!output_row_pitch_in_blocks_or_pixels) { - if (pHeader->m_total_slices & 1) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid alpha .basis file\n"); - return false; - } + output_row_pitch_in_blocks_or_pixels = num_dst_blocks_x; } - - // This flag dates back to pre-Basis Universal, when .basis supported full ETC1 too. - if ((pHeader->m_flags & cBASISHeaderFlagETC1S) == 0) + else if (output_row_pitch_in_blocks_or_pixels < num_dst_blocks_x) { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: Invalid .basis file (ETC1S check)\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_slice: output_row_pitch_in_blocks_or_pixels is too low\n"); return false; } - } - else - { - if ((pHeader->m_flags & cBASISHeaderFlagETC1S) != 0) + + if (output_block_or_pixel_stride_in_bytes != sizeof(bc6h_block)) { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: Invalid .basis file (ETC1S check)\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_slice: invalid output_block_or_pixel_stride_in_bytes\n"); return false; } - } - - if ((pHeader->m_slice_desc_file_ofs >= data_size) || - ((data_size - pHeader->m_slice_desc_file_ofs) < (sizeof(basis_slice_desc) * pHeader->m_total_slices)) - ) - { - BASISU_DEVEL_ERROR("basisu_transcoder::validate_header_quick: passed in buffer is too small or data is corrupted\n"); - return false; - } - - return true; - } - basis_texture_type basisu_transcoder::get_texture_type(const void* pData, uint32_t data_size) const - { - if (!validate_header_quick(pData, data_size)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_texture_type: header validation failed\n"); - return cBASISTexType2DArray; - } + fast_bc6h_params bc6h_enc_params; + const bool hq_flag = (decode_flags & cDecodeFlagsHighQuality) != 0; + bc6h_enc_params.m_max_2subset_pats_to_try = hq_flag ? 1 : 0; + + for (uint32_t src_block_y = 0; src_block_y < num_blocks_y; src_block_y += 2) + { + const uint32_t num_inner_blocks_y = basisu::minimum(2, num_blocks_y - src_block_y); - const basis_file_header* pHeader = static_cast(pData); + for (uint32_t src_block_x = 0; src_block_x < num_blocks_x; src_block_x += 2) + { + const uint32_t num_inner_blocks_x = basisu::minimum(2, num_blocks_x - src_block_x); - basis_texture_type btt = static_cast(static_cast(pHeader->m_tex_type)); + for (uint32_t iy = 0; iy < num_inner_blocks_y; iy++) + { + for (uint32_t ix = 0; ix < num_inner_blocks_x; ix++) + { + const astc_blk* pS = pSource_block + (src_block_y + iy) * num_blocks_x + (src_block_x + ix); - if (btt >= cBASISTexTypeTotal) - { - BASISU_DEVEL_ERROR("basisu_transcoder::validate_header_quick: header's texture type field is invalid\n"); - return cBASISTexType2DArray; - } + half_float blk_texels[6][6][4]; + + astc_helpers::log_astc_block log_blk; + status = astc_helpers::unpack_block(pS, log_blk, 6, 6); + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_slice: Transcoder failed to unpack a ASTC HDR block - this is a bug, or the data was corrupted\n"); + return false; + } + + status = astc_helpers::decode_block(log_blk, blk_texels, 6, 6, astc_helpers::cDecodeModeHDR16); + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_slice: Transcoder failed to unpack a ASTC HDR block - this is a bug, or the data was corrupted\n"); + return false; + } - return btt; - } + for (uint32_t y = 0; y < 6; y++) + { + for (uint32_t x = 0; x < 6; x++) + { + unpacked_blocks[iy * 6 + y][ix * 6 + x][0] = blk_texels[y][x][0]; + unpacked_blocks[iy * 6 + y][ix * 6 + x][1] = blk_texels[y][x][1]; + unpacked_blocks[iy * 6 + y][ix * 6 + x][2] = blk_texels[y][x][2]; + + } // x + } // y - bool basisu_transcoder::get_userdata(const void* pData, uint32_t data_size, uint32_t& userdata0, uint32_t& userdata1) const - { - if (!validate_header_quick(pData, data_size)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_userdata: header validation failed\n"); - return false; - } + } // ix - const basis_file_header* pHeader = static_cast(pData); + } // iy + + const uint32_t dst_x = src_block_x * 6; + assert((dst_x & 3) == 0); + const uint32_t dst_block_x = dst_x >> 2; - userdata0 = pHeader->m_userdata0; - userdata1 = pHeader->m_userdata1; - return true; - } + const uint32_t dst_y = src_block_y * 6; + assert((dst_y & 3) == 0); + const uint32_t dst_block_y = dst_y >> 2; - uint32_t basisu_transcoder::get_total_images(const void* pData, uint32_t data_size) const - { - if (!validate_header_quick(pData, data_size)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header validation failed\n"); - return 0; - } + const uint32_t num_inner_dst_blocks_x = basisu::minimum(3, num_dst_blocks_x - dst_block_x); + const uint32_t num_inner_dst_blocks_y = basisu::minimum(3, num_dst_blocks_y - dst_block_y); - const basis_file_header* pHeader = static_cast(pData); + for (uint32_t dy = 0; dy < num_inner_dst_blocks_y; dy++) + { + for (uint32_t dx = 0; dx < num_inner_dst_blocks_x; dx++) + { + bc6h_block* pDst_block = (bc6h_block*)pDst_blocks + (dst_block_x + dx) + (dst_block_y + dy) * output_row_pitch_in_blocks_or_pixels; - return pHeader->m_total_images; - } + half_float src_pixels[4][4][3]; // [y][x][c] - basis_tex_format basisu_transcoder::get_tex_format(const void* pData, uint32_t data_size) const - { - if (!validate_header_quick(pData, data_size)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header validation failed\n"); - return basis_tex_format::cETC1S; - } + for (uint32_t y = 0; y < 4; y++) + { + const uint32_t src_pixel_y = basisu::minimum(dy * 4 + y, num_inner_blocks_y * 6 - 1); - const basis_file_header* pHeader = static_cast(pData); + for (uint32_t x = 0; x < 4; x++) + { + const uint32_t src_pixel_x = basisu::minimum(dx * 4 + x, num_inner_blocks_x * 6 - 1); - return (basis_tex_format)(uint32_t)pHeader->m_tex_format; - } + assert((src_pixel_y < 12) && (src_pixel_x < 12)); - bool basisu_transcoder::get_image_info(const void* pData, uint32_t data_size, basisu_image_info& image_info, uint32_t image_index) const - { - if (!validate_header_quick(pData, data_size)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_image_info: header validation failed\n"); - return false; - } + src_pixels[y][x][0] = unpacked_blocks[src_pixel_y][src_pixel_x][0]; + src_pixels[y][x][1] = unpacked_blocks[src_pixel_y][src_pixel_x][1]; + src_pixels[y][x][2] = unpacked_blocks[src_pixel_y][src_pixel_x][2]; + + } // x + } // y + + astc_6x6_hdr::fast_encode_bc6h(&src_pixels[0][0][0], pDst_block, bc6h_enc_params); - int slice_index = find_first_slice_index(pData, data_size, image_index, 0); - if (slice_index < 0) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_image_info: invalid slice index\n"); - return false; - } + } // dx + } // dy - const basis_file_header* pHeader = static_cast(pData); + } // block_x - if (image_index >= pHeader->m_total_images) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_image_info: invalid image_index\n"); - return false; + } // block_y + + status = true; } + else + { + for (uint32_t block_y = 0; block_y < num_blocks_y; ++block_y) + { + void* pDst_block = (uint8_t*)pDst_blocks + block_y * output_row_pitch_in_blocks_or_pixels * output_block_or_pixel_stride_in_bytes; - const basis_slice_desc* pSlice_descs = reinterpret_cast(static_cast(pData) + pHeader->m_slice_desc_file_ofs); + for (uint32_t block_x = 0; block_x < num_blocks_x; ++block_x, ++pSource_block, pDst_block = (uint8_t*)pDst_block + output_block_or_pixel_stride_in_bytes) + { + switch (fmt) + { + case block_format::cASTC_HDR_6x6: + { + // Nothing to do, ASTC HDR 6x6 is just ASTC. + // TODO: Optimize this copy + memcpy(pDst_block, pSource_block, sizeof(astc_helpers::astc_block)); + status = true; + break; + } + case block_format::cRGB_9E5: + { + astc_helpers::log_astc_block log_blk; + status = astc_helpers::unpack_block(pSource_block, log_blk, 6, 6); + if (status) + { + uint32_t* pDst_pixels = reinterpret_cast( + static_cast(pDst_blocks) + (block_x * 6 + block_y * 6 * output_row_pitch_in_blocks_or_pixels) * sizeof(uint32_t) + ); - uint32_t total_levels = 1; - for (uint32_t i = slice_index + 1; i < pHeader->m_total_slices; i++) - if (pSlice_descs[i].m_image_index == image_index) - total_levels = basisu::maximum(total_levels, pSlice_descs[i].m_level_index + 1); - else - break; + uint32_t blk_texels[6][6]; - if (total_levels > 16) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_image_info: invalid image_index\n"); - return false; - } + status = astc_helpers::decode_block(log_blk, blk_texels, 6, 6, astc_helpers::cDecodeModeRGB9E5); - const basis_slice_desc& slice_desc = pSlice_descs[slice_index]; + if (status) + { + const uint32_t max_x = basisu::minimum(6, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 6); + const uint32_t max_y = basisu::minimum(6, (int)output_rows_in_pixels - (int)block_y * 6); - image_info.m_image_index = image_index; - image_info.m_total_levels = total_levels; - - image_info.m_alpha_flag = false; + for (uint32_t y = 0; y < max_y; y++) + { + memcpy(pDst_pixels, &blk_texels[y][0], sizeof(uint32_t) * max_x); - // For ETC1S, if anything has alpha all images have alpha. For UASTC, we only report alpha when the image actually has alpha. - if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S) - image_info.m_alpha_flag = (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) != 0; - else - image_info.m_alpha_flag = (slice_desc.m_flags & cSliceDescFlagsHasAlpha) != 0; + pDst_pixels += output_row_pitch_in_blocks_or_pixels; + } // y + } + } - image_info.m_iframe_flag = (slice_desc.m_flags & cSliceDescFlagsFrameIsIFrame) != 0; + break; + } + case block_format::cRGBA_HALF: + { + astc_helpers::log_astc_block log_blk; + status = astc_helpers::unpack_block(pSource_block, log_blk, 6, 6); + if (status) + { + half_float* pDst_pixels = reinterpret_cast( + static_cast(pDst_blocks) + (block_x * 6 + block_y * 6 * output_row_pitch_in_blocks_or_pixels) * sizeof(half_float) * 4 + ); - image_info.m_width = slice_desc.m_num_blocks_x * 4; - image_info.m_height = slice_desc.m_num_blocks_y * 4; - image_info.m_orig_width = slice_desc.m_orig_width; - image_info.m_orig_height = slice_desc.m_orig_height; - image_info.m_num_blocks_x = slice_desc.m_num_blocks_x; - image_info.m_num_blocks_y = slice_desc.m_num_blocks_y; - image_info.m_total_blocks = image_info.m_num_blocks_x * image_info.m_num_blocks_y; - image_info.m_first_slice_index = slice_index; + half_float blk_texels[6][6][4]; + status = astc_helpers::decode_block(log_blk, blk_texels, 6, 6, astc_helpers::cDecodeModeHDR16); + + if (status) + { + const uint32_t max_x = basisu::minimum(6, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 6); + const uint32_t max_y = basisu::minimum(6, (int)output_rows_in_pixels - (int)block_y * 6); + + for (uint32_t y = 0; y < max_y; y++) + { + for (uint32_t x = 0; x < max_x; x++) + { + pDst_pixels[0 + 4 * x] = blk_texels[y][x][0]; + pDst_pixels[1 + 4 * x] = blk_texels[y][x][1]; + pDst_pixels[2 + 4 * x] = blk_texels[y][x][2]; + pDst_pixels[3 + 4 * x] = blk_texels[y][x][3]; + } // x + + pDst_pixels += output_row_pitch_in_blocks_or_pixels * 4; + } // y + } + } + + break; + } + case block_format::cRGB_HALF: + { + astc_helpers::log_astc_block log_blk; + status = astc_helpers::unpack_block(pSource_block, log_blk, 6, 6); + if (status) + { + half_float* pDst_pixels = + reinterpret_cast(static_cast(pDst_blocks) + (block_x * 6 + block_y * 6 * output_row_pitch_in_blocks_or_pixels) * sizeof(half_float) * 3); + + half_float blk_texels[6][6][4]; + status = astc_helpers::decode_block(log_blk, blk_texels, 6, 6, astc_helpers::cDecodeModeHDR16); + if (status) + { + const uint32_t max_x = basisu::minimum(6, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 6); + const uint32_t max_y = basisu::minimum(6, (int)output_rows_in_pixels - (int)block_y * 6); + + for (uint32_t y = 0; y < max_y; y++) + { + for (uint32_t x = 0; x < max_x; x++) + { + pDst_pixels[0 + 3 * x] = blk_texels[y][x][0]; + pDst_pixels[1 + 3 * x] = blk_texels[y][x][1]; + pDst_pixels[2 + 3 * x] = blk_texels[y][x][2]; + } // x + + pDst_pixels += output_row_pitch_in_blocks_or_pixels * 3; + } // y + } + } + + break; + } + default: + assert(0); + break; + + } + + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_slice: Transcoder failed to unpack a ASTC HDR block - this is a bug, or the data was corrupted\n"); + return false; + } + + } // block_x + + } // block_y + } return true; +#else + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_slice: ASTC HDR is unsupported\n"); + + BASISU_NOTE_UNUSED(decode_flags); + BASISU_NOTE_UNUSED(channel0); + BASISU_NOTE_UNUSED(channel1); + BASISU_NOTE_UNUSED(output_rows_in_pixels); + BASISU_NOTE_UNUSED(output_row_pitch_in_blocks_or_pixels); + BASISU_NOTE_UNUSED(output_block_or_pixel_stride_in_bytes); + BASISU_NOTE_UNUSED(fmt); + BASISU_NOTE_UNUSED(image_data_size); + BASISU_NOTE_UNUSED(pImage_data); + BASISU_NOTE_UNUSED(num_blocks_x); + BASISU_NOTE_UNUSED(num_blocks_y); + BASISU_NOTE_UNUSED(pDst_blocks); + + return false; +#endif } - uint32_t basisu_transcoder::get_total_image_levels(const void* pData, uint32_t data_size, uint32_t image_index) const + bool basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_image( + transcoder_texture_format target_format, + void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, + const uint8_t* pCompressed_data, uint32_t compressed_data_length, + uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index, + uint32_t slice_offset, uint32_t slice_length, + uint32_t decode_flags, + bool has_alpha, + bool is_video, + uint32_t output_row_pitch_in_blocks_or_pixels, + basisu_transcoder_state* pState, + uint32_t output_rows_in_pixels, + int channel0, int channel1) { - if (!validate_header_quick(pData, data_size)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_image_levels: header validation failed\n"); - return false; - } + BASISU_NOTE_UNUSED(is_video); + BASISU_NOTE_UNUSED(level_index); + BASISU_NOTE_UNUSED(decode_flags); - int slice_index = find_first_slice_index(pData, data_size, image_index, 0); - if (slice_index < 0) + if (((uint64_t)slice_offset + slice_length) > (uint64_t)compressed_data_length) { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_image_levels: failed finding slice\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_image: source data buffer too small\n"); return false; } - const basis_file_header* pHeader = static_cast(pData); + const uint32_t bytes_per_block_or_pixel = basis_get_bytes_per_block_or_pixel(target_format); + //const uint32_t total_slice_blocks = num_blocks_x * num_blocks_y; - if (image_index >= pHeader->m_total_images) + if (!basis_validate_output_buffer_size(basis_tex_format::cASTC_HDR_6x6, target_format, output_blocks_buf_size_in_blocks_or_pixels, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, output_rows_in_pixels)) { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_image_levels: invalid image_index\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_image: output buffer size too small\n"); return false; } - const basis_slice_desc* pSlice_descs = reinterpret_cast(static_cast(pData) + pHeader->m_slice_desc_file_ofs); + bool status = false; - uint32_t total_levels = 1; - for (uint32_t i = slice_index + 1; i < pHeader->m_total_slices; i++) - if (pSlice_descs[i].m_image_index == image_index) - total_levels = basisu::maximum(total_levels, pSlice_descs[i].m_level_index + 1); - else - break; + switch (target_format) + { + case transcoder_texture_format::cTFASTC_HDR_6x6_RGBA: + { + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cASTC_HDR_6x6, + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); - const uint32_t cMaxSupportedLevels = 16; - if (total_levels > cMaxSupportedLevels) + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_image: transcode_slice() to ASTC_HDR failed\n"); + } + break; + } + case transcoder_texture_format::cTFBC6H: { - BASISU_DEVEL_ERROR("basisu_transcoder::get_total_image_levels: invalid image levels!\n"); - return false; + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBC6H, + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_image: transcode_slice() to BC6H failed\n"); + } + break; } - - return total_levels; - } - - bool basisu_transcoder::get_image_level_desc(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, uint32_t& orig_width, uint32_t& orig_height, uint32_t& total_blocks) const - { - if (!validate_header_quick(pData, data_size)) + case transcoder_texture_format::cTFRGB_HALF: { - BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_desc: header validation failed\n"); - return false; + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGB_HALF, + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_image: transcode_slice() to RGB_HALF failed\n"); + } + break; } - - int slice_index = find_first_slice_index(pData, data_size, image_index, level_index); - if (slice_index < 0) + case transcoder_texture_format::cTFRGBA_HALF: { - BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_desc: failed finding slice\n"); - return false; + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGBA_HALF, + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_image: transcode_slice() to RGBA_HALF failed\n"); + } + break; } - - const basis_file_header* pHeader = static_cast(pData); - - if (image_index >= pHeader->m_total_images) + case transcoder_texture_format::cTFRGB_9E5: { - BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_desc: invalid image_index\n"); - return false; + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGB_9E5, + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_image: transcode_slice() to RGBA_HALF failed\n"); + } + break; + } + default: + { + assert(0); + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_transcoder::transcode_image: Invalid format\n"); + break; + } } - const basis_slice_desc* pSlice_descs = reinterpret_cast(static_cast(pData) + pHeader->m_slice_desc_file_ofs); - - const basis_slice_desc& slice_desc = pSlice_descs[slice_index]; + return status; + } - orig_width = slice_desc.m_orig_width; - orig_height = slice_desc.m_orig_height; - total_blocks = slice_desc.m_num_blocks_x * slice_desc.m_num_blocks_y; + //------------------------------------------------------------------------------------------------ + // ASTC 6x6 HDR intermediate - return true; + basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder() + { } - bool basisu_transcoder::get_image_level_info(const void* pData, uint32_t data_size, basisu_image_level_info& image_info, uint32_t image_index, uint32_t level_index) const + // num_blocks_x/num_blocks_y are source 6x6 blocks + bool basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_slice( + void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, + uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, + const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels, + basisu_transcoder_state* pState, uint32_t output_rows_in_pixels, int channel0, int channel1, uint32_t decode_flags) { - if (!validate_header_quick(pData, data_size)) + BASISU_NOTE_UNUSED(pState); + BASISU_NOTE_UNUSED(bc1_allow_threecolor_blocks); + BASISU_NOTE_UNUSED(has_alpha); + BASISU_NOTE_UNUSED(channel0); + BASISU_NOTE_UNUSED(channel1); + BASISU_NOTE_UNUSED(decode_flags); + BASISU_NOTE_UNUSED(orig_width); + BASISU_NOTE_UNUSED(orig_height); + + assert(g_transcoder_initialized); + if (!g_transcoder_initialized) { - BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_info: validate_file_checksums failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_slice: Transcoder not globally initialized.\n"); return false; } - int slice_index = find_first_slice_index(pData, data_size, image_index, level_index); - if (slice_index < 0) +#if BASISD_SUPPORT_UASTC_HDR + + // TODO: Optimize this + + basisu::vector2D decoded_blocks; + uint32_t dec_width = 0, dec_height = 0; + bool dec_status = astc_6x6_hdr::decode_6x6_hdr(pImage_data, image_data_size, decoded_blocks, dec_width, dec_height); + if (!dec_status) { - BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_info: failed finding slice\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_slice: decode_6x6_hdr() failed.\n"); return false; } - const basis_file_header* pHeader = static_cast(pData); - - if (image_index >= pHeader->m_total_images) + if ((dec_width != orig_width) || (dec_height != orig_height) || + (decoded_blocks.get_width() != num_blocks_x) || (decoded_blocks.get_height() != num_blocks_y)) { - BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_info: invalid image_index\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_slice: unexpected decoded width/height\n"); return false; } - const basis_slice_desc* pSlice_descs = reinterpret_cast(static_cast(pData) + pHeader->m_slice_desc_file_ofs); + //const uint32_t total_src_blocks = num_blocks_x * num_blocks_y; - const basis_slice_desc& slice_desc = pSlice_descs[slice_index]; + const uint32_t output_block_width = get_block_width(fmt); + //const uint32_t output_block_height = get_block_height(fmt); - image_info.m_image_index = image_index; - image_info.m_level_index = level_index; - - // For ETC1S, if anything has alpha all images have alpha. For UASTC, we only report alpha when the image actually has alpha. - if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S) - image_info.m_alpha_flag = (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) != 0; - else - image_info.m_alpha_flag = (slice_desc.m_flags & cSliceDescFlagsHasAlpha) != 0; - - image_info.m_iframe_flag = (slice_desc.m_flags & cSliceDescFlagsFrameIsIFrame) != 0; - image_info.m_width = slice_desc.m_num_blocks_x * 4; - image_info.m_height = slice_desc.m_num_blocks_y * 4; - image_info.m_orig_width = slice_desc.m_orig_width; - image_info.m_orig_height = slice_desc.m_orig_height; - image_info.m_num_blocks_x = slice_desc.m_num_blocks_x; - image_info.m_num_blocks_y = slice_desc.m_num_blocks_y; - image_info.m_total_blocks = image_info.m_num_blocks_x * image_info.m_num_blocks_y; - image_info.m_first_slice_index = slice_index; - - image_info.m_rgb_file_ofs = slice_desc.m_file_ofs; - image_info.m_rgb_file_len = slice_desc.m_file_size; - image_info.m_alpha_file_ofs = 0; - image_info.m_alpha_file_len = 0; - - if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S) + if (!output_row_pitch_in_blocks_or_pixels) { - if (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) - { - assert((slice_index + 1) < (int)pHeader->m_total_slices); - image_info.m_alpha_file_ofs = pSlice_descs[slice_index + 1].m_file_ofs; - image_info.m_alpha_file_len = pSlice_descs[slice_index + 1].m_file_size; - } + if (basis_block_format_is_uncompressed(fmt)) + output_row_pitch_in_blocks_or_pixels = orig_width; + else + output_row_pitch_in_blocks_or_pixels = (orig_width + output_block_width - 1) / output_block_width; } - return true; - } - - bool basisu_transcoder::get_file_info(const void* pData, uint32_t data_size, basisu_file_info& file_info) const - { - if (!validate_file_checksums(pData, data_size, false)) + if (basis_block_format_is_uncompressed(fmt)) { - BASISU_DEVEL_ERROR("basisu_transcoder::get_file_info: validate_file_checksums failed\n"); - return false; + if (!output_rows_in_pixels) + output_rows_in_pixels = orig_height; } - const basis_file_header* pHeader = static_cast(pData); - const basis_slice_desc* pSlice_descs = reinterpret_cast(static_cast(pData) + pHeader->m_slice_desc_file_ofs); - - file_info.m_version = pHeader->m_ver; - - file_info.m_total_header_size = sizeof(basis_file_header) + pHeader->m_total_slices * sizeof(basis_slice_desc); + const astc_blk* pSource_block = (const astc_blk *)decoded_blocks.get_ptr(); - file_info.m_total_selectors = pHeader->m_total_selectors; - file_info.m_selector_codebook_ofs = pHeader->m_selector_cb_file_ofs; - file_info.m_selector_codebook_size = pHeader->m_selector_cb_file_size; + bool status = false; - file_info.m_total_endpoints = pHeader->m_total_endpoints; - file_info.m_endpoint_codebook_ofs = pHeader->m_endpoint_cb_file_ofs; - file_info.m_endpoint_codebook_size = pHeader->m_endpoint_cb_file_size; + half_float unpacked_blocks[12][12][3]; // [y][x][c] - file_info.m_tables_ofs = pHeader->m_tables_file_ofs; - file_info.m_tables_size = pHeader->m_tables_file_size; + assert(((orig_width + 5) / 6) == num_blocks_x); + assert(((orig_height + 5) / 6) == num_blocks_y); - file_info.m_tex_format = static_cast(static_cast(pHeader->m_tex_format)); + if (fmt == block_format::cBC6H) + { + const uint32_t num_dst_blocks_x = (orig_width + 3) / 4; + const uint32_t num_dst_blocks_y = (orig_height + 3) / 4; - file_info.m_etc1s = (pHeader->m_tex_format == (int)basis_tex_format::cETC1S); - - file_info.m_y_flipped = (pHeader->m_flags & cBASISHeaderFlagYFlipped) != 0; - file_info.m_has_alpha_slices = (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) != 0; + if (!output_row_pitch_in_blocks_or_pixels) + { + output_row_pitch_in_blocks_or_pixels = num_dst_blocks_x; + } + else if (output_row_pitch_in_blocks_or_pixels < num_dst_blocks_x) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_slice: output_row_pitch_in_blocks_or_pixels is too low\n"); + return false; + } - const uint32_t total_slices = pHeader->m_total_slices; + if (output_block_or_pixel_stride_in_bytes != sizeof(bc6h_block)) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_slice: invalid output_block_or_pixel_stride_in_bytes\n"); + return false; + } - file_info.m_slice_info.resize(total_slices); + fast_bc6h_params bc6h_enc_params; + const bool hq_flag = (decode_flags & cDecodeFlagsHighQuality) != 0; + bc6h_enc_params.m_max_2subset_pats_to_try = hq_flag ? 1 : 0; + + for (uint32_t src_block_y = 0; src_block_y < num_blocks_y; src_block_y += 2) + { + const uint32_t num_inner_blocks_y = basisu::minimum(2, num_blocks_y - src_block_y); - file_info.m_slices_size = 0; + for (uint32_t src_block_x = 0; src_block_x < num_blocks_x; src_block_x += 2) + { + const uint32_t num_inner_blocks_x = basisu::minimum(2, num_blocks_x - src_block_x); - file_info.m_tex_type = static_cast(static_cast(pHeader->m_tex_type)); + for (uint32_t iy = 0; iy < num_inner_blocks_y; iy++) + { + for (uint32_t ix = 0; ix < num_inner_blocks_x; ix++) + { + const astc_blk* pS = pSource_block + (src_block_y + iy) * num_blocks_x + (src_block_x + ix); - if (file_info.m_tex_type > cBASISTexTypeTotal) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_file_info: invalid texture type, file is corrupted\n"); - return false; - } + half_float blk_texels[6][6][4]; - file_info.m_us_per_frame = pHeader->m_us_per_frame; - file_info.m_userdata0 = pHeader->m_userdata0; - file_info.m_userdata1 = pHeader->m_userdata1; + astc_helpers::log_astc_block log_blk; + status = astc_helpers::unpack_block(pS, log_blk, 6, 6); + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_slice: Transcoder failed to unpack a ASTC HDR block - this is a bug, or the data was corrupted\n"); + return false; + } - file_info.m_image_mipmap_levels.resize(0); - file_info.m_image_mipmap_levels.resize(pHeader->m_total_images); + status = astc_helpers::decode_block(log_blk, blk_texels, 6, 6, astc_helpers::cDecodeModeHDR16); + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_slice: Transcoder failed to unpack a ASTC HDR block - this is a bug, or the data was corrupted\n"); + return false; + } - file_info.m_total_images = pHeader->m_total_images; + for (uint32_t y = 0; y < 6; y++) + { + for (uint32_t x = 0; x < 6; x++) + { + unpacked_blocks[iy * 6 + y][ix * 6 + x][0] = blk_texels[y][x][0]; + unpacked_blocks[iy * 6 + y][ix * 6 + x][1] = blk_texels[y][x][1]; + unpacked_blocks[iy * 6 + y][ix * 6 + x][2] = blk_texels[y][x][2]; + } // x + } // y - for (uint32_t i = 0; i < total_slices; i++) - { - file_info.m_slices_size += pSlice_descs[i].m_file_size; + } // ix - basisu_slice_info& slice_info = file_info.m_slice_info[i]; + } // iy - slice_info.m_orig_width = pSlice_descs[i].m_orig_width; - slice_info.m_orig_height = pSlice_descs[i].m_orig_height; - slice_info.m_width = pSlice_descs[i].m_num_blocks_x * 4; - slice_info.m_height = pSlice_descs[i].m_num_blocks_y * 4; - slice_info.m_num_blocks_x = pSlice_descs[i].m_num_blocks_x; - slice_info.m_num_blocks_y = pSlice_descs[i].m_num_blocks_y; - slice_info.m_total_blocks = slice_info.m_num_blocks_x * slice_info.m_num_blocks_y; - slice_info.m_compressed_size = pSlice_descs[i].m_file_size; - slice_info.m_slice_index = i; - slice_info.m_image_index = pSlice_descs[i].m_image_index; - slice_info.m_level_index = pSlice_descs[i].m_level_index; - slice_info.m_unpacked_slice_crc16 = pSlice_descs[i].m_slice_data_crc16; - slice_info.m_alpha_flag = (pSlice_descs[i].m_flags & cSliceDescFlagsHasAlpha) != 0; - slice_info.m_iframe_flag = (pSlice_descs[i].m_flags & cSliceDescFlagsFrameIsIFrame) != 0; + const uint32_t dst_x = src_block_x * 6; + assert((dst_x & 3) == 0); + const uint32_t dst_block_x = dst_x >> 2; - if (pSlice_descs[i].m_image_index >= pHeader->m_total_images) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_file_info: slice desc's image index is invalid\n"); - return false; - } + const uint32_t dst_y = src_block_y * 6; + assert((dst_y & 3) == 0); + const uint32_t dst_block_y = dst_y >> 2; - file_info.m_image_mipmap_levels[pSlice_descs[i].m_image_index] = basisu::maximum(file_info.m_image_mipmap_levels[pSlice_descs[i].m_image_index], pSlice_descs[i].m_level_index + 1); + const uint32_t num_inner_dst_blocks_x = basisu::minimum(3, num_dst_blocks_x - dst_block_x); + const uint32_t num_inner_dst_blocks_y = basisu::minimum(3, num_dst_blocks_y - dst_block_y); - if (file_info.m_image_mipmap_levels[pSlice_descs[i].m_image_index] > 16) - { - BASISU_DEVEL_ERROR("basisu_transcoder::get_file_info: slice mipmap level is invalid\n"); - return false; - } - } + for (uint32_t dy = 0; dy < num_inner_dst_blocks_y; dy++) + { + for (uint32_t dx = 0; dx < num_inner_dst_blocks_x; dx++) + { + bc6h_block* pDst_block = (bc6h_block*)pDst_blocks + (dst_block_x + dx) + (dst_block_y + dy) * output_row_pitch_in_blocks_or_pixels; - return true; - } - - bool basisu_transcoder::start_transcoding(const void* pData, uint32_t data_size) - { - if (!validate_header_quick(pData, data_size)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: header validation failed\n"); - return false; - } + half_float src_pixels[4][4][3]; // [y][x][c] - const basis_file_header* pHeader = reinterpret_cast(pData); - const uint8_t* pDataU8 = static_cast(pData); + for (uint32_t y = 0; y < 4; y++) + { + const uint32_t src_pixel_y = basisu::minimum(dy * 4 + y, num_inner_blocks_y * 6 - 1); - if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S) - { - if (m_lowlevel_etc1s_decoder.m_local_endpoints.size()) - { - m_lowlevel_etc1s_decoder.clear(); - } + for (uint32_t x = 0; x < 4; x++) + { + const uint32_t src_pixel_x = basisu::minimum(dx * 4 + x, num_inner_blocks_x * 6 - 1); - if (pHeader->m_flags & cBASISHeaderFlagUsesGlobalCodebook) - { - if (!m_lowlevel_etc1s_decoder.get_global_codebooks()) - { - BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: File uses global codebooks, but set_global_codebooks() has not been called\n"); - return false; - } - if (!m_lowlevel_etc1s_decoder.get_global_codebooks()->get_endpoints().size()) - { - BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: Global codebooks must be unpacked first by calling start_transcoding()\n"); - return false; - } - if ((m_lowlevel_etc1s_decoder.get_global_codebooks()->get_endpoints().size() != pHeader->m_total_endpoints) || - (m_lowlevel_etc1s_decoder.get_global_codebooks()->get_selectors().size() != pHeader->m_total_selectors)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: Global codebook size mismatch (wrong codebooks for file).\n"); - return false; - } - if (!pHeader->m_tables_file_size) - { - BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted (2)\n"); - return false; - } - if (pHeader->m_tables_file_ofs > data_size) - { - BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (4)\n"); - return false; - } - if (pHeader->m_tables_file_size > (data_size - pHeader->m_tables_file_ofs)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (5)\n"); - return false; - } - } - else - { - if (!pHeader->m_endpoint_cb_file_size || !pHeader->m_selector_cb_file_size || !pHeader->m_tables_file_size) - { - BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted (0)\n"); - return false; - } + assert((src_pixel_y < 12) && (src_pixel_x < 12)); - if ((pHeader->m_endpoint_cb_file_ofs > data_size) || (pHeader->m_selector_cb_file_ofs > data_size) || (pHeader->m_tables_file_ofs > data_size)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (1)\n"); - return false; - } + src_pixels[y][x][0] = unpacked_blocks[src_pixel_y][src_pixel_x][0]; + src_pixels[y][x][1] = unpacked_blocks[src_pixel_y][src_pixel_x][1]; + src_pixels[y][x][2] = unpacked_blocks[src_pixel_y][src_pixel_x][2]; - if (pHeader->m_endpoint_cb_file_size > (data_size - pHeader->m_endpoint_cb_file_ofs)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (2)\n"); - return false; - } + } // x + } // y + + astc_6x6_hdr::fast_encode_bc6h(&src_pixels[0][0][0], pDst_block, bc6h_enc_params); - if (pHeader->m_selector_cb_file_size > (data_size - pHeader->m_selector_cb_file_ofs)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (3)\n"); - return false; - } + } // dx + } // dy - if (pHeader->m_tables_file_size > (data_size - pHeader->m_tables_file_ofs)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (3)\n"); - return false; - } + } // block_x - if (!m_lowlevel_etc1s_decoder.decode_palettes( - pHeader->m_total_endpoints, pDataU8 + pHeader->m_endpoint_cb_file_ofs, pHeader->m_endpoint_cb_file_size, - pHeader->m_total_selectors, pDataU8 + pHeader->m_selector_cb_file_ofs, pHeader->m_selector_cb_file_size)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: decode_palettes failed\n"); - return false; - } - } + } // block_y - if (!m_lowlevel_etc1s_decoder.decode_tables(pDataU8 + pHeader->m_tables_file_ofs, pHeader->m_tables_file_size)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: decode_tables failed\n"); - return false; - } + status = true; } else { - // Nothing special to do for UASTC/UASTC HDR. - if (m_lowlevel_etc1s_decoder.m_local_endpoints.size()) + for (uint32_t block_y = 0; block_y < num_blocks_y; ++block_y) { - m_lowlevel_etc1s_decoder.clear(); - } - } - - m_ready_to_transcode = true; + void* pDst_block = (uint8_t*)pDst_blocks + block_y * output_row_pitch_in_blocks_or_pixels * output_block_or_pixel_stride_in_bytes; - return true; - } + for (uint32_t block_x = 0; block_x < num_blocks_x; ++block_x, ++pSource_block, pDst_block = (uint8_t*)pDst_block + output_block_or_pixel_stride_in_bytes) + { + switch (fmt) + { + case block_format::cASTC_HDR_6x6: + { + // Nothing to do, ASTC HDR 6x6 is just ASTC. + // TODO: Optimize this copy + memcpy(pDst_block, pSource_block, sizeof(astc_helpers::astc_block)); + status = true; + break; + } + case block_format::cRGB_9E5: + { + astc_helpers::log_astc_block log_blk; + status = astc_helpers::unpack_block(pSource_block, log_blk, 6, 6); + if (status) + { + uint32_t* pDst_pixels = reinterpret_cast( + static_cast(pDst_blocks) + (block_x * 6 + block_y * 6 * output_row_pitch_in_blocks_or_pixels) * sizeof(uint32_t) + ); - bool basisu_transcoder::stop_transcoding() - { - m_lowlevel_etc1s_decoder.clear(); + uint32_t blk_texels[6][6]; - m_ready_to_transcode = false; - - return true; - } + status = astc_helpers::decode_block(log_blk, blk_texels, 6, 6, astc_helpers::cDecodeModeRGB9E5); - bool basisu_transcoder::transcode_slice(const void* pData, uint32_t data_size, uint32_t slice_index, void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, block_format fmt, - uint32_t output_block_or_pixel_stride_in_bytes, uint32_t decode_flags, uint32_t output_row_pitch_in_blocks_or_pixels, basisu_transcoder_state* pState, void *pAlpha_blocks, uint32_t output_rows_in_pixels, int channel0, int channel1) const - { - if (!m_ready_to_transcode) - { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: must call start_transcoding first\n"); - return false; - } + if (status) + { + const uint32_t max_x = basisu::minimum(6, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 6); + const uint32_t max_y = basisu::minimum(6, (int)output_rows_in_pixels - (int)block_y * 6); - if (decode_flags & cDecodeFlagsPVRTCDecodeToNextPow2) - { - // TODO: Not yet supported - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: cDecodeFlagsPVRTCDecodeToNextPow2 currently unsupported\n"); - return false; + for (uint32_t y = 0; y < max_y; y++) + { + memcpy(pDst_pixels, &blk_texels[y][0], sizeof(uint32_t) * max_x); + + pDst_pixels += output_row_pitch_in_blocks_or_pixels; + } // y + } + } + + break; + } + case block_format::cRGBA_HALF: + { + astc_helpers::log_astc_block log_blk; + status = astc_helpers::unpack_block(pSource_block, log_blk, 6, 6); + if (status) + { + half_float* pDst_pixels = reinterpret_cast( + static_cast(pDst_blocks) + (block_x * 6 + block_y * 6 * output_row_pitch_in_blocks_or_pixels) * sizeof(half_float) * 4 + ); + + half_float blk_texels[6][6][4]; + status = astc_helpers::decode_block(log_blk, blk_texels, 6, 6, astc_helpers::cDecodeModeHDR16); + + if (status) + { + const uint32_t max_x = basisu::minimum(6, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 6); + const uint32_t max_y = basisu::minimum(6, (int)output_rows_in_pixels - (int)block_y * 6); + + for (uint32_t y = 0; y < max_y; y++) + { + for (uint32_t x = 0; x < max_x; x++) + { + pDst_pixels[0 + 4 * x] = blk_texels[y][x][0]; + pDst_pixels[1 + 4 * x] = blk_texels[y][x][1]; + pDst_pixels[2 + 4 * x] = blk_texels[y][x][2]; + pDst_pixels[3 + 4 * x] = blk_texels[y][x][3]; + } // x + + pDst_pixels += output_row_pitch_in_blocks_or_pixels * 4; + } // y + } + } + + break; + } + case block_format::cRGB_HALF: + { + astc_helpers::log_astc_block log_blk; + status = astc_helpers::unpack_block(pSource_block, log_blk, 6, 6); + if (status) + { + half_float* pDst_pixels = + reinterpret_cast(static_cast(pDst_blocks) + (block_x * 6 + block_y * 6 * output_row_pitch_in_blocks_or_pixels) * sizeof(half_float) * 3); + + half_float blk_texels[6][6][4]; + status = astc_helpers::decode_block(log_blk, blk_texels, 6, 6, astc_helpers::cDecodeModeHDR16); + if (status) + { + const uint32_t max_x = basisu::minimum(6, (int)output_row_pitch_in_blocks_or_pixels - (int)block_x * 6); + const uint32_t max_y = basisu::minimum(6, (int)output_rows_in_pixels - (int)block_y * 6); + + for (uint32_t y = 0; y < max_y; y++) + { + for (uint32_t x = 0; x < max_x; x++) + { + pDst_pixels[0 + 3 * x] = blk_texels[y][x][0]; + pDst_pixels[1 + 3 * x] = blk_texels[y][x][1]; + pDst_pixels[2 + 3 * x] = blk_texels[y][x][2]; + } // x + + pDst_pixels += output_row_pitch_in_blocks_or_pixels * 3; + } // y + } + } + + break; + } + default: + assert(0); + break; + + } + + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_slice: Transcoder failed to unpack a ASTC HDR block - this is a bug, or the data was corrupted\n"); + return false; + } + + } // block_x + + } // block_y } - if (!validate_header_quick(pData, data_size)) + return true; +#else + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_slice: ASTC HDR is unsupported\n"); + + BASISU_NOTE_UNUSED(decode_flags); + BASISU_NOTE_UNUSED(channel0); + BASISU_NOTE_UNUSED(channel1); + BASISU_NOTE_UNUSED(output_rows_in_pixels); + BASISU_NOTE_UNUSED(output_row_pitch_in_blocks_or_pixels); + BASISU_NOTE_UNUSED(output_block_or_pixel_stride_in_bytes); + BASISU_NOTE_UNUSED(fmt); + BASISU_NOTE_UNUSED(image_data_size); + BASISU_NOTE_UNUSED(pImage_data); + BASISU_NOTE_UNUSED(num_blocks_x); + BASISU_NOTE_UNUSED(num_blocks_y); + BASISU_NOTE_UNUSED(pDst_blocks); + + return false; +#endif + } + + bool basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_image( + transcoder_texture_format target_format, + void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, + const uint8_t* pCompressed_data, uint32_t compressed_data_length, + uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index, + uint32_t slice_offset, uint32_t slice_length, + uint32_t decode_flags, + bool has_alpha, + bool is_video, + uint32_t output_row_pitch_in_blocks_or_pixels, + basisu_transcoder_state* pState, + uint32_t output_rows_in_pixels, + int channel0, int channel1) + { + BASISU_NOTE_UNUSED(is_video); + BASISU_NOTE_UNUSED(level_index); + BASISU_NOTE_UNUSED(decode_flags); + + if (((uint64_t)slice_offset + slice_length) > (uint64_t)compressed_data_length) { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: header validation failed\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_image: source data buffer too small\n"); return false; } - const basis_file_header* pHeader = reinterpret_cast(pData); - - const uint8_t* pDataU8 = static_cast(pData); + const uint32_t bytes_per_block_or_pixel = basis_get_bytes_per_block_or_pixel(target_format); + //const uint32_t total_slice_blocks = num_blocks_x * num_blocks_y; - if (slice_index >= pHeader->m_total_slices) + if (!basis_validate_output_buffer_size(basis_tex_format::cASTC_HDR_6x6, target_format, output_blocks_buf_size_in_blocks_or_pixels, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, output_rows_in_pixels)) { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: slice_index >= pHeader->m_total_slices\n"); + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_image: output buffer size too small\n"); return false; } - const basis_slice_desc& slice_desc = reinterpret_cast(pDataU8 + pHeader->m_slice_desc_file_ofs)[slice_index]; + bool status = false; - uint32_t total_4x4_blocks = slice_desc.m_num_blocks_x * slice_desc.m_num_blocks_y; - - if (basis_block_format_is_uncompressed(fmt)) + switch (target_format) { - // Assume the output buffer is orig_width by orig_height - if (!output_row_pitch_in_blocks_or_pixels) - output_row_pitch_in_blocks_or_pixels = slice_desc.m_orig_width; - - if (!output_rows_in_pixels) - output_rows_in_pixels = slice_desc.m_orig_height; + case transcoder_texture_format::cTFASTC_HDR_6x6_RGBA: + { + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cASTC_HDR_6x6, + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); - // Now make sure the output buffer is large enough, or we'll overwrite memory. - if (output_blocks_buf_size_in_blocks_or_pixels < (output_rows_in_pixels * output_row_pitch_in_blocks_or_pixels)) + if (!status) { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: output_blocks_buf_size_in_blocks_or_pixels < (output_rows_in_pixels * output_row_pitch_in_blocks_or_pixels)\n"); - return false; + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_image: transcode_slice() to ASTC_HDR failed\n"); } + break; } - else if (fmt == block_format::cFXT1_RGB) + case transcoder_texture_format::cTFBC6H: { - const uint32_t num_blocks_fxt1_x = (slice_desc.m_orig_width + 7) / 8; - const uint32_t num_blocks_fxt1_y = (slice_desc.m_orig_height + 3) / 4; - const uint32_t total_blocks_fxt1 = num_blocks_fxt1_x * num_blocks_fxt1_y; - - if (output_blocks_buf_size_in_blocks_or_pixels < total_blocks_fxt1) + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cBC6H, + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); + if (!status) { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: output_blocks_buf_size_in_blocks_or_pixels < total_blocks_fxt1\n"); - return false; + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_image: transcode_slice() to BC6H failed\n"); } + break; } - else + case transcoder_texture_format::cTFRGB_HALF: { - if (output_blocks_buf_size_in_blocks_or_pixels < total_4x4_blocks) + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGB_HALF, + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); + if (!status) { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: output_blocks_buf_size_in_blocks_or_pixels < total_blocks\n"); - return false; + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_image: transcode_slice() to RGB_HALF failed\n"); } + break; } - - if (fmt != block_format::cETC1) + case transcoder_texture_format::cTFRGBA_HALF: { - if ((fmt == block_format::cPVRTC1_4_RGB) || (fmt == block_format::cPVRTC1_4_RGBA)) + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGBA_HALF, + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1, decode_flags); + if (!status) { - if ((!basisu::is_pow2(slice_desc.m_num_blocks_x * 4)) || (!basisu::is_pow2(slice_desc.m_num_blocks_y * 4))) - { - // PVRTC1 only supports power of 2 dimensions - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: PVRTC1 only supports power of 2 dimensions\n"); - return false; - } + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_image: transcode_slice() to RGBA_HALF failed\n"); } + break; } - - if (slice_desc.m_file_ofs > data_size) - { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: invalid slice_desc.m_file_ofs, or passed in buffer too small\n"); - return false; - } - - const uint32_t data_size_left = data_size - slice_desc.m_file_ofs; - if (data_size_left < slice_desc.m_file_size) - { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: invalid slice_desc.m_file_size, or passed in buffer too small\n"); - return false; - } - - if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC_HDR_4x4) + case transcoder_texture_format::cTFRGB_9E5: { - return m_lowlevel_uastc_hdr_decoder.transcode_slice(pOutput_blocks, slice_desc.m_num_blocks_x, slice_desc.m_num_blocks_y, - pDataU8 + slice_desc.m_file_ofs, slice_desc.m_file_size, - fmt, output_block_or_pixel_stride_in_bytes, (decode_flags & cDecodeFlagsBC1ForbidThreeColorBlocks) == 0, *pHeader, slice_desc, output_row_pitch_in_blocks_or_pixels, pState, - output_rows_in_pixels, channel0, channel1, decode_flags); + status = transcode_slice(pOutput_blocks, num_blocks_x, num_blocks_y, pCompressed_data + slice_offset, slice_length, block_format::cRGB_9E5, + bytes_per_block_or_pixel, false, has_alpha, orig_width, orig_height, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels, channel0, channel1 , decode_flags); + if (!status) + { + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_image: transcode_slice() to RGBA_HALF failed\n"); + } + break; } - else if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4) + default: { - return m_lowlevel_uastc_decoder.transcode_slice(pOutput_blocks, slice_desc.m_num_blocks_x, slice_desc.m_num_blocks_y, - pDataU8 + slice_desc.m_file_ofs, slice_desc.m_file_size, - fmt, output_block_or_pixel_stride_in_bytes, (decode_flags & cDecodeFlagsBC1ForbidThreeColorBlocks) == 0, *pHeader, slice_desc, output_row_pitch_in_blocks_or_pixels, pState, - output_rows_in_pixels, channel0, channel1, decode_flags); + assert(0); + BASISU_DEVEL_ERROR("basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder::transcode_image: Invalid format\n"); + break; } - else - { - return m_lowlevel_etc1s_decoder.transcode_slice(pOutput_blocks, slice_desc.m_num_blocks_x, slice_desc.m_num_blocks_y, - pDataU8 + slice_desc.m_file_ofs, slice_desc.m_file_size, - fmt, output_block_or_pixel_stride_in_bytes, (decode_flags & cDecodeFlagsBC1ForbidThreeColorBlocks) == 0, *pHeader, slice_desc, output_row_pitch_in_blocks_or_pixels, pState, - (decode_flags & cDecodeFlagsOutputHasAlphaIndices) != 0, pAlpha_blocks, output_rows_in_pixels); } + + return status; } - int basisu_transcoder::find_first_slice_index(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index) const + //------------------------------------------------------------------------------------------------ + + basisu_transcoder::basisu_transcoder() : + m_ready_to_transcode(false) { - BASISU_NOTE_UNUSED(data_size); + } + + bool basisu_transcoder::validate_file_checksums(const void* pData, uint32_t data_size, bool full_validation) const + { + if (!validate_header(pData, data_size)) + return false; const basis_file_header* pHeader = reinterpret_cast(pData); - const uint8_t* pDataU8 = static_cast(pData); - // For very large basis files this search could be painful - // TODO: Binary search this - for (uint32_t slice_iter = 0; slice_iter < pHeader->m_total_slices; slice_iter++) +#if !BASISU_NO_HEADER_OR_DATA_CRC16_CHECKS + if (crc16(&pHeader->m_data_size, sizeof(basis_file_header) - BASISU_OFFSETOF(basis_file_header, m_data_size), 0) != pHeader->m_header_crc16) { - const basis_slice_desc& slice_desc = reinterpret_cast(pDataU8 + pHeader->m_slice_desc_file_ofs)[slice_iter]; - if ((slice_desc.m_image_index == image_index) && (slice_desc.m_level_index == level_index)) - return slice_iter; + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header CRC check failed\n"); + return false; } - BASISU_DEVEL_ERROR("basisu_transcoder::find_first_slice_index: didn't find slice\n"); + if (full_validation) + { + if (crc16(reinterpret_cast(pData) + sizeof(basis_file_header), pHeader->m_data_size, 0) != pHeader->m_data_crc16) + { + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: data CRC check failed\n"); + return false; + } + } +#endif - return -1; + return true; } - int basisu_transcoder::find_slice(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, bool alpha_data) const + bool basisu_transcoder::validate_header_quick(const void* pData, uint32_t data_size) const { - if (!validate_header_quick(pData, data_size)) - { - BASISU_DEVEL_ERROR("basisu_transcoder::find_slice: header validation failed\n"); + if (data_size <= sizeof(basis_file_header)) return false; - } const basis_file_header* pHeader = reinterpret_cast(pData); - const uint8_t* pDataU8 = static_cast(pData); - const basis_slice_desc* pSlice_descs = reinterpret_cast(pDataU8 + pHeader->m_slice_desc_file_ofs); - // For very large basis files this search could be painful - // TODO: Binary search this - for (uint32_t slice_iter = 0; slice_iter < pHeader->m_total_slices; slice_iter++) + if ((pHeader->m_sig != basis_file_header::cBASISSigValue) || (pHeader->m_ver != BASISD_SUPPORTED_BASIS_VERSION) || (pHeader->m_header_size != sizeof(basis_file_header))) { - const basis_slice_desc& slice_desc = pSlice_descs[slice_iter]; - if ((slice_desc.m_image_index == image_index) && (slice_desc.m_level_index == level_index)) - { - if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S) - { - const bool slice_alpha = (slice_desc.m_flags & cSliceDescFlagsHasAlpha) != 0; - if (slice_alpha == alpha_data) - return slice_iter; - } - else - { - return slice_iter; - } - } + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header has an invalid signature, or file version is unsupported\n"); + return false; } - BASISU_DEVEL_ERROR("basisu_transcoder::find_slice: didn't find slice\n"); - - return -1; - } - - void basisu_transcoder::write_opaque_alpha_blocks( - uint32_t num_blocks_x, uint32_t num_blocks_y, - void* pOutput_blocks, block_format fmt, - uint32_t block_stride_in_bytes, uint32_t output_row_pitch_in_blocks_or_pixels) - { - // 'num_blocks_y', 'pOutput_blocks' & 'block_stride_in_bytes' unused - // when disabling BASISD_SUPPORT_ETC2_EAC_A8 *and* BASISD_SUPPORT_DXT5A - BASISU_NOTE_UNUSED(num_blocks_y); - BASISU_NOTE_UNUSED(pOutput_blocks); - BASISU_NOTE_UNUSED(block_stride_in_bytes); - - if (!output_row_pitch_in_blocks_or_pixels) - output_row_pitch_in_blocks_or_pixels = num_blocks_x; - - if ((fmt == block_format::cETC2_EAC_A8) || (fmt == block_format::cETC2_EAC_R11)) + uint32_t expected_file_size = sizeof(basis_file_header) + pHeader->m_data_size; + if (data_size < expected_file_size) { -#if BASISD_SUPPORT_ETC2_EAC_A8 - eac_block blk; - blk.m_base = 255; - blk.m_multiplier = 1; - blk.m_table = 13; - - // Selectors are all 4's - memcpy(&blk.m_selectors, g_etc2_eac_a8_sel4, sizeof(g_etc2_eac_a8_sel4)); - - for (uint32_t y = 0; y < num_blocks_y; y++) - { - uint32_t dst_ofs = y * output_row_pitch_in_blocks_or_pixels * block_stride_in_bytes; - for (uint32_t x = 0; x < num_blocks_x; x++) - { - memcpy((uint8_t*)pOutput_blocks + dst_ofs, &blk, sizeof(blk)); - dst_ofs += block_stride_in_bytes; - } - } -#endif + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: source buffer is too small\n"); + return false; } - else if (fmt == block_format::cBC4) + + if ((!pHeader->m_total_slices) || (!pHeader->m_total_images)) { -#if BASISD_SUPPORT_DXT5A - dxt5a_block blk; - blk.m_endpoints[0] = 255; - blk.m_endpoints[1] = 255; - memset(blk.m_selectors, 0, sizeof(blk.m_selectors)); + BASISU_DEVEL_ERROR("basisu_transcoder::validate_header_quick: header is invalid\n"); + return false; + } - for (uint32_t y = 0; y < num_blocks_y; y++) - { - uint32_t dst_ofs = y * output_row_pitch_in_blocks_or_pixels * block_stride_in_bytes; - for (uint32_t x = 0; x < num_blocks_x; x++) - { - memcpy((uint8_t*)pOutput_blocks + dst_ofs, &blk, sizeof(blk)); - dst_ofs += block_stride_in_bytes; - } - } -#endif + if ((pHeader->m_slice_desc_file_ofs >= data_size) || + ((data_size - pHeader->m_slice_desc_file_ofs) < (sizeof(basis_slice_desc) * pHeader->m_total_slices)) + ) + { + BASISU_DEVEL_ERROR("basisu_transcoder::validate_header_quick: passed in buffer is too small or data is corrupted\n"); + return false; } + + return true; } - bool basisu_transcoder::transcode_image_level( - const void* pData, uint32_t data_size, - uint32_t image_index, uint32_t level_index, - void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, - transcoder_texture_format fmt, - uint32_t decode_flags, uint32_t output_row_pitch_in_blocks_or_pixels, basisu_transcoder_state *pState, uint32_t output_rows_in_pixels) const + bool basisu_transcoder::validate_header(const void* pData, uint32_t data_size) const { - const uint32_t bytes_per_block_or_pixel = basis_get_bytes_per_block_or_pixel(fmt); - - if (!m_ready_to_transcode) + if (data_size <= sizeof(basis_file_header)) { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: must call start_transcoding() first\n"); + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: input source buffer is too small\n"); return false; } - //const bool transcode_alpha_data_to_opaque_formats = (decode_flags & cDecodeFlagsTranscodeAlphaDataToOpaqueFormats) != 0; + const basis_file_header* pHeader = reinterpret_cast(pData); - if (decode_flags & cDecodeFlagsPVRTCDecodeToNextPow2) + if ((pHeader->m_sig != basis_file_header::cBASISSigValue) || (pHeader->m_ver != BASISD_SUPPORTED_BASIS_VERSION) || (pHeader->m_header_size != sizeof(basis_file_header))) { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: cDecodeFlagsPVRTCDecodeToNextPow2 currently unsupported\n"); - // TODO: Not yet supported + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header has an invalid signature, or file version is unsupported\n"); return false; } - if (!validate_header_quick(pData, data_size)) + uint32_t expected_file_size = sizeof(basis_file_header) + pHeader->m_data_size; + if (data_size < expected_file_size) { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: header validation failed\n"); + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: input source buffer is too small, or header is corrupted\n"); return false; } - const basis_file_header* pHeader = reinterpret_cast(pData); - - const uint8_t* pDataU8 = static_cast(pData); - - const basis_slice_desc* pSlice_descs = reinterpret_cast(pDataU8 + pHeader->m_slice_desc_file_ofs); - - const bool basis_file_has_alpha_slices = (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) != 0; - - int slice_index = find_first_slice_index(pData, data_size, image_index, level_index); - if (slice_index < 0) + if ((!pHeader->m_total_images) || (!pHeader->m_total_slices)) { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: failed finding slice index\n"); - // Unable to find the requested image/level + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid basis file (total images or slices are 0)\n"); return false; } - if ((fmt == transcoder_texture_format::cTFPVRTC1_4_RGBA) && (!basis_file_has_alpha_slices)) + if (pHeader->m_total_images > pHeader->m_total_slices) { - // Switch to PVRTC1 RGB if the input doesn't have alpha. - fmt = transcoder_texture_format::cTFPVRTC1_4_RGB; + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid basis file (too many images)\n"); + return false; } - + if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S) { - if (pSlice_descs[slice_index].m_flags & cSliceDescFlagsHasAlpha) - { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file has out of order alpha slice\n"); - - // The first slice shouldn't have alpha data in a properly formed basis file - return false; - } - - if (basis_file_has_alpha_slices) + if (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) { - // The alpha data should immediately follow the color data, and have the same resolution. - if ((slice_index + 1U) >= pHeader->m_total_slices) - { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file has missing alpha slice\n"); - // basis file is missing the alpha slice - return false; - } - - // Basic sanity checks - if ((pSlice_descs[slice_index + 1].m_flags & cSliceDescFlagsHasAlpha) == 0) - { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file has missing alpha slice (flag check)\n"); - // This slice should have alpha data - return false; - } - - if ((pSlice_descs[slice_index].m_num_blocks_x != pSlice_descs[slice_index + 1].m_num_blocks_x) || (pSlice_descs[slice_index].m_num_blocks_y != pSlice_descs[slice_index + 1].m_num_blocks_y)) + if (pHeader->m_total_slices & 1) { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file slice dimensions bad\n"); - // Alpha slice should have been the same res as the color slice + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: invalid alpha .basis file\n"); return false; } } + + // This flag dates back to pre-Basis Universal, when .basis supported full ETC1 too. + if ((pHeader->m_flags & cBASISHeaderFlagETC1S) == 0) + { + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: Invalid .basis file (ETC1S check)\n"); + return false; + } } - - bool status = false; - - const uint32_t total_slice_blocks = pSlice_descs[slice_index].m_num_blocks_x * pSlice_descs[slice_index].m_num_blocks_y; - - if (((fmt == transcoder_texture_format::cTFPVRTC1_4_RGB) || (fmt == transcoder_texture_format::cTFPVRTC1_4_RGBA)) && (output_blocks_buf_size_in_blocks_or_pixels > total_slice_blocks)) + else { - // The transcoder doesn't write beyond total_slice_blocks, so we need to clear the rest ourselves. - // For GL usage, PVRTC1 4bpp image size is (max(width, 8)* max(height, 8) * 4 + 7) / 8. - // However, for KTX and internally in Basis this formula isn't used, it's just ((width+3)/4) * ((height+3)/4) * bytes_per_block_or_pixel. This is all the transcoder actually writes to memory. - memset(static_cast(pOutput_blocks) + total_slice_blocks * bytes_per_block_or_pixel, 0, (output_blocks_buf_size_in_blocks_or_pixels - total_slice_blocks) * bytes_per_block_or_pixel); + if ((pHeader->m_flags & cBASISHeaderFlagETC1S) != 0) + { + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: Invalid .basis file (ETC1S check)\n"); + return false; + } } - if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC_HDR_4x4) + if ((pHeader->m_slice_desc_file_ofs >= data_size) || + ((data_size - pHeader->m_slice_desc_file_ofs) < (sizeof(basis_slice_desc) * pHeader->m_total_slices)) + ) { - const basis_slice_desc* pSlice_desc = &pSlice_descs[slice_index]; - - // Use the container independent image transcode method. - status = m_lowlevel_uastc_hdr_decoder.transcode_image(fmt, - pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, - (const uint8_t*)pData, data_size, pSlice_desc->m_num_blocks_x, pSlice_desc->m_num_blocks_y, pSlice_desc->m_orig_width, pSlice_desc->m_orig_height, pSlice_desc->m_level_index, - pSlice_desc->m_file_ofs, pSlice_desc->m_file_size, - decode_flags, basis_file_has_alpha_slices, pHeader->m_tex_type == cBASISTexTypeVideoFrames, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + BASISU_DEVEL_ERROR("basisu_transcoder::validate_header_quick: passed in buffer is too small or data is corrupted\n"); + return false; } - else if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4) + + return true; + } + + basis_texture_type basisu_transcoder::get_texture_type(const void* pData, uint32_t data_size) const + { + if (!validate_header_quick(pData, data_size)) { - const basis_slice_desc* pSlice_desc = &pSlice_descs[slice_index]; + BASISU_DEVEL_ERROR("basisu_transcoder::get_texture_type: header validation failed\n"); + return cBASISTexType2DArray; + } - // Use the container independent image transcode method. - status = m_lowlevel_uastc_decoder.transcode_image(fmt, - pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, - (const uint8_t*)pData, data_size, pSlice_desc->m_num_blocks_x, pSlice_desc->m_num_blocks_y, pSlice_desc->m_orig_width, pSlice_desc->m_orig_height, pSlice_desc->m_level_index, - pSlice_desc->m_file_ofs, pSlice_desc->m_file_size, - decode_flags, basis_file_has_alpha_slices, pHeader->m_tex_type == cBASISTexTypeVideoFrames, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + const basis_file_header* pHeader = static_cast(pData); + + basis_texture_type btt = static_cast(static_cast(pHeader->m_tex_type)); + + if (btt >= cBASISTexTypeTotal) + { + BASISU_DEVEL_ERROR("basisu_transcoder::validate_header_quick: header's texture type field is invalid\n"); + return cBASISTexType2DArray; } - else + + return btt; + } + + bool basisu_transcoder::get_userdata(const void* pData, uint32_t data_size, uint32_t& userdata0, uint32_t& userdata1) const + { + if (!validate_header_quick(pData, data_size)) { - // ETC1S - const basis_slice_desc* pSlice_desc = &pSlice_descs[slice_index]; - const basis_slice_desc* pAlpha_slice_desc = basis_file_has_alpha_slices ? &pSlice_descs[slice_index + 1] : nullptr; + BASISU_DEVEL_ERROR("basisu_transcoder::get_userdata: header validation failed\n"); + return false; + } - assert((pSlice_desc->m_flags & cSliceDescFlagsHasAlpha) == 0); + const basis_file_header* pHeader = static_cast(pData); - if (pAlpha_slice_desc) - { - // Basic sanity checks - assert((pAlpha_slice_desc->m_flags & cSliceDescFlagsHasAlpha) != 0); - assert(pSlice_desc->m_num_blocks_x == pAlpha_slice_desc->m_num_blocks_x); - assert(pSlice_desc->m_num_blocks_y == pAlpha_slice_desc->m_num_blocks_y); - assert(pSlice_desc->m_level_index == pAlpha_slice_desc->m_level_index); - } + userdata0 = pHeader->m_userdata0; + userdata1 = pHeader->m_userdata1; + return true; + } - // Use the container independent image transcode method. - status = m_lowlevel_etc1s_decoder.transcode_image(fmt, - pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, - (const uint8_t *)pData, data_size, pSlice_desc->m_num_blocks_x, pSlice_desc->m_num_blocks_y, pSlice_desc->m_orig_width, pSlice_desc->m_orig_height, pSlice_desc->m_level_index, - pSlice_desc->m_file_ofs, pSlice_desc->m_file_size, - (pAlpha_slice_desc != nullptr) ? (uint32_t)pAlpha_slice_desc->m_file_ofs : 0U, (pAlpha_slice_desc != nullptr) ? (uint32_t)pAlpha_slice_desc->m_file_size : 0U, - decode_flags, basis_file_has_alpha_slices, pHeader->m_tex_type == cBASISTexTypeVideoFrames, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + uint32_t basisu_transcoder::get_total_images(const void* pData, uint32_t data_size) const + { + if (!validate_header_quick(pData, data_size)) + { + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_images: header validation failed\n"); + return 0; + } - } // if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4) - - if (!status) - { - BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: Returning false\n"); - } - else - { - //BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: Returning true\n"); - } + const basis_file_header* pHeader = static_cast(pData); - return status; + return pHeader->m_total_images; } - uint32_t basis_get_bytes_per_block_or_pixel(transcoder_texture_format fmt) + basis_tex_format basisu_transcoder::get_basis_tex_format(const void* pData, uint32_t data_size) const { - switch (fmt) + if (!validate_header_quick(pData, data_size)) { - case transcoder_texture_format::cTFETC1_RGB: - case transcoder_texture_format::cTFBC1_RGB: - case transcoder_texture_format::cTFBC4_R: - case transcoder_texture_format::cTFPVRTC1_4_RGB: - case transcoder_texture_format::cTFPVRTC1_4_RGBA: - case transcoder_texture_format::cTFATC_RGB: - case transcoder_texture_format::cTFPVRTC2_4_RGB: - case transcoder_texture_format::cTFPVRTC2_4_RGBA: - case transcoder_texture_format::cTFETC2_EAC_R11: - return 8; - case transcoder_texture_format::cTFBC7_RGBA: - case transcoder_texture_format::cTFBC7_ALT: - case transcoder_texture_format::cTFBC6H: - case transcoder_texture_format::cTFETC2_RGBA: - case transcoder_texture_format::cTFBC3_RGBA: - case transcoder_texture_format::cTFBC5_RG: - case transcoder_texture_format::cTFASTC_4x4_RGBA: - case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: - case transcoder_texture_format::cTFATC_RGBA: - case transcoder_texture_format::cTFFXT1_RGB: - case transcoder_texture_format::cTFETC2_EAC_RG11: - return 16; - case transcoder_texture_format::cTFRGBA32: - case transcoder_texture_format::cTFRGB_9E5: - return sizeof(uint32_t); - case transcoder_texture_format::cTFRGB565: - case transcoder_texture_format::cTFBGR565: - case transcoder_texture_format::cTFRGBA4444: - return sizeof(uint16_t); - case transcoder_texture_format::cTFRGB_HALF: - return sizeof(half_float) * 3; - case transcoder_texture_format::cTFRGBA_HALF: - return sizeof(half_float) * 4; - default: - assert(0); - BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n"); - break; + BASISU_DEVEL_ERROR("basisu_transcoder::get_basis_tex_format: header validation failed\n"); + return basis_tex_format::cETC1S; } - return 0; + + const basis_file_header* pHeader = static_cast(pData); + + return (basis_tex_format)(uint32_t)pHeader->m_tex_format; } - const char* basis_get_format_name(transcoder_texture_format fmt) + bool basisu_transcoder::get_image_info(const void* pData, uint32_t data_size, basisu_image_info& image_info, uint32_t image_index) const { - switch (fmt) + if (!validate_header_quick(pData, data_size)) { - case transcoder_texture_format::cTFETC1_RGB: return "ETC1_RGB"; - case transcoder_texture_format::cTFBC1_RGB: return "BC1_RGB"; - case transcoder_texture_format::cTFBC4_R: return "BC4_R"; - case transcoder_texture_format::cTFPVRTC1_4_RGB: return "PVRTC1_4_RGB"; - case transcoder_texture_format::cTFPVRTC1_4_RGBA: return "PVRTC1_4_RGBA"; - case transcoder_texture_format::cTFBC7_RGBA: return "BC7_RGBA"; - case transcoder_texture_format::cTFBC7_ALT: return "BC7_RGBA"; - case transcoder_texture_format::cTFETC2_RGBA: return "ETC2_RGBA"; - case transcoder_texture_format::cTFBC3_RGBA: return "BC3_RGBA"; - case transcoder_texture_format::cTFBC5_RG: return "BC5_RG"; - case transcoder_texture_format::cTFASTC_4x4_RGBA: return "ASTC_RGBA"; - case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: return "ASTC_HDR_RGBA"; - case transcoder_texture_format::cTFATC_RGB: return "ATC_RGB"; - case transcoder_texture_format::cTFATC_RGBA: return "ATC_RGBA"; - case transcoder_texture_format::cTFRGBA32: return "RGBA32"; - case transcoder_texture_format::cTFRGB565: return "RGB565"; - case transcoder_texture_format::cTFBGR565: return "BGR565"; - case transcoder_texture_format::cTFRGBA4444: return "RGBA4444"; - case transcoder_texture_format::cTFRGBA_HALF: return "RGBA_HALF"; - case transcoder_texture_format::cTFRGB_9E5: return "RGB_9E5"; - case transcoder_texture_format::cTFRGB_HALF: return "RGB_HALF"; - case transcoder_texture_format::cTFFXT1_RGB: return "FXT1_RGB"; - case transcoder_texture_format::cTFPVRTC2_4_RGB: return "PVRTC2_4_RGB"; - case transcoder_texture_format::cTFPVRTC2_4_RGBA: return "PVRTC2_4_RGBA"; - case transcoder_texture_format::cTFETC2_EAC_R11: return "ETC2_EAC_R11"; - case transcoder_texture_format::cTFETC2_EAC_RG11: return "ETC2_EAC_RG11"; - case transcoder_texture_format::cTFBC6H: return "BC6H"; - default: - assert(0); - BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n"); - break; + BASISU_DEVEL_ERROR("basisu_transcoder::get_image_info: header validation failed\n"); + return false; } - return ""; - } - const char* basis_get_block_format_name(block_format fmt) - { - switch (fmt) + int slice_index = find_first_slice_index(pData, data_size, image_index, 0); + if (slice_index < 0) { - case block_format::cETC1: return "ETC1"; - case block_format::cBC1: return "BC1"; - case block_format::cPVRTC1_4_RGB: return "PVRTC1_4_RGB"; - case block_format::cPVRTC1_4_RGBA: return "PVRTC1_4_RGBA"; - case block_format::cBC7: return "BC7"; - case block_format::cETC2_RGBA: return "ETC2_RGBA"; - case block_format::cBC3: return "BC3"; - case block_format::cASTC_4x4: return "ASTC_4x4"; - case block_format::cATC_RGB: return "ATC_RGB"; - case block_format::cRGBA32: return "RGBA32"; - case block_format::cRGB565: return "RGB565"; - case block_format::cBGR565: return "BGR565"; - case block_format::cRGBA4444: return "RGBA4444"; - case block_format::cRGBA_HALF: return "RGBA_HALF"; - case block_format::cRGB_HALF: return "RGB_HALF"; - case block_format::cRGB_9E5: return "RGB_9E5"; - case block_format::cUASTC_4x4: return "UASTC_4x4"; - case block_format::cUASTC_HDR_4x4: return "UASTC_HDR_4x4"; - case block_format::cBC6H: return "BC6H"; - case block_format::cASTC_HDR_4x4: return "ASTC_HDR_4x4"; - case block_format::cFXT1_RGB: return "FXT1_RGB"; - case block_format::cPVRTC2_4_RGB: return "PVRTC2_4_RGB"; - case block_format::cPVRTC2_4_RGBA: return "PVRTC2_4_RGBA"; - case block_format::cETC2_EAC_R11: return "ETC2_EAC_R11"; - case block_format::cETC2_EAC_RG11: return "ETC2_EAC_RG11"; - default: - assert(0); - BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n"); - break; + BASISU_DEVEL_ERROR("basisu_transcoder::get_image_info: invalid slice index\n"); + return false; } - return ""; - } - const char* basis_get_texture_type_name(basis_texture_type tex_type) - { - switch (tex_type) + const basis_file_header* pHeader = static_cast(pData); + + if (image_index >= pHeader->m_total_images) { - case cBASISTexType2D: return "2D"; - case cBASISTexType2DArray: return "2D array"; - case cBASISTexTypeCubemapArray: return "cubemap array"; - case cBASISTexTypeVideoFrames: return "video"; - case cBASISTexTypeVolume: return "3D"; - default: - assert(0); - BASISU_DEVEL_ERROR("basis_get_texture_type_name: Invalid tex_type\n"); - break; + BASISU_DEVEL_ERROR("basisu_transcoder::get_image_info: invalid image_index\n"); + return false; } - return ""; - } - bool basis_transcoder_format_has_alpha(transcoder_texture_format fmt) - { - // TODO: Technically ASTC_HDR does support alpha, but UASTC_HDR doesn't yet support it. Unsure what to do here. - switch (fmt) + const basis_slice_desc* pSlice_descs = reinterpret_cast(static_cast(pData) + pHeader->m_slice_desc_file_ofs); + + uint32_t total_levels = 1; + for (uint32_t i = slice_index + 1; i < pHeader->m_total_slices; i++) + if (pSlice_descs[i].m_image_index == image_index) + total_levels = basisu::maximum(total_levels, pSlice_descs[i].m_level_index + 1); + else + break; + + if (total_levels > 16) { - case transcoder_texture_format::cTFETC2_RGBA: - case transcoder_texture_format::cTFBC3_RGBA: - case transcoder_texture_format::cTFASTC_4x4_RGBA: - case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: - case transcoder_texture_format::cTFBC7_RGBA: - case transcoder_texture_format::cTFBC7_ALT: - case transcoder_texture_format::cTFPVRTC1_4_RGBA: - case transcoder_texture_format::cTFPVRTC2_4_RGBA: - case transcoder_texture_format::cTFATC_RGBA: - case transcoder_texture_format::cTFRGBA32: - case transcoder_texture_format::cTFRGBA4444: - case transcoder_texture_format::cTFRGBA_HALF: - return true; - default: - break; + BASISU_DEVEL_ERROR("basisu_transcoder::get_image_info: invalid image_index\n"); + return false; } - return false; + + const basis_slice_desc& slice_desc = pSlice_descs[slice_index]; + + image_info.m_image_index = image_index; + image_info.m_total_levels = total_levels; + + image_info.m_alpha_flag = false; + + // For ETC1S, if anything has alpha all images have alpha. For UASTC, we only report alpha when the image actually has alpha. + if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S) + image_info.m_alpha_flag = (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) != 0; + else + image_info.m_alpha_flag = (slice_desc.m_flags & cSliceDescFlagsHasAlpha) != 0; + + image_info.m_iframe_flag = (slice_desc.m_flags & cSliceDescFlagsFrameIsIFrame) != 0; + + const uint32_t block_width = basis_tex_format_get_block_width((basis_tex_format)((uint32_t)pHeader->m_tex_format)); + const uint32_t block_height = basis_tex_format_get_block_height((basis_tex_format)((uint32_t)pHeader->m_tex_format)); + + image_info.m_width = slice_desc.m_num_blocks_x * block_width; + image_info.m_height = slice_desc.m_num_blocks_y * block_height; + image_info.m_orig_width = slice_desc.m_orig_width; + image_info.m_orig_height = slice_desc.m_orig_height; + image_info.m_num_blocks_x = slice_desc.m_num_blocks_x; + image_info.m_num_blocks_y = slice_desc.m_num_blocks_y; + image_info.m_block_width = block_width; + image_info.m_block_height = block_height; + image_info.m_total_blocks = image_info.m_num_blocks_x * image_info.m_num_blocks_y; + image_info.m_first_slice_index = slice_index; + + return true; } - bool basis_transcoder_format_is_hdr(transcoder_texture_format fmt) + uint32_t basisu_transcoder::get_total_image_levels(const void* pData, uint32_t data_size, uint32_t image_index) const { - switch (fmt) + if (!validate_header_quick(pData, data_size)) { - case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: - case transcoder_texture_format::cTFBC6H: - case transcoder_texture_format::cTFRGBA_HALF: - case transcoder_texture_format::cTFRGB_HALF: - case transcoder_texture_format::cTFRGB_9E5: - return true; - default: - break; + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_image_levels: header validation failed\n"); + return false; } - return false; - } - basisu::texture_format basis_get_basisu_texture_format(transcoder_texture_format fmt) - { - switch (fmt) + int slice_index = find_first_slice_index(pData, data_size, image_index, 0); + if (slice_index < 0) { - case transcoder_texture_format::cTFETC1_RGB: return basisu::texture_format::cETC1; - case transcoder_texture_format::cTFBC1_RGB: return basisu::texture_format::cBC1; - case transcoder_texture_format::cTFBC4_R: return basisu::texture_format::cBC4; - case transcoder_texture_format::cTFPVRTC1_4_RGB: return basisu::texture_format::cPVRTC1_4_RGB; - case transcoder_texture_format::cTFPVRTC1_4_RGBA: return basisu::texture_format::cPVRTC1_4_RGBA; - case transcoder_texture_format::cTFBC7_RGBA: return basisu::texture_format::cBC7; - case transcoder_texture_format::cTFBC7_ALT: return basisu::texture_format::cBC7; - case transcoder_texture_format::cTFETC2_RGBA: return basisu::texture_format::cETC2_RGBA; - case transcoder_texture_format::cTFBC3_RGBA: return basisu::texture_format::cBC3; - case transcoder_texture_format::cTFBC5_RG: return basisu::texture_format::cBC5; - case transcoder_texture_format::cTFASTC_4x4_RGBA: return basisu::texture_format::cASTC_LDR_4x4; - case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: return basisu::texture_format::cASTC_HDR_4x4; - case transcoder_texture_format::cTFBC6H: return basisu::texture_format::cBC6HUnsigned; - case transcoder_texture_format::cTFATC_RGB: return basisu::texture_format::cATC_RGB; - case transcoder_texture_format::cTFATC_RGBA: return basisu::texture_format::cATC_RGBA_INTERPOLATED_ALPHA; - case transcoder_texture_format::cTFRGBA32: return basisu::texture_format::cRGBA32; - case transcoder_texture_format::cTFRGB565: return basisu::texture_format::cRGB565; - case transcoder_texture_format::cTFBGR565: return basisu::texture_format::cBGR565; - case transcoder_texture_format::cTFRGBA4444: return basisu::texture_format::cRGBA4444; - case transcoder_texture_format::cTFRGBA_HALF: return basisu::texture_format::cRGBA_HALF; - case transcoder_texture_format::cTFRGB_9E5: return basisu::texture_format::cRGB_9E5; - case transcoder_texture_format::cTFRGB_HALF: return basisu::texture_format::cRGB_HALF; - case transcoder_texture_format::cTFFXT1_RGB: return basisu::texture_format::cFXT1_RGB; - case transcoder_texture_format::cTFPVRTC2_4_RGB: return basisu::texture_format::cPVRTC2_4_RGBA; - case transcoder_texture_format::cTFPVRTC2_4_RGBA: return basisu::texture_format::cPVRTC2_4_RGBA; - case transcoder_texture_format::cTFETC2_EAC_R11: return basisu::texture_format::cETC2_R11_EAC; - case transcoder_texture_format::cTFETC2_EAC_RG11: return basisu::texture_format::cETC2_RG11_EAC; - default: - assert(0); - BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n"); - break; + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_image_levels: failed finding slice\n"); + return false; } - return basisu::texture_format::cInvalidTextureFormat; - } - bool basis_transcoder_format_is_uncompressed(transcoder_texture_format tex_type) - { - switch (tex_type) + const basis_file_header* pHeader = static_cast(pData); + + if (image_index >= pHeader->m_total_images) { - case transcoder_texture_format::cTFRGBA32: - case transcoder_texture_format::cTFRGB565: - case transcoder_texture_format::cTFBGR565: - case transcoder_texture_format::cTFRGBA4444: - case transcoder_texture_format::cTFRGB_HALF: - case transcoder_texture_format::cTFRGBA_HALF: - case transcoder_texture_format::cTFRGB_9E5: - return true; - default: - break; + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_image_levels: invalid image_index\n"); + return false; } - return false; - } - bool basis_block_format_is_uncompressed(block_format blk_fmt) - { - switch (blk_fmt) + const basis_slice_desc* pSlice_descs = reinterpret_cast(static_cast(pData) + pHeader->m_slice_desc_file_ofs); + + uint32_t total_levels = 1; + for (uint32_t i = slice_index + 1; i < pHeader->m_total_slices; i++) + if (pSlice_descs[i].m_image_index == image_index) + total_levels = basisu::maximum(total_levels, pSlice_descs[i].m_level_index + 1); + else + break; + + const uint32_t cMaxSupportedLevels = 16; + if (total_levels > cMaxSupportedLevels) { - case block_format::cRGB32: - case block_format::cRGBA32: - case block_format::cA32: - case block_format::cRGB565: - case block_format::cBGR565: - case block_format::cRGBA4444: - case block_format::cRGBA4444_COLOR: - case block_format::cRGBA4444_ALPHA: - case block_format::cRGBA4444_COLOR_OPAQUE: - case block_format::cRGBA_HALF: - case block_format::cRGB_HALF: - case block_format::cRGB_9E5: - return true; - default: - break; + BASISU_DEVEL_ERROR("basisu_transcoder::get_total_image_levels: invalid image levels!\n"); + return false; } - return false; + + return total_levels; } - - uint32_t basis_get_uncompressed_bytes_per_pixel(transcoder_texture_format fmt) + + bool basisu_transcoder::get_image_level_desc(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, uint32_t& orig_width, uint32_t& orig_height, uint32_t& total_blocks) const { - switch (fmt) + if (!validate_header_quick(pData, data_size)) { - case transcoder_texture_format::cTFRGBA32: - case transcoder_texture_format::cTFRGB_9E5: - return sizeof(uint32_t); - case transcoder_texture_format::cTFRGB565: - case transcoder_texture_format::cTFBGR565: - case transcoder_texture_format::cTFRGBA4444: - return sizeof(uint16_t); - case transcoder_texture_format::cTFRGB_HALF: - return sizeof(half_float) * 3; - case transcoder_texture_format::cTFRGBA_HALF: - return sizeof(half_float) * 4; - default: - break; + BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_desc: header validation failed\n"); + return false; } - return 0; - } - - uint32_t basis_get_block_width(transcoder_texture_format tex_type) - { - switch (tex_type) + + int slice_index = find_first_slice_index(pData, data_size, image_index, level_index); + if (slice_index < 0) { - case transcoder_texture_format::cTFFXT1_RGB: - return 8; - default: - break; + BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_desc: failed finding slice\n"); + return false; } - return 4; - } - uint32_t basis_get_block_height(transcoder_texture_format tex_type) - { - BASISU_NOTE_UNUSED(tex_type); - return 4; + const basis_file_header* pHeader = static_cast(pData); + + if (image_index >= pHeader->m_total_images) + { + BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_desc: invalid image_index\n"); + return false; + } + + const basis_slice_desc* pSlice_descs = reinterpret_cast(static_cast(pData) + pHeader->m_slice_desc_file_ofs); + + const basis_slice_desc& slice_desc = pSlice_descs[slice_index]; + + orig_width = slice_desc.m_orig_width; + orig_height = slice_desc.m_orig_height; + total_blocks = slice_desc.m_num_blocks_x * slice_desc.m_num_blocks_y; + + return true; } - - bool basis_is_format_supported(transcoder_texture_format tex_type, basis_tex_format fmt) + + bool basisu_transcoder::get_image_level_info(const void* pData, uint32_t data_size, basisu_image_level_info& image_info, uint32_t image_index, uint32_t level_index) const { - if (fmt == basis_tex_format::cUASTC_HDR_4x4) + if (!validate_header_quick(pData, data_size)) { - // UASTC HDR -#if BASISD_SUPPORT_UASTC_HDR - switch (tex_type) - { - case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: - case transcoder_texture_format::cTFBC6H: - case transcoder_texture_format::cTFRGBA_HALF: - case transcoder_texture_format::cTFRGB_HALF: - case transcoder_texture_format::cTFRGB_9E5: - return true; - default: - break; - } -#endif + BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_info: validate_file_checksums failed\n"); + return false; } - else if (fmt == basis_tex_format::cUASTC4x4) + + int slice_index = find_first_slice_index(pData, data_size, image_index, level_index); + if (slice_index < 0) { - // UASTC LDR -#if BASISD_SUPPORT_UASTC - switch (tex_type) - { - // These niche formats aren't currently supported for UASTC - everything else is. - case transcoder_texture_format::cTFPVRTC2_4_RGB: - case transcoder_texture_format::cTFPVRTC2_4_RGBA: - case transcoder_texture_format::cTFATC_RGB: - case transcoder_texture_format::cTFATC_RGBA: - case transcoder_texture_format::cTFFXT1_RGB: - // UASTC LDR doesn't support transcoding to HDR formats - case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: - case transcoder_texture_format::cTFBC6H: - case transcoder_texture_format::cTFRGBA_HALF: - case transcoder_texture_format::cTFRGB_HALF: - case transcoder_texture_format::cTFRGB_9E5: - return false; - default: - return true; - } -#endif + BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_info: failed finding slice\n"); + return false; + } + + const basis_file_header* pHeader = static_cast(pData); + + if (image_index >= pHeader->m_total_images) + { + BASISU_DEVEL_ERROR("basisu_transcoder::get_image_level_info: invalid image_index\n"); + return false; } + + const basis_slice_desc* pSlice_descs = reinterpret_cast(static_cast(pData) + pHeader->m_slice_desc_file_ofs); + + const basis_slice_desc& slice_desc = pSlice_descs[slice_index]; + + image_info.m_image_index = image_index; + image_info.m_level_index = level_index; + + // For ETC1S, if anything has alpha all images have alpha. For UASTC, we only report alpha when the image actually has alpha. + if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S) + image_info.m_alpha_flag = (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) != 0; else + image_info.m_alpha_flag = (slice_desc.m_flags & cSliceDescFlagsHasAlpha) != 0; + + const uint32_t block_width = basis_tex_format_get_block_width((basis_tex_format)((uint32_t)pHeader->m_tex_format)); + const uint32_t block_height = basis_tex_format_get_block_height((basis_tex_format)((uint32_t)pHeader->m_tex_format)); + + image_info.m_iframe_flag = (slice_desc.m_flags & cSliceDescFlagsFrameIsIFrame) != 0; + image_info.m_width = slice_desc.m_num_blocks_x * block_width; + image_info.m_height = slice_desc.m_num_blocks_y * block_height; + image_info.m_orig_width = slice_desc.m_orig_width; + image_info.m_orig_height = slice_desc.m_orig_height; + image_info.m_block_width = block_width; + image_info.m_block_height = block_height; + image_info.m_num_blocks_x = slice_desc.m_num_blocks_x; + image_info.m_num_blocks_y = slice_desc.m_num_blocks_y; + image_info.m_total_blocks = image_info.m_num_blocks_x * image_info.m_num_blocks_y; + image_info.m_first_slice_index = slice_index; + + image_info.m_rgb_file_ofs = slice_desc.m_file_ofs; + image_info.m_rgb_file_len = slice_desc.m_file_size; + image_info.m_alpha_file_ofs = 0; + image_info.m_alpha_file_len = 0; + + if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S) { - // ETC1S - switch (tex_type) + if (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) { - // ETC1 and uncompressed are always supported. - case transcoder_texture_format::cTFETC1_RGB: - case transcoder_texture_format::cTFRGBA32: - case transcoder_texture_format::cTFRGB565: - case transcoder_texture_format::cTFBGR565: - case transcoder_texture_format::cTFRGBA4444: - return true; -#if BASISD_SUPPORT_DXT1 - case transcoder_texture_format::cTFBC1_RGB: - return true; -#endif -#if BASISD_SUPPORT_DXT5A - case transcoder_texture_format::cTFBC4_R: - case transcoder_texture_format::cTFBC5_RG: - return true; -#endif -#if BASISD_SUPPORT_DXT1 && BASISD_SUPPORT_DXT5A - case transcoder_texture_format::cTFBC3_RGBA: - return true; -#endif -#if BASISD_SUPPORT_PVRTC1 - case transcoder_texture_format::cTFPVRTC1_4_RGB: - case transcoder_texture_format::cTFPVRTC1_4_RGBA: - return true; -#endif -#if BASISD_SUPPORT_BC7_MODE5 - case transcoder_texture_format::cTFBC7_RGBA: - case transcoder_texture_format::cTFBC7_ALT: - return true; -#endif -#if BASISD_SUPPORT_ETC2_EAC_A8 - case transcoder_texture_format::cTFETC2_RGBA: - return true; -#endif -#if BASISD_SUPPORT_ASTC - case transcoder_texture_format::cTFASTC_4x4_RGBA: - return true; -#endif -#if BASISD_SUPPORT_ATC - case transcoder_texture_format::cTFATC_RGB: - case transcoder_texture_format::cTFATC_RGBA: - return true; -#endif -#if BASISD_SUPPORT_FXT1 - case transcoder_texture_format::cTFFXT1_RGB: - return true; -#endif -#if BASISD_SUPPORT_PVRTC2 - case transcoder_texture_format::cTFPVRTC2_4_RGB: - case transcoder_texture_format::cTFPVRTC2_4_RGBA: - return true; -#endif -#if BASISD_SUPPORT_ETC2_EAC_RG11 - case transcoder_texture_format::cTFETC2_EAC_R11: - case transcoder_texture_format::cTFETC2_EAC_RG11: - return true; -#endif - default: - break; + assert((slice_index + 1) < (int)pHeader->m_total_slices); + image_info.m_alpha_file_ofs = pSlice_descs[slice_index + 1].m_file_ofs; + image_info.m_alpha_file_len = pSlice_descs[slice_index + 1].m_file_size; } } - return false; + return true; } - // ------------------------------------------------------------------------------------------------------ - // UASTC - // ------------------------------------------------------------------------------------------------------ - -#if BASISD_SUPPORT_UASTC - const astc_bc7_common_partition2_desc g_astc_bc7_common_partitions2[TOTAL_ASTC_BC7_COMMON_PARTITIONS2] = - { - { 0, 28, false }, { 1, 20, false }, { 2, 16, true }, { 3, 29, false }, - { 4, 91, true }, { 5, 9, false }, { 6, 107, true }, { 7, 72, true }, - { 8, 149, false }, { 9, 204, true }, { 10, 50, false }, { 11, 114, true }, - { 12, 496, true }, { 13, 17, true }, { 14, 78, false }, { 15, 39, true }, - { 17, 252, true }, { 18, 828, true }, { 19, 43, false }, { 20, 156, false }, - { 21, 116, false }, { 22, 210, true }, { 23, 476, true }, { 24, 273, false }, - { 25, 684, true }, { 26, 359, false }, { 29, 246, true }, { 32, 195, true }, - { 33, 694, true }, { 52, 524, true } - }; - - const bc73_astc2_common_partition_desc g_bc7_3_astc2_common_partitions[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS] = - { - { 10, 36, 4 }, { 11, 48, 4 }, { 0, 61, 3 }, { 2, 137, 4 }, - { 8, 161, 5 }, { 13, 183, 4 }, { 1, 226, 2 }, { 33, 281, 2 }, - { 40, 302, 3 }, { 20, 307, 4 }, { 21, 479, 0 }, { 58, 495, 3 }, - { 3, 593, 0 }, { 32, 594, 2 }, { 59, 605, 1 }, { 34, 799, 3 }, - { 20, 812, 1 }, { 14, 988, 4 }, { 31, 993, 3 } - }; - - const astc_bc7_common_partition3_desc g_astc_bc7_common_partitions3[TOTAL_ASTC_BC7_COMMON_PARTITIONS3] = + bool basisu_transcoder::get_file_info(const void* pData, uint32_t data_size, basisu_file_info& file_info) const { - { 4, 260, 0 }, { 8, 74, 5 }, { 9, 32, 5 }, { 10, 156, 2 }, - { 11, 183, 2 }, { 12, 15, 0 }, { 13, 745, 4 }, { 20, 0, 1 }, - { 35, 335, 1 }, { 36, 902, 5 }, { 57, 254, 0 } - }; + if (!validate_file_checksums(pData, data_size, false)) + { + BASISU_DEVEL_ERROR("basisu_transcoder::get_file_info: validate_file_checksums failed\n"); + return false; + } - const uint8_t g_astc_to_bc7_partition_index_perm_tables[6][3] = { { 0, 1, 2 }, { 1, 2, 0 }, { 2, 0, 1 }, { 2, 1, 0 }, { 0, 2, 1 }, { 1, 0, 2 } }; + const basis_file_header* pHeader = static_cast(pData); + const basis_slice_desc* pSlice_descs = reinterpret_cast(static_cast(pData) + pHeader->m_slice_desc_file_ofs); - const uint8_t g_bc7_to_astc_partition_index_perm_tables[6][3] = { { 0, 1, 2 }, { 2, 0, 1 }, { 1, 2, 0 }, { 2, 1, 0 }, { 0, 2, 1 }, { 1, 0, 2 } }; + file_info.m_version = pHeader->m_ver; - uint32_t bc7_convert_partition_index_3_to_2(uint32_t p, uint32_t k) - { - assert(k < 6); - switch (k >> 1) - { - case 0: - if (p <= 1) - p = 0; - else - p = 1; - break; - case 1: - if (p == 0) - p = 0; - else - p = 1; - break; - case 2: - if ((p == 0) || (p == 2)) - p = 0; - else - p = 1; - break; - } - if (k & 1) - p = 1 - p; - return p; - } + file_info.m_total_header_size = sizeof(basis_file_header) + pHeader->m_total_slices * sizeof(basis_slice_desc); - static const uint8_t g_zero_pattern[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + file_info.m_total_selectors = pHeader->m_total_selectors; + file_info.m_selector_codebook_ofs = pHeader->m_selector_cb_file_ofs; + file_info.m_selector_codebook_size = pHeader->m_selector_cb_file_size; - const uint8_t g_astc_bc7_patterns2[TOTAL_ASTC_BC7_COMMON_PARTITIONS2][16] = - { - { 0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1 }, { 0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1 }, { 1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0 }, { 0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1 }, - { 1,1,1,1,1,1,1,0,1,1,1,0,1,1,0,0 }, { 0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1 }, { 1,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0 }, { 1,1,1,1,1,1,1,0,1,1,0,0,1,0,0,0 }, - { 0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1 }, { 1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0 }, { 0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1 }, { 1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0 }, - { 1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0 }, { 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0 }, { 0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1 }, { 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0 }, - { 1,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1 }, { 1,1,1,1,1,1,1,1,0,1,1,1,0,0,0,1 }, { 0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0 }, { 0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0 }, - { 0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0 }, { 1,1,1,1,1,1,1,1,0,1,1,1,0,0,1,1 }, { 1,0,0,0,1,1,0,0,1,1,0,0,1,1,1,0 }, { 0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0 }, - { 1,1,1,1,0,1,1,1,0,1,1,1,0,0,1,1 }, { 0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0 }, { 1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1 }, { 1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0 }, - { 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0 }, { 1,0,0,1,0,0,1,1,0,1,1,0,1,1,0,0 } - }; + file_info.m_total_endpoints = pHeader->m_total_endpoints; + file_info.m_endpoint_codebook_ofs = pHeader->m_endpoint_cb_file_ofs; + file_info.m_endpoint_codebook_size = pHeader->m_endpoint_cb_file_size; - const uint8_t g_astc_bc7_patterns3[TOTAL_ASTC_BC7_COMMON_PARTITIONS3][16] = - { - { 0,0,0,0,0,0,0,0,1,1,2,2,1,1,2,2 }, { 1,1,1,1,1,1,1,1,0,0,0,0,2,2,2,2 }, { 1,1,1,1,0,0,0,0,0,0,0,0,2,2,2,2 }, { 1,1,1,1,2,2,2,2,0,0,0,0,0,0,0,0 }, - { 1,1,2,0,1,1,2,0,1,1,2,0,1,1,2,0 }, { 0,1,1,2,0,1,1,2,0,1,1,2,0,1,1,2 }, { 0,2,1,1,0,2,1,1,0,2,1,1,0,2,1,1 }, { 2,0,0,0,2,0,0,0,2,1,1,1,2,1,1,1 }, - { 2,0,1,2,2,0,1,2,2,0,1,2,2,0,1,2 }, { 1,1,1,1,0,0,0,0,2,2,2,2,1,1,1,1 }, { 0,0,2,2,0,0,1,1,0,0,1,1,0,0,2,2 } - }; + file_info.m_tables_ofs = pHeader->m_tables_file_ofs; + file_info.m_tables_size = pHeader->m_tables_file_size; - const uint8_t g_bc7_3_astc2_patterns2[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS][16] = - { - { 0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0 }, { 0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0 }, { 1,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0 }, { 0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1 }, - { 1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1 }, { 0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0 }, { 0,0,0,1,0,0,1,1,1,1,1,1,1,1,1,1 }, { 0,1,1,1,0,0,1,1,0,0,1,1,0,0,1,1 }, - { 1,1,0,0,0,0,0,0,0,0,1,1,1,1,0,0 }, { 0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0 }, { 0,0,0,0,0,0,0,0,1,1,1,0,1,1,1,0 }, { 1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0 }, - { 0,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0 }, { 0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1 }, { 1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0 }, { 1,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0 }, - { 1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0 }, { 0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,0 }, { 1,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0 } - }; + file_info.m_tex_format = static_cast(static_cast(pHeader->m_tex_format)); - const uint8_t g_astc_bc7_pattern2_anchors[TOTAL_ASTC_BC7_COMMON_PARTITIONS2][3] = - { - { 0, 2 }, { 0, 3 }, { 1, 0 }, { 0, 3 }, { 7, 0 }, { 0, 2 }, { 3, 0 }, { 7, 0 }, - { 0, 11 }, { 2, 0 }, { 0, 7 }, { 11, 0 }, { 3, 0 }, { 8, 0 }, { 0, 4 }, { 12, 0 }, - { 1, 0 }, { 8, 0 }, { 0, 1 }, { 0, 2 }, { 0, 4 }, { 8, 0 }, { 1, 0 }, { 0, 2 }, - { 4, 0 }, { 0, 1 }, { 4, 0 }, { 1, 0 }, { 4, 0 }, { 1, 0 } - }; + file_info.m_etc1s = (pHeader->m_tex_format == (int)basis_tex_format::cETC1S); + + file_info.m_y_flipped = (pHeader->m_flags & cBASISHeaderFlagYFlipped) != 0; + file_info.m_has_alpha_slices = (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) != 0; - const uint8_t g_astc_bc7_pattern3_anchors[TOTAL_ASTC_BC7_COMMON_PARTITIONS3][3] = - { - { 0, 8, 10 }, { 8, 0, 12 }, { 4, 0, 12 }, { 8, 0, 4 }, { 3, 0, 2 }, { 0, 1, 3 }, { 0, 2, 1 }, { 1, 9, 0 }, { 1, 2, 0 }, { 4, 0, 8 }, { 0, 6, 2 } - }; + const uint32_t total_slices = pHeader->m_total_slices; - const uint8_t g_bc7_3_astc2_patterns2_anchors[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS][3] = - { - { 0, 4 }, { 0, 2 }, { 2, 0 }, { 0, 7 }, { 8, 0 }, { 0, 1 }, { 0, 3 }, { 0, 1 }, { 2, 0 }, { 0, 1 }, { 0, 8 }, { 2, 0 }, { 0, 1 }, { 0, 7 }, { 12, 0 }, { 2, 0 }, { 9, 0 }, { 0, 2 }, { 4, 0 } - }; + file_info.m_slice_info.resize(total_slices); - const uint32_t g_uastc_mode_huff_codes[TOTAL_UASTC_MODES + 1][2] = - { - { 0x1, 4 }, - { 0x35, 6 }, - { 0x1D, 5 }, - { 0x3, 5 }, + file_info.m_slices_size = 0; - { 0x13, 5 }, - { 0xB, 5 }, - { 0x1B, 5 }, - { 0x7, 5 }, + file_info.m_tex_type = static_cast(static_cast(pHeader->m_tex_type)); - { 0x17, 5 }, - { 0xF, 5 }, - { 0x2, 3 }, - { 0x0, 2 }, + if (file_info.m_tex_type > cBASISTexTypeTotal) + { + BASISU_DEVEL_ERROR("basisu_transcoder::get_file_info: invalid texture type, file is corrupted\n"); + return false; + } - { 0x6, 3 }, - { 0x1F, 5 }, - { 0xD, 5 }, - { 0x5, 7 }, + file_info.m_us_per_frame = pHeader->m_us_per_frame; + file_info.m_userdata0 = pHeader->m_userdata0; + file_info.m_userdata1 = pHeader->m_userdata1; - { 0x15, 6 }, - { 0x25, 6 }, - { 0x9, 4 }, - { 0x45, 7 } // future expansion - }; + file_info.m_image_mipmap_levels.resize(0); + file_info.m_image_mipmap_levels.resize(pHeader->m_total_images); - // If g_uastc_mode_huff_codes[] changes this table must be updated! - static const uint8_t g_uastc_huff_modes[128] = - { - 11,0,10,3,11,15,12,7,11,18,10,5,11,14,12,9,11,0,10,4,11,16,12,8,11,18,10,6,11,2,12,13,11,0,10,3,11,17,12,7,11,18,10,5,11,14,12,9,11,0,10,4,11,1,12,8,11,18,10,6,11,2,12,13,11,0,10,3,11, - 19,12,7,11,18,10,5,11,14,12,9,11,0,10,4,11,16,12,8,11,18,10,6,11,2,12,13,11,0,10,3,11,17,12,7,11,18,10,5,11,14,12,9,11,0,10,4,11,1,12,8,11,18,10,6,11,2,12,13 - }; + file_info.m_total_images = pHeader->m_total_images; - const uint8_t g_uastc_mode_weight_bits[TOTAL_UASTC_MODES] = { 4, 2, 3, 2, 2, 3, 2, 2, 0, 2, 4, 2, 3, 1, 2, 4, 2, 2, 5 }; - const uint8_t g_uastc_mode_weight_ranges[TOTAL_UASTC_MODES] = { 8, 2, 5, 2, 2, 5, 2, 2, 0, 2, 8, 2, 5, 0, 2, 8, 2, 2, 11 }; - const uint8_t g_uastc_mode_endpoint_ranges[TOTAL_UASTC_MODES] = { 19, 20, 8, 7, 12, 20, 18, 12, 0, 8, 13, 13, 19, 20, 20, 20, 20, 20, 11 }; - const uint8_t g_uastc_mode_subsets[TOTAL_UASTC_MODES] = { 1, 1, 2, 3, 2, 1, 1, 2, 0, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1 }; - const uint8_t g_uastc_mode_planes[TOTAL_UASTC_MODES] = { 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1 }; - const uint8_t g_uastc_mode_comps[TOTAL_UASTC_MODES] = { 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 3 }; - const uint8_t g_uastc_mode_has_etc1_bias[TOTAL_UASTC_MODES] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1 }; - const uint8_t g_uastc_mode_has_bc1_hint0[TOTAL_UASTC_MODES] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; - const uint8_t g_uastc_mode_has_bc1_hint1[TOTAL_UASTC_MODES] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1 }; - const uint8_t g_uastc_mode_cem[TOTAL_UASTC_MODES] = { 8, 8, 8, 8, 8, 8, 8, 8, 0, 12, 12, 12, 12, 12, 12, 4, 4, 4, 8 }; - const uint8_t g_uastc_mode_has_alpha[TOTAL_UASTC_MODES] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }; - const uint8_t g_uastc_mode_is_la[TOTAL_UASTC_MODES] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0 }; - const uint8_t g_uastc_mode_total_hint_bits[TOTAL_UASTC_MODES] = { 15, 15, 15, 15, 15, 15, 15, 15, 0, 23, 17, 17, 17, 23, 23, 23, 23, 23, 15 }; + const uint32_t block_width = basis_tex_format_get_block_width((basis_tex_format)((uint32_t)pHeader->m_tex_format)); + const uint32_t block_height = basis_tex_format_get_block_height((basis_tex_format)((uint32_t)pHeader->m_tex_format)); + file_info.m_block_width = block_width; + file_info.m_block_height = block_height; - // bits, trits, quints - const int g_astc_bise_range_table[TOTAL_ASTC_RANGES][3] = - { - { 1, 0, 0 }, // 0-1 0 - { 0, 1, 0 }, // 0-2 1 - { 2, 0, 0 }, // 0-3 2 - { 0, 0, 1 }, // 0-4 3 + for (uint32_t i = 0; i < total_slices; i++) + { + file_info.m_slices_size += pSlice_descs[i].m_file_size; - { 1, 1, 0 }, // 0-5 4 - { 3, 0, 0 }, // 0-7 5 - { 1, 0, 1 }, // 0-9 6 - { 2, 1, 0 }, // 0-11 7 + basisu_slice_info& slice_info = file_info.m_slice_info[i]; - { 4, 0, 0 }, // 0-15 8 - { 2, 0, 1 }, // 0-19 9 - { 3, 1, 0 }, // 0-23 10 - { 5, 0, 0 }, // 0-31 11 + slice_info.m_orig_width = pSlice_descs[i].m_orig_width; + slice_info.m_orig_height = pSlice_descs[i].m_orig_height; + slice_info.m_width = pSlice_descs[i].m_num_blocks_x * block_width; + slice_info.m_height = pSlice_descs[i].m_num_blocks_y * block_height; + slice_info.m_num_blocks_x = pSlice_descs[i].m_num_blocks_x; + slice_info.m_num_blocks_y = pSlice_descs[i].m_num_blocks_y; + slice_info.m_block_width = block_width; + slice_info.m_block_height = block_height; + slice_info.m_total_blocks = slice_info.m_num_blocks_x * slice_info.m_num_blocks_y; + slice_info.m_compressed_size = pSlice_descs[i].m_file_size; + slice_info.m_slice_index = i; + slice_info.m_image_index = pSlice_descs[i].m_image_index; + slice_info.m_level_index = pSlice_descs[i].m_level_index; + slice_info.m_unpacked_slice_crc16 = pSlice_descs[i].m_slice_data_crc16; + slice_info.m_alpha_flag = (pSlice_descs[i].m_flags & cSliceDescFlagsHasAlpha) != 0; + slice_info.m_iframe_flag = (pSlice_descs[i].m_flags & cSliceDescFlagsFrameIsIFrame) != 0; - { 3, 0, 1 }, // 0-39 12 - { 4, 1, 0 }, // 0-47 13 - { 6, 0, 0 }, // 0-63 14 - { 4, 0, 1 }, // 0-79 15 + if (pSlice_descs[i].m_image_index >= pHeader->m_total_images) + { + BASISU_DEVEL_ERROR("basisu_transcoder::get_file_info: slice desc's image index is invalid\n"); + return false; + } - { 5, 1, 0 }, // 0-95 16 - { 7, 0, 0 }, // 0-127 17 - { 5, 0, 1 }, // 0-159 18 - { 6, 1, 0 }, // 0-191 19 + file_info.m_image_mipmap_levels[pSlice_descs[i].m_image_index] = basisu::maximum(file_info.m_image_mipmap_levels[pSlice_descs[i].m_image_index], pSlice_descs[i].m_level_index + 1); - { 8, 0, 0 }, // 0-255 20 - }; + if (file_info.m_image_mipmap_levels[pSlice_descs[i].m_image_index] > 16) + { + BASISU_DEVEL_ERROR("basisu_transcoder::get_file_info: slice mipmap level is invalid\n"); + return false; + } + } - int astc_get_levels(int range) - { - assert(range < (int)BC7ENC_TOTAL_ASTC_RANGES); - return (1 + 2 * g_astc_bise_range_table[range][1] + 4 * g_astc_bise_range_table[range][2]) << g_astc_bise_range_table[range][0]; + return true; } + + bool basisu_transcoder::start_transcoding(const void* pData, uint32_t data_size) + { + if (!validate_header_quick(pData, data_size)) + { + BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: header validation failed\n"); + return false; + } - // g_astc_unquant[] is the inverse of g_astc_sorted_order_unquant[] - astc_quant_bin g_astc_unquant[BC7ENC_TOTAL_ASTC_RANGES][256]; // [ASTC encoded endpoint index] + const basis_file_header* pHeader = reinterpret_cast(pData); + const uint8_t* pDataU8 = static_cast(pData); - // Taken right from the ASTC spec. - static struct - { - const char* m_pB_str; - uint32_t m_c; - } g_astc_endpoint_unquant_params[BC7ENC_TOTAL_ASTC_RANGES] = - { - { "", 0 }, - { "", 0 }, - { "", 0 }, - { "", 0 }, - { "000000000", 204, }, // 0-5 - { "", 0 }, - { "000000000", 113, }, // 0-9 - { "b000b0bb0", 93 }, // 0-11 - { "", 0 }, - { "b0000bb00", 54 }, // 0-19 - { "cb000cbcb", 44 }, // 0-23 - { "", 0 }, - { "cb0000cbc", 26 }, // 0-39 - { "dcb000dcb", 22 }, // 0-47 - { "", 0 }, - { "dcb0000dc", 13 }, // 0-79 - { "edcb000ed", 11 }, // 0-95 - { "", 0 }, - { "edcb0000e", 6 }, // 0-159 - { "fedcb000f", 5 }, // 0-191 - { "", 0 }, - }; - - bool astc_is_valid_endpoint_range(uint32_t range) - { - if ((g_astc_bise_range_table[range][1] == 0) && (g_astc_bise_range_table[range][2] == 0)) - return true; - - return g_astc_endpoint_unquant_params[range].m_c != 0; - } + if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S) + { + if (m_lowlevel_etc1s_decoder.m_local_endpoints.size()) + { + m_lowlevel_etc1s_decoder.clear(); + } - uint32_t unquant_astc_endpoint(uint32_t packed_bits, uint32_t packed_trits, uint32_t packed_quints, uint32_t range) - { - assert(range < BC7ENC_TOTAL_ASTC_RANGES); + if (pHeader->m_flags & cBASISHeaderFlagUsesGlobalCodebook) + { + if (!m_lowlevel_etc1s_decoder.get_global_codebooks()) + { + BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: File uses global codebooks, but set_global_codebooks() has not been called\n"); + return false; + } + if (!m_lowlevel_etc1s_decoder.get_global_codebooks()->get_endpoints().size()) + { + BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: Global codebooks must be unpacked first by calling start_transcoding()\n"); + return false; + } + if ((m_lowlevel_etc1s_decoder.get_global_codebooks()->get_endpoints().size() != pHeader->m_total_endpoints) || + (m_lowlevel_etc1s_decoder.get_global_codebooks()->get_selectors().size() != pHeader->m_total_selectors)) + { + BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: Global codebook size mismatch (wrong codebooks for file).\n"); + return false; + } + if (!pHeader->m_tables_file_size) + { + BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted (2)\n"); + return false; + } + if (pHeader->m_tables_file_ofs > data_size) + { + BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (4)\n"); + return false; + } + if (pHeader->m_tables_file_size > (data_size - pHeader->m_tables_file_ofs)) + { + BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (5)\n"); + return false; + } + } + else + { + if (!pHeader->m_endpoint_cb_file_size || !pHeader->m_selector_cb_file_size || !pHeader->m_tables_file_size) + { + BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted (0)\n"); + return false; + } - const uint32_t bits = g_astc_bise_range_table[range][0]; - const uint32_t trits = g_astc_bise_range_table[range][1]; - const uint32_t quints = g_astc_bise_range_table[range][2]; + if ((pHeader->m_endpoint_cb_file_ofs > data_size) || (pHeader->m_selector_cb_file_ofs > data_size) || (pHeader->m_tables_file_ofs > data_size)) + { + BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (1)\n"); + return false; + } - uint32_t val = 0; - if ((!trits) && (!quints)) - { - assert(!packed_trits && !packed_quints); + if (pHeader->m_endpoint_cb_file_size > (data_size - pHeader->m_endpoint_cb_file_ofs)) + { + BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (2)\n"); + return false; + } - int bits_left = 8; - while (bits_left > 0) - { - uint32_t v = packed_bits; + if (pHeader->m_selector_cb_file_size > (data_size - pHeader->m_selector_cb_file_ofs)) + { + BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (3)\n"); + return false; + } - int n = basisu::minimumi(bits_left, bits); - if (n < (int)bits) - v >>= (bits - n); + if (pHeader->m_tables_file_size > (data_size - pHeader->m_tables_file_ofs)) + { + BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: file is corrupted or passed in buffer too small (3)\n"); + return false; + } - assert(v < (1U << n)); + if (!m_lowlevel_etc1s_decoder.decode_palettes( + pHeader->m_total_endpoints, pDataU8 + pHeader->m_endpoint_cb_file_ofs, pHeader->m_endpoint_cb_file_size, + pHeader->m_total_selectors, pDataU8 + pHeader->m_selector_cb_file_ofs, pHeader->m_selector_cb_file_size)) + { + BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: decode_palettes failed\n"); + return false; + } + } - val |= (v << (bits_left - n)); - bits_left -= n; + if (!m_lowlevel_etc1s_decoder.decode_tables(pDataU8 + pHeader->m_tables_file_ofs, pHeader->m_tables_file_size)) + { + BASISU_DEVEL_ERROR("basisu_transcoder::start_transcoding: decode_tables failed\n"); + return false; } } else { - const uint32_t A = (packed_bits & 1) ? 511 : 0; - const uint32_t C = g_astc_endpoint_unquant_params[range].m_c; - const uint32_t D = trits ? packed_trits : packed_quints; - - assert(C); - - uint32_t B = 0; - for (uint32_t i = 0; i < 9; i++) + // Nothing special to do for UASTC/UASTC HDR. + if (m_lowlevel_etc1s_decoder.m_local_endpoints.size()) { - B <<= 1; - - char c = g_astc_endpoint_unquant_params[range].m_pB_str[i]; - if (c != '0') - { - c -= 'a'; - B |= ((packed_bits >> c) & 1); - } + m_lowlevel_etc1s_decoder.clear(); } - - val = D * C + B; - val = val ^ A; - val = (A & 0x80) | (val >> 2); } + + m_ready_to_transcode = true; - return val; + return true; } - uint32_t unquant_astc_endpoint_val(uint32_t packed_val, uint32_t range) + bool basisu_transcoder::stop_transcoding() { - assert(range < BC7ENC_TOTAL_ASTC_RANGES); - assert(packed_val < (uint32_t)astc_get_levels(range)); - - const uint32_t bits = g_astc_bise_range_table[range][0]; - const uint32_t trits = g_astc_bise_range_table[range][1]; - const uint32_t quints = g_astc_bise_range_table[range][2]; + m_lowlevel_etc1s_decoder.clear(); - if ((!trits) && (!quints)) - return unquant_astc_endpoint(packed_val, 0, 0, range); - else if (trits) - return unquant_astc_endpoint(packed_val & ((1 << bits) - 1), packed_val >> bits, 0, range); - else - return unquant_astc_endpoint(packed_val & ((1 << bits) - 1), 0, packed_val >> bits, range); + m_ready_to_transcode = false; + + return true; } - // BC7 - Various BC7 tables/helpers - const uint32_t g_bc7_weights1[2] = { 0, 64 }; - const uint32_t g_bc7_weights2[4] = { 0, 21, 43, 64 }; - const uint32_t g_bc7_weights3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 }; - const uint32_t g_bc7_weights4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; - const uint32_t g_astc_weights4[16] = { 0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64 }; - const uint32_t g_astc_weights5[32] = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64 }; - const uint32_t g_astc_weights_3levels[3] = { 0, 32, 64 }; - - const uint8_t g_bc7_partition1[16] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; - - const uint8_t g_bc7_partition2[64 * 16] = + bool basisu_transcoder::transcode_slice(const void* pData, uint32_t data_size, uint32_t slice_index, void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, block_format fmt, + uint32_t output_block_or_pixel_stride_in_bytes, uint32_t decode_flags, uint32_t output_row_pitch_in_blocks_or_pixels, basisu_transcoder_state* pState, void *pAlpha_blocks, uint32_t output_rows_in_pixels, int channel0, int channel1) const { - 0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1, 0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1, 0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1, 0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1, 0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1, 0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1, - 0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1, 0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1, - 0,0,0,0,1,0,0,0,1,1,1,0,1,1,1,1, 0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0, 0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0, 0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0, 0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0, 0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1, - 0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0, 0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0, 0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0, 0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0, 0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0, 0,1,1,1,0,0,0,1,1,0,0,0,1,1,1,0, 0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0, - 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, 0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1, 0,1,0,1,1,0,1,0,0,1,0,1,1,0,1,0, 0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0, 0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0, 0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0, 0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1, 0,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1, - 0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0, 0,0,0,1,0,0,1,1,1,1,0,0,1,0,0,0, 0,0,1,1,0,0,1,0,0,1,0,0,1,1,0,0, 0,0,1,1,1,0,1,1,1,1,0,1,1,1,0,0, 0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0, 0,0,1,1,1,1,0,0,1,1,0,0,0,0,1,1, 0,1,1,0,0,1,1,0,1,0,0,1,1,0,0,1, 0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0, - 0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0, 0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0, 0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0, 0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0, 0,1,1,0,1,1,0,0,1,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,1, 0,1,1,0,0,0,1,1,1,0,0,1,1,1,0,0, 0,0,1,1,1,0,0,1,1,1,0,0,0,1,1,0, - 0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1, 0,1,1,0,0,0,1,1,0,0,1,1,1,0,0,1, 0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,1, 0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1, 0,0,0,0,1,1,1,1,0,0,1,1,0,0,1,1, 0,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0, 0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0, 0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,1 - }; + if (!m_ready_to_transcode) + { + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: must call start_transcoding first\n"); + return false; + } - const uint8_t g_bc7_partition3[64 * 16] = - { - 0,0,1,1,0,0,1,1,0,2,2,1,2,2,2,2, 0,0,0,1,0,0,1,1,2,2,1,1,2,2,2,1, 0,0,0,0,2,0,0,1,2,2,1,1,2,2,1,1, 0,2,2,2,0,0,2,2,0,0,1,1,0,1,1,1, 0,0,0,0,0,0,0,0,1,1,2,2,1,1,2,2, 0,0,1,1,0,0,1,1,0,0,2,2,0,0,2,2, 0,0,2,2,0,0,2,2,1,1,1,1,1,1,1,1, 0,0,1,1,0,0,1,1,2,2,1,1,2,2,1,1, - 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2, 0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2, 0,0,0,0,1,1,1,1,2,2,2,2,2,2,2,2, 0,0,1,2,0,0,1,2,0,0,1,2,0,0,1,2, 0,1,1,2,0,1,1,2,0,1,1,2,0,1,1,2, 0,1,2,2,0,1,2,2,0,1,2,2,0,1,2,2, 0,0,1,1,0,1,1,2,1,1,2,2,1,2,2,2, 0,0,1,1,2,0,0,1,2,2,0,0,2,2,2,0, - 0,0,0,1,0,0,1,1,0,1,1,2,1,1,2,2, 0,1,1,1,0,0,1,1,2,0,0,1,2,2,0,0, 0,0,0,0,1,1,2,2,1,1,2,2,1,1,2,2, 0,0,2,2,0,0,2,2,0,0,2,2,1,1,1,1, 0,1,1,1,0,1,1,1,0,2,2,2,0,2,2,2, 0,0,0,1,0,0,0,1,2,2,2,1,2,2,2,1, 0,0,0,0,0,0,1,1,0,1,2,2,0,1,2,2, 0,0,0,0,1,1,0,0,2,2,1,0,2,2,1,0, - 0,1,2,2,0,1,2,2,0,0,1,1,0,0,0,0, 0,0,1,2,0,0,1,2,1,1,2,2,2,2,2,2, 0,1,1,0,1,2,2,1,1,2,2,1,0,1,1,0, 0,0,0,0,0,1,1,0,1,2,2,1,1,2,2,1, 0,0,2,2,1,1,0,2,1,1,0,2,0,0,2,2, 0,1,1,0,0,1,1,0,2,0,0,2,2,2,2,2, 0,0,1,1,0,1,2,2,0,1,2,2,0,0,1,1, 0,0,0,0,2,0,0,0,2,2,1,1,2,2,2,1, - 0,0,0,0,0,0,0,2,1,1,2,2,1,2,2,2, 0,2,2,2,0,0,2,2,0,0,1,2,0,0,1,1, 0,0,1,1,0,0,1,2,0,0,2,2,0,2,2,2, 0,1,2,0,0,1,2,0,0,1,2,0,0,1,2,0, 0,0,0,0,1,1,1,1,2,2,2,2,0,0,0,0, 0,1,2,0,1,2,0,1,2,0,1,2,0,1,2,0, 0,1,2,0,2,0,1,2,1,2,0,1,0,1,2,0, 0,0,1,1,2,2,0,0,1,1,2,2,0,0,1,1, - 0,0,1,1,1,1,2,2,2,2,0,0,0,0,1,1, 0,1,0,1,0,1,0,1,2,2,2,2,2,2,2,2, 0,0,0,0,0,0,0,0,2,1,2,1,2,1,2,1, 0,0,2,2,1,1,2,2,0,0,2,2,1,1,2,2, 0,0,2,2,0,0,1,1,0,0,2,2,0,0,1,1, 0,2,2,0,1,2,2,1,0,2,2,0,1,2,2,1, 0,1,0,1,2,2,2,2,2,2,2,2,0,1,0,1, 0,0,0,0,2,1,2,1,2,1,2,1,2,1,2,1, - 0,1,0,1,0,1,0,1,0,1,0,1,2,2,2,2, 0,2,2,2,0,1,1,1,0,2,2,2,0,1,1,1, 0,0,0,2,1,1,1,2,0,0,0,2,1,1,1,2, 0,0,0,0,2,1,1,2,2,1,1,2,2,1,1,2, 0,2,2,2,0,1,1,1,0,1,1,1,0,2,2,2, 0,0,0,2,1,1,1,2,1,1,1,2,0,0,0,2, 0,1,1,0,0,1,1,0,0,1,1,0,2,2,2,2, 0,0,0,0,0,0,0,0,2,1,1,2,2,1,1,2, - 0,1,1,0,0,1,1,0,2,2,2,2,2,2,2,2, 0,0,2,2,0,0,1,1,0,0,1,1,0,0,2,2, 0,0,2,2,1,1,2,2,1,1,2,2,0,0,2,2, 0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,2, 0,0,0,2,0,0,0,1,0,0,0,2,0,0,0,1, 0,2,2,2,1,2,2,2,0,2,2,2,1,2,2,2, 0,1,0,1,2,2,2,2,2,2,2,2,2,2,2,2, 0,1,1,1,2,0,1,1,2,2,0,1,2,2,2,0, - }; + if (decode_flags & cDecodeFlagsPVRTCDecodeToNextPow2) + { + // TODO: Not yet supported + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: cDecodeFlagsPVRTCDecodeToNextPow2 currently unsupported\n"); + return false; + } - const uint8_t g_bc7_table_anchor_index_second_subset[64] = { 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, 15, 2, 8, 2, 2, 8, 8,15, 2, 8, 2, 2, 8, 8, 2, 2, 15,15, 6, 8, 2, 8,15,15, 2, 8, 2, 2, 2,15,15, 6, 6, 2, 6, 8,15,15, 2, 2, 15,15,15,15,15, 2, 2,15 }; + if (!validate_header_quick(pData, data_size)) + { + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: header validation failed\n"); + return false; + } - const uint8_t g_bc7_table_anchor_index_third_subset_1[64] = - { - 3, 3,15,15, 8, 3,15,15, 8, 8, 6, 6, 6, 5, 3, 3, 3, 3, 8,15, 3, 3, 6,10, 5, 8, 8, 6, 8, 5,15,15, 8,15, 3, 5, 6,10, 8,15, 15, 3,15, 5,15,15,15,15, 3,15, 5, 5, 5, 8, 5,10, 5,10, 8,13,15,12, 3, 3 - }; + const basis_file_header* pHeader = reinterpret_cast(pData); - const uint8_t g_bc7_table_anchor_index_third_subset_2[64] = - { - 15, 8, 8, 3,15,15, 3, 8, 15,15,15,15,15,15,15, 8, 15, 8,15, 3,15, 8,15, 8, 3,15, 6,10,15,15,10, 8, 15, 3,15,10,10, 8, 9,10, 6,15, 8,15, 3, 6, 6, 8, 15, 3,15,15,15,15,15,15, 15,15,15,15, 3,15,15, 8 - }; + const uint8_t* pDataU8 = static_cast(pData); - const uint8_t g_bc7_num_subsets[8] = { 3, 2, 3, 2, 1, 1, 1, 2 }; - const uint8_t g_bc7_partition_bits[8] = { 4, 6, 6, 6, 0, 0, 0, 6 }; - const uint8_t g_bc7_color_index_bitcount[8] = { 3, 3, 2, 2, 2, 2, 4, 2 }; + if (slice_index >= pHeader->m_total_slices) + { + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: slice_index >= pHeader->m_total_slices\n"); + return false; + } - const uint8_t g_bc7_mode_has_p_bits[8] = { 1, 1, 0, 1, 0, 0, 1, 1 }; - const uint8_t g_bc7_mode_has_shared_p_bits[8] = { 0, 1, 0, 0, 0, 0, 0, 0 }; - const uint8_t g_bc7_color_precision_table[8] = { 4, 6, 5, 7, 5, 7, 7, 5 }; - const int8_t g_bc7_alpha_precision_table[8] = { 0, 0, 0, 0, 6, 8, 7, 5 }; + const basis_slice_desc& slice_desc = reinterpret_cast(pDataU8 + pHeader->m_slice_desc_file_ofs)[slice_index]; + + if (basis_block_format_is_uncompressed(fmt)) + { + // Assume the output buffer is orig_width by orig_height + if (!output_row_pitch_in_blocks_or_pixels) + output_row_pitch_in_blocks_or_pixels = slice_desc.m_orig_width; - const uint8_t g_bc7_alpha_index_bitcount[8] = { 0, 0, 0, 0, 3, 2, 4, 2 }; + if (!output_rows_in_pixels) + output_rows_in_pixels = slice_desc.m_orig_height; - endpoint_err g_bc7_mode_6_optimal_endpoints[256][2]; // [c][pbit] - endpoint_err g_bc7_mode_5_optimal_endpoints[256]; // [c] - - static inline void bc7_set_block_bits(uint8_t* pBytes, uint32_t val, uint32_t num_bits, uint32_t* pCur_ofs) - { - assert((num_bits <= 32) && (val < (1ULL << num_bits))); - while (num_bits) - { - const uint32_t n = basisu::minimumu(8 - (*pCur_ofs & 7), num_bits); - pBytes[*pCur_ofs >> 3] |= (uint8_t)(val << (*pCur_ofs & 7)); - val >>= n; - num_bits -= n; - *pCur_ofs += n; - } - assert(*pCur_ofs <= 128); - } - - // TODO: Optimize this. - void encode_bc7_block(void* pBlock, const bc7_optimization_results* pResults) - { - const uint32_t best_mode = pResults->m_mode; - - const uint32_t total_subsets = g_bc7_num_subsets[best_mode]; - const uint32_t total_partitions = 1 << g_bc7_partition_bits[best_mode]; - //const uint32_t num_rotations = 1 << g_bc7_rotation_bits[best_mode]; - //const uint32_t num_index_selectors = (best_mode == 4) ? 2 : 1; - - const uint8_t* pPartition; - if (total_subsets == 1) - pPartition = &g_bc7_partition1[0]; - else if (total_subsets == 2) - pPartition = &g_bc7_partition2[pResults->m_partition * 16]; - else - pPartition = &g_bc7_partition3[pResults->m_partition * 16]; - - uint8_t color_selectors[16]; - memcpy(color_selectors, pResults->m_selectors, 16); - - uint8_t alpha_selectors[16]; - memcpy(alpha_selectors, pResults->m_alpha_selectors, 16); - - color_quad_u8 low[3], high[3]; - memcpy(low, pResults->m_low, sizeof(low)); - memcpy(high, pResults->m_high, sizeof(high)); - - uint32_t pbits[3][2]; - memcpy(pbits, pResults->m_pbits, sizeof(pbits)); - - int anchor[3] = { -1, -1, -1 }; - - for (uint32_t k = 0; k < total_subsets; k++) - { - uint32_t anchor_index = 0; - if (k) + // Now make sure the output buffer is large enough, or we'll overwrite memory. + if (output_blocks_buf_size_in_blocks_or_pixels < (output_rows_in_pixels * output_row_pitch_in_blocks_or_pixels)) { - if ((total_subsets == 3) && (k == 1)) - anchor_index = g_bc7_table_anchor_index_third_subset_1[pResults->m_partition]; - else if ((total_subsets == 3) && (k == 2)) - anchor_index = g_bc7_table_anchor_index_third_subset_2[pResults->m_partition]; - else - anchor_index = g_bc7_table_anchor_index_second_subset[pResults->m_partition]; + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: output_blocks_buf_size_in_blocks_or_pixels < (output_rows_in_pixels * output_row_pitch_in_blocks_or_pixels)\n"); + return false; } + } + else if (fmt == block_format::cFXT1_RGB) + { + const uint32_t num_blocks_fxt1_x = (slice_desc.m_orig_width + 7) / 8; + const uint32_t num_blocks_fxt1_y = (slice_desc.m_orig_height + 3) / 4; + const uint32_t total_blocks_fxt1 = num_blocks_fxt1_x * num_blocks_fxt1_y; - anchor[k] = anchor_index; - - const uint32_t color_index_bits = get_bc7_color_index_size(best_mode, pResults->m_index_selector); - const uint32_t num_color_indices = 1 << color_index_bits; - - if (color_selectors[anchor_index] & (num_color_indices >> 1)) + if (output_blocks_buf_size_in_blocks_or_pixels < total_blocks_fxt1) { - for (uint32_t i = 0; i < 16; i++) - if (pPartition[i] == k) - color_selectors[i] = (uint8_t)((num_color_indices - 1) - color_selectors[i]); - - if (get_bc7_mode_has_seperate_alpha_selectors(best_mode)) - { - for (uint32_t q = 0; q < 3; q++) - { - uint8_t t = low[k].m_c[q]; - low[k].m_c[q] = high[k].m_c[q]; - high[k].m_c[q] = t; - } - } - else - { - color_quad_u8 tmp = low[k]; - low[k] = high[k]; - high[k] = tmp; - } - - if (!g_bc7_mode_has_shared_p_bits[best_mode]) - { - uint32_t t = pbits[k][0]; - pbits[k][0] = pbits[k][1]; - pbits[k][1] = t; - } + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: output_blocks_buf_size_in_blocks_or_pixels < total_blocks_fxt1\n"); + return false; } + } + else if (fmt == block_format::cASTC_HDR_6x6) + { + const uint32_t num_blocks_6x6_x = (slice_desc.m_orig_width + 5) / 6; + const uint32_t num_blocks_6x6_y = (slice_desc.m_orig_height + 5) / 6; + const uint32_t total_blocks_6x6 = num_blocks_6x6_x * num_blocks_6x6_y; - if (get_bc7_mode_has_seperate_alpha_selectors(best_mode)) + if (output_blocks_buf_size_in_blocks_or_pixels < total_blocks_6x6) { - const uint32_t alpha_index_bits = get_bc7_alpha_index_size(best_mode, pResults->m_index_selector); - const uint32_t num_alpha_indices = 1 << alpha_index_bits; - - if (alpha_selectors[anchor_index] & (num_alpha_indices >> 1)) - { - for (uint32_t i = 0; i < 16; i++) - if (pPartition[i] == k) - alpha_selectors[i] = (uint8_t)((num_alpha_indices - 1) - alpha_selectors[i]); - - uint8_t t = low[k].m_c[3]; - low[k].m_c[3] = high[k].m_c[3]; - high[k].m_c[3] = t; - } + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: output_blocks_buf_size_in_blocks_or_pixels < total_blocks_6x6\n"); + return false; } } - - uint8_t* pBlock_bytes = (uint8_t*)(pBlock); - memset(pBlock_bytes, 0, BC7ENC_BLOCK_SIZE); - - uint32_t cur_bit_ofs = 0; - bc7_set_block_bits(pBlock_bytes, 1 << best_mode, best_mode + 1, &cur_bit_ofs); - - if ((best_mode == 4) || (best_mode == 5)) - bc7_set_block_bits(pBlock_bytes, pResults->m_rotation, 2, &cur_bit_ofs); - - if (best_mode == 4) - bc7_set_block_bits(pBlock_bytes, pResults->m_index_selector, 1, &cur_bit_ofs); - - if (total_partitions > 1) - bc7_set_block_bits(pBlock_bytes, pResults->m_partition, (total_partitions == 64) ? 6 : 4, &cur_bit_ofs); - - const uint32_t total_comps = (best_mode >= 4) ? 4 : 3; - for (uint32_t comp = 0; comp < total_comps; comp++) + else { - for (uint32_t subset = 0; subset < total_subsets; subset++) + // must be a 4x4 pixel block format + const uint32_t num_blocks_4x4_x = (slice_desc.m_orig_width + 3) / 4; + const uint32_t num_blocks_4x4_y = (slice_desc.m_orig_height + 3) / 4; + const uint32_t total_4x4_blocks = num_blocks_4x4_x * num_blocks_4x4_y; + + if (output_blocks_buf_size_in_blocks_or_pixels < total_4x4_blocks) { - bc7_set_block_bits(pBlock_bytes, low[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs); - bc7_set_block_bits(pBlock_bytes, high[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs); + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: output_blocks_buf_size_in_blocks_or_pixels < total_blocks\n"); + return false; } } - if (g_bc7_mode_has_p_bits[best_mode]) + if ((pHeader->m_tex_format == (uint32_t)basis_tex_format::cETC1S) || (pHeader->m_tex_format == (uint32_t)basis_tex_format::cUASTC4x4)) { - for (uint32_t subset = 0; subset < total_subsets; subset++) + if ((fmt == block_format::cPVRTC1_4_RGB) || (fmt == block_format::cPVRTC1_4_RGBA)) { - bc7_set_block_bits(pBlock_bytes, pbits[subset][0], 1, &cur_bit_ofs); - if (!g_bc7_mode_has_shared_p_bits[best_mode]) - bc7_set_block_bits(pBlock_bytes, pbits[subset][1], 1, &cur_bit_ofs); + if ((!basisu::is_pow2(slice_desc.m_num_blocks_x * 4)) || (!basisu::is_pow2(slice_desc.m_num_blocks_y * 4))) + { + // PVRTC1 only supports power of 2 dimensions + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: PVRTC1 only supports power of 2 dimensions\n"); + return false; + } } } - for (uint32_t y = 0; y < 4; y++) + if (slice_desc.m_file_ofs > data_size) { - for (uint32_t x = 0; x < 4; x++) - { - int idx = x + y * 4; - - uint32_t n = pResults->m_index_selector ? get_bc7_alpha_index_size(best_mode, pResults->m_index_selector) : get_bc7_color_index_size(best_mode, pResults->m_index_selector); - - if ((idx == anchor[0]) || (idx == anchor[1]) || (idx == anchor[2])) - n--; - - bc7_set_block_bits(pBlock_bytes, pResults->m_index_selector ? alpha_selectors[idx] : color_selectors[idx], n, &cur_bit_ofs); - } + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: invalid slice_desc.m_file_ofs, or passed in buffer too small\n"); + return false; } - if (get_bc7_mode_has_seperate_alpha_selectors(best_mode)) + const uint32_t data_size_left = data_size - slice_desc.m_file_ofs; + if (data_size_left < slice_desc.m_file_size) { - for (uint32_t y = 0; y < 4; y++) - { - for (uint32_t x = 0; x < 4; x++) - { - int idx = x + y * 4; - - uint32_t n = pResults->m_index_selector ? get_bc7_color_index_size(best_mode, pResults->m_index_selector) : get_bc7_alpha_index_size(best_mode, pResults->m_index_selector); - - if ((idx == anchor[0]) || (idx == anchor[1]) || (idx == anchor[2])) - n--; - - bc7_set_block_bits(pBlock_bytes, pResults->m_index_selector ? color_selectors[idx] : alpha_selectors[idx], n, &cur_bit_ofs); - } - } + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_slice: invalid slice_desc.m_file_size, or passed in buffer too small\n"); + return false; + } + + if (pHeader->m_tex_format == (int)basis_tex_format::cASTC_HDR_6x6) + { + return m_lowlevel_astc_6x6_hdr_decoder.transcode_slice(pOutput_blocks, slice_desc.m_num_blocks_x, slice_desc.m_num_blocks_y, + pDataU8 + slice_desc.m_file_ofs, slice_desc.m_file_size, + fmt, output_block_or_pixel_stride_in_bytes, (decode_flags & cDecodeFlagsBC1ForbidThreeColorBlocks) == 0, *pHeader, slice_desc, output_row_pitch_in_blocks_or_pixels, pState, + output_rows_in_pixels, channel0, channel1, decode_flags); + } + else if (pHeader->m_tex_format == (int)basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE) + { + return m_lowlevel_astc_6x6_hdr_intermediate_decoder.transcode_slice(pOutput_blocks, slice_desc.m_num_blocks_x, slice_desc.m_num_blocks_y, + pDataU8 + slice_desc.m_file_ofs, slice_desc.m_file_size, + fmt, output_block_or_pixel_stride_in_bytes, (decode_flags & cDecodeFlagsBC1ForbidThreeColorBlocks) == 0, *pHeader, slice_desc, output_row_pitch_in_blocks_or_pixels, pState, + output_rows_in_pixels, channel0, channel1, decode_flags); + } + else if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC_HDR_4x4) + { + return m_lowlevel_uastc_4x4_hdr_decoder.transcode_slice(pOutput_blocks, slice_desc.m_num_blocks_x, slice_desc.m_num_blocks_y, + pDataU8 + slice_desc.m_file_ofs, slice_desc.m_file_size, + fmt, output_block_or_pixel_stride_in_bytes, (decode_flags & cDecodeFlagsBC1ForbidThreeColorBlocks) == 0, *pHeader, slice_desc, output_row_pitch_in_blocks_or_pixels, pState, + output_rows_in_pixels, channel0, channel1, decode_flags); + } + else if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4) + { + return m_lowlevel_uastc_decoder.transcode_slice(pOutput_blocks, slice_desc.m_num_blocks_x, slice_desc.m_num_blocks_y, + pDataU8 + slice_desc.m_file_ofs, slice_desc.m_file_size, + fmt, output_block_or_pixel_stride_in_bytes, (decode_flags & cDecodeFlagsBC1ForbidThreeColorBlocks) == 0, *pHeader, slice_desc, output_row_pitch_in_blocks_or_pixels, pState, + output_rows_in_pixels, channel0, channel1, decode_flags); + } + else + { + return m_lowlevel_etc1s_decoder.transcode_slice(pOutput_blocks, slice_desc.m_num_blocks_x, slice_desc.m_num_blocks_y, + pDataU8 + slice_desc.m_file_ofs, slice_desc.m_file_size, + fmt, output_block_or_pixel_stride_in_bytes, (decode_flags & cDecodeFlagsBC1ForbidThreeColorBlocks) == 0, *pHeader, slice_desc, output_row_pitch_in_blocks_or_pixels, pState, + (decode_flags & cDecodeFlagsOutputHasAlphaIndices) != 0, pAlpha_blocks, output_rows_in_pixels); } - - assert(cur_bit_ofs == 128); } - // ASTC - static inline void astc_set_bits_1_to_9(uint32_t* pDst, int& bit_offset, uint32_t code, uint32_t codesize) + int basisu_transcoder::find_first_slice_index(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index) const { - uint8_t* pBuf = reinterpret_cast(pDst); + BASISU_NOTE_UNUSED(data_size); - assert(codesize <= 9); - if (codesize) - { - uint32_t byte_bit_offset = bit_offset & 7; - uint32_t val = code << byte_bit_offset; + const basis_file_header* pHeader = reinterpret_cast(pData); + const uint8_t* pDataU8 = static_cast(pData); - uint32_t index = bit_offset >> 3; - pBuf[index] |= (uint8_t)val; + // For very large basis files this search could be painful + // TODO: Binary search this + for (uint32_t slice_iter = 0; slice_iter < pHeader->m_total_slices; slice_iter++) + { + const basis_slice_desc& slice_desc = reinterpret_cast(pDataU8 + pHeader->m_slice_desc_file_ofs)[slice_iter]; + if ((slice_desc.m_image_index == image_index) && (slice_desc.m_level_index == level_index)) + return slice_iter; + } - if (codesize > (8 - byte_bit_offset)) - pBuf[index + 1] |= (uint8_t)(val >> 8); + BASISU_DEVEL_ERROR("basisu_transcoder::find_first_slice_index: didn't find slice\n"); - bit_offset += codesize; - } + return -1; } - void pack_astc_solid_block(void* pDst_block, const color32& color) + int basisu_transcoder::find_slice(const void* pData, uint32_t data_size, uint32_t image_index, uint32_t level_index, bool alpha_data) const { - uint32_t r = color[0], g = color[1], b = color[2]; - uint32_t a = color[3]; + if (!validate_header_quick(pData, data_size)) + { + BASISU_DEVEL_ERROR("basisu_transcoder::find_slice: header validation failed\n"); + return false; + } - uint32_t* pOutput = static_cast(pDst_block); - uint8_t* pBytes = reinterpret_cast(pDst_block); + const basis_file_header* pHeader = reinterpret_cast(pData); + const uint8_t* pDataU8 = static_cast(pData); + const basis_slice_desc* pSlice_descs = reinterpret_cast(pDataU8 + pHeader->m_slice_desc_file_ofs); - pBytes[0] = 0xfc; pBytes[1] = 0xfd; pBytes[2] = 0xff; pBytes[3] = 0xff; + // For very large basis files this search could be painful + // TODO: Binary search this + for (uint32_t slice_iter = 0; slice_iter < pHeader->m_total_slices; slice_iter++) + { + const basis_slice_desc& slice_desc = pSlice_descs[slice_iter]; + if ((slice_desc.m_image_index == image_index) && (slice_desc.m_level_index == level_index)) + { + if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S) + { + const bool slice_alpha = (slice_desc.m_flags & cSliceDescFlagsHasAlpha) != 0; + if (slice_alpha == alpha_data) + return slice_iter; + } + else + { + return slice_iter; + } + } + } - pOutput[1] = 0xffffffff; - pOutput[2] = 0; - pOutput[3] = 0; + BASISU_DEVEL_ERROR("basisu_transcoder::find_slice: didn't find slice\n"); - int bit_pos = 64; - astc_set_bits(reinterpret_cast(pDst_block), bit_pos, r | (r << 8), 16); - astc_set_bits(reinterpret_cast(pDst_block), bit_pos, g | (g << 8), 16); - astc_set_bits(reinterpret_cast(pDst_block), bit_pos, b | (b << 8), 16); - astc_set_bits(reinterpret_cast(pDst_block), bit_pos, a | (a << 8), 16); + return -1; } - // See 23.21 https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.inline.html#_partition_pattern_generation -#ifdef _DEBUG - static inline uint32_t astc_hash52(uint32_t v) + void basisu_transcoder::write_opaque_alpha_blocks( + uint32_t num_blocks_x, uint32_t num_blocks_y, + void* pOutput_blocks, block_format fmt, + uint32_t block_stride_in_bytes, uint32_t output_row_pitch_in_blocks_or_pixels) { - uint32_t p = v; - p ^= p >> 15; p -= p << 17; p += p << 7; p += p << 4; - p ^= p >> 5; p += p << 16; p ^= p >> 7; p ^= p >> 3; - p ^= p << 6; p ^= p >> 17; - return p; - } + // 'num_blocks_y', 'pOutput_blocks' & 'block_stride_in_bytes' unused + // when disabling BASISD_SUPPORT_ETC2_EAC_A8 *and* BASISD_SUPPORT_DXT5A + BASISU_NOTE_UNUSED(num_blocks_y); + BASISU_NOTE_UNUSED(pOutput_blocks); + BASISU_NOTE_UNUSED(block_stride_in_bytes); - int astc_compute_texel_partition(int seed, int x, int y, int z, int partitioncount, bool small_block) - { - if (small_block) + if (!output_row_pitch_in_blocks_or_pixels) + output_row_pitch_in_blocks_or_pixels = num_blocks_x; + + if ((fmt == block_format::cETC2_EAC_A8) || (fmt == block_format::cETC2_EAC_R11)) { - x <<= 1; y <<= 1; z <<= 1; - } - seed += (partitioncount - 1) * 1024; - uint32_t rnum = astc_hash52(seed); - uint8_t seed1 = rnum & 0xF; - uint8_t seed2 = (rnum >> 4) & 0xF; - uint8_t seed3 = (rnum >> 8) & 0xF; - uint8_t seed4 = (rnum >> 12) & 0xF; - uint8_t seed5 = (rnum >> 16) & 0xF; - uint8_t seed6 = (rnum >> 20) & 0xF; - uint8_t seed7 = (rnum >> 24) & 0xF; - uint8_t seed8 = (rnum >> 28) & 0xF; - uint8_t seed9 = (rnum >> 18) & 0xF; - uint8_t seed10 = (rnum >> 22) & 0xF; - uint8_t seed11 = (rnum >> 26) & 0xF; - uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF; +#if BASISD_SUPPORT_ETC2_EAC_A8 + eac_block blk; + blk.m_base = 255; + blk.m_multiplier = 1; + blk.m_table = 13; - seed1 *= seed1; seed2 *= seed2; - seed3 *= seed3; seed4 *= seed4; - seed5 *= seed5; seed6 *= seed6; - seed7 *= seed7; seed8 *= seed8; - seed9 *= seed9; seed10 *= seed10; - seed11 *= seed11; seed12 *= seed12; + // Selectors are all 4's + memcpy(&blk.m_selectors, g_etc2_eac_a8_sel4, sizeof(g_etc2_eac_a8_sel4)); - int sh1, sh2, sh3; - if (seed & 1) - { - sh1 = (seed & 2 ? 4 : 5); sh2 = (partitioncount == 3 ? 6 : 5); + for (uint32_t y = 0; y < num_blocks_y; y++) + { + uint32_t dst_ofs = y * output_row_pitch_in_blocks_or_pixels * block_stride_in_bytes; + for (uint32_t x = 0; x < num_blocks_x; x++) + { + memcpy((uint8_t*)pOutput_blocks + dst_ofs, &blk, sizeof(blk)); + dst_ofs += block_stride_in_bytes; + } + } +#endif } - else + else if (fmt == block_format::cBC4) { - sh1 = (partitioncount == 3 ? 6 : 5); sh2 = (seed & 2 ? 4 : 5); - } - sh3 = (seed & 0x10) ? sh1 : sh2; - - seed1 >>= sh1; seed2 >>= sh2; seed3 >>= sh1; seed4 >>= sh2; - seed5 >>= sh1; seed6 >>= sh2; seed7 >>= sh1; seed8 >>= sh2; - seed9 >>= sh3; seed10 >>= sh3; seed11 >>= sh3; seed12 >>= sh3; - - int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14); - int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10); - int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6); - int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2); - - a &= 0x3F; b &= 0x3F; c &= 0x3F; d &= 0x3F; - - if (partitioncount < 4) d = 0; - if (partitioncount < 3) c = 0; +#if BASISD_SUPPORT_DXT5A + dxt5a_block blk; + blk.m_endpoints[0] = 255; + blk.m_endpoints[1] = 255; + memset(blk.m_selectors, 0, sizeof(blk.m_selectors)); - if (a >= b && a >= c && a >= d) - return 0; - else if (b >= c && b >= d) - return 1; - else if (c >= d) - return 2; - else - return 3; - } + for (uint32_t y = 0; y < num_blocks_y; y++) + { + uint32_t dst_ofs = y * output_row_pitch_in_blocks_or_pixels * block_stride_in_bytes; + for (uint32_t x = 0; x < num_blocks_x; x++) + { + memcpy((uint8_t*)pOutput_blocks + dst_ofs, &blk, sizeof(blk)); + dst_ofs += block_stride_in_bytes; + } + } #endif + } + } - static const uint8_t g_astc_quint_encode[125] = + bool basisu_transcoder::transcode_image_level( + const void* pData, uint32_t data_size, + uint32_t image_index, uint32_t level_index, + void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, + transcoder_texture_format fmt, + uint32_t decode_flags, uint32_t output_row_pitch_in_blocks_or_pixels, basisu_transcoder_state *pState, uint32_t output_rows_in_pixels) const { - 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 24, 25, 26, 27, 28, 5, 13, 21, 29, 6, 32, 33, 34, 35, 36, 40, 41, 42, 43, 44, 48, 49, 50, 51, 52, 56, 57, - 58, 59, 60, 37, 45, 53, 61, 14, 64, 65, 66, 67, 68, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 88, 89, 90, 91, 92, 69, 77, 85, 93, 22, 96, 97, 98, 99, 100, 104, - 105, 106, 107, 108, 112, 113, 114, 115, 116, 120, 121, 122, 123, 124, 101, 109, 117, 125, 30, 102, 103, 70, 71, 38, 110, 111, 78, 79, 46, 118, 119, 86, 87, 54, - 126, 127, 94, 95, 62, 39, 47, 55, 63, 31 - }; + const uint32_t bytes_per_block_or_pixel = basis_get_bytes_per_block_or_pixel(fmt); - // Encodes 3 values to output, usable for any range that uses quints and bits - static inline void astc_encode_quints(uint32_t* pOutput, const uint8_t* pValues, int& bit_pos, int n) - { - // First extract the quints and the bits from the 3 input values - int quints = 0, bits[3]; - const uint32_t bit_mask = (1 << n) - 1; - for (int i = 0; i < 3; i++) + if (!m_ready_to_transcode) { - static const int s_muls[3] = { 1, 5, 25 }; + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: must call start_transcoding() first\n"); + return false; + } - const int t = pValues[i] >> n; + //const bool transcode_alpha_data_to_opaque_formats = (decode_flags & cDecodeFlagsTranscodeAlphaDataToOpaqueFormats) != 0; - quints += t * s_muls[i]; - bits[i] = pValues[i] & bit_mask; + if (decode_flags & cDecodeFlagsPVRTCDecodeToNextPow2) + { + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: cDecodeFlagsPVRTCDecodeToNextPow2 currently unsupported\n"); + // TODO: Not yet supported + return false; } - // Encode the quints, by inverting the bit manipulations done by the decoder, converting 3 quints into 7-bits. - // See https://www.khronos.org/registry/DataFormat/specs/1.2/dataformat.1.2.html#astc-integer-sequence-encoding - - assert(quints < 125); - const int T = g_astc_quint_encode[quints]; + if (!validate_header_quick(pData, data_size)) + { + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: header validation failed\n"); + return false; + } - // Now interleave the 7 encoded quint bits with the bits to form the encoded output. See table 95-96. - astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 2) << n) | (bits[1] << (3 + n)) | (astc_extract_bits(T, 3, 4) << (3 + n * 2)) | - (bits[2] << (5 + n * 2)) | (astc_extract_bits(T, 5, 6) << (5 + n * 3)), 7 + n * 3); - } + const basis_file_header* pHeader = reinterpret_cast(pData); - // Packs values using ASTC's BISE to output buffer. - static void astc_pack_bise(uint32_t* pDst, const uint8_t* pSrc_vals, int bit_pos, int num_vals, int range) - { - uint32_t temp[5] = { 0, 0, 0, 0, 0 }; + const uint8_t* pDataU8 = static_cast(pData); - const int num_bits = g_astc_bise_range_table[range][0]; + const basis_slice_desc* pSlice_descs = reinterpret_cast(pDataU8 + pHeader->m_slice_desc_file_ofs); - int group_size = 0; - if (g_astc_bise_range_table[range][1]) - group_size = 5; - else if (g_astc_bise_range_table[range][2]) - group_size = 3; + const bool basis_file_has_alpha_slices = (pHeader->m_flags & cBASISHeaderFlagHasAlphaSlices) != 0; - if (group_size) + int slice_index = find_first_slice_index(pData, data_size, image_index, level_index); + if (slice_index < 0) { - // Range has trits or quints - pack each group of 5 or 3 values - const int total_groups = (group_size == 5) ? ((num_vals + 4) / 5) : ((num_vals + 2) / 3); + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: failed finding slice index\n"); + // Unable to find the requested image/level + return false; + } - for (int group_index = 0; group_index < total_groups; group_index++) + if ((fmt == transcoder_texture_format::cTFPVRTC1_4_RGBA) && (!basis_file_has_alpha_slices)) + { + // Switch to PVRTC1 RGB if the input doesn't have alpha. + fmt = transcoder_texture_format::cTFPVRTC1_4_RGB; + } + + if (pHeader->m_tex_format == (int)basis_tex_format::cETC1S) + { + if (pSlice_descs[slice_index].m_flags & cSliceDescFlagsHasAlpha) { - uint8_t vals[5] = { 0, 0, 0, 0, 0 }; - - const int limit = basisu::minimum(group_size, num_vals - group_index * group_size); - for (int i = 0; i < limit; i++) - vals[i] = pSrc_vals[group_index * group_size + i]; + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file has out of order alpha slice\n"); - if (group_size == 5) - astc_encode_trits(temp, vals, bit_pos, num_bits); - else - astc_encode_quints(temp, vals, bit_pos, num_bits); + // The first slice shouldn't have alpha data in a properly formed basis file + return false; } - } - else - { - for (int i = 0; i < num_vals; i++) - astc_set_bits_1_to_9(temp, bit_pos, pSrc_vals[i], num_bits); - } - pDst[0] |= temp[0]; pDst[1] |= temp[1]; - pDst[2] |= temp[2]; pDst[3] |= temp[3]; - } - - const uint32_t ASTC_BLOCK_MODE_BITS = 11; - const uint32_t ASTC_PART_BITS = 2; - const uint32_t ASTC_CEM_BITS = 4; - const uint32_t ASTC_PARTITION_INDEX_BITS = 10; - const uint32_t ASTC_CCS_BITS = 2; - - const uint32_t g_uastc_mode_astc_block_mode[TOTAL_UASTC_MODES] = { 0x242, 0x42, 0x53, 0x42, 0x42, 0x53, 0x442, 0x42, 0, 0x42, 0x242, 0x442, 0x53, 0x441, 0x42, 0x242, 0x42, 0x442, 0x253 }; - - bool pack_astc_block(uint32_t* pDst, const astc_block_desc* pBlock, uint32_t uastc_mode) - { - assert(uastc_mode < TOTAL_UASTC_MODES); - uint8_t* pDst_bytes = reinterpret_cast(pDst); - - const int total_weights = pBlock->m_dual_plane ? 32 : 16; - - // Set mode bits - see Table 146-147 - uint32_t mode = g_uastc_mode_astc_block_mode[uastc_mode]; - pDst_bytes[0] = (uint8_t)mode; - pDst_bytes[1] = (uint8_t)(mode >> 8); - - memset(pDst_bytes + 2, 0, 16 - 2); - - int bit_pos = ASTC_BLOCK_MODE_BITS; - - // We only support 1-5 bit weight indices - assert(!g_astc_bise_range_table[pBlock->m_weight_range][1] && !g_astc_bise_range_table[pBlock->m_weight_range][2]); - const int bits_per_weight = g_astc_bise_range_table[pBlock->m_weight_range][0]; - - // See table 143 - PART - astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_subsets - 1, ASTC_PART_BITS); + if (basis_file_has_alpha_slices) + { + // The alpha data should immediately follow the color data, and have the same resolution. + if ((slice_index + 1U) >= pHeader->m_total_slices) + { + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file has missing alpha slice\n"); + // basis file is missing the alpha slice + return false; + } - if (pBlock->m_subsets == 1) - astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_cem, ASTC_CEM_BITS); - else - { - // See table 145 - astc_set_bits(pDst, bit_pos, pBlock->m_partition_seed, ASTC_PARTITION_INDEX_BITS); + // Basic sanity checks + if ((pSlice_descs[slice_index + 1].m_flags & cSliceDescFlagsHasAlpha) == 0) + { + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file has missing alpha slice (flag check)\n"); + // This slice should have alpha data + return false; + } - // Table 150 - we assume all CEM's are equal, so write 2 0's along with the CEM - astc_set_bits_1_to_9(pDst, bit_pos, (pBlock->m_cem << 2) & 63, ASTC_CEM_BITS + 2); + if ((pSlice_descs[slice_index].m_num_blocks_x != pSlice_descs[slice_index + 1].m_num_blocks_x) || (pSlice_descs[slice_index].m_num_blocks_y != pSlice_descs[slice_index + 1].m_num_blocks_y)) + { + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: alpha basis file slice dimensions bad\n"); + // Alpha slice should have been the same res as the color slice + return false; + } + } } + + bool status = false; - if (pBlock->m_dual_plane) + if ((pHeader->m_tex_format == (int)basis_tex_format::cETC1S) || (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4)) { - const int total_weight_bits = total_weights * bits_per_weight; - - // See Illegal Encodings 23.24 - // https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.inline.html#_illegal_encodings - assert((total_weight_bits >= 24) && (total_weight_bits <= 96)); - - int ccs_bit_pos = 128 - total_weight_bits - ASTC_CCS_BITS; - astc_set_bits_1_to_9(pDst, ccs_bit_pos, pBlock->m_ccs, ASTC_CCS_BITS); - } - - const int num_cem_pairs = (1 + (pBlock->m_cem >> 2)) * pBlock->m_subsets; - assert(num_cem_pairs <= 9); - - astc_pack_bise(pDst, pBlock->m_endpoints, bit_pos, num_cem_pairs * 2, g_uastc_mode_endpoint_ranges[uastc_mode]); + // Only do this on 4x4 LDR formats that supports transcoding to PVRTC1. + const uint32_t total_slice_blocks = pSlice_descs[slice_index].m_num_blocks_x * pSlice_descs[slice_index].m_num_blocks_y; - // Write the weight bits in reverse bit order. - switch (bits_per_weight) - { - case 1: - { - const uint32_t N = 1; - for (int i = 0; i < total_weights; i++) + if (((fmt == transcoder_texture_format::cTFPVRTC1_4_RGB) || (fmt == transcoder_texture_format::cTFPVRTC1_4_RGBA)) && (output_blocks_buf_size_in_blocks_or_pixels > total_slice_blocks)) { - const uint32_t ofs = 128 - N - i; - assert((ofs >> 3) < 16); - pDst_bytes[ofs >> 3] |= (pBlock->m_weights[i] << (ofs & 7)); + // The transcoder doesn't write beyond total_slice_blocks, so we need to clear the rest ourselves. + // For GL usage, PVRTC1 4bpp image size is (max(width, 8)* max(height, 8) * 4 + 7) / 8. + // However, for KTX and internally in Basis this formula isn't used, it's just ((width+3)/4) * ((height+3)/4) * bytes_per_block_or_pixel. This is all the transcoder actually writes to memory. + memset(static_cast(pOutput_blocks) + total_slice_blocks * bytes_per_block_or_pixel, 0, (output_blocks_buf_size_in_blocks_or_pixels - total_slice_blocks) * bytes_per_block_or_pixel); } - break; } - case 2: + + if (pHeader->m_tex_format == (int)basis_tex_format::cASTC_HDR_6x6) { - const uint32_t N = 2; - for (int i = 0; i < total_weights; i++) - { - static const uint8_t s_reverse_bits2[4] = { 0, 2, 1, 3 }; - const uint32_t ofs = 128 - N - (i * N); - assert((ofs >> 3) < 16); - pDst_bytes[ofs >> 3] |= (s_reverse_bits2[pBlock->m_weights[i]] << (ofs & 7)); - } - break; + const basis_slice_desc* pSlice_desc = &pSlice_descs[slice_index]; + + // Use the container independent image transcode method. + status = m_lowlevel_astc_6x6_hdr_decoder.transcode_image(fmt, + pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, + (const uint8_t*)pData, data_size, pSlice_desc->m_num_blocks_x, pSlice_desc->m_num_blocks_y, pSlice_desc->m_orig_width, pSlice_desc->m_orig_height, pSlice_desc->m_level_index, + pSlice_desc->m_file_ofs, pSlice_desc->m_file_size, + decode_flags, basis_file_has_alpha_slices, pHeader->m_tex_type == cBASISTexTypeVideoFrames, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); } - case 3: + else if (pHeader->m_tex_format == (int)basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE) { - const uint32_t N = 3; - for (int i = 0; i < total_weights; i++) - { - static const uint8_t s_reverse_bits3[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; + const basis_slice_desc* pSlice_desc = &pSlice_descs[slice_index]; - const uint32_t ofs = 128 - N - (i * N); - const uint32_t rev = s_reverse_bits3[pBlock->m_weights[i]] << (ofs & 7); + // Use the container independent image transcode method. + status = m_lowlevel_astc_6x6_hdr_intermediate_decoder.transcode_image(fmt, + pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, + (const uint8_t*)pData, data_size, pSlice_desc->m_num_blocks_x, pSlice_desc->m_num_blocks_y, pSlice_desc->m_orig_width, pSlice_desc->m_orig_height, pSlice_desc->m_level_index, + pSlice_desc->m_file_ofs, pSlice_desc->m_file_size, + decode_flags, basis_file_has_alpha_slices, pHeader->m_tex_type == cBASISTexTypeVideoFrames, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + } + else if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC_HDR_4x4) + { + const basis_slice_desc* pSlice_desc = &pSlice_descs[slice_index]; - uint32_t index = ofs >> 3; - assert(index < 16); - pDst_bytes[index++] |= rev & 0xFF; - if (index < 16) - pDst_bytes[index++] |= (rev >> 8); - } - break; + // Use the container independent image transcode method. + status = m_lowlevel_uastc_4x4_hdr_decoder.transcode_image(fmt, + pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, + (const uint8_t*)pData, data_size, pSlice_desc->m_num_blocks_x, pSlice_desc->m_num_blocks_y, pSlice_desc->m_orig_width, pSlice_desc->m_orig_height, pSlice_desc->m_level_index, + pSlice_desc->m_file_ofs, pSlice_desc->m_file_size, + decode_flags, basis_file_has_alpha_slices, pHeader->m_tex_type == cBASISTexTypeVideoFrames, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); } - case 4: + else if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4) { - const uint32_t N = 4; - for (int i = 0; i < total_weights; i++) - { - static const uint8_t s_reverse_bits4[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; - const int ofs = 128 - N - (i * N); - assert(ofs >= 0 && (ofs >> 3) < 16); - pDst_bytes[ofs >> 3] |= (s_reverse_bits4[pBlock->m_weights[i]] << (ofs & 7)); - } - break; + const basis_slice_desc* pSlice_desc = &pSlice_descs[slice_index]; + + // Use the container independent image transcode method. + status = m_lowlevel_uastc_decoder.transcode_image(fmt, + pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, + (const uint8_t*)pData, data_size, pSlice_desc->m_num_blocks_x, pSlice_desc->m_num_blocks_y, pSlice_desc->m_orig_width, pSlice_desc->m_orig_height, pSlice_desc->m_level_index, + pSlice_desc->m_file_ofs, pSlice_desc->m_file_size, + decode_flags, basis_file_has_alpha_slices, pHeader->m_tex_type == cBASISTexTypeVideoFrames, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); } - case 5: + else { - const uint32_t N = 5; - for (int i = 0; i < total_weights; i++) - { - static const uint8_t s_reverse_bits5[32] = { 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30, 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31 }; + // ETC1S + const basis_slice_desc* pSlice_desc = &pSlice_descs[slice_index]; + const basis_slice_desc* pAlpha_slice_desc = basis_file_has_alpha_slices ? &pSlice_descs[slice_index + 1] : nullptr; - const uint32_t ofs = 128 - N - (i * N); - const uint32_t rev = s_reverse_bits5[pBlock->m_weights[i]] << (ofs & 7); + assert((pSlice_desc->m_flags & cSliceDescFlagsHasAlpha) == 0); - uint32_t index = ofs >> 3; - assert(index < 16); - pDst_bytes[index++] |= rev & 0xFF; - if (index < 16) - pDst_bytes[index++] |= (rev >> 8); + if (pAlpha_slice_desc) + { + // Basic sanity checks + assert((pAlpha_slice_desc->m_flags & cSliceDescFlagsHasAlpha) != 0); + assert(pSlice_desc->m_num_blocks_x == pAlpha_slice_desc->m_num_blocks_x); + assert(pSlice_desc->m_num_blocks_y == pAlpha_slice_desc->m_num_blocks_y); + assert(pSlice_desc->m_level_index == pAlpha_slice_desc->m_level_index); } - break; + // Use the container independent image transcode method. + status = m_lowlevel_etc1s_decoder.transcode_image(fmt, + pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, + (const uint8_t *)pData, data_size, pSlice_desc->m_num_blocks_x, pSlice_desc->m_num_blocks_y, pSlice_desc->m_orig_width, pSlice_desc->m_orig_height, pSlice_desc->m_level_index, + pSlice_desc->m_file_ofs, pSlice_desc->m_file_size, + (pAlpha_slice_desc != nullptr) ? (uint32_t)pAlpha_slice_desc->m_file_ofs : 0U, (pAlpha_slice_desc != nullptr) ? (uint32_t)pAlpha_slice_desc->m_file_size : 0U, + decode_flags, basis_file_has_alpha_slices, pHeader->m_tex_type == cBASISTexTypeVideoFrames, output_row_pitch_in_blocks_or_pixels, pState, output_rows_in_pixels); + + } // if (pHeader->m_tex_format == (int)basis_tex_format::cUASTC4x4) + + if (!status) + { + BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: Returning false\n"); + } + else + { + //BASISU_DEVEL_ERROR("basisu_transcoder::transcode_image_level: Returning true\n"); } + + return status; + } + + uint32_t basis_get_bytes_per_block_or_pixel(transcoder_texture_format fmt) + { + switch (fmt) + { + case transcoder_texture_format::cTFETC1_RGB: + case transcoder_texture_format::cTFBC1_RGB: + case transcoder_texture_format::cTFBC4_R: + case transcoder_texture_format::cTFPVRTC1_4_RGB: + case transcoder_texture_format::cTFPVRTC1_4_RGBA: + case transcoder_texture_format::cTFATC_RGB: + case transcoder_texture_format::cTFPVRTC2_4_RGB: + case transcoder_texture_format::cTFPVRTC2_4_RGBA: + case transcoder_texture_format::cTFETC2_EAC_R11: + return 8; + case transcoder_texture_format::cTFBC7_RGBA: + case transcoder_texture_format::cTFBC7_ALT: + case transcoder_texture_format::cTFBC6H: + case transcoder_texture_format::cTFETC2_RGBA: + case transcoder_texture_format::cTFBC3_RGBA: + case transcoder_texture_format::cTFBC5_RG: + case transcoder_texture_format::cTFASTC_4x4_RGBA: + case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: + case transcoder_texture_format::cTFASTC_HDR_6x6_RGBA: + case transcoder_texture_format::cTFATC_RGBA: + case transcoder_texture_format::cTFFXT1_RGB: + case transcoder_texture_format::cTFETC2_EAC_RG11: + return 16; + case transcoder_texture_format::cTFRGBA32: + case transcoder_texture_format::cTFRGB_9E5: + return sizeof(uint32_t); + case transcoder_texture_format::cTFRGB565: + case transcoder_texture_format::cTFBGR565: + case transcoder_texture_format::cTFRGBA4444: + return sizeof(uint16_t); + case transcoder_texture_format::cTFRGB_HALF: + return sizeof(half_float) * 3; + case transcoder_texture_format::cTFRGBA_HALF: + return sizeof(half_float) * 4; default: assert(0); + BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n"); break; } - - return true; + return 0; } - const uint8_t* get_anchor_indices(uint32_t subsets, uint32_t mode, uint32_t common_pattern, const uint8_t*& pPartition_pattern) + const char* basis_get_format_name(transcoder_texture_format fmt) { - const uint8_t* pSubset_anchor_indices = g_zero_pattern; - pPartition_pattern = g_zero_pattern; - - if (subsets >= 2) + switch (fmt) { - if (subsets == 3) - { - pPartition_pattern = &g_astc_bc7_patterns3[common_pattern][0]; - pSubset_anchor_indices = &g_astc_bc7_pattern3_anchors[common_pattern][0]; - } - else if (mode == 7) - { - pPartition_pattern = &g_bc7_3_astc2_patterns2[common_pattern][0]; - pSubset_anchor_indices = &g_bc7_3_astc2_patterns2_anchors[common_pattern][0]; - } - else - { - pPartition_pattern = &g_astc_bc7_patterns2[common_pattern][0]; - pSubset_anchor_indices = &g_astc_bc7_pattern2_anchors[common_pattern][0]; - } + case transcoder_texture_format::cTFETC1_RGB: return "ETC1_RGB"; + case transcoder_texture_format::cTFBC1_RGB: return "BC1_RGB"; + case transcoder_texture_format::cTFBC4_R: return "BC4_R"; + case transcoder_texture_format::cTFPVRTC1_4_RGB: return "PVRTC1_4_RGB"; + case transcoder_texture_format::cTFPVRTC1_4_RGBA: return "PVRTC1_4_RGBA"; + case transcoder_texture_format::cTFBC7_RGBA: return "BC7_RGBA"; + case transcoder_texture_format::cTFBC7_ALT: return "BC7_RGBA"; + case transcoder_texture_format::cTFETC2_RGBA: return "ETC2_RGBA"; + case transcoder_texture_format::cTFBC3_RGBA: return "BC3_RGBA"; + case transcoder_texture_format::cTFBC5_RG: return "BC5_RG"; + case transcoder_texture_format::cTFASTC_4x4_RGBA: return "ASTC_RGBA"; + case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: return "ASTC_HDR_4X4_RGBA"; + case transcoder_texture_format::cTFASTC_HDR_6x6_RGBA: return "ASTC_HDR_6X6_RGBA"; + case transcoder_texture_format::cTFATC_RGB: return "ATC_RGB"; + case transcoder_texture_format::cTFATC_RGBA: return "ATC_RGBA"; + case transcoder_texture_format::cTFRGBA32: return "RGBA32"; + case transcoder_texture_format::cTFRGB565: return "RGB565"; + case transcoder_texture_format::cTFBGR565: return "BGR565"; + case transcoder_texture_format::cTFRGBA4444: return "RGBA4444"; + case transcoder_texture_format::cTFRGBA_HALF: return "RGBA_HALF"; + case transcoder_texture_format::cTFRGB_9E5: return "RGB_9E5"; + case transcoder_texture_format::cTFRGB_HALF: return "RGB_HALF"; + case transcoder_texture_format::cTFFXT1_RGB: return "FXT1_RGB"; + case transcoder_texture_format::cTFPVRTC2_4_RGB: return "PVRTC2_4_RGB"; + case transcoder_texture_format::cTFPVRTC2_4_RGBA: return "PVRTC2_4_RGBA"; + case transcoder_texture_format::cTFETC2_EAC_R11: return "ETC2_EAC_R11"; + case transcoder_texture_format::cTFETC2_EAC_RG11: return "ETC2_EAC_RG11"; + case transcoder_texture_format::cTFBC6H: return "BC6H"; + default: + assert(0); + BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n"); + break; } - - return pSubset_anchor_indices; + return ""; } - static inline uint32_t read_bit(const uint8_t* pBuf, uint32_t& bit_offset) + const char* basis_get_block_format_name(block_format fmt) { - uint32_t byte_bits = pBuf[bit_offset >> 3] >> (bit_offset & 7); - bit_offset += 1; - return byte_bits & 1; + switch (fmt) + { + case block_format::cETC1: return "ETC1"; + case block_format::cBC1: return "BC1"; + case block_format::cPVRTC1_4_RGB: return "PVRTC1_4_RGB"; + case block_format::cPVRTC1_4_RGBA: return "PVRTC1_4_RGBA"; + case block_format::cBC7: return "BC7"; + case block_format::cETC2_RGBA: return "ETC2_RGBA"; + case block_format::cBC3: return "BC3"; + case block_format::cASTC_4x4: return "ASTC_4x4"; + case block_format::cATC_RGB: return "ATC_RGB"; + case block_format::cRGBA32: return "RGBA32"; + case block_format::cRGB565: return "RGB565"; + case block_format::cBGR565: return "BGR565"; + case block_format::cRGBA4444: return "RGBA4444"; + case block_format::cRGBA_HALF: return "RGBA_HALF"; + case block_format::cRGB_HALF: return "RGB_HALF"; + case block_format::cRGB_9E5: return "RGB_9E5"; + case block_format::cUASTC_4x4: return "UASTC_4x4"; + case block_format::cUASTC_HDR_4x4: return "UASTC_HDR_4x4"; + case block_format::cBC6H: return "BC6H"; + case block_format::cASTC_HDR_4x4: return "ASTC_HDR_4x4"; + case block_format::cASTC_HDR_6x6: return "ASTC_HDR_6x6"; + case block_format::cFXT1_RGB: return "FXT1_RGB"; + case block_format::cPVRTC2_4_RGB: return "PVRTC2_4_RGB"; + case block_format::cPVRTC2_4_RGBA: return "PVRTC2_4_RGBA"; + case block_format::cETC2_EAC_R11: return "ETC2_EAC_R11"; + case block_format::cETC2_EAC_RG11: return "ETC2_EAC_RG11"; + default: + assert(0); + BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n"); + break; + } + return ""; } - static inline uint32_t read_bits1_to_9(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize) + const char* basis_get_texture_type_name(basis_texture_type tex_type) { - assert(codesize <= 9); - if (!codesize) - return 0; - - if ((BASISD_IS_BIG_ENDIAN) || (!BASISD_USE_UNALIGNED_WORD_READS) || (bit_offset >= 112)) + switch (tex_type) { - const uint8_t* pBytes = &pBuf[bit_offset >> 3U]; - - uint32_t byte_bit_offset = bit_offset & 7U; - - uint32_t bits = pBytes[0] >> byte_bit_offset; - uint32_t bits_read = basisu::minimum(codesize, 8 - byte_bit_offset); - - uint32_t bits_remaining = codesize - bits_read; - if (bits_remaining) - bits |= ((uint32_t)pBytes[1]) << bits_read; - - bit_offset += codesize; - - return bits & ((1U << codesize) - 1U); + case cBASISTexType2D: return "2D"; + case cBASISTexType2DArray: return "2D array"; + case cBASISTexTypeCubemapArray: return "cubemap array"; + case cBASISTexTypeVideoFrames: return "video"; + case cBASISTexTypeVolume: return "3D"; + default: + assert(0); + BASISU_DEVEL_ERROR("basis_get_texture_type_name: Invalid tex_type\n"); + break; } - - uint32_t byte_bit_offset = bit_offset & 7U; - const uint16_t w = *(const uint16_t *)(&pBuf[bit_offset >> 3U]); - bit_offset += codesize; - return (w >> byte_bit_offset) & ((1U << codesize) - 1U); + return ""; } - inline uint64_t read_bits64(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize) + bool basis_transcoder_format_has_alpha(transcoder_texture_format fmt) { - assert(codesize <= 64U); - uint64_t bits = 0; - uint32_t total_bits = 0; - - while (total_bits < codesize) + // TODO: Technically ASTC HDR does support alpha, but our ASTC HDR encoders don't yet support it. Unsure what to do here. + switch (fmt) { - uint32_t byte_bit_offset = bit_offset & 7U; - uint32_t bits_to_read = basisu::minimum(codesize - total_bits, 8U - byte_bit_offset); - - uint32_t byte_bits = pBuf[bit_offset >> 3U] >> byte_bit_offset; - byte_bits &= ((1U << bits_to_read) - 1U); - - bits |= ((uint64_t)(byte_bits) << total_bits); - - total_bits += bits_to_read; - bit_offset += bits_to_read; + case transcoder_texture_format::cTFETC2_RGBA: + case transcoder_texture_format::cTFBC3_RGBA: + case transcoder_texture_format::cTFASTC_4x4_RGBA: + case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: // technically this ASTC HDR format supports alpha, but we currently don't exploit that in our encoders + case transcoder_texture_format::cTFASTC_HDR_6x6_RGBA: // technically this ASTC HDR format supports alpha, but we currently don't exploit that in our encoders + case transcoder_texture_format::cTFBC7_RGBA: + case transcoder_texture_format::cTFBC7_ALT: + case transcoder_texture_format::cTFPVRTC1_4_RGBA: + case transcoder_texture_format::cTFPVRTC2_4_RGBA: + case transcoder_texture_format::cTFATC_RGBA: + case transcoder_texture_format::cTFRGBA32: + case transcoder_texture_format::cTFRGBA4444: + case transcoder_texture_format::cTFRGBA_HALF: + return true; + default: + break; } - - return bits; + return false; } - static inline uint32_t read_bits1_to_9_fst(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize) + bool basis_transcoder_format_is_hdr(transcoder_texture_format fmt) { - assert(codesize <= 9); - if (!codesize) - return 0; - assert(bit_offset < 112); - - if ((BASISD_IS_BIG_ENDIAN) || (!BASISD_USE_UNALIGNED_WORD_READS)) + switch (fmt) { - const uint8_t* pBytes = &pBuf[bit_offset >> 3U]; - - uint32_t byte_bit_offset = bit_offset & 7U; - - uint32_t bits = pBytes[0] >> byte_bit_offset; - uint32_t bits_read = basisu::minimum(codesize, 8 - byte_bit_offset); - - uint32_t bits_remaining = codesize - bits_read; - if (bits_remaining) - bits |= ((uint32_t)pBytes[1]) << bits_read; - - bit_offset += codesize; - - return bits & ((1U << codesize) - 1U); + case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: + case transcoder_texture_format::cTFASTC_HDR_6x6_RGBA: + case transcoder_texture_format::cTFBC6H: + case transcoder_texture_format::cTFRGBA_HALF: + case transcoder_texture_format::cTFRGB_HALF: + case transcoder_texture_format::cTFRGB_9E5: + return true; + default: + break; } - else + return false; + } + + basisu::texture_format basis_get_basisu_texture_format(transcoder_texture_format fmt) + { + switch (fmt) { - uint32_t byte_bit_offset = bit_offset & 7U; - const uint16_t w = *(const uint16_t*)(&pBuf[bit_offset >> 3U]); - bit_offset += codesize; - return (w >> byte_bit_offset) & ((1U << codesize) - 1U); + case transcoder_texture_format::cTFETC1_RGB: return basisu::texture_format::cETC1; + case transcoder_texture_format::cTFBC1_RGB: return basisu::texture_format::cBC1; + case transcoder_texture_format::cTFBC4_R: return basisu::texture_format::cBC4; + case transcoder_texture_format::cTFPVRTC1_4_RGB: return basisu::texture_format::cPVRTC1_4_RGB; + case transcoder_texture_format::cTFPVRTC1_4_RGBA: return basisu::texture_format::cPVRTC1_4_RGBA; + case transcoder_texture_format::cTFBC7_RGBA: return basisu::texture_format::cBC7; + case transcoder_texture_format::cTFBC7_ALT: return basisu::texture_format::cBC7; + case transcoder_texture_format::cTFETC2_RGBA: return basisu::texture_format::cETC2_RGBA; + case transcoder_texture_format::cTFBC3_RGBA: return basisu::texture_format::cBC3; + case transcoder_texture_format::cTFBC5_RG: return basisu::texture_format::cBC5; + case transcoder_texture_format::cTFASTC_4x4_RGBA: return basisu::texture_format::cASTC_LDR_4x4; + case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: return basisu::texture_format::cASTC_HDR_4x4; + case transcoder_texture_format::cTFASTC_HDR_6x6_RGBA: return basisu::texture_format::cASTC_HDR_6x6; + case transcoder_texture_format::cTFBC6H: return basisu::texture_format::cBC6HUnsigned; + case transcoder_texture_format::cTFATC_RGB: return basisu::texture_format::cATC_RGB; + case transcoder_texture_format::cTFATC_RGBA: return basisu::texture_format::cATC_RGBA_INTERPOLATED_ALPHA; + case transcoder_texture_format::cTFRGBA32: return basisu::texture_format::cRGBA32; + case transcoder_texture_format::cTFRGB565: return basisu::texture_format::cRGB565; + case transcoder_texture_format::cTFBGR565: return basisu::texture_format::cBGR565; + case transcoder_texture_format::cTFRGBA4444: return basisu::texture_format::cRGBA4444; + case transcoder_texture_format::cTFRGBA_HALF: return basisu::texture_format::cRGBA_HALF; + case transcoder_texture_format::cTFRGB_9E5: return basisu::texture_format::cRGB_9E5; + case transcoder_texture_format::cTFRGB_HALF: return basisu::texture_format::cRGB_HALF; + case transcoder_texture_format::cTFFXT1_RGB: return basisu::texture_format::cFXT1_RGB; + case transcoder_texture_format::cTFPVRTC2_4_RGB: return basisu::texture_format::cPVRTC2_4_RGBA; + case transcoder_texture_format::cTFPVRTC2_4_RGBA: return basisu::texture_format::cPVRTC2_4_RGBA; + case transcoder_texture_format::cTFETC2_EAC_R11: return basisu::texture_format::cETC2_R11_EAC; + case transcoder_texture_format::cTFETC2_EAC_RG11: return basisu::texture_format::cETC2_RG11_EAC; + default: + assert(0); + BASISU_DEVEL_ERROR("basis_get_basisu_texture_format: Invalid fmt\n"); + break; } + return basisu::texture_format::cInvalidTextureFormat; } - bool unpack_uastc(const uastc_block& blk, unpacked_uastc_block& unpacked, bool blue_contract_check, bool read_hints) + bool basis_transcoder_format_is_uncompressed(transcoder_texture_format tex_type) { - //memset(&unpacked, 0, sizeof(unpacked)); - -#if 0 - uint8_t table[128]; - memset(table, 0xFF, sizeof(table)); - + switch (tex_type) { - for (uint32_t mode = 0; mode <= TOTAL_UASTC_MODES; mode++) - { - const uint32_t code = g_uastc_mode_huff_codes[mode][0]; - const uint32_t codesize = g_uastc_mode_huff_codes[mode][1]; - - table[code] = mode; - - uint32_t bits_left = 7 - codesize; - for (uint32_t i = 0; i < (1 << bits_left); i++) - table[code | (i << codesize)] = mode; - } - - for (uint32_t i = 0; i < 128; i++) - printf("%u,", table[i]); - exit(0); + case transcoder_texture_format::cTFRGBA32: + case transcoder_texture_format::cTFRGB565: + case transcoder_texture_format::cTFBGR565: + case transcoder_texture_format::cTFRGBA4444: + case transcoder_texture_format::cTFRGB_HALF: + case transcoder_texture_format::cTFRGBA_HALF: + case transcoder_texture_format::cTFRGB_9E5: + return true; + default: + break; } -#endif - - const int mode = g_uastc_huff_modes[blk.m_bytes[0] & 127]; - if (mode >= (int)TOTAL_UASTC_MODES) - return false; - - unpacked.m_mode = mode; - unpacked.m_common_pattern = 0; - - uint32_t bit_ofs = g_uastc_mode_huff_codes[mode][1]; + return false; + } - if (mode == UASTC_MODE_INDEX_SOLID_COLOR) + bool basis_block_format_is_uncompressed(block_format blk_fmt) + { + switch (blk_fmt) { - unpacked.m_solid_color.r = (uint8_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 8); - unpacked.m_solid_color.g = (uint8_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 8); - unpacked.m_solid_color.b = (uint8_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 8); - unpacked.m_solid_color.a = (uint8_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 8); - - if (read_hints) - { - unpacked.m_etc1_flip = false; - unpacked.m_etc1_diff = read_bit(blk.m_bytes, bit_ofs) != 0; - unpacked.m_etc1_inten0 = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 3); - unpacked.m_etc1_inten1 = 0; - unpacked.m_etc1_selector = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 2); - unpacked.m_etc1_r = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 5); - unpacked.m_etc1_g = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 5); - unpacked.m_etc1_b = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 5); - unpacked.m_etc1_bias = 0; - unpacked.m_etc2_hints = 0; - } - + case block_format::cRGB32: + case block_format::cRGBA32: + case block_format::cA32: + case block_format::cRGB565: + case block_format::cBGR565: + case block_format::cRGBA4444: + case block_format::cRGBA4444_COLOR: + case block_format::cRGBA4444_ALPHA: + case block_format::cRGBA4444_COLOR_OPAQUE: + case block_format::cRGBA_HALF: + case block_format::cRGB_HALF: + case block_format::cRGB_9E5: return true; + default: + break; } - - if (read_hints) - { - if (g_uastc_mode_has_bc1_hint0[mode]) - unpacked.m_bc1_hint0 = read_bit(blk.m_bytes, bit_ofs) != 0; - else - unpacked.m_bc1_hint0 = false; - - if (g_uastc_mode_has_bc1_hint1[mode]) - unpacked.m_bc1_hint1 = read_bit(blk.m_bytes, bit_ofs) != 0; - else - unpacked.m_bc1_hint1 = false; - - unpacked.m_etc1_flip = read_bit(blk.m_bytes, bit_ofs) != 0; - unpacked.m_etc1_diff = read_bit(blk.m_bytes, bit_ofs) != 0; - unpacked.m_etc1_inten0 = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 3); - unpacked.m_etc1_inten1 = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 3); - - if (g_uastc_mode_has_etc1_bias[mode]) - unpacked.m_etc1_bias = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 5); - else - unpacked.m_etc1_bias = 0; - - if (g_uastc_mode_has_alpha[mode]) - { - unpacked.m_etc2_hints = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 8); - //assert(unpacked.m_etc2_hints > 0); - } - else - unpacked.m_etc2_hints = 0; - } - else - bit_ofs += g_uastc_mode_total_hint_bits[mode]; - - uint32_t subsets = 1; - switch (mode) + return false; + } + + uint32_t basis_get_uncompressed_bytes_per_pixel(transcoder_texture_format fmt) + { + switch (fmt) { - case 2: - case 4: - case 7: - case 9: - case 16: - unpacked.m_common_pattern = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 5); - subsets = 2; - break; - case 3: - unpacked.m_common_pattern = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 4); - subsets = 3; - break; + case transcoder_texture_format::cTFRGBA32: + case transcoder_texture_format::cTFRGB_9E5: + return sizeof(uint32_t); + case transcoder_texture_format::cTFRGB565: + case transcoder_texture_format::cTFBGR565: + case transcoder_texture_format::cTFRGBA4444: + return sizeof(uint16_t); + case transcoder_texture_format::cTFRGB_HALF: + return sizeof(half_float) * 3; + case transcoder_texture_format::cTFRGBA_HALF: + return sizeof(half_float) * 4; default: break; } - - uint32_t part_seed = 0; - switch (mode) + return 0; + } + + uint32_t basis_get_block_width(transcoder_texture_format tex_type) + { + switch (tex_type) { - case 2: - case 4: - case 9: - case 16: - if (unpacked.m_common_pattern >= TOTAL_ASTC_BC7_COMMON_PARTITIONS2) - return false; + case transcoder_texture_format::cTFFXT1_RGB: + return 8; + case transcoder_texture_format::cTFASTC_HDR_6x6_RGBA: + return 6; + default: + break; + } + return 4; + } - part_seed = g_astc_bc7_common_partitions2[unpacked.m_common_pattern].m_astc; + uint32_t basis_get_block_height(transcoder_texture_format tex_type) + { + switch (tex_type) + { + case transcoder_texture_format::cTFASTC_HDR_6x6_RGBA: + return 6; + default: break; - case 3: - if (unpacked.m_common_pattern >= TOTAL_ASTC_BC7_COMMON_PARTITIONS3) - return false; + } + return 4; + } - part_seed = g_astc_bc7_common_partitions3[unpacked.m_common_pattern].m_astc; + uint32_t basis_tex_format_get_block_width(basis_tex_format fmt) + { + switch (fmt) + { + case basis_tex_format::cASTC_HDR_6x6: + case basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE: + return 6; + default: break; - case 7: - if (unpacked.m_common_pattern >= TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS) - return false; + } + return 4; + } - part_seed = g_bc7_3_astc2_common_partitions[unpacked.m_common_pattern].m_astc2; - break; + uint32_t basis_tex_format_get_block_height(basis_tex_format fmt) + { + switch (fmt) + { + case basis_tex_format::cASTC_HDR_6x6: + case basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE: + return 6; default: break; } + return 4; + } - uint32_t total_planes = 1; - switch (mode) + bool basis_tex_format_is_hdr(basis_tex_format fmt) + { + switch (fmt) { - case 6: - case 11: - case 13: - unpacked.m_astc.m_ccs = (int)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 2); - total_planes = 2; - break; - case 17: - unpacked.m_astc.m_ccs = 3; - total_planes = 2; - break; + case basis_tex_format::cUASTC_HDR_4x4: + case basis_tex_format::cASTC_HDR_6x6: + case basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE: + return true; default: break; } - - unpacked.m_astc.m_dual_plane = (total_planes == 2); - - unpacked.m_astc.m_subsets = subsets; - unpacked.m_astc.m_partition_seed = part_seed; - - const uint32_t total_comps = g_uastc_mode_comps[mode]; - - const uint32_t weight_bits = g_uastc_mode_weight_bits[mode]; - - unpacked.m_astc.m_weight_range = g_uastc_mode_weight_ranges[mode]; - - const uint32_t total_values = total_comps * 2 * subsets; - const uint32_t endpoint_range = g_uastc_mode_endpoint_ranges[mode]; - - const uint32_t cem = g_uastc_mode_cem[mode]; - unpacked.m_astc.m_cem = cem; - - const uint32_t ep_bits = g_astc_bise_range_table[endpoint_range][0]; - const uint32_t ep_trits = g_astc_bise_range_table[endpoint_range][1]; - const uint32_t ep_quints = g_astc_bise_range_table[endpoint_range][2]; - - uint32_t total_tqs = 0; - uint32_t bundle_size = 0, mul = 0; - if (ep_trits) + return false; + } + + bool basis_is_format_supported(transcoder_texture_format tex_type, basis_tex_format fmt) + { + if ((fmt == basis_tex_format::cASTC_HDR_6x6) || (fmt == basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE)) { - total_tqs = (total_values + 4) / 5; - bundle_size = 5; - mul = 3; + // RDO UASTC HDR 6x6, or our custom intermediate format +#if BASISD_SUPPORT_UASTC_HDR + switch (tex_type) + { + case transcoder_texture_format::cTFASTC_HDR_6x6_RGBA: + case transcoder_texture_format::cTFBC6H: + case transcoder_texture_format::cTFRGBA_HALF: + case transcoder_texture_format::cTFRGB_HALF: + case transcoder_texture_format::cTFRGB_9E5: + return true; + default: + break; + } +#endif } - else if (ep_quints) + else if (fmt == basis_tex_format::cUASTC_HDR_4x4) { - total_tqs = (total_values + 2) / 3; - bundle_size = 3; - mul = 5; + // UASTC HDR 4x4 +#if BASISD_SUPPORT_UASTC_HDR + switch (tex_type) + { + case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: + case transcoder_texture_format::cTFBC6H: + case transcoder_texture_format::cTFRGBA_HALF: + case transcoder_texture_format::cTFRGB_HALF: + case transcoder_texture_format::cTFRGB_9E5: + return true; + default: + break; + } +#endif } - - uint32_t tq_values[8]; - for (uint32_t i = 0; i < total_tqs; i++) + else if (fmt == basis_tex_format::cUASTC4x4) { - uint32_t num_bits = ep_trits ? 8 : 7; - if (i == (total_tqs - 1)) + // UASTC LDR 4x4 +#if BASISD_SUPPORT_UASTC + switch (tex_type) { - uint32_t num_remaining = total_values - (total_tqs - 1) * bundle_size; - if (ep_trits) + // These niche formats aren't currently supported for UASTC - everything else is. + case transcoder_texture_format::cTFPVRTC2_4_RGB: + case transcoder_texture_format::cTFPVRTC2_4_RGBA: + case transcoder_texture_format::cTFATC_RGB: + case transcoder_texture_format::cTFATC_RGBA: + case transcoder_texture_format::cTFFXT1_RGB: + // UASTC LDR doesn't support transcoding to HDR formats + case transcoder_texture_format::cTFASTC_HDR_4x4_RGBA: + case transcoder_texture_format::cTFASTC_HDR_6x6_RGBA: + case transcoder_texture_format::cTFBC6H: + case transcoder_texture_format::cTFRGBA_HALF: + case transcoder_texture_format::cTFRGB_HALF: + case transcoder_texture_format::cTFRGB_9E5: + return false; + default: + return true; + } +#endif + } + else + { + // ETC1S + switch (tex_type) + { + // ETC1 and uncompressed are always supported. + case transcoder_texture_format::cTFETC1_RGB: + case transcoder_texture_format::cTFRGBA32: + case transcoder_texture_format::cTFRGB565: + case transcoder_texture_format::cTFBGR565: + case transcoder_texture_format::cTFRGBA4444: + return true; +#if BASISD_SUPPORT_DXT1 + case transcoder_texture_format::cTFBC1_RGB: + return true; +#endif +#if BASISD_SUPPORT_DXT5A + case transcoder_texture_format::cTFBC4_R: + case transcoder_texture_format::cTFBC5_RG: + return true; +#endif +#if BASISD_SUPPORT_DXT1 && BASISD_SUPPORT_DXT5A + case transcoder_texture_format::cTFBC3_RGBA: + return true; +#endif +#if BASISD_SUPPORT_PVRTC1 + case transcoder_texture_format::cTFPVRTC1_4_RGB: + case transcoder_texture_format::cTFPVRTC1_4_RGBA: + return true; +#endif +#if BASISD_SUPPORT_BC7_MODE5 + case transcoder_texture_format::cTFBC7_RGBA: + case transcoder_texture_format::cTFBC7_ALT: + return true; +#endif +#if BASISD_SUPPORT_ETC2_EAC_A8 + case transcoder_texture_format::cTFETC2_RGBA: + return true; +#endif +#if BASISD_SUPPORT_ASTC + case transcoder_texture_format::cTFASTC_4x4_RGBA: + return true; +#endif +#if BASISD_SUPPORT_ATC + case transcoder_texture_format::cTFATC_RGB: + case transcoder_texture_format::cTFATC_RGBA: + return true; +#endif +#if BASISD_SUPPORT_FXT1 + case transcoder_texture_format::cTFFXT1_RGB: + return true; +#endif +#if BASISD_SUPPORT_PVRTC2 + case transcoder_texture_format::cTFPVRTC2_4_RGB: + case transcoder_texture_format::cTFPVRTC2_4_RGBA: + return true; +#endif +#if BASISD_SUPPORT_ETC2_EAC_RG11 + case transcoder_texture_format::cTFETC2_EAC_R11: + case transcoder_texture_format::cTFETC2_EAC_RG11: + return true; +#endif + default: + break; + } + } + + return false; + } + + // ------------------------------------------------------------------------------------------------------ + // UASTC LDR 4x4 + // ------------------------------------------------------------------------------------------------------ + +#if BASISD_SUPPORT_UASTC + const astc_bc7_common_partition2_desc g_astc_bc7_common_partitions2[TOTAL_ASTC_BC7_COMMON_PARTITIONS2] = + { + { 0, 28, false }, { 1, 20, false }, { 2, 16, true }, { 3, 29, false }, + { 4, 91, true }, { 5, 9, false }, { 6, 107, true }, { 7, 72, true }, + { 8, 149, false }, { 9, 204, true }, { 10, 50, false }, { 11, 114, true }, + { 12, 496, true }, { 13, 17, true }, { 14, 78, false }, { 15, 39, true }, + { 17, 252, true }, { 18, 828, true }, { 19, 43, false }, { 20, 156, false }, + { 21, 116, false }, { 22, 210, true }, { 23, 476, true }, { 24, 273, false }, + { 25, 684, true }, { 26, 359, false }, { 29, 246, true }, { 32, 195, true }, + { 33, 694, true }, { 52, 524, true } + }; + + const bc73_astc2_common_partition_desc g_bc7_3_astc2_common_partitions[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS] = + { + { 10, 36, 4 }, { 11, 48, 4 }, { 0, 61, 3 }, { 2, 137, 4 }, + { 8, 161, 5 }, { 13, 183, 4 }, { 1, 226, 2 }, { 33, 281, 2 }, + { 40, 302, 3 }, { 20, 307, 4 }, { 21, 479, 0 }, { 58, 495, 3 }, + { 3, 593, 0 }, { 32, 594, 2 }, { 59, 605, 1 }, { 34, 799, 3 }, + { 20, 812, 1 }, { 14, 988, 4 }, { 31, 993, 3 } + }; + + const astc_bc7_common_partition3_desc g_astc_bc7_common_partitions3[TOTAL_ASTC_BC7_COMMON_PARTITIONS3] = + { + { 4, 260, 0 }, { 8, 74, 5 }, { 9, 32, 5 }, { 10, 156, 2 }, + { 11, 183, 2 }, { 12, 15, 0 }, { 13, 745, 4 }, { 20, 0, 1 }, + { 35, 335, 1 }, { 36, 902, 5 }, { 57, 254, 0 } + }; + + const uint8_t g_astc_to_bc7_partition_index_perm_tables[6][3] = { { 0, 1, 2 }, { 1, 2, 0 }, { 2, 0, 1 }, { 2, 1, 0 }, { 0, 2, 1 }, { 1, 0, 2 } }; + + const uint8_t g_bc7_to_astc_partition_index_perm_tables[6][3] = { { 0, 1, 2 }, { 2, 0, 1 }, { 1, 2, 0 }, { 2, 1, 0 }, { 0, 2, 1 }, { 1, 0, 2 } }; + + uint32_t bc7_convert_partition_index_3_to_2(uint32_t p, uint32_t k) + { + assert(k < 6); + switch (k >> 1) + { + case 0: + if (p <= 1) + p = 0; + else + p = 1; + break; + case 1: + if (p == 0) + p = 0; + else + p = 1; + break; + case 2: + if ((p == 0) || (p == 2)) + p = 0; + else + p = 1; + break; + } + if (k & 1) + p = 1 - p; + return p; + } + + static const uint8_t g_zero_pattern[16] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; + + const uint8_t g_astc_bc7_patterns2[TOTAL_ASTC_BC7_COMMON_PARTITIONS2][16] = + { + { 0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1 }, { 0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1 }, { 1,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0 }, { 0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1 }, + { 1,1,1,1,1,1,1,0,1,1,1,0,1,1,0,0 }, { 0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1 }, { 1,1,1,0,1,1,0,0,1,0,0,0,0,0,0,0 }, { 1,1,1,1,1,1,1,0,1,1,0,0,1,0,0,0 }, + { 0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1 }, { 1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0 }, { 0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1 }, { 1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,0 }, + { 1,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0 }, { 1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0 }, { 0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1 }, { 1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0 }, + { 1,0,0,0,1,1,1,0,1,1,1,1,1,1,1,1 }, { 1,1,1,1,1,1,1,1,0,1,1,1,0,0,0,1 }, { 0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0 }, { 0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0 }, + { 0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0 }, { 1,1,1,1,1,1,1,1,0,1,1,1,0,0,1,1 }, { 1,0,0,0,1,1,0,0,1,1,0,0,1,1,1,0 }, { 0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0 }, + { 1,1,1,1,0,1,1,1,0,1,1,1,0,0,1,1 }, { 0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0 }, { 1,1,1,1,0,0,0,0,0,0,0,0,1,1,1,1 }, { 1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0 }, + { 1,1,1,1,0,0,0,0,1,1,1,1,0,0,0,0 }, { 1,0,0,1,0,0,1,1,0,1,1,0,1,1,0,0 } + }; + + const uint8_t g_astc_bc7_patterns3[TOTAL_ASTC_BC7_COMMON_PARTITIONS3][16] = + { + { 0,0,0,0,0,0,0,0,1,1,2,2,1,1,2,2 }, { 1,1,1,1,1,1,1,1,0,0,0,0,2,2,2,2 }, { 1,1,1,1,0,0,0,0,0,0,0,0,2,2,2,2 }, { 1,1,1,1,2,2,2,2,0,0,0,0,0,0,0,0 }, + { 1,1,2,0,1,1,2,0,1,1,2,0,1,1,2,0 }, { 0,1,1,2,0,1,1,2,0,1,1,2,0,1,1,2 }, { 0,2,1,1,0,2,1,1,0,2,1,1,0,2,1,1 }, { 2,0,0,0,2,0,0,0,2,1,1,1,2,1,1,1 }, + { 2,0,1,2,2,0,1,2,2,0,1,2,2,0,1,2 }, { 1,1,1,1,0,0,0,0,2,2,2,2,1,1,1,1 }, { 0,0,2,2,0,0,1,1,0,0,1,1,0,0,2,2 } + }; + + const uint8_t g_bc7_3_astc2_patterns2[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS][16] = + { + { 0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0 }, { 0,0,1,0,0,0,1,0,0,0,1,0,0,0,1,0 }, { 1,1,0,0,1,1,0,0,1,0,0,0,0,0,0,0 }, { 0,0,0,0,0,0,0,1,0,0,1,1,0,0,1,1 }, + { 1,1,1,1,1,1,1,1,0,0,0,0,1,1,1,1 }, { 0,1,0,0,0,1,0,0,0,1,0,0,0,1,0,0 }, { 0,0,0,1,0,0,1,1,1,1,1,1,1,1,1,1 }, { 0,1,1,1,0,0,1,1,0,0,1,1,0,0,1,1 }, + { 1,1,0,0,0,0,0,0,0,0,1,1,1,1,0,0 }, { 0,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0 }, { 0,0,0,0,0,0,0,0,1,1,1,0,1,1,1,0 }, { 1,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0 }, + { 0,1,1,1,0,0,1,1,0,0,0,0,0,0,0,0 }, { 0,0,0,0,0,0,0,1,1,1,1,1,1,1,1,1 }, { 1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0 }, { 1,1,0,0,1,1,0,0,1,1,0,0,1,0,0,0 }, + { 1,1,1,1,1,1,1,1,1,0,0,0,1,0,0,0 }, { 0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,0 }, { 1,1,1,1,0,1,1,1,0,0,0,0,0,0,0,0 } + }; + + const uint8_t g_astc_bc7_pattern2_anchors[TOTAL_ASTC_BC7_COMMON_PARTITIONS2][3] = + { + { 0, 2 }, { 0, 3 }, { 1, 0 }, { 0, 3 }, { 7, 0 }, { 0, 2 }, { 3, 0 }, { 7, 0 }, + { 0, 11 }, { 2, 0 }, { 0, 7 }, { 11, 0 }, { 3, 0 }, { 8, 0 }, { 0, 4 }, { 12, 0 }, + { 1, 0 }, { 8, 0 }, { 0, 1 }, { 0, 2 }, { 0, 4 }, { 8, 0 }, { 1, 0 }, { 0, 2 }, + { 4, 0 }, { 0, 1 }, { 4, 0 }, { 1, 0 }, { 4, 0 }, { 1, 0 } + }; + + const uint8_t g_astc_bc7_pattern3_anchors[TOTAL_ASTC_BC7_COMMON_PARTITIONS3][3] = + { + { 0, 8, 10 }, { 8, 0, 12 }, { 4, 0, 12 }, { 8, 0, 4 }, { 3, 0, 2 }, { 0, 1, 3 }, { 0, 2, 1 }, { 1, 9, 0 }, { 1, 2, 0 }, { 4, 0, 8 }, { 0, 6, 2 } + }; + + const uint8_t g_bc7_3_astc2_patterns2_anchors[TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS][3] = + { + { 0, 4 }, { 0, 2 }, { 2, 0 }, { 0, 7 }, { 8, 0 }, { 0, 1 }, { 0, 3 }, { 0, 1 }, { 2, 0 }, { 0, 1 }, { 0, 8 }, { 2, 0 }, { 0, 1 }, { 0, 7 }, { 12, 0 }, { 2, 0 }, { 9, 0 }, { 0, 2 }, { 4, 0 } + }; + + const uint32_t g_uastc_mode_huff_codes[TOTAL_UASTC_MODES + 1][2] = + { + { 0x1, 4 }, + { 0x35, 6 }, + { 0x1D, 5 }, + { 0x3, 5 }, + + { 0x13, 5 }, + { 0xB, 5 }, + { 0x1B, 5 }, + { 0x7, 5 }, + + { 0x17, 5 }, + { 0xF, 5 }, + { 0x2, 3 }, + { 0x0, 2 }, + + { 0x6, 3 }, + { 0x1F, 5 }, + { 0xD, 5 }, + { 0x5, 7 }, + + { 0x15, 6 }, + { 0x25, 6 }, + { 0x9, 4 }, + { 0x45, 7 } // future expansion + }; + + // If g_uastc_mode_huff_codes[] changes this table must be updated! + static const uint8_t g_uastc_huff_modes[128] = + { + 11,0,10,3,11,15,12,7,11,18,10,5,11,14,12,9,11,0,10,4,11,16,12,8,11,18,10,6,11,2,12,13,11,0,10,3,11,17,12,7,11,18,10,5,11,14,12,9,11,0,10,4,11,1,12,8,11,18,10,6,11,2,12,13,11,0,10,3,11, + 19,12,7,11,18,10,5,11,14,12,9,11,0,10,4,11,16,12,8,11,18,10,6,11,2,12,13,11,0,10,3,11,17,12,7,11,18,10,5,11,14,12,9,11,0,10,4,11,1,12,8,11,18,10,6,11,2,12,13 + }; + + const uint8_t g_uastc_mode_weight_bits[TOTAL_UASTC_MODES] = { 4, 2, 3, 2, 2, 3, 2, 2, 0, 2, 4, 2, 3, 1, 2, 4, 2, 2, 5 }; + const uint8_t g_uastc_mode_weight_ranges[TOTAL_UASTC_MODES] = { 8, 2, 5, 2, 2, 5, 2, 2, 0, 2, 8, 2, 5, 0, 2, 8, 2, 2, 11 }; + const uint8_t g_uastc_mode_endpoint_ranges[TOTAL_UASTC_MODES] = { 19, 20, 8, 7, 12, 20, 18, 12, 0, 8, 13, 13, 19, 20, 20, 20, 20, 20, 11 }; + const uint8_t g_uastc_mode_subsets[TOTAL_UASTC_MODES] = { 1, 1, 2, 3, 2, 1, 1, 2, 0, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1 }; + const uint8_t g_uastc_mode_planes[TOTAL_UASTC_MODES] = { 1, 1, 1, 1, 1, 1, 2, 1, 0, 1, 1, 2, 1, 2, 1, 1, 1, 2, 1 }; + const uint8_t g_uastc_mode_comps[TOTAL_UASTC_MODES] = { 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 2, 2, 2, 3 }; + const uint8_t g_uastc_mode_has_etc1_bias[TOTAL_UASTC_MODES] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1 }; + const uint8_t g_uastc_mode_has_bc1_hint0[TOTAL_UASTC_MODES] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; + const uint8_t g_uastc_mode_has_bc1_hint1[TOTAL_UASTC_MODES] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1 }; + const uint8_t g_uastc_mode_cem[TOTAL_UASTC_MODES] = { 8, 8, 8, 8, 8, 8, 8, 8, 0, 12, 12, 12, 12, 12, 12, 4, 4, 4, 8 }; + const uint8_t g_uastc_mode_has_alpha[TOTAL_UASTC_MODES] = { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0 }; + const uint8_t g_uastc_mode_is_la[TOTAL_UASTC_MODES] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0 }; + const uint8_t g_uastc_mode_total_hint_bits[TOTAL_UASTC_MODES] = { 15, 15, 15, 15, 15, 15, 15, 15, 0, 23, 17, 17, 17, 23, 23, 23, 23, 23, 15 }; + + // bits, trits, quints + const int g_astc_bise_range_table[TOTAL_ASTC_RANGES][3] = + { + { 1, 0, 0 }, // 0-1 0 + { 0, 1, 0 }, // 0-2 1 + { 2, 0, 0 }, // 0-3 2 + { 0, 0, 1 }, // 0-4 3 + + { 1, 1, 0 }, // 0-5 4 + { 3, 0, 0 }, // 0-7 5 + { 1, 0, 1 }, // 0-9 6 + { 2, 1, 0 }, // 0-11 7 + + { 4, 0, 0 }, // 0-15 8 + { 2, 0, 1 }, // 0-19 9 + { 3, 1, 0 }, // 0-23 10 + { 5, 0, 0 }, // 0-31 11 + + { 3, 0, 1 }, // 0-39 12 + { 4, 1, 0 }, // 0-47 13 + { 6, 0, 0 }, // 0-63 14 + { 4, 0, 1 }, // 0-79 15 + + { 5, 1, 0 }, // 0-95 16 + { 7, 0, 0 }, // 0-127 17 + { 5, 0, 1 }, // 0-159 18 + { 6, 1, 0 }, // 0-191 19 + + { 8, 0, 0 }, // 0-255 20 + }; + + int astc_get_levels(int range) + { + assert(range < (int)BC7ENC_TOTAL_ASTC_RANGES); + return (1 + 2 * g_astc_bise_range_table[range][1] + 4 * g_astc_bise_range_table[range][2]) << g_astc_bise_range_table[range][0]; + } + + // g_astc_unquant[] is the inverse of g_astc_sorted_order_unquant[] + astc_quant_bin g_astc_unquant[BC7ENC_TOTAL_ASTC_RANGES][256]; // [ASTC encoded endpoint index] + + // Taken right from the ASTC spec. + static struct + { + const char* m_pB_str; + uint32_t m_c; + } g_astc_endpoint_unquant_params[BC7ENC_TOTAL_ASTC_RANGES] = + { + { "", 0 }, + { "", 0 }, + { "", 0 }, + { "", 0 }, + { "000000000", 204, }, // 0-5 + { "", 0 }, + { "000000000", 113, }, // 0-9 + { "b000b0bb0", 93 }, // 0-11 + { "", 0 }, + { "b0000bb00", 54 }, // 0-19 + { "cb000cbcb", 44 }, // 0-23 + { "", 0 }, + { "cb0000cbc", 26 }, // 0-39 + { "dcb000dcb", 22 }, // 0-47 + { "", 0 }, + { "dcb0000dc", 13 }, // 0-79 + { "edcb000ed", 11 }, // 0-95 + { "", 0 }, + { "edcb0000e", 6 }, // 0-159 + { "fedcb000f", 5 }, // 0-191 + { "", 0 }, + }; + + bool astc_is_valid_endpoint_range(uint32_t range) + { + if ((g_astc_bise_range_table[range][1] == 0) && (g_astc_bise_range_table[range][2] == 0)) + return true; + + return g_astc_endpoint_unquant_params[range].m_c != 0; + } + + uint32_t unquant_astc_endpoint(uint32_t packed_bits, uint32_t packed_trits, uint32_t packed_quints, uint32_t range) + { + assert(range < BC7ENC_TOTAL_ASTC_RANGES); + + const uint32_t bits = g_astc_bise_range_table[range][0]; + const uint32_t trits = g_astc_bise_range_table[range][1]; + const uint32_t quints = g_astc_bise_range_table[range][2]; + + uint32_t val = 0; + if ((!trits) && (!quints)) + { + assert(!packed_trits && !packed_quints); + + int bits_left = 8; + while (bits_left > 0) + { + uint32_t v = packed_bits; + + int n = basisu::minimumi(bits_left, bits); + if (n < (int)bits) + v >>= (bits - n); + + assert(v < (1U << n)); + + val |= (v << (bits_left - n)); + bits_left -= n; + } + } + else + { + const uint32_t A = (packed_bits & 1) ? 511 : 0; + const uint32_t C = g_astc_endpoint_unquant_params[range].m_c; + const uint32_t D = trits ? packed_trits : packed_quints; + + assert(C); + + uint32_t B = 0; + for (uint32_t i = 0; i < 9; i++) + { + B <<= 1; + + char c = g_astc_endpoint_unquant_params[range].m_pB_str[i]; + if (c != '0') + { + c -= 'a'; + B |= ((packed_bits >> c) & 1); + } + } + + val = D * C + B; + val = val ^ A; + val = (A & 0x80) | (val >> 2); + } + + return val; + } + + uint32_t unquant_astc_endpoint_val(uint32_t packed_val, uint32_t range) + { + assert(range < BC7ENC_TOTAL_ASTC_RANGES); + assert(packed_val < (uint32_t)astc_get_levels(range)); + + const uint32_t bits = g_astc_bise_range_table[range][0]; + const uint32_t trits = g_astc_bise_range_table[range][1]; + const uint32_t quints = g_astc_bise_range_table[range][2]; + + if ((!trits) && (!quints)) + return unquant_astc_endpoint(packed_val, 0, 0, range); + else if (trits) + return unquant_astc_endpoint(packed_val & ((1 << bits) - 1), packed_val >> bits, 0, range); + else + return unquant_astc_endpoint(packed_val & ((1 << bits) - 1), 0, packed_val >> bits, range); + } + + // BC7 - Various BC7 tables/helpers + const uint32_t g_bc7_weights1[2] = { 0, 64 }; + const uint32_t g_bc7_weights2[4] = { 0, 21, 43, 64 }; + const uint32_t g_bc7_weights3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 }; + const uint32_t g_bc7_weights4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; + const uint32_t g_astc_weights4[16] = { 0, 4, 8, 12, 17, 21, 25, 29, 35, 39, 43, 47, 52, 56, 60, 64 }; + const uint32_t g_astc_weights5[32] = { 0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64 }; + const uint32_t g_astc_weights_3levels[3] = { 0, 32, 64 }; + + const uint8_t g_bc7_partition1[16] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 }; + + const uint8_t g_bc7_partition2[64 * 16] = + { + 0,0,1,1,0,0,1,1,0,0,1,1,0,0,1,1, 0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,1, 0,1,1,1,0,1,1,1,0,1,1,1,0,1,1,1, 0,0,0,1,0,0,1,1,0,0,1,1,0,1,1,1, 0,0,0,0,0,0,0,1,0,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,1,0,1,1,1,1,1,1,1, 0,0,0,1,0,0,1,1,0,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,1,0,0,1,1,0,1,1,1, + 0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,1,0,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,1,0,1,1,1, 0,0,0,1,0,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1, 0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1, 0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1, + 0,0,0,0,1,0,0,0,1,1,1,0,1,1,1,1, 0,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,1,0,0,0,1,1,1,0, 0,1,1,1,0,0,1,1,0,0,0,1,0,0,0,0, 0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0, 0,0,0,0,1,0,0,0,1,1,0,0,1,1,1,0, 0,0,0,0,0,0,0,0,1,0,0,0,1,1,0,0, 0,1,1,1,0,0,1,1,0,0,1,1,0,0,0,1, + 0,0,1,1,0,0,0,1,0,0,0,1,0,0,0,0, 0,0,0,0,1,0,0,0,1,0,0,0,1,1,0,0, 0,1,1,0,0,1,1,0,0,1,1,0,0,1,1,0, 0,0,1,1,0,1,1,0,0,1,1,0,1,1,0,0, 0,0,0,1,0,1,1,1,1,1,1,0,1,0,0,0, 0,0,0,0,1,1,1,1,1,1,1,1,0,0,0,0, 0,1,1,1,0,0,0,1,1,0,0,0,1,1,1,0, 0,0,1,1,1,0,0,1,1,0,0,1,1,1,0,0, + 0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1, 0,0,0,0,1,1,1,1,0,0,0,0,1,1,1,1, 0,1,0,1,1,0,1,0,0,1,0,1,1,0,1,0, 0,0,1,1,0,0,1,1,1,1,0,0,1,1,0,0, 0,0,1,1,1,1,0,0,0,0,1,1,1,1,0,0, 0,1,0,1,0,1,0,1,1,0,1,0,1,0,1,0, 0,1,1,0,1,0,0,1,0,1,1,0,1,0,0,1, 0,1,0,1,1,0,1,0,1,0,1,0,0,1,0,1, + 0,1,1,1,0,0,1,1,1,1,0,0,1,1,1,0, 0,0,0,1,0,0,1,1,1,1,0,0,1,0,0,0, 0,0,1,1,0,0,1,0,0,1,0,0,1,1,0,0, 0,0,1,1,1,0,1,1,1,1,0,1,1,1,0,0, 0,1,1,0,1,0,0,1,1,0,0,1,0,1,1,0, 0,0,1,1,1,1,0,0,1,1,0,0,0,0,1,1, 0,1,1,0,0,1,1,0,1,0,0,1,1,0,0,1, 0,0,0,0,0,1,1,0,0,1,1,0,0,0,0,0, + 0,1,0,0,1,1,1,0,0,1,0,0,0,0,0,0, 0,0,1,0,0,1,1,1,0,0,1,0,0,0,0,0, 0,0,0,0,0,0,1,0,0,1,1,1,0,0,1,0, 0,0,0,0,0,1,0,0,1,1,1,0,0,1,0,0, 0,1,1,0,1,1,0,0,1,0,0,1,0,0,1,1, 0,0,1,1,0,1,1,0,1,1,0,0,1,0,0,1, 0,1,1,0,0,0,1,1,1,0,0,1,1,1,0,0, 0,0,1,1,1,0,0,1,1,1,0,0,0,1,1,0, + 0,1,1,0,1,1,0,0,1,1,0,0,1,0,0,1, 0,1,1,0,0,0,1,1,0,0,1,1,1,0,0,1, 0,1,1,1,1,1,1,0,1,0,0,0,0,0,0,1, 0,0,0,1,1,0,0,0,1,1,1,0,0,1,1,1, 0,0,0,0,1,1,1,1,0,0,1,1,0,0,1,1, 0,0,1,1,0,0,1,1,1,1,1,1,0,0,0,0, 0,0,1,0,0,0,1,0,1,1,1,0,1,1,1,0, 0,1,0,0,0,1,0,0,0,1,1,1,0,1,1,1 + }; + + const uint8_t g_bc7_partition3[64 * 16] = + { + 0,0,1,1,0,0,1,1,0,2,2,1,2,2,2,2, 0,0,0,1,0,0,1,1,2,2,1,1,2,2,2,1, 0,0,0,0,2,0,0,1,2,2,1,1,2,2,1,1, 0,2,2,2,0,0,2,2,0,0,1,1,0,1,1,1, 0,0,0,0,0,0,0,0,1,1,2,2,1,1,2,2, 0,0,1,1,0,0,1,1,0,0,2,2,0,0,2,2, 0,0,2,2,0,0,2,2,1,1,1,1,1,1,1,1, 0,0,1,1,0,0,1,1,2,2,1,1,2,2,1,1, + 0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2, 0,0,0,0,1,1,1,1,1,1,1,1,2,2,2,2, 0,0,0,0,1,1,1,1,2,2,2,2,2,2,2,2, 0,0,1,2,0,0,1,2,0,0,1,2,0,0,1,2, 0,1,1,2,0,1,1,2,0,1,1,2,0,1,1,2, 0,1,2,2,0,1,2,2,0,1,2,2,0,1,2,2, 0,0,1,1,0,1,1,2,1,1,2,2,1,2,2,2, 0,0,1,1,2,0,0,1,2,2,0,0,2,2,2,0, + 0,0,0,1,0,0,1,1,0,1,1,2,1,1,2,2, 0,1,1,1,0,0,1,1,2,0,0,1,2,2,0,0, 0,0,0,0,1,1,2,2,1,1,2,2,1,1,2,2, 0,0,2,2,0,0,2,2,0,0,2,2,1,1,1,1, 0,1,1,1,0,1,1,1,0,2,2,2,0,2,2,2, 0,0,0,1,0,0,0,1,2,2,2,1,2,2,2,1, 0,0,0,0,0,0,1,1,0,1,2,2,0,1,2,2, 0,0,0,0,1,1,0,0,2,2,1,0,2,2,1,0, + 0,1,2,2,0,1,2,2,0,0,1,1,0,0,0,0, 0,0,1,2,0,0,1,2,1,1,2,2,2,2,2,2, 0,1,1,0,1,2,2,1,1,2,2,1,0,1,1,0, 0,0,0,0,0,1,1,0,1,2,2,1,1,2,2,1, 0,0,2,2,1,1,0,2,1,1,0,2,0,0,2,2, 0,1,1,0,0,1,1,0,2,0,0,2,2,2,2,2, 0,0,1,1,0,1,2,2,0,1,2,2,0,0,1,1, 0,0,0,0,2,0,0,0,2,2,1,1,2,2,2,1, + 0,0,0,0,0,0,0,2,1,1,2,2,1,2,2,2, 0,2,2,2,0,0,2,2,0,0,1,2,0,0,1,1, 0,0,1,1,0,0,1,2,0,0,2,2,0,2,2,2, 0,1,2,0,0,1,2,0,0,1,2,0,0,1,2,0, 0,0,0,0,1,1,1,1,2,2,2,2,0,0,0,0, 0,1,2,0,1,2,0,1,2,0,1,2,0,1,2,0, 0,1,2,0,2,0,1,2,1,2,0,1,0,1,2,0, 0,0,1,1,2,2,0,0,1,1,2,2,0,0,1,1, + 0,0,1,1,1,1,2,2,2,2,0,0,0,0,1,1, 0,1,0,1,0,1,0,1,2,2,2,2,2,2,2,2, 0,0,0,0,0,0,0,0,2,1,2,1,2,1,2,1, 0,0,2,2,1,1,2,2,0,0,2,2,1,1,2,2, 0,0,2,2,0,0,1,1,0,0,2,2,0,0,1,1, 0,2,2,0,1,2,2,1,0,2,2,0,1,2,2,1, 0,1,0,1,2,2,2,2,2,2,2,2,0,1,0,1, 0,0,0,0,2,1,2,1,2,1,2,1,2,1,2,1, + 0,1,0,1,0,1,0,1,0,1,0,1,2,2,2,2, 0,2,2,2,0,1,1,1,0,2,2,2,0,1,1,1, 0,0,0,2,1,1,1,2,0,0,0,2,1,1,1,2, 0,0,0,0,2,1,1,2,2,1,1,2,2,1,1,2, 0,2,2,2,0,1,1,1,0,1,1,1,0,2,2,2, 0,0,0,2,1,1,1,2,1,1,1,2,0,0,0,2, 0,1,1,0,0,1,1,0,0,1,1,0,2,2,2,2, 0,0,0,0,0,0,0,0,2,1,1,2,2,1,1,2, + 0,1,1,0,0,1,1,0,2,2,2,2,2,2,2,2, 0,0,2,2,0,0,1,1,0,0,1,1,0,0,2,2, 0,0,2,2,1,1,2,2,1,1,2,2,0,0,2,2, 0,0,0,0,0,0,0,0,0,0,0,0,2,1,1,2, 0,0,0,2,0,0,0,1,0,0,0,2,0,0,0,1, 0,2,2,2,1,2,2,2,0,2,2,2,1,2,2,2, 0,1,0,1,2,2,2,2,2,2,2,2,2,2,2,2, 0,1,1,1,2,0,1,1,2,2,0,1,2,2,2,0, + }; + + const uint8_t g_bc7_table_anchor_index_second_subset[64] = { 15,15,15,15,15,15,15,15, 15,15,15,15,15,15,15,15, 15, 2, 8, 2, 2, 8, 8,15, 2, 8, 2, 2, 8, 8, 2, 2, 15,15, 6, 8, 2, 8,15,15, 2, 8, 2, 2, 2,15,15, 6, 6, 2, 6, 8,15,15, 2, 2, 15,15,15,15,15, 2, 2,15 }; + + const uint8_t g_bc7_table_anchor_index_third_subset_1[64] = + { + 3, 3,15,15, 8, 3,15,15, 8, 8, 6, 6, 6, 5, 3, 3, 3, 3, 8,15, 3, 3, 6,10, 5, 8, 8, 6, 8, 5,15,15, 8,15, 3, 5, 6,10, 8,15, 15, 3,15, 5,15,15,15,15, 3,15, 5, 5, 5, 8, 5,10, 5,10, 8,13,15,12, 3, 3 + }; + + const uint8_t g_bc7_table_anchor_index_third_subset_2[64] = + { + 15, 8, 8, 3,15,15, 3, 8, 15,15,15,15,15,15,15, 8, 15, 8,15, 3,15, 8,15, 8, 3,15, 6,10,15,15,10, 8, 15, 3,15,10,10, 8, 9,10, 6,15, 8,15, 3, 6, 6, 8, 15, 3,15,15,15,15,15,15, 15,15,15,15, 3,15,15, 8 + }; + + const uint8_t g_bc7_num_subsets[8] = { 3, 2, 3, 2, 1, 1, 1, 2 }; + const uint8_t g_bc7_partition_bits[8] = { 4, 6, 6, 6, 0, 0, 0, 6 }; + const uint8_t g_bc7_color_index_bitcount[8] = { 3, 3, 2, 2, 2, 2, 4, 2 }; + + const uint8_t g_bc7_mode_has_p_bits[8] = { 1, 1, 0, 1, 0, 0, 1, 1 }; + const uint8_t g_bc7_mode_has_shared_p_bits[8] = { 0, 1, 0, 0, 0, 0, 0, 0 }; + const uint8_t g_bc7_color_precision_table[8] = { 4, 6, 5, 7, 5, 7, 7, 5 }; + const int8_t g_bc7_alpha_precision_table[8] = { 0, 0, 0, 0, 6, 8, 7, 5 }; + + const uint8_t g_bc7_alpha_index_bitcount[8] = { 0, 0, 0, 0, 3, 2, 4, 2 }; + + endpoint_err g_bc7_mode_6_optimal_endpoints[256][2]; // [c][pbit] + endpoint_err g_bc7_mode_5_optimal_endpoints[256]; // [c] + + static inline void bc7_set_block_bits(uint8_t* pBytes, uint32_t val, uint32_t num_bits, uint32_t* pCur_ofs) + { + assert((num_bits <= 32) && (val < (1ULL << num_bits))); + while (num_bits) + { + const uint32_t n = basisu::minimumu(8 - (*pCur_ofs & 7), num_bits); + pBytes[*pCur_ofs >> 3] |= (uint8_t)(val << (*pCur_ofs & 7)); + val >>= n; + num_bits -= n; + *pCur_ofs += n; + } + assert(*pCur_ofs <= 128); + } + + // TODO: Optimize this. + void encode_bc7_block(void* pBlock, const bc7_optimization_results* pResults) + { + const uint32_t best_mode = pResults->m_mode; + + const uint32_t total_subsets = g_bc7_num_subsets[best_mode]; + const uint32_t total_partitions = 1 << g_bc7_partition_bits[best_mode]; + //const uint32_t num_rotations = 1 << g_bc7_rotation_bits[best_mode]; + //const uint32_t num_index_selectors = (best_mode == 4) ? 2 : 1; + + const uint8_t* pPartition; + if (total_subsets == 1) + pPartition = &g_bc7_partition1[0]; + else if (total_subsets == 2) + pPartition = &g_bc7_partition2[pResults->m_partition * 16]; + else + pPartition = &g_bc7_partition3[pResults->m_partition * 16]; + + uint8_t color_selectors[16]; + memcpy(color_selectors, pResults->m_selectors, 16); + + uint8_t alpha_selectors[16]; + memcpy(alpha_selectors, pResults->m_alpha_selectors, 16); + + color_quad_u8 low[3], high[3]; + memcpy(low, pResults->m_low, sizeof(low)); + memcpy(high, pResults->m_high, sizeof(high)); + + uint32_t pbits[3][2]; + memcpy(pbits, pResults->m_pbits, sizeof(pbits)); + + int anchor[3] = { -1, -1, -1 }; + + for (uint32_t k = 0; k < total_subsets; k++) + { + uint32_t anchor_index = 0; + if (k) + { + if ((total_subsets == 3) && (k == 1)) + anchor_index = g_bc7_table_anchor_index_third_subset_1[pResults->m_partition]; + else if ((total_subsets == 3) && (k == 2)) + anchor_index = g_bc7_table_anchor_index_third_subset_2[pResults->m_partition]; + else + anchor_index = g_bc7_table_anchor_index_second_subset[pResults->m_partition]; + } + + anchor[k] = anchor_index; + + const uint32_t color_index_bits = get_bc7_color_index_size(best_mode, pResults->m_index_selector); + const uint32_t num_color_indices = 1 << color_index_bits; + + if (color_selectors[anchor_index] & (num_color_indices >> 1)) + { + for (uint32_t i = 0; i < 16; i++) + if (pPartition[i] == k) + color_selectors[i] = (uint8_t)((num_color_indices - 1) - color_selectors[i]); + + if (get_bc7_mode_has_seperate_alpha_selectors(best_mode)) + { + for (uint32_t q = 0; q < 3; q++) + { + uint8_t t = low[k].m_c[q]; + low[k].m_c[q] = high[k].m_c[q]; + high[k].m_c[q] = t; + } + } + else + { + color_quad_u8 tmp = low[k]; + low[k] = high[k]; + high[k] = tmp; + } + + if (!g_bc7_mode_has_shared_p_bits[best_mode]) + { + uint32_t t = pbits[k][0]; + pbits[k][0] = pbits[k][1]; + pbits[k][1] = t; + } + } + + if (get_bc7_mode_has_seperate_alpha_selectors(best_mode)) + { + const uint32_t alpha_index_bits = get_bc7_alpha_index_size(best_mode, pResults->m_index_selector); + const uint32_t num_alpha_indices = 1 << alpha_index_bits; + + if (alpha_selectors[anchor_index] & (num_alpha_indices >> 1)) + { + for (uint32_t i = 0; i < 16; i++) + if (pPartition[i] == k) + alpha_selectors[i] = (uint8_t)((num_alpha_indices - 1) - alpha_selectors[i]); + + uint8_t t = low[k].m_c[3]; + low[k].m_c[3] = high[k].m_c[3]; + high[k].m_c[3] = t; + } + } + } + + uint8_t* pBlock_bytes = (uint8_t*)(pBlock); + memset(pBlock_bytes, 0, BC7ENC_BLOCK_SIZE); + + uint32_t cur_bit_ofs = 0; + bc7_set_block_bits(pBlock_bytes, 1 << best_mode, best_mode + 1, &cur_bit_ofs); + + if ((best_mode == 4) || (best_mode == 5)) + bc7_set_block_bits(pBlock_bytes, pResults->m_rotation, 2, &cur_bit_ofs); + + if (best_mode == 4) + bc7_set_block_bits(pBlock_bytes, pResults->m_index_selector, 1, &cur_bit_ofs); + + if (total_partitions > 1) + bc7_set_block_bits(pBlock_bytes, pResults->m_partition, (total_partitions == 64) ? 6 : 4, &cur_bit_ofs); + + const uint32_t total_comps = (best_mode >= 4) ? 4 : 3; + for (uint32_t comp = 0; comp < total_comps; comp++) + { + for (uint32_t subset = 0; subset < total_subsets; subset++) + { + bc7_set_block_bits(pBlock_bytes, low[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs); + bc7_set_block_bits(pBlock_bytes, high[subset].m_c[comp], (comp == 3) ? g_bc7_alpha_precision_table[best_mode] : g_bc7_color_precision_table[best_mode], &cur_bit_ofs); + } + } + + if (g_bc7_mode_has_p_bits[best_mode]) + { + for (uint32_t subset = 0; subset < total_subsets; subset++) + { + bc7_set_block_bits(pBlock_bytes, pbits[subset][0], 1, &cur_bit_ofs); + if (!g_bc7_mode_has_shared_p_bits[best_mode]) + bc7_set_block_bits(pBlock_bytes, pbits[subset][1], 1, &cur_bit_ofs); + } + } + + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + int idx = x + y * 4; + + uint32_t n = pResults->m_index_selector ? get_bc7_alpha_index_size(best_mode, pResults->m_index_selector) : get_bc7_color_index_size(best_mode, pResults->m_index_selector); + + if ((idx == anchor[0]) || (idx == anchor[1]) || (idx == anchor[2])) + n--; + + bc7_set_block_bits(pBlock_bytes, pResults->m_index_selector ? alpha_selectors[idx] : color_selectors[idx], n, &cur_bit_ofs); + } + } + + if (get_bc7_mode_has_seperate_alpha_selectors(best_mode)) + { + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + int idx = x + y * 4; + + uint32_t n = pResults->m_index_selector ? get_bc7_color_index_size(best_mode, pResults->m_index_selector) : get_bc7_alpha_index_size(best_mode, pResults->m_index_selector); + + if ((idx == anchor[0]) || (idx == anchor[1]) || (idx == anchor[2])) + n--; + + bc7_set_block_bits(pBlock_bytes, pResults->m_index_selector ? color_selectors[idx] : alpha_selectors[idx], n, &cur_bit_ofs); + } + } + } + + assert(cur_bit_ofs == 128); + } + + // ASTC + static inline void astc_set_bits_1_to_9(uint32_t* pDst, int& bit_offset, uint32_t code, uint32_t codesize) + { + uint8_t* pBuf = reinterpret_cast(pDst); + + assert(codesize <= 9); + if (codesize) + { + uint32_t byte_bit_offset = bit_offset & 7; + uint32_t val = code << byte_bit_offset; + + uint32_t index = bit_offset >> 3; + pBuf[index] |= (uint8_t)val; + + if (codesize > (8 - byte_bit_offset)) + pBuf[index + 1] |= (uint8_t)(val >> 8); + + bit_offset += codesize; + } + } + + void pack_astc_solid_block(void* pDst_block, const color32& color) + { + uint32_t r = color[0], g = color[1], b = color[2]; + uint32_t a = color[3]; + + uint32_t* pOutput = static_cast(pDst_block); + uint8_t* pBytes = reinterpret_cast(pDst_block); + + pBytes[0] = 0xfc; pBytes[1] = 0xfd; pBytes[2] = 0xff; pBytes[3] = 0xff; + + pOutput[1] = 0xffffffff; + pOutput[2] = 0; + pOutput[3] = 0; + + int bit_pos = 64; + astc_set_bits(reinterpret_cast(pDst_block), bit_pos, r | (r << 8), 16); + astc_set_bits(reinterpret_cast(pDst_block), bit_pos, g | (g << 8), 16); + astc_set_bits(reinterpret_cast(pDst_block), bit_pos, b | (b << 8), 16); + astc_set_bits(reinterpret_cast(pDst_block), bit_pos, a | (a << 8), 16); + } + + // See 23.21 https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.inline.html#_partition_pattern_generation +#ifdef _DEBUG + static inline uint32_t astc_hash52(uint32_t v) + { + uint32_t p = v; + p ^= p >> 15; p -= p << 17; p += p << 7; p += p << 4; + p ^= p >> 5; p += p << 16; p ^= p >> 7; p ^= p >> 3; + p ^= p << 6; p ^= p >> 17; + return p; + } + + int astc_compute_texel_partition(int seed, int x, int y, int z, int partitioncount, bool small_block) + { + if (small_block) + { + x <<= 1; y <<= 1; z <<= 1; + } + seed += (partitioncount - 1) * 1024; + uint32_t rnum = astc_hash52(seed); + uint8_t seed1 = rnum & 0xF; + uint8_t seed2 = (rnum >> 4) & 0xF; + uint8_t seed3 = (rnum >> 8) & 0xF; + uint8_t seed4 = (rnum >> 12) & 0xF; + uint8_t seed5 = (rnum >> 16) & 0xF; + uint8_t seed6 = (rnum >> 20) & 0xF; + uint8_t seed7 = (rnum >> 24) & 0xF; + uint8_t seed8 = (rnum >> 28) & 0xF; + uint8_t seed9 = (rnum >> 18) & 0xF; + uint8_t seed10 = (rnum >> 22) & 0xF; + uint8_t seed11 = (rnum >> 26) & 0xF; + uint8_t seed12 = ((rnum >> 30) | (rnum << 2)) & 0xF; + + seed1 *= seed1; seed2 *= seed2; + seed3 *= seed3; seed4 *= seed4; + seed5 *= seed5; seed6 *= seed6; + seed7 *= seed7; seed8 *= seed8; + seed9 *= seed9; seed10 *= seed10; + seed11 *= seed11; seed12 *= seed12; + + int sh1, sh2, sh3; + if (seed & 1) + { + sh1 = (seed & 2 ? 4 : 5); sh2 = (partitioncount == 3 ? 6 : 5); + } + else + { + sh1 = (partitioncount == 3 ? 6 : 5); sh2 = (seed & 2 ? 4 : 5); + } + sh3 = (seed & 0x10) ? sh1 : sh2; + + seed1 >>= sh1; seed2 >>= sh2; seed3 >>= sh1; seed4 >>= sh2; + seed5 >>= sh1; seed6 >>= sh2; seed7 >>= sh1; seed8 >>= sh2; + seed9 >>= sh3; seed10 >>= sh3; seed11 >>= sh3; seed12 >>= sh3; + + int a = seed1 * x + seed2 * y + seed11 * z + (rnum >> 14); + int b = seed3 * x + seed4 * y + seed12 * z + (rnum >> 10); + int c = seed5 * x + seed6 * y + seed9 * z + (rnum >> 6); + int d = seed7 * x + seed8 * y + seed10 * z + (rnum >> 2); + + a &= 0x3F; b &= 0x3F; c &= 0x3F; d &= 0x3F; + + if (partitioncount < 4) d = 0; + if (partitioncount < 3) c = 0; + + if (a >= b && a >= c && a >= d) + return 0; + else if (b >= c && b >= d) + return 1; + else if (c >= d) + return 2; + else + return 3; + } +#endif + + static const uint8_t g_astc_quint_encode[125] = + { + 0, 1, 2, 3, 4, 8, 9, 10, 11, 12, 16, 17, 18, 19, 20, 24, 25, 26, 27, 28, 5, 13, 21, 29, 6, 32, 33, 34, 35, 36, 40, 41, 42, 43, 44, 48, 49, 50, 51, 52, 56, 57, + 58, 59, 60, 37, 45, 53, 61, 14, 64, 65, 66, 67, 68, 72, 73, 74, 75, 76, 80, 81, 82, 83, 84, 88, 89, 90, 91, 92, 69, 77, 85, 93, 22, 96, 97, 98, 99, 100, 104, + 105, 106, 107, 108, 112, 113, 114, 115, 116, 120, 121, 122, 123, 124, 101, 109, 117, 125, 30, 102, 103, 70, 71, 38, 110, 111, 78, 79, 46, 118, 119, 86, 87, 54, + 126, 127, 94, 95, 62, 39, 47, 55, 63, 31 + }; + + // Encodes 3 values to output, usable for any range that uses quints and bits + static inline void astc_encode_quints(uint32_t* pOutput, const uint8_t* pValues, int& bit_pos, int n) + { + // First extract the quints and the bits from the 3 input values + int quints = 0, bits[3]; + const uint32_t bit_mask = (1 << n) - 1; + for (int i = 0; i < 3; i++) + { + static const int s_muls[3] = { 1, 5, 25 }; + + const int t = pValues[i] >> n; + + quints += t * s_muls[i]; + bits[i] = pValues[i] & bit_mask; + } + + // Encode the quints, by inverting the bit manipulations done by the decoder, converting 3 quints into 7-bits. + // See https://www.khronos.org/registry/DataFormat/specs/1.2/dataformat.1.2.html#astc-integer-sequence-encoding + + assert(quints < 125); + const int T = g_astc_quint_encode[quints]; + + // Now interleave the 7 encoded quint bits with the bits to form the encoded output. See table 95-96. + astc_set_bits(pOutput, bit_pos, bits[0] | (astc_extract_bits(T, 0, 2) << n) | (bits[1] << (3 + n)) | (astc_extract_bits(T, 3, 4) << (3 + n * 2)) | + (bits[2] << (5 + n * 2)) | (astc_extract_bits(T, 5, 6) << (5 + n * 3)), 7 + n * 3); + } + + // Packs values using ASTC's BISE to output buffer. + static void astc_pack_bise(uint32_t* pDst, const uint8_t* pSrc_vals, int bit_pos, int num_vals, int range) + { + uint32_t temp[5] = { 0, 0, 0, 0, 0 }; + + const int num_bits = g_astc_bise_range_table[range][0]; + + int group_size = 0; + if (g_astc_bise_range_table[range][1]) + group_size = 5; + else if (g_astc_bise_range_table[range][2]) + group_size = 3; + + if (group_size) + { + // Range has trits or quints - pack each group of 5 or 3 values + const int total_groups = (group_size == 5) ? ((num_vals + 4) / 5) : ((num_vals + 2) / 3); + + for (int group_index = 0; group_index < total_groups; group_index++) + { + uint8_t vals[5] = { 0, 0, 0, 0, 0 }; + + const int limit = basisu::minimum(group_size, num_vals - group_index * group_size); + for (int i = 0; i < limit; i++) + vals[i] = pSrc_vals[group_index * group_size + i]; + + if (group_size == 5) + astc_encode_trits(temp, vals, bit_pos, num_bits); + else + astc_encode_quints(temp, vals, bit_pos, num_bits); + } + } + else + { + for (int i = 0; i < num_vals; i++) + astc_set_bits_1_to_9(temp, bit_pos, pSrc_vals[i], num_bits); + } + + pDst[0] |= temp[0]; pDst[1] |= temp[1]; + pDst[2] |= temp[2]; pDst[3] |= temp[3]; + } + + const uint32_t ASTC_BLOCK_MODE_BITS = 11; + const uint32_t ASTC_PART_BITS = 2; + const uint32_t ASTC_CEM_BITS = 4; + const uint32_t ASTC_PARTITION_INDEX_BITS = 10; + const uint32_t ASTC_CCS_BITS = 2; + + const uint32_t g_uastc_mode_astc_block_mode[TOTAL_UASTC_MODES] = { 0x242, 0x42, 0x53, 0x42, 0x42, 0x53, 0x442, 0x42, 0, 0x42, 0x242, 0x442, 0x53, 0x441, 0x42, 0x242, 0x42, 0x442, 0x253 }; + + bool pack_astc_block(uint32_t* pDst, const astc_block_desc* pBlock, uint32_t uastc_mode) + { + assert(uastc_mode < TOTAL_UASTC_MODES); + uint8_t* pDst_bytes = reinterpret_cast(pDst); + + const int total_weights = pBlock->m_dual_plane ? 32 : 16; + + // Set mode bits - see Table 146-147 + uint32_t mode = g_uastc_mode_astc_block_mode[uastc_mode]; + pDst_bytes[0] = (uint8_t)mode; + pDst_bytes[1] = (uint8_t)(mode >> 8); + + memset(pDst_bytes + 2, 0, 16 - 2); + + int bit_pos = ASTC_BLOCK_MODE_BITS; + + // We only support 1-5 bit weight indices + assert(!g_astc_bise_range_table[pBlock->m_weight_range][1] && !g_astc_bise_range_table[pBlock->m_weight_range][2]); + const int bits_per_weight = g_astc_bise_range_table[pBlock->m_weight_range][0]; + + // See table 143 - PART + astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_subsets - 1, ASTC_PART_BITS); + + if (pBlock->m_subsets == 1) + astc_set_bits_1_to_9(pDst, bit_pos, pBlock->m_cem, ASTC_CEM_BITS); + else + { + // See table 145 + astc_set_bits(pDst, bit_pos, pBlock->m_partition_seed, ASTC_PARTITION_INDEX_BITS); + + // Table 150 - we assume all CEM's are equal, so write 2 0's along with the CEM + astc_set_bits_1_to_9(pDst, bit_pos, (pBlock->m_cem << 2) & 63, ASTC_CEM_BITS + 2); + } + + if (pBlock->m_dual_plane) + { + const int total_weight_bits = total_weights * bits_per_weight; + + // See Illegal Encodings 23.24 + // https://www.khronos.org/registry/DataFormat/specs/1.3/dataformat.1.3.inline.html#_illegal_encodings + assert((total_weight_bits >= 24) && (total_weight_bits <= 96)); + + int ccs_bit_pos = 128 - total_weight_bits - ASTC_CCS_BITS; + astc_set_bits_1_to_9(pDst, ccs_bit_pos, pBlock->m_ccs, ASTC_CCS_BITS); + } + + const int num_cem_pairs = (1 + (pBlock->m_cem >> 2)) * pBlock->m_subsets; + assert(num_cem_pairs <= 9); + + astc_pack_bise(pDst, pBlock->m_endpoints, bit_pos, num_cem_pairs * 2, g_uastc_mode_endpoint_ranges[uastc_mode]); + + // Write the weight bits in reverse bit order. + switch (bits_per_weight) + { + case 1: + { + const uint32_t N = 1; + for (int i = 0; i < total_weights; i++) + { + const uint32_t ofs = 128 - N - i; + assert((ofs >> 3) < 16); + pDst_bytes[ofs >> 3] |= (pBlock->m_weights[i] << (ofs & 7)); + } + break; + } + case 2: + { + const uint32_t N = 2; + for (int i = 0; i < total_weights; i++) + { + static const uint8_t s_reverse_bits2[4] = { 0, 2, 1, 3 }; + const uint32_t ofs = 128 - N - (i * N); + assert((ofs >> 3) < 16); + pDst_bytes[ofs >> 3] |= (s_reverse_bits2[pBlock->m_weights[i]] << (ofs & 7)); + } + break; + } + case 3: + { + const uint32_t N = 3; + for (int i = 0; i < total_weights; i++) + { + static const uint8_t s_reverse_bits3[8] = { 0, 4, 2, 6, 1, 5, 3, 7 }; + + const uint32_t ofs = 128 - N - (i * N); + const uint32_t rev = s_reverse_bits3[pBlock->m_weights[i]] << (ofs & 7); + + uint32_t index = ofs >> 3; + assert(index < 16); + pDst_bytes[index++] |= rev & 0xFF; + if (index < 16) + pDst_bytes[index++] |= (rev >> 8); + } + break; + } + case 4: + { + const uint32_t N = 4; + for (int i = 0; i < total_weights; i++) + { + static const uint8_t s_reverse_bits4[16] = { 0, 8, 4, 12, 2, 10, 6, 14, 1, 9, 5, 13, 3, 11, 7, 15 }; + const int ofs = 128 - N - (i * N); + assert(ofs >= 0 && (ofs >> 3) < 16); + pDst_bytes[ofs >> 3] |= (s_reverse_bits4[pBlock->m_weights[i]] << (ofs & 7)); + } + break; + } + case 5: + { + const uint32_t N = 5; + for (int i = 0; i < total_weights; i++) + { + static const uint8_t s_reverse_bits5[32] = { 0, 16, 8, 24, 4, 20, 12, 28, 2, 18, 10, 26, 6, 22, 14, 30, 1, 17, 9, 25, 5, 21, 13, 29, 3, 19, 11, 27, 7, 23, 15, 31 }; + + const uint32_t ofs = 128 - N - (i * N); + const uint32_t rev = s_reverse_bits5[pBlock->m_weights[i]] << (ofs & 7); + + uint32_t index = ofs >> 3; + assert(index < 16); + pDst_bytes[index++] |= rev & 0xFF; + if (index < 16) + pDst_bytes[index++] |= (rev >> 8); + } + + break; + } + default: + assert(0); + break; + } + + return true; + } + + const uint8_t* get_anchor_indices(uint32_t subsets, uint32_t mode, uint32_t common_pattern, const uint8_t*& pPartition_pattern) + { + const uint8_t* pSubset_anchor_indices = g_zero_pattern; + pPartition_pattern = g_zero_pattern; + + if (subsets >= 2) + { + if (subsets == 3) + { + pPartition_pattern = &g_astc_bc7_patterns3[common_pattern][0]; + pSubset_anchor_indices = &g_astc_bc7_pattern3_anchors[common_pattern][0]; + } + else if (mode == 7) + { + pPartition_pattern = &g_bc7_3_astc2_patterns2[common_pattern][0]; + pSubset_anchor_indices = &g_bc7_3_astc2_patterns2_anchors[common_pattern][0]; + } + else + { + pPartition_pattern = &g_astc_bc7_patterns2[common_pattern][0]; + pSubset_anchor_indices = &g_astc_bc7_pattern2_anchors[common_pattern][0]; + } + } + + return pSubset_anchor_indices; + } + + static inline uint32_t read_bit(const uint8_t* pBuf, uint32_t& bit_offset) + { + uint32_t byte_bits = pBuf[bit_offset >> 3] >> (bit_offset & 7); + bit_offset += 1; + return byte_bits & 1; + } + + static inline uint32_t read_bits1_to_9(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize) + { + assert(codesize <= 9); + if (!codesize) + return 0; + + if ((BASISD_IS_BIG_ENDIAN) || (!BASISD_USE_UNALIGNED_WORD_READS) || (bit_offset >= 112)) + { + const uint8_t* pBytes = &pBuf[bit_offset >> 3U]; + + uint32_t byte_bit_offset = bit_offset & 7U; + + uint32_t bits = pBytes[0] >> byte_bit_offset; + uint32_t bits_read = basisu::minimum(codesize, 8 - byte_bit_offset); + + uint32_t bits_remaining = codesize - bits_read; + if (bits_remaining) + bits |= ((uint32_t)pBytes[1]) << bits_read; + + bit_offset += codesize; + + return bits & ((1U << codesize) - 1U); + } + + uint32_t byte_bit_offset = bit_offset & 7U; + const uint16_t w = *(const uint16_t *)(&pBuf[bit_offset >> 3U]); + bit_offset += codesize; + return (w >> byte_bit_offset) & ((1U << codesize) - 1U); + } + + inline uint64_t read_bits64(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize) + { + assert(codesize <= 64U); + uint64_t bits = 0; + uint32_t total_bits = 0; + + while (total_bits < codesize) + { + uint32_t byte_bit_offset = bit_offset & 7U; + uint32_t bits_to_read = basisu::minimum(codesize - total_bits, 8U - byte_bit_offset); + + uint32_t byte_bits = pBuf[bit_offset >> 3U] >> byte_bit_offset; + byte_bits &= ((1U << bits_to_read) - 1U); + + bits |= ((uint64_t)(byte_bits) << total_bits); + + total_bits += bits_to_read; + bit_offset += bits_to_read; + } + + return bits; + } + + static inline uint32_t read_bits1_to_9_fst(const uint8_t* pBuf, uint32_t& bit_offset, uint32_t codesize) + { + assert(codesize <= 9); + if (!codesize) + return 0; + assert(bit_offset < 112); + + if ((BASISD_IS_BIG_ENDIAN) || (!BASISD_USE_UNALIGNED_WORD_READS)) + { + const uint8_t* pBytes = &pBuf[bit_offset >> 3U]; + + uint32_t byte_bit_offset = bit_offset & 7U; + + uint32_t bits = pBytes[0] >> byte_bit_offset; + uint32_t bits_read = basisu::minimum(codesize, 8 - byte_bit_offset); + + uint32_t bits_remaining = codesize - bits_read; + if (bits_remaining) + bits |= ((uint32_t)pBytes[1]) << bits_read; + + bit_offset += codesize; + + return bits & ((1U << codesize) - 1U); + } + else + { + uint32_t byte_bit_offset = bit_offset & 7U; + const uint16_t w = *(const uint16_t*)(&pBuf[bit_offset >> 3U]); + bit_offset += codesize; + return (w >> byte_bit_offset) & ((1U << codesize) - 1U); + } + } + + bool unpack_uastc(const uastc_block& blk, unpacked_uastc_block& unpacked, bool blue_contract_check, bool read_hints) + { + //memset(&unpacked, 0, sizeof(unpacked)); + +#if 0 + uint8_t table[128]; + memset(table, 0xFF, sizeof(table)); + + { + for (uint32_t mode = 0; mode <= TOTAL_UASTC_MODES; mode++) + { + const uint32_t code = g_uastc_mode_huff_codes[mode][0]; + const uint32_t codesize = g_uastc_mode_huff_codes[mode][1]; + + table[code] = mode; + + uint32_t bits_left = 7 - codesize; + for (uint32_t i = 0; i < (1 << bits_left); i++) + table[code | (i << codesize)] = mode; + } + + for (uint32_t i = 0; i < 128; i++) + printf("%u,", table[i]); + exit(0); + } +#endif + + const int mode = g_uastc_huff_modes[blk.m_bytes[0] & 127]; + if (mode >= (int)TOTAL_UASTC_MODES) + return false; + + unpacked.m_mode = mode; + unpacked.m_common_pattern = 0; + + uint32_t bit_ofs = g_uastc_mode_huff_codes[mode][1]; + + if (mode == UASTC_MODE_INDEX_SOLID_COLOR) + { + unpacked.m_solid_color.r = (uint8_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 8); + unpacked.m_solid_color.g = (uint8_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 8); + unpacked.m_solid_color.b = (uint8_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 8); + unpacked.m_solid_color.a = (uint8_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 8); + + if (read_hints) + { + unpacked.m_etc1_flip = false; + unpacked.m_etc1_diff = read_bit(blk.m_bytes, bit_ofs) != 0; + unpacked.m_etc1_inten0 = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 3); + unpacked.m_etc1_inten1 = 0; + unpacked.m_etc1_selector = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 2); + unpacked.m_etc1_r = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 5); + unpacked.m_etc1_g = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 5); + unpacked.m_etc1_b = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 5); + unpacked.m_etc1_bias = 0; + unpacked.m_etc2_hints = 0; + } + + return true; + } + + if (read_hints) + { + if (g_uastc_mode_has_bc1_hint0[mode]) + unpacked.m_bc1_hint0 = read_bit(blk.m_bytes, bit_ofs) != 0; + else + unpacked.m_bc1_hint0 = false; + + if (g_uastc_mode_has_bc1_hint1[mode]) + unpacked.m_bc1_hint1 = read_bit(blk.m_bytes, bit_ofs) != 0; + else + unpacked.m_bc1_hint1 = false; + + unpacked.m_etc1_flip = read_bit(blk.m_bytes, bit_ofs) != 0; + unpacked.m_etc1_diff = read_bit(blk.m_bytes, bit_ofs) != 0; + unpacked.m_etc1_inten0 = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 3); + unpacked.m_etc1_inten1 = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 3); + + if (g_uastc_mode_has_etc1_bias[mode]) + unpacked.m_etc1_bias = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 5); + else + unpacked.m_etc1_bias = 0; + + if (g_uastc_mode_has_alpha[mode]) + { + unpacked.m_etc2_hints = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 8); + //assert(unpacked.m_etc2_hints > 0); + } + else + unpacked.m_etc2_hints = 0; + } + else + bit_ofs += g_uastc_mode_total_hint_bits[mode]; + + uint32_t subsets = 1; + switch (mode) + { + case 2: + case 4: + case 7: + case 9: + case 16: + unpacked.m_common_pattern = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 5); + subsets = 2; + break; + case 3: + unpacked.m_common_pattern = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 4); + subsets = 3; + break; + default: + break; + } + + uint32_t part_seed = 0; + switch (mode) + { + case 2: + case 4: + case 9: + case 16: + if (unpacked.m_common_pattern >= TOTAL_ASTC_BC7_COMMON_PARTITIONS2) + return false; + + part_seed = g_astc_bc7_common_partitions2[unpacked.m_common_pattern].m_astc; + break; + case 3: + if (unpacked.m_common_pattern >= TOTAL_ASTC_BC7_COMMON_PARTITIONS3) + return false; + + part_seed = g_astc_bc7_common_partitions3[unpacked.m_common_pattern].m_astc; + break; + case 7: + if (unpacked.m_common_pattern >= TOTAL_BC7_3_ASTC2_COMMON_PARTITIONS) + return false; + + part_seed = g_bc7_3_astc2_common_partitions[unpacked.m_common_pattern].m_astc2; + break; + default: + break; + } + + uint32_t total_planes = 1; + switch (mode) + { + case 6: + case 11: + case 13: + unpacked.m_astc.m_ccs = (int)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, 2); + total_planes = 2; + break; + case 17: + unpacked.m_astc.m_ccs = 3; + total_planes = 2; + break; + default: + break; + } + + unpacked.m_astc.m_dual_plane = (total_planes == 2); + + unpacked.m_astc.m_subsets = subsets; + unpacked.m_astc.m_partition_seed = part_seed; + + const uint32_t total_comps = g_uastc_mode_comps[mode]; + + const uint32_t weight_bits = g_uastc_mode_weight_bits[mode]; + + unpacked.m_astc.m_weight_range = g_uastc_mode_weight_ranges[mode]; + + const uint32_t total_values = total_comps * 2 * subsets; + const uint32_t endpoint_range = g_uastc_mode_endpoint_ranges[mode]; + + const uint32_t cem = g_uastc_mode_cem[mode]; + unpacked.m_astc.m_cem = cem; + + const uint32_t ep_bits = g_astc_bise_range_table[endpoint_range][0]; + const uint32_t ep_trits = g_astc_bise_range_table[endpoint_range][1]; + const uint32_t ep_quints = g_astc_bise_range_table[endpoint_range][2]; + + uint32_t total_tqs = 0; + uint32_t bundle_size = 0, mul = 0; + if (ep_trits) + { + total_tqs = (total_values + 4) / 5; + bundle_size = 5; + mul = 3; + } + else if (ep_quints) + { + total_tqs = (total_values + 2) / 3; + bundle_size = 3; + mul = 5; + } + + uint32_t tq_values[8]; + for (uint32_t i = 0; i < total_tqs; i++) + { + uint32_t num_bits = ep_trits ? 8 : 7; + if (i == (total_tqs - 1)) + { + uint32_t num_remaining = total_values - (total_tqs - 1) * bundle_size; + if (ep_trits) + { + switch (num_remaining) + { + case 1: num_bits = 2; break; + case 2: num_bits = 4; break; + case 3: num_bits = 5; break; + case 4: num_bits = 7; break; + default: break; + } + } + else if (ep_quints) + { + switch (num_remaining) + { + case 1: num_bits = 3; break; + case 2: num_bits = 5; break; + default: break; + } + } + } + + tq_values[i] = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, num_bits); + } // i + + uint32_t accum = 0; + uint32_t accum_remaining = 0; + uint32_t next_tq_index = 0; + + for (uint32_t i = 0; i < total_values; i++) + { + uint32_t value = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, ep_bits); + + if (total_tqs) + { + if (!accum_remaining) + { + assert(next_tq_index < total_tqs); + accum = tq_values[next_tq_index++]; + accum_remaining = bundle_size; + } + + // TODO: Optimize with tables + uint32_t v = accum % mul; + accum /= mul; + accum_remaining--; + + value |= (v << ep_bits); + } + + unpacked.m_astc.m_endpoints[i] = (uint8_t)value; + } + + const uint8_t* pPartition_pattern; + const uint8_t* pSubset_anchor_indices = get_anchor_indices(subsets, mode, unpacked.m_common_pattern, pPartition_pattern); + +#ifdef _DEBUG + for (uint32_t i = 0; i < 16; i++) + assert(pPartition_pattern[i] == astc_compute_texel_partition(part_seed, i & 3, i >> 2, 0, subsets, true)); + + for (uint32_t subset_index = 0; subset_index < subsets; subset_index++) + { + uint32_t anchor_index = 0; + + for (uint32_t i = 0; i < 16; i++) + { + if (pPartition_pattern[i] == subset_index) + { + anchor_index = i; + break; + } + } + + assert(pSubset_anchor_indices[subset_index] == anchor_index); + } +#endif + +#if 0 + const uint32_t total_planes_shift = total_planes - 1; + for (uint32_t i = 0; i < 16 * total_planes; i++) + { + uint32_t num_bits = weight_bits; + for (uint32_t s = 0; s < subsets; s++) + { + if (pSubset_anchor_indices[s] == (i >> total_planes_shift)) + { + num_bits--; + break; + } + } + + unpacked.m_astc.m_weights[i] = (uint8_t)read_bits1_to_9(blk.m_bytes, bit_ofs, num_bits); + } +#endif + + if (mode == 18) + { + // Mode 18 is the only mode with more than 64 weight bits. + for (uint32_t i = 0; i < 16; i++) + unpacked.m_astc.m_weights[i] = (uint8_t)read_bits1_to_9(blk.m_bytes, bit_ofs, i ? weight_bits : (weight_bits - 1)); + } + else + { + // All other modes have <= 64 weight bits. + uint64_t bits; + + // Read the weight bits + if ((BASISD_IS_BIG_ENDIAN) || (!BASISD_USE_UNALIGNED_WORD_READS)) + bits = read_bits64(blk.m_bytes, bit_ofs, basisu::minimum(64, 128 - (int)bit_ofs)); + else + { + bits = blk.m_dwords[2]; + bits |= (((uint64_t)blk.m_dwords[3]) << 32U); + + if (bit_ofs >= 64U) + bits >>= (bit_ofs - 64U); + else + { + assert(bit_ofs >= 56U); + + uint32_t bits_needed = 64U - bit_ofs; + bits <<= bits_needed; + bits |= (blk.m_bytes[7] >> (8U - bits_needed)); + } + } + + bit_ofs = 0; + + const uint32_t mask = (1U << weight_bits) - 1U; + const uint32_t anchor_mask = (1U << (weight_bits - 1U)) - 1U; + + if (total_planes == 2) + { + // Dual plane modes always have a single subset, and the first 2 weights are anchors. + + unpacked.m_astc.m_weights[0] = (uint8_t)((uint32_t)(bits >> bit_ofs) & anchor_mask); + bit_ofs += (weight_bits - 1); + + unpacked.m_astc.m_weights[1] = (uint8_t)((uint32_t)(bits >> bit_ofs) & anchor_mask); + bit_ofs += (weight_bits - 1); + + for (uint32_t i = 2; i < 32; i++) + { + unpacked.m_astc.m_weights[i] = (uint8_t)((uint32_t)(bits >> bit_ofs) & mask); + bit_ofs += weight_bits; + } + } + else + { + if (subsets == 1) + { + // Specialize the single subset case. + if (weight_bits == 4) + { + assert(bit_ofs == 0); + + // Specialize the most common case: 4-bit weights. + unpacked.m_astc.m_weights[0] = (uint8_t)((uint32_t)(bits) & 7); + unpacked.m_astc.m_weights[1] = (uint8_t)((uint32_t)(bits >> 3) & 15); + unpacked.m_astc.m_weights[2] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 1)) & 15); + unpacked.m_astc.m_weights[3] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 2)) & 15); + + unpacked.m_astc.m_weights[4] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 3)) & 15); + unpacked.m_astc.m_weights[5] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 4)) & 15); + unpacked.m_astc.m_weights[6] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 5)) & 15); + unpacked.m_astc.m_weights[7] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 6)) & 15); + + unpacked.m_astc.m_weights[8] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 7)) & 15); + unpacked.m_astc.m_weights[9] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 8)) & 15); + unpacked.m_astc.m_weights[10] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 9)) & 15); + unpacked.m_astc.m_weights[11] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 10)) & 15); + + unpacked.m_astc.m_weights[12] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 11)) & 15); + unpacked.m_astc.m_weights[13] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 12)) & 15); + unpacked.m_astc.m_weights[14] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 13)) & 15); + unpacked.m_astc.m_weights[15] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 14)) & 15); + } + else + { + // First weight is always an anchor. + unpacked.m_astc.m_weights[0] = (uint8_t)((uint32_t)(bits >> bit_ofs) & anchor_mask); + bit_ofs += (weight_bits - 1); + + for (uint32_t i = 1; i < 16; i++) + { + unpacked.m_astc.m_weights[i] = (uint8_t)((uint32_t)(bits >> bit_ofs) & mask); + bit_ofs += weight_bits; + } + } + } + else + { + const uint32_t a0 = pSubset_anchor_indices[0], a1 = pSubset_anchor_indices[1], a2 = pSubset_anchor_indices[2]; + + for (uint32_t i = 0; i < 16; i++) + { + if ((i == a0) || (i == a1) || (i == a2)) + { + unpacked.m_astc.m_weights[i] = (uint8_t)((uint32_t)(bits >> bit_ofs) & anchor_mask); + bit_ofs += (weight_bits - 1); + } + else + { + unpacked.m_astc.m_weights[i] = (uint8_t)((uint32_t)(bits >> bit_ofs) & mask); + bit_ofs += weight_bits; + } + } + } + } + } + + if ((blue_contract_check) && (total_comps >= 3)) + { + // We only need to disable ASTC Blue Contraction when we'll be packing to ASTC. The other transcoders don't care. + bool invert_subset[3] = { false, false, false }; + bool any_flag = false; + + for (uint32_t subset_index = 0; subset_index < subsets; subset_index++) + { + const int s0 = g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 0]].m_unquant + + g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 2]].m_unquant + + g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 4]].m_unquant; + + const int s1 = g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 1]].m_unquant + + g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 3]].m_unquant + + g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 5]].m_unquant; + + if (s1 < s0) + { + for (uint32_t c = 0; c < total_comps; c++) + std::swap(unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + c * 2 + 0], unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + c * 2 + 1]); + + invert_subset[subset_index] = true; + any_flag = true; + } + } + + if (any_flag) + { + const uint32_t weight_mask = (1 << weight_bits) - 1; + + for (uint32_t i = 0; i < 16; i++) + { + uint32_t subset = pPartition_pattern[i]; + + if (invert_subset[subset]) + { + unpacked.m_astc.m_weights[i * total_planes] = (uint8_t)(weight_mask - unpacked.m_astc.m_weights[i * total_planes]); + + if (total_planes == 2) + unpacked.m_astc.m_weights[i * total_planes + 1] = (uint8_t)(weight_mask - unpacked.m_astc.m_weights[i * total_planes + 1]); + } + } + } + } + + return true; + } + + static const uint32_t* g_astc_weight_tables[6] = { nullptr, g_bc7_weights1, g_bc7_weights2, g_bc7_weights3, g_astc_weights4, g_astc_weights5 }; + + bool unpack_uastc(uint32_t mode, uint32_t common_pattern, const color32& solid_color, const astc_block_desc& astc, color32* pPixels, bool srgb) + { + if (mode == UASTC_MODE_INDEX_SOLID_COLOR) + { + for (uint32_t i = 0; i < 16; i++) + pPixels[i] = solid_color; + return true; + } + + color32 endpoints[3][2]; + + const uint32_t total_subsets = g_uastc_mode_subsets[mode]; + const uint32_t total_comps = basisu::minimum(4U, g_uastc_mode_comps[mode]); + const uint32_t endpoint_range = g_uastc_mode_endpoint_ranges[mode]; + const uint32_t total_planes = g_uastc_mode_planes[mode]; + const uint32_t weight_bits = g_uastc_mode_weight_bits[mode]; + const uint32_t weight_levels = 1 << weight_bits; + + for (uint32_t subset_index = 0; subset_index < total_subsets; subset_index++) + { + if (total_comps == 2) + { + const uint32_t ll = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + 0 * 2 + 0]].m_unquant; + const uint32_t lh = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + 0 * 2 + 1]].m_unquant; + + const uint32_t al = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + 1 * 2 + 0]].m_unquant; + const uint32_t ah = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + 1 * 2 + 1]].m_unquant; + + endpoints[subset_index][0].set_noclamp_rgba(ll, ll, ll, al); + endpoints[subset_index][1].set_noclamp_rgba(lh, lh, lh, ah); + } + else + { + for (uint32_t comp_index = 0; comp_index < total_comps; comp_index++) + { + endpoints[subset_index][0][comp_index] = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + comp_index * 2 + 0]].m_unquant; + endpoints[subset_index][1][comp_index] = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + comp_index * 2 + 1]].m_unquant; + } + for (uint32_t comp_index = total_comps; comp_index < 4; comp_index++) + { + endpoints[subset_index][0][comp_index] = 255; + endpoints[subset_index][1][comp_index] = 255; + } + } + } + + color32 block_colors[3][32]; + + const uint32_t* pWeights = g_astc_weight_tables[weight_bits]; + + for (uint32_t subset_index = 0; subset_index < total_subsets; subset_index++) + { + for (uint32_t l = 0; l < weight_levels; l++) + { + if (total_comps == 2) + { + const uint8_t lc = (uint8_t)astc_interpolate(endpoints[subset_index][0][0], endpoints[subset_index][1][0], pWeights[l], srgb); + const uint8_t ac = (uint8_t)astc_interpolate(endpoints[subset_index][0][3], endpoints[subset_index][1][3], pWeights[l], srgb); + + block_colors[subset_index][l].set(lc, lc, lc, ac); + } + else + { + uint32_t comp_index; + for (comp_index = 0; comp_index < total_comps; comp_index++) + block_colors[subset_index][l][comp_index] = (uint8_t)astc_interpolate(endpoints[subset_index][0][comp_index], endpoints[subset_index][1][comp_index], pWeights[l], srgb); + + for (; comp_index < 4; comp_index++) + block_colors[subset_index][l][comp_index] = 255; + } + } + } + + const uint8_t* pPartition_pattern = g_zero_pattern; + + if (total_subsets >= 2) + { + if (total_subsets == 3) + pPartition_pattern = &g_astc_bc7_patterns3[common_pattern][0]; + else if (mode == 7) + pPartition_pattern = &g_bc7_3_astc2_patterns2[common_pattern][0]; + else + pPartition_pattern = &g_astc_bc7_patterns2[common_pattern][0]; + +#ifdef _DEBUG + for (uint32_t i = 0; i < 16; i++) + { + assert(pPartition_pattern[i] == (uint8_t)astc_compute_texel_partition(astc.m_partition_seed, i & 3, i >> 2, 0, total_subsets, true)); + } +#endif + } + + if (total_planes == 1) + { + if (total_subsets == 1) + { + for (uint32_t i = 0; i < 16; i++) + { + assert(astc.m_weights[i] < weight_levels); + pPixels[i] = block_colors[0][astc.m_weights[i]]; + } + } + else + { + for (uint32_t i = 0; i < 16; i++) + { + assert(astc.m_weights[i] < weight_levels); + pPixels[i] = block_colors[pPartition_pattern[i]][astc.m_weights[i]]; + } + } + } + else + { + assert(total_subsets == 1); + + for (uint32_t i = 0; i < 16; i++) + { + const uint32_t subset_index = 0; // pPartition_pattern[i]; + + const uint32_t weight_index0 = astc.m_weights[i * 2]; + const uint32_t weight_index1 = astc.m_weights[i * 2 + 1]; + + assert(weight_index0 < weight_levels && weight_index1 < weight_levels); + + color32& c = pPixels[i]; + for (uint32_t comp = 0; comp < 4; comp++) + { + if ((int)comp == astc.m_ccs) + c[comp] = block_colors[subset_index][weight_index1][comp]; + else + c[comp] = block_colors[subset_index][weight_index0][comp]; + } + } + } + + return true; + } + + bool unpack_uastc(const unpacked_uastc_block& unpacked_blk, color32* pPixels, bool srgb) + { + return unpack_uastc(unpacked_blk.m_mode, unpacked_blk.m_common_pattern, unpacked_blk.m_solid_color, unpacked_blk.m_astc, pPixels, srgb); + } + + bool unpack_uastc(const uastc_block& blk, color32* pPixels, bool srgb) + { + unpacked_uastc_block unpacked_blk; + + if (!unpack_uastc(blk, unpacked_blk, false, false)) + return false; + + return unpack_uastc(unpacked_blk, pPixels, srgb); + } + + // Determines the best shared pbits to use to encode xl/xh + static void determine_shared_pbits( + uint32_t total_comps, uint32_t comp_bits, float xl[4], float xh[4], + color_quad_u8& bestMinColor, color_quad_u8& bestMaxColor, uint32_t best_pbits[2]) + { + const uint32_t total_bits = comp_bits + 1; + assert(total_bits >= 4 && total_bits <= 8); + + const int iscalep = (1 << total_bits) - 1; + const float scalep = (float)iscalep; + + float best_err = 1e+9f; + + for (int p = 0; p < 2; p++) + { + color_quad_u8 xMinColor, xMaxColor; + for (uint32_t c = 0; c < 4; c++) + { + xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); + xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); + } + + color_quad_u8 scaledLow, scaledHigh; + + for (uint32_t i = 0; i < 4; i++) + { + scaledLow.m_c[i] = (xMinColor.m_c[i] << (8 - total_bits)); + scaledLow.m_c[i] |= (scaledLow.m_c[i] >> total_bits); + assert(scaledLow.m_c[i] <= 255); + + scaledHigh.m_c[i] = (xMaxColor.m_c[i] << (8 - total_bits)); + scaledHigh.m_c[i] |= (scaledHigh.m_c[i] >> total_bits); + assert(scaledHigh.m_c[i] <= 255); + } + + float err = 0; + for (uint32_t i = 0; i < total_comps; i++) + err += basisu::squaref((scaledLow.m_c[i] / 255.0f) - xl[i]) + basisu::squaref((scaledHigh.m_c[i] / 255.0f) - xh[i]); + + if (err < best_err) + { + best_err = err; + best_pbits[0] = p; + best_pbits[1] = p; + for (uint32_t j = 0; j < 4; j++) + { + bestMinColor.m_c[j] = xMinColor.m_c[j] >> 1; + bestMaxColor.m_c[j] = xMaxColor.m_c[j] >> 1; + } + } + } + } + + // Determines the best unique pbits to use to encode xl/xh + static void determine_unique_pbits( + uint32_t total_comps, uint32_t comp_bits, float xl[4], float xh[4], + color_quad_u8& bestMinColor, color_quad_u8& bestMaxColor, uint32_t best_pbits[2]) + { + const uint32_t total_bits = comp_bits + 1; + const int iscalep = (1 << total_bits) - 1; + const float scalep = (float)iscalep; + + float best_err0 = 1e+9f; + float best_err1 = 1e+9f; + + for (int p = 0; p < 2; p++) + { + color_quad_u8 xMinColor, xMaxColor; + + for (uint32_t c = 0; c < 4; c++) + { + xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); + xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); + } + + color_quad_u8 scaledLow, scaledHigh; + for (uint32_t i = 0; i < 4; i++) + { + scaledLow.m_c[i] = (xMinColor.m_c[i] << (8 - total_bits)); + scaledLow.m_c[i] |= (scaledLow.m_c[i] >> total_bits); + assert(scaledLow.m_c[i] <= 255); + + scaledHigh.m_c[i] = (xMaxColor.m_c[i] << (8 - total_bits)); + scaledHigh.m_c[i] |= (scaledHigh.m_c[i] >> total_bits); + assert(scaledHigh.m_c[i] <= 255); + } + + float err0 = 0, err1 = 0; + for (uint32_t i = 0; i < total_comps; i++) + { + err0 += basisu::squaref(scaledLow.m_c[i] - xl[i] * 255.0f); + err1 += basisu::squaref(scaledHigh.m_c[i] - xh[i] * 255.0f); + } + + if (err0 < best_err0) + { + best_err0 = err0; + best_pbits[0] = p; + + bestMinColor.m_c[0] = xMinColor.m_c[0] >> 1; + bestMinColor.m_c[1] = xMinColor.m_c[1] >> 1; + bestMinColor.m_c[2] = xMinColor.m_c[2] >> 1; + bestMinColor.m_c[3] = xMinColor.m_c[3] >> 1; + } + + if (err1 < best_err1) + { + best_err1 = err1; + best_pbits[1] = p; + + bestMaxColor.m_c[0] = xMaxColor.m_c[0] >> 1; + bestMaxColor.m_c[1] = xMaxColor.m_c[1] >> 1; + bestMaxColor.m_c[2] = xMaxColor.m_c[2] >> 1; + bestMaxColor.m_c[3] = xMaxColor.m_c[3] >> 1; + } + } + } + + bool transcode_uastc_to_astc(const uastc_block& src_blk, void* pDst) + { + unpacked_uastc_block unpacked_src_blk; + if (!unpack_uastc(src_blk, unpacked_src_blk, true, false)) + return false; + + bool success = false; + if (unpacked_src_blk.m_mode == UASTC_MODE_INDEX_SOLID_COLOR) + { + pack_astc_solid_block(pDst, unpacked_src_blk.m_solid_color); + success = true; + } + else + { + success = pack_astc_block(static_cast(pDst), &unpacked_src_blk.m_astc, unpacked_src_blk.m_mode); + } + + return success; + } + + bool transcode_uastc_to_bc7(const unpacked_uastc_block& unpacked_src_blk, bc7_optimization_results& dst_blk) + { + memset(&dst_blk, 0, sizeof(dst_blk)); + + const uint32_t mode = unpacked_src_blk.m_mode; + + const uint32_t endpoint_range = g_uastc_mode_endpoint_ranges[mode]; + const uint32_t total_comps = g_uastc_mode_comps[mode]; + + switch (mode) + { + case 0: + case 5: + case 10: + case 12: + case 14: + case 15: + case 18: + { + // MODE 0: DualPlane: 0, WeightRange: 8 (16), Subsets: 1, EndpointRange: 19 (192) - BC7 MODE6 RGB + // MODE 5: DualPlane: 0, WeightRange : 5 (8), Subsets : 1, EndpointRange : 20 (256) - BC7 MODE6 RGB + // MODE 10 DualPlane: 0, WeightRange: 8 (16), Subsets: 1, EndpointRange: 13 (48) - BC7 MODE6 + // MODE 12: DualPlane: 0, WeightRange : 5 (8), Subsets : 1, EndpointRange : 19 (192) - BC7 MODE6 + // MODE 14: DualPlane: 0, WeightRange : 2 (4), Subsets : 1, EndpointRange : 20 (256) - BC7 MODE6 + // MODE 18: DualPlane: 0, WeightRange : 11 (32), Subsets : 1, CEM : 8, EndpointRange : 11 (32) - BC7 MODE6 + // MODE 15: DualPlane: 0, WeightRange : 8 (16), Subsets : 1, CEM : 4 (LA Direct), EndpointRange : 20 (256) - BC7 MODE6 + dst_blk.m_mode = 6; + + float xl[4], xh[4]; + if (total_comps == 2) + { + xl[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[0]].m_unquant / 255.0f; + xh[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[1]].m_unquant / 255.0f; + + xl[1] = xl[0]; + xh[1] = xh[0]; + + xl[2] = xl[0]; + xh[2] = xh[0]; + + xl[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[2]].m_unquant / 255.0f; + xh[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[3]].m_unquant / 255.0f; + } + else + { + xl[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[0]].m_unquant / 255.0f; + xl[1] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[2]].m_unquant / 255.0f; + xl[2] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[4]].m_unquant / 255.0f; + + xh[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[1]].m_unquant / 255.0f; + xh[1] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[3]].m_unquant / 255.0f; + xh[2] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[5]].m_unquant / 255.0f; + + if (total_comps == 4) + { + xl[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[6]].m_unquant / 255.0f; + xh[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[7]].m_unquant / 255.0f; + } + else + { + xl[3] = 1.0f; + xh[3] = 1.0f; + } + } + + uint32_t best_pbits[2]; + color_quad_u8 bestMinColor, bestMaxColor; + determine_unique_pbits((total_comps == 2) ? 4 : total_comps, 7, xl, xh, bestMinColor, bestMaxColor, best_pbits); + + dst_blk.m_low[0] = bestMinColor; + dst_blk.m_high[0] = bestMaxColor; + + if (total_comps == 3) + { + dst_blk.m_low[0].m_c[3] = 127; + dst_blk.m_high[0].m_c[3] = 127; + } + + dst_blk.m_pbits[0][0] = best_pbits[0]; + dst_blk.m_pbits[0][1] = best_pbits[1]; + + if (mode == 18) + { + const uint8_t s_bc7_5_to_4[32] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6, 7, 8, 9, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15 }; + for (uint32_t i = 0; i < 16; i++) + dst_blk.m_selectors[i] = s_bc7_5_to_4[unpacked_src_blk.m_astc.m_weights[i]]; + } + else if (mode == 14) + { + const uint8_t s_bc7_2_to_4[4] = { 0, 5, 10, 15 }; + for (uint32_t i = 0; i < 16; i++) + dst_blk.m_selectors[i] = s_bc7_2_to_4[unpacked_src_blk.m_astc.m_weights[i]]; + } + else if ((mode == 5) || (mode == 12)) + { + const uint8_t s_bc7_3_to_4[8] = { 0, 2, 4, 6, 9, 11, 13, 15 }; + for (uint32_t i = 0; i < 16; i++) + dst_blk.m_selectors[i] = s_bc7_3_to_4[unpacked_src_blk.m_astc.m_weights[i]]; + } + else + { + for (uint32_t i = 0; i < 16; i++) + dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i]; + } + + break; + } + case 1: + { + // DualPlane: 0, WeightRange : 2 (4), Subsets : 1, EndpointRange : 20 (256) - BC7 MODE3 + // Mode 1 uses endpoint range 20 - no need to use ASTC dequant tables. + dst_blk.m_mode = 3; + + float xl[4], xh[4]; + xl[0] = unpacked_src_blk.m_astc.m_endpoints[0] / 255.0f; + xl[1] = unpacked_src_blk.m_astc.m_endpoints[2] / 255.0f; + xl[2] = unpacked_src_blk.m_astc.m_endpoints[4] / 255.0f; + xl[3] = 1.0f; + + xh[0] = unpacked_src_blk.m_astc.m_endpoints[1] / 255.0f; + xh[1] = unpacked_src_blk.m_astc.m_endpoints[3] / 255.0f; + xh[2] = unpacked_src_blk.m_astc.m_endpoints[5] / 255.0f; + xh[3] = 1.0f; + + uint32_t best_pbits[2]; + color_quad_u8 bestMinColor, bestMaxColor; + memset(&bestMinColor, 0, sizeof(bestMinColor)); + memset(&bestMaxColor, 0, sizeof(bestMaxColor)); + determine_unique_pbits(3, 7, xl, xh, bestMinColor, bestMaxColor, best_pbits); + + for (uint32_t i = 0; i < 3; i++) + { + dst_blk.m_low[0].m_c[i] = bestMinColor.m_c[i]; + dst_blk.m_high[0].m_c[i] = bestMaxColor.m_c[i]; + dst_blk.m_low[1].m_c[i] = bestMinColor.m_c[i]; + dst_blk.m_high[1].m_c[i] = bestMaxColor.m_c[i]; + } + dst_blk.m_pbits[0][0] = best_pbits[0]; + dst_blk.m_pbits[0][1] = best_pbits[1]; + dst_blk.m_pbits[1][0] = best_pbits[0]; + dst_blk.m_pbits[1][1] = best_pbits[1]; + + for (uint32_t i = 0; i < 16; i++) + dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i]; + + break; + } + case 2: + { + // 2. DualPlane: 0, WeightRange : 5 (8), Subsets : 2, EndpointRange : 8 (16) - BC7 MODE1 + dst_blk.m_mode = 1; + dst_blk.m_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_bc7; + + const bool invert_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_invert; + + float xl[4], xh[4]; + xl[3] = 1.0f; + xh[3] = 1.0f; + + for (uint32_t subset = 0; subset < 2; subset++) + { + for (uint32_t i = 0; i < 3; i++) + { + uint32_t v = unpacked_src_blk.m_astc.m_endpoints[i * 2 + subset * 6]; + v = (v << 4) | v; + xl[i] = v / 255.0f; + + v = unpacked_src_blk.m_astc.m_endpoints[i * 2 + subset * 6 + 1]; + v = (v << 4) | v; + xh[i] = v / 255.0f; + } + + uint32_t best_pbits[2] = { 0, 0 }; + color_quad_u8 bestMinColor, bestMaxColor; + memset(&bestMinColor, 0, sizeof(bestMinColor)); + memset(&bestMaxColor, 0, sizeof(bestMaxColor)); + determine_shared_pbits(3, 6, xl, xh, bestMinColor, bestMaxColor, best_pbits); + + const uint32_t bc7_subset_index = invert_partition ? (1 - subset) : subset; + + for (uint32_t i = 0; i < 3; i++) + { + dst_blk.m_low[bc7_subset_index].m_c[i] = bestMinColor.m_c[i]; + dst_blk.m_high[bc7_subset_index].m_c[i] = bestMaxColor.m_c[i]; + } + + dst_blk.m_pbits[bc7_subset_index][0] = best_pbits[0]; + } // subset + + for (uint32_t i = 0; i < 16; i++) + dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i]; + + break; + } + case 3: + { + // DualPlane: 0, WeightRange : 2 (4), Subsets : 3, EndpointRange : 7 (12) - BC7 MODE2 + dst_blk.m_mode = 2; + dst_blk.m_partition = g_astc_bc7_common_partitions3[unpacked_src_blk.m_common_pattern].m_bc7; + + const uint32_t perm = g_astc_bc7_common_partitions3[unpacked_src_blk.m_common_pattern].m_astc_to_bc7_perm; + + for (uint32_t subset = 0; subset < 3; subset++) + { + for (uint32_t comp = 0; comp < 3; comp++) + { + uint32_t lo = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[comp * 2 + 0 + subset * 6]].m_unquant; + uint32_t hi = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[comp * 2 + 1 + subset * 6]].m_unquant; + + // TODO: I think this can be improved by using tables like Basis Universal does with ETC1S conversion. + lo = (lo * 31 + 127) / 255; + hi = (hi * 31 + 127) / 255; + + const uint32_t bc7_subset_index = g_astc_to_bc7_partition_index_perm_tables[perm][subset]; + + dst_blk.m_low[bc7_subset_index].m_c[comp] = (uint8_t)lo; + dst_blk.m_high[bc7_subset_index].m_c[comp] = (uint8_t)hi; + } + } + + for (uint32_t i = 0; i < 16; i++) + dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i]; + + break; + } + case 4: + { + // 4. DualPlane: 0, WeightRange: 2 (4), Subsets: 2, EndpointRange: 12 (40) - BC7 MODE3 + dst_blk.m_mode = 3; + dst_blk.m_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_bc7; + + const bool invert_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_invert; + + float xl[4], xh[4]; + xl[3] = 1.0f; + xh[3] = 1.0f; + + for (uint32_t subset = 0; subset < 2; subset++) + { + for (uint32_t i = 0; i < 3; i++) + { + xl[i] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[i * 2 + subset * 6]].m_unquant / 255.0f; + xh[i] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[i * 2 + subset * 6 + 1]].m_unquant / 255.0f; + } + + uint32_t best_pbits[2] = { 0, 0 }; + color_quad_u8 bestMinColor, bestMaxColor; + memset(&bestMinColor, 0, sizeof(bestMinColor)); + memset(&bestMaxColor, 0, sizeof(bestMaxColor)); + determine_unique_pbits(3, 7, xl, xh, bestMinColor, bestMaxColor, best_pbits); + + const uint32_t bc7_subset_index = invert_partition ? (1 - subset) : subset; + + for (uint32_t i = 0; i < 3; i++) + { + dst_blk.m_low[bc7_subset_index].m_c[i] = bestMinColor.m_c[i]; + dst_blk.m_high[bc7_subset_index].m_c[i] = bestMaxColor.m_c[i]; + } + dst_blk.m_low[bc7_subset_index].m_c[3] = 127; + dst_blk.m_high[bc7_subset_index].m_c[3] = 127; + + dst_blk.m_pbits[bc7_subset_index][0] = best_pbits[0]; + dst_blk.m_pbits[bc7_subset_index][1] = best_pbits[1]; + + } // subset + + for (uint32_t i = 0; i < 16; i++) + dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i]; + + break; + } + case 6: + case 11: + case 13: + case 17: + { + // MODE 6: DualPlane: 1, WeightRange : 2 (4), Subsets : 1, EndpointRange : 18 (160) - BC7 MODE5 RGB + // MODE 11: DualPlane: 1, WeightRange: 2 (4), Subsets: 1, EndpointRange: 13 (48) - BC7 MODE5 + // MODE 13: DualPlane: 1, WeightRange: 0 (2), Subsets : 1, EndpointRange : 20 (256) - BC7 MODE5 + // MODE 17: DualPlane: 1, WeightRange: 2 (4), Subsets: 1, CEM: 4 (LA Direct), EndpointRange: 20 (256) - BC7 MODE5 + dst_blk.m_mode = 5; + dst_blk.m_rotation = (unpacked_src_blk.m_astc.m_ccs + 1) & 3; + + if (total_comps == 2) + { + assert(unpacked_src_blk.m_astc.m_ccs == 3); + + dst_blk.m_low->m_c[0] = (uint8_t)((g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[0]].m_unquant * 127 + 127) / 255); + dst_blk.m_high->m_c[0] = (uint8_t)((g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[1]].m_unquant * 127 + 127) / 255); + + dst_blk.m_low->m_c[1] = dst_blk.m_low->m_c[0]; + dst_blk.m_high->m_c[1] = dst_blk.m_high->m_c[0]; + + dst_blk.m_low->m_c[2] = dst_blk.m_low->m_c[0]; + dst_blk.m_high->m_c[2] = dst_blk.m_high->m_c[0]; + + dst_blk.m_low->m_c[3] = (uint8_t)(g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[2]].m_unquant); + dst_blk.m_high->m_c[3] = (uint8_t)(g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[3]].m_unquant); + } + else + { + for (uint32_t astc_comp = 0; astc_comp < 4; astc_comp++) + { + uint32_t bc7_comp = astc_comp; + // ASTC and BC7 handle dual plane component rotations differently: + // ASTC: 2nd plane separately interpolates the CCS channel. + // BC7: 2nd plane channel is swapped with alpha, 2nd plane controls alpha interpolation, then we swap alpha with the desired channel. + if (astc_comp == (uint32_t)unpacked_src_blk.m_astc.m_ccs) + bc7_comp = 3; + else if (astc_comp == 3) + bc7_comp = unpacked_src_blk.m_astc.m_ccs; + + uint32_t l = 255, h = 255; + if (astc_comp < total_comps) + { + l = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[astc_comp * 2 + 0]].m_unquant; + h = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[astc_comp * 2 + 1]].m_unquant; + } + + if (bc7_comp < 3) + { + l = (l * 127 + 127) / 255; + h = (h * 127 + 127) / 255; + } + + dst_blk.m_low->m_c[bc7_comp] = (uint8_t)l; + dst_blk.m_high->m_c[bc7_comp] = (uint8_t)h; + } + } + + if (mode == 13) + { + for (uint32_t i = 0; i < 16; i++) + { + dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i * 2] ? 3 : 0; + dst_blk.m_alpha_selectors[i] = unpacked_src_blk.m_astc.m_weights[i * 2 + 1] ? 3 : 0; + } + } + else + { + for (uint32_t i = 0; i < 16; i++) + { + dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i * 2]; + dst_blk.m_alpha_selectors[i] = unpacked_src_blk.m_astc.m_weights[i * 2 + 1]; + } + } + + break; + } + case 7: + { + // DualPlane: 0, WeightRange : 2 (4), Subsets : 2, EndpointRange : 12 (40) - BC7 MODE2 + dst_blk.m_mode = 2; + dst_blk.m_partition = g_bc7_3_astc2_common_partitions[unpacked_src_blk.m_common_pattern].m_bc73; + + const uint32_t common_pattern_k = g_bc7_3_astc2_common_partitions[unpacked_src_blk.m_common_pattern].k; + + for (uint32_t bc7_part = 0; bc7_part < 3; bc7_part++) + { + const uint32_t astc_part = bc7_convert_partition_index_3_to_2(bc7_part, common_pattern_k); + + for (uint32_t c = 0; c < 3; c++) + { + dst_blk.m_low[bc7_part].m_c[c] = (g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[c * 2 + 0 + astc_part * 6]].m_unquant * 31 + 127) / 255; + dst_blk.m_high[bc7_part].m_c[c] = (g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[c * 2 + 1 + astc_part * 6]].m_unquant * 31 + 127) / 255; + } + } + + for (uint32_t i = 0; i < 16; i++) + dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i]; + + break; + } + case UASTC_MODE_INDEX_SOLID_COLOR: + { + // Void-Extent: Solid Color RGBA (BC7 MODE5 or MODE6) + const color32& solid_color = unpacked_src_blk.m_solid_color; + + uint32_t best_err0 = g_bc7_mode_6_optimal_endpoints[solid_color.r][0].m_error + g_bc7_mode_6_optimal_endpoints[solid_color.g][0].m_error + + g_bc7_mode_6_optimal_endpoints[solid_color.b][0].m_error + g_bc7_mode_6_optimal_endpoints[solid_color.a][0].m_error; + + uint32_t best_err1 = g_bc7_mode_6_optimal_endpoints[solid_color.r][1].m_error + g_bc7_mode_6_optimal_endpoints[solid_color.g][1].m_error + + g_bc7_mode_6_optimal_endpoints[solid_color.b][1].m_error + g_bc7_mode_6_optimal_endpoints[solid_color.a][1].m_error; + + if (best_err0 > 0 && best_err1 > 0) + { + dst_blk.m_mode = 5; + + for (uint32_t c = 0; c < 3; c++) + { + dst_blk.m_low[0].m_c[c] = g_bc7_mode_5_optimal_endpoints[solid_color.c[c]].m_lo; + dst_blk.m_high[0].m_c[c] = g_bc7_mode_5_optimal_endpoints[solid_color.c[c]].m_hi; + } + + memset(dst_blk.m_selectors, BC7ENC_MODE_5_OPTIMAL_INDEX, 16); + + dst_blk.m_low[0].m_c[3] = solid_color.c[3]; + dst_blk.m_high[0].m_c[3] = solid_color.c[3]; + + //memset(dst_blk.m_alpha_selectors, 0, 16); + } + else + { + dst_blk.m_mode = 6; + + uint32_t best_p = 0; + if (best_err1 < best_err0) + best_p = 1; + + for (uint32_t c = 0; c < 4; c++) + { + dst_blk.m_low[0].m_c[c] = g_bc7_mode_6_optimal_endpoints[solid_color.c[c]][best_p].m_lo; + dst_blk.m_high[0].m_c[c] = g_bc7_mode_6_optimal_endpoints[solid_color.c[c]][best_p].m_hi; + } + + dst_blk.m_pbits[0][0] = best_p; + dst_blk.m_pbits[0][1] = best_p; + memset(dst_blk.m_selectors, BC7ENC_MODE_6_OPTIMAL_INDEX, 16); + } + + break; + } + case 9: + case 16: + { + // 9. DualPlane: 0, WeightRange : 2 (4), Subsets : 2, EndpointRange : 8 (16) - BC7 MODE7 + // 16. DualPlane: 0, WeightRange: 2 (4), Subsets: 2, CEM: 4 (LA Direct), EndpointRange: 20 (256) - BC7 MODE7 + + dst_blk.m_mode = 7; + dst_blk.m_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_bc7; + + const bool invert_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_invert; + + for (uint32_t astc_subset = 0; astc_subset < 2; astc_subset++) + { + float xl[4], xh[4]; + + if (total_comps == 2) + { + xl[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[0 + astc_subset * 4]].m_unquant / 255.0f; + xh[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[1 + astc_subset * 4]].m_unquant / 255.0f; + + xl[1] = xl[0]; + xh[1] = xh[0]; + + xl[2] = xl[0]; + xh[2] = xh[0]; + + xl[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[2 + astc_subset * 4]].m_unquant / 255.0f; + xh[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[3 + astc_subset * 4]].m_unquant / 255.0f; + } + else + { + xl[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[0 + astc_subset * 8]].m_unquant / 255.0f; + xl[1] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[2 + astc_subset * 8]].m_unquant / 255.0f; + xl[2] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[4 + astc_subset * 8]].m_unquant / 255.0f; + xl[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[6 + astc_subset * 8]].m_unquant / 255.0f; + + xh[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[1 + astc_subset * 8]].m_unquant / 255.0f; + xh[1] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[3 + astc_subset * 8]].m_unquant / 255.0f; + xh[2] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[5 + astc_subset * 8]].m_unquant / 255.0f; + xh[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[7 + astc_subset * 8]].m_unquant / 255.0f; + } + + uint32_t best_pbits[2] = { 0, 0 }; + color_quad_u8 bestMinColor, bestMaxColor; + memset(&bestMinColor, 0, sizeof(bestMinColor)); + memset(&bestMaxColor, 0, sizeof(bestMaxColor)); + determine_unique_pbits(4, 5, xl, xh, bestMinColor, bestMaxColor, best_pbits); + + const uint32_t bc7_subset_index = invert_partition ? (1 - astc_subset) : astc_subset; + + dst_blk.m_low[bc7_subset_index] = bestMinColor; + dst_blk.m_high[bc7_subset_index] = bestMaxColor; + + dst_blk.m_pbits[bc7_subset_index][0] = best_pbits[0]; + dst_blk.m_pbits[bc7_subset_index][1] = best_pbits[1]; + } // astc_subset + + for (uint32_t i = 0; i < 16; i++) + dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i]; + + break; + } + default: + return false; + } + + return true; + } + + bool transcode_uastc_to_bc7(const uastc_block& src_blk, bc7_optimization_results& dst_blk) + { + unpacked_uastc_block unpacked_src_blk; + if (!unpack_uastc(src_blk, unpacked_src_blk, false, false)) + return false; + + return transcode_uastc_to_bc7(unpacked_src_blk, dst_blk); + } + + bool transcode_uastc_to_bc7(const uastc_block& src_blk, void* pDst) + { + bc7_optimization_results temp; + if (!transcode_uastc_to_bc7(src_blk, temp)) + return false; + + encode_bc7_block(pDst, &temp); + return true; + } + + color32 apply_etc1_bias(const color32 &block_color, uint32_t bias, uint32_t limit, uint32_t subblock) + { + color32 result; + + for (uint32_t c = 0; c < 3; c++) + { + static const int s_divs[3] = { 1, 3, 9 }; + + int delta = 0; + + switch (bias) + { + case 2: delta = subblock ? 0 : ((c == 0) ? -1 : 0); break; + case 5: delta = subblock ? 0 : ((c == 1) ? -1 : 0); break; + case 6: delta = subblock ? 0 : ((c == 2) ? -1 : 0); break; + + case 7: delta = subblock ? 0 : ((c == 0) ? 1 : 0); break; + case 11: delta = subblock ? 0 : ((c == 1) ? 1 : 0); break; + case 15: delta = subblock ? 0 : ((c == 2) ? 1 : 0); break; + + case 18: delta = subblock ? ((c == 0) ? -1 : 0) : 0; break; + case 19: delta = subblock ? ((c == 1) ? -1 : 0) : 0; break; + case 20: delta = subblock ? ((c == 2) ? -1 : 0) : 0; break; + + case 21: delta = subblock ? ((c == 0) ? 1 : 0) : 0; break; + case 24: delta = subblock ? ((c == 1) ? 1 : 0) : 0; break; + case 8: delta = subblock ? ((c == 2) ? 1 : 0) : 0; break; + + case 10: delta = -2; break; + + case 27: delta = subblock ? 0 : -1; break; + case 28: delta = subblock ? -1 : 1; break; + case 29: delta = subblock ? 1 : 0; break; + case 30: delta = subblock ? -1 : 0; break; + case 31: delta = subblock ? 0 : 1; break; + + default: + delta = ((bias / s_divs[c]) % 3) - 1; + break; + } + + int v = block_color[c]; + if (v == 0) + { + if (delta == -2) + v += 3; + else + v += delta + 1; + } + else if (v == (int)limit) + { + v += (delta - 1); + } + else + { + v += delta; + if ((v < 0) || (v > (int)limit)) + v = (v - delta) - delta; + } + + assert(v >= 0); + assert(v <= (int)limit); + + result[c] = (uint8_t)v; + } + + return result; + } + + static void etc1_determine_selectors(decoder_etc_block& dst_blk, const color32* pSource_pixels, uint32_t first_subblock, uint32_t last_subblock) + { + static const uint8_t s_tran[4] = { 1, 0, 2, 3 }; + + uint16_t l_bitmask = 0; + uint16_t h_bitmask = 0; + + for (uint32_t subblock = first_subblock; subblock < last_subblock; subblock++) + { + color32 block_colors[4]; + dst_blk.get_block_colors(block_colors, subblock); + + uint32_t block_y[4]; + for (uint32_t i = 0; i < 4; i++) + block_y[i] = block_colors[i][0] * 54 + block_colors[i][1] * 183 + block_colors[i][2] * 19; + + const uint32_t block_y01 = block_y[0] + block_y[1]; + const uint32_t block_y12 = block_y[1] + block_y[2]; + const uint32_t block_y23 = block_y[2] + block_y[3]; + + // X0 X0 X0 X0 X1 X1 X1 X1 X2 X2 X2 X2 X3 X3 X3 X3 + // Y0 Y1 Y2 Y3 Y0 Y1 Y2 Y3 Y0 Y1 Y2 Y3 Y0 Y1 Y2 Y3 + + if (dst_blk.get_flip_bit()) + { + uint32_t ofs = subblock * 2; + + for (uint32_t y = 0; y < 2; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + const color32& c = pSource_pixels[x + (subblock * 2 + y) * 4]; + const uint32_t l = c[0] * 108 + c[1] * 366 + c[2] * 38; + + uint32_t t = s_tran[(l < block_y01) + (l < block_y12) + (l < block_y23)]; + + assert(ofs < 16); + l_bitmask |= ((t & 1) << ofs); + h_bitmask |= ((t >> 1) << ofs); + ofs += 4; + } + + ofs = (int)ofs + 1 - 4 * 4; + } + } + else + { + uint32_t ofs = (subblock * 2) * 4; + for (uint32_t x = 0; x < 2; x++) + { + for (uint32_t y = 0; y < 4; y++) + { + const color32& c = pSource_pixels[subblock * 2 + x + y * 4]; + const uint32_t l = c[0] * 108 + c[1] * 366 + c[2] * 38; + + uint32_t t = s_tran[(l < block_y01) + (l < block_y12) + (l < block_y23)]; + + assert(ofs < 16); + l_bitmask |= ((t & 1) << ofs); + h_bitmask |= ((t >> 1) << ofs); + ++ofs; + } + } + } + } + + dst_blk.m_bytes[7] = (uint8_t)(l_bitmask); + dst_blk.m_bytes[6] = (uint8_t)(l_bitmask >> 8); + dst_blk.m_bytes[5] = (uint8_t)(h_bitmask); + dst_blk.m_bytes[4] = (uint8_t)(h_bitmask >> 8); + } + + static const uint8_t s_etc1_solid_selectors[4][4] = { { 255, 255, 255, 255 }, { 255, 255, 0, 0 }, { 0, 0, 0, 0 }, {0, 0, 255, 255 } }; + + struct etc_coord2 + { + uint8_t m_x, m_y; + }; + + // [flip][subblock][pixel_index] + const etc_coord2 g_etc1_pixel_coords[2][2][8] = + { + { + { + { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 }, + { 1, 0 }, { 1, 1 }, { 1, 2 }, { 1, 3 } + }, + { + { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 }, + { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 } + } + }, + { + { + { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 }, + { 0, 1 }, { 1, 1 }, { 2, 1 }, { 3, 1 } + }, + { + { 0, 2 }, { 1, 2 }, { 2, 2 }, { 3, 2 }, + { 0, 3 }, { 1, 3 }, { 2, 3 }, { 3, 3 } + }, + } + }; + + void transcode_uastc_to_etc1(unpacked_uastc_block& unpacked_src_blk, color32 block_pixels[4][4], void* pDst) + { + decoder_etc_block& dst_blk = *static_cast(pDst); + + if (unpacked_src_blk.m_mode == UASTC_MODE_INDEX_SOLID_COLOR) + { + dst_blk.m_bytes[3] = (uint8_t)((unpacked_src_blk.m_etc1_diff << 1) | (unpacked_src_blk.m_etc1_inten0 << 5) | (unpacked_src_blk.m_etc1_inten0 << 2)); + + if (unpacked_src_blk.m_etc1_diff) + { + dst_blk.m_bytes[0] = (uint8_t)(unpacked_src_blk.m_etc1_r << 3); + dst_blk.m_bytes[1] = (uint8_t)(unpacked_src_blk.m_etc1_g << 3); + dst_blk.m_bytes[2] = (uint8_t)(unpacked_src_blk.m_etc1_b << 3); + } + else + { + dst_blk.m_bytes[0] = (uint8_t)(unpacked_src_blk.m_etc1_r | (unpacked_src_blk.m_etc1_r << 4)); + dst_blk.m_bytes[1] = (uint8_t)(unpacked_src_blk.m_etc1_g | (unpacked_src_blk.m_etc1_g << 4)); + dst_blk.m_bytes[2] = (uint8_t)(unpacked_src_blk.m_etc1_b | (unpacked_src_blk.m_etc1_b << 4)); + } + + memcpy(dst_blk.m_bytes + 4, &s_etc1_solid_selectors[unpacked_src_blk.m_etc1_selector][0], 4); + + return; + } + + const bool flip = unpacked_src_blk.m_etc1_flip != 0; + const bool diff = unpacked_src_blk.m_etc1_diff != 0; + + dst_blk.m_bytes[3] = (uint8_t)((int)flip | (diff << 1) | (unpacked_src_blk.m_etc1_inten0 << 5) | (unpacked_src_blk.m_etc1_inten1 << 2)); + + const uint32_t limit = diff ? 31 : 15; + + color32 block_colors[2]; + + for (uint32_t subset = 0; subset < 2; subset++) + { + uint32_t avg_color[3]; + memset(avg_color, 0, sizeof(avg_color)); + + for (uint32_t j = 0; j < 8; j++) + { + const etc_coord2& c = g_etc1_pixel_coords[flip][subset][j]; + + avg_color[0] += block_pixels[c.m_y][c.m_x].r; + avg_color[1] += block_pixels[c.m_y][c.m_x].g; + avg_color[2] += block_pixels[c.m_y][c.m_x].b; + } // j + + block_colors[subset][0] = (uint8_t)((avg_color[0] * limit + 1020) / (8 * 255)); + block_colors[subset][1] = (uint8_t)((avg_color[1] * limit + 1020) / (8 * 255)); + block_colors[subset][2] = (uint8_t)((avg_color[2] * limit + 1020) / (8 * 255)); + block_colors[subset][3] = 0; + + if (g_uastc_mode_has_etc1_bias[unpacked_src_blk.m_mode]) + { + block_colors[subset] = apply_etc1_bias(block_colors[subset], unpacked_src_blk.m_etc1_bias, limit, subset); + } + + } // subset + + if (diff) + { + int dr = block_colors[1].r - block_colors[0].r; + int dg = block_colors[1].g - block_colors[0].g; + int db = block_colors[1].b - block_colors[0].b; + + dr = basisu::clamp(dr, cETC1ColorDeltaMin, cETC1ColorDeltaMax); + dg = basisu::clamp(dg, cETC1ColorDeltaMin, cETC1ColorDeltaMax); + db = basisu::clamp(db, cETC1ColorDeltaMin, cETC1ColorDeltaMax); + + if (dr < 0) dr += 8; + if (dg < 0) dg += 8; + if (db < 0) db += 8; + + dst_blk.m_bytes[0] = (uint8_t)((block_colors[0].r << 3) | dr); + dst_blk.m_bytes[1] = (uint8_t)((block_colors[0].g << 3) | dg); + dst_blk.m_bytes[2] = (uint8_t)((block_colors[0].b << 3) | db); + } + else + { + dst_blk.m_bytes[0] = (uint8_t)(block_colors[1].r | (block_colors[0].r << 4)); + dst_blk.m_bytes[1] = (uint8_t)(block_colors[1].g | (block_colors[0].g << 4)); + dst_blk.m_bytes[2] = (uint8_t)(block_colors[1].b | (block_colors[0].b << 4)); + } + + etc1_determine_selectors(dst_blk, &block_pixels[0][0], 0, 2); + } + + bool transcode_uastc_to_etc1(const uastc_block& src_blk, void* pDst) + { + unpacked_uastc_block unpacked_src_blk; + if (!unpack_uastc(src_blk, unpacked_src_blk, false)) + return false; + + color32 block_pixels[4][4]; + if (unpacked_src_blk.m_mode != UASTC_MODE_INDEX_SOLID_COLOR) + { + const bool unpack_srgb = false; + if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) + return false; + } + + transcode_uastc_to_etc1(unpacked_src_blk, block_pixels, pDst); + + return true; + } + + static inline int gray_distance2(const uint8_t c, int y) + { + int gray_dist = (int)c - y; + return gray_dist * gray_dist; + } + + static bool pack_etc1_y_estimate_flipped(const uint8_t* pSrc_pixels, + int& upper_avg, int& lower_avg, int& left_avg, int& right_avg) + { + int sums[2][2]; + +#define GET_XY(x, y) pSrc_pixels[(x) + ((y) * 4)] + + sums[0][0] = GET_XY(0, 0) + GET_XY(0, 1) + GET_XY(1, 0) + GET_XY(1, 1); + sums[1][0] = GET_XY(2, 0) + GET_XY(2, 1) + GET_XY(3, 0) + GET_XY(3, 1); + sums[0][1] = GET_XY(0, 2) + GET_XY(0, 3) + GET_XY(1, 2) + GET_XY(1, 3); + sums[1][1] = GET_XY(2, 2) + GET_XY(2, 3) + GET_XY(3, 2) + GET_XY(3, 3); + + upper_avg = (sums[0][0] + sums[1][0] + 4) / 8; + lower_avg = (sums[0][1] + sums[1][1] + 4) / 8; + left_avg = (sums[0][0] + sums[0][1] + 4) / 8; + right_avg = (sums[1][0] + sums[1][1] + 4) / 8; + +#undef GET_XY +#define GET_XY(x, y, a) gray_distance2(pSrc_pixels[(x) + ((y) * 4)], a) + + int upper_gray_dist = 0, lower_gray_dist = 0, left_gray_dist = 0, right_gray_dist = 0; + for (uint32_t i = 0; i < 4; i++) + { + for (uint32_t j = 0; j < 2; j++) + { + upper_gray_dist += GET_XY(i, j, upper_avg); + lower_gray_dist += GET_XY(i, 2 + j, lower_avg); + left_gray_dist += GET_XY(j, i, left_avg); + right_gray_dist += GET_XY(2 + j, i, right_avg); + } + } + +#undef GET_XY + + int upper_lower_sum = upper_gray_dist + lower_gray_dist; + int left_right_sum = left_gray_dist + right_gray_dist; + + return upper_lower_sum < left_right_sum; + } + + // Base Sel Table + // XXXXX XX XXX + static const uint16_t g_etc1_y_solid_block_configs[256] = + { + 0,781,64,161,260,192,33,131,96,320,65,162,261,193,34,291,97,224,66,163,262,194,35,549,98,4,67,653,164,195,523,36,99,5,578,68,165,353,196,37,135,100,324,69,166,354,197,38,295,101,228,70,167, + 355,198,39,553,102,8,71,608,168,199,527,40,103,9,582,72,169,357,200,41,139,104,328,73,170,358,201,42,299,105,232,74,171,359,202,43,557,106,12,75,612,172,203,531,44,107,13,586,76,173,361, + 204,45,143,108,332,77,174,362,205,46,303,109,236,78,175,363,206,47,561,110,16,79,616,176,207,535,48,111,17,590,80,177,365,208,49,147,112,336,81,178,366,209,50,307,113,240,82,179,367,210, + 51,565,114,20,83,620,180,211,539,52,115,21,594,84,181,369,212,53,151,116,340,85,182,370,213,54,311,117,244,86,183,371,214,55,569,118,24,87,624,184,215,543,56,119,25,598,88,185,373,216,57, + 155,120,344,89,186,374,217,58,315,121,248,90,187,375,218,59,573,122,28,91,628,188,219,754,60,123,29,602,92,189,377,220,61,159,124,348,93,190,378,221,62,319,125,252,94,191,379,222,63,882,126 + }; + + // individual + // table base sel0 sel1 sel2 sel3 + static const uint16_t g_etc1_y_solid_block_4i_configs[256] = + { + 0xA000,0xA800,0x540B,0xAA01,0xAA01,0xFE00,0xFF00,0xFF00,0x8,0x5515,0x5509,0x5509,0xAA03,0x5508,0x5508,0x9508,0xA508,0xA908,0xAA08,0x5513,0xAA09,0xAA09,0xAA05,0xFF08,0xFF08,0x10,0x551D,0x5511,0x5511, + 0xAA0B,0x5510,0x5510,0x9510,0xA510,0xA910,0xAA10,0x551B,0xAA11,0xAA11,0xAA0D,0xFF10,0xFF10,0x18,0x5525,0x5519,0x5519,0xAA13,0x5518,0x5518,0x9518,0xA518,0xA918,0xAA18,0x5523,0xAA19,0xAA19,0xAA15, + 0xFF18,0xFF18,0x20,0x552D,0x5521,0x5521,0xAA1B,0x5520,0x5520,0x9520,0xA520,0xA920,0xAA20,0x552B,0xAA21,0xAA21,0xAA1D,0xFF20,0xFF20,0x28,0x5535,0x5529,0x5529,0xAA23,0x5528,0x5528,0x9528,0xA528,0xA928, + 0xAA28,0x5533,0xAA29,0xAA29,0xAA25,0xFF28,0xFF28,0x30,0x553D,0x5531,0x5531,0xAA2B,0x5530,0x5530,0x9530,0xA530,0xA930,0xAA30,0x553B,0xAA31,0xAA31,0xAA2D,0xFF30,0xFF30,0x38,0x5545,0x5539,0x5539,0xAA33, + 0x5538,0x5538,0x9538,0xA538,0xA938,0xAA38,0x5543,0xAA39,0xAA39,0xAA35,0xFF38,0xFF38,0x40,0x554D,0x5541,0x5541,0xAA3B,0x5540,0x5540,0x9540,0xA540,0xA940,0xAA40,0x554B,0xAA41,0xAA41,0xAA3D,0xFF40,0xFF40, + 0x48,0x5555,0x5549,0x5549,0xAA43,0x5548,0x5548,0x9548,0xA548,0xA948,0xAA48,0x5553,0xAA49,0xAA49,0xAA45,0xFF48,0xFF48,0x50,0x555D,0x5551,0x5551,0xAA4B,0x5550,0x5550,0x9550,0xA550,0xA950,0xAA50,0x555B, + 0xAA51,0xAA51,0xAA4D,0xFF50,0xFF50,0x58,0x5565,0x5559,0x5559,0xAA53,0x5558,0x5558,0x9558,0xA558,0xA958,0xAA58,0x5563,0xAA59,0xAA59,0xAA55,0xFF58,0xFF58,0x60,0x556D,0x5561,0x5561,0xAA5B,0x5560,0x5560, + 0x9560,0xA560,0xA960,0xAA60,0x556B,0xAA61,0xAA61,0xAA5D,0xFF60,0xFF60,0x68,0x5575,0x5569,0x5569,0xAA63,0x5568,0x5568,0x9568,0xA568,0xA968,0xAA68,0x5573,0xAA69,0xAA69,0xAA65,0xFF68,0xFF68,0x70,0x557D, + 0x5571,0x5571,0xAA6B,0x5570,0x5570,0x9570,0xA570,0xA970,0xAA70,0x557B,0xAA71,0xAA71,0xAA6D,0xFF70,0xFF70,0x78,0x78,0x5579,0x5579,0xAA73,0x5578,0x9578,0x2578,0xE6E,0x278 + }; + + static const uint16_t g_etc1_y_solid_block_2i_configs[256] = + { + 0x416,0x800,0xA00,0x50B,0xA01,0xA01,0xF00,0xF00,0xF00,0x8,0x515,0x509,0x509,0xA03,0x508,0x508,0xF01,0xF01,0xA08,0xA08,0x513,0xA09,0xA09,0xA05,0xF08,0xF08,0x10,0x51D,0x511,0x511,0xA0B,0x510,0x510,0xF09, + 0xF09,0xA10,0xA10,0x51B,0xA11,0xA11,0xA0D,0xF10,0xF10,0x18,0x525,0x519,0x519,0xA13,0x518,0x518,0xF11,0xF11,0xA18,0xA18,0x523,0xA19,0xA19,0xA15,0xF18,0xF18,0x20,0x52D,0x521,0x521,0xA1B,0x520,0x520,0xF19, + 0xF19,0xA20,0xA20,0x52B,0xA21,0xA21,0xA1D,0xF20,0xF20,0x28,0x535,0x529,0x529,0xA23,0x528,0x528,0xF21,0xF21,0xA28,0xA28,0x533,0xA29,0xA29,0xA25,0xF28,0xF28,0x30,0x53D,0x531,0x531,0xA2B,0x530,0x530,0xF29, + 0xF29,0xA30,0xA30,0x53B,0xA31,0xA31,0xA2D,0xF30,0xF30,0x38,0x545,0x539,0x539,0xA33,0x538,0x538,0xF31,0xF31,0xA38,0xA38,0x543,0xA39,0xA39,0xA35,0xF38,0xF38,0x40,0x54D,0x541,0x541,0xA3B,0x540,0x540,0xF39, + 0xF39,0xA40,0xA40,0x54B,0xA41,0xA41,0xA3D,0xF40,0xF40,0x48,0x555,0x549,0x549,0xA43,0x548,0x548,0xF41,0xF41,0xA48,0xA48,0x553,0xA49,0xA49,0xA45,0xF48,0xF48,0x50,0x55D,0x551,0x551,0xA4B,0x550,0x550,0xF49, + 0xF49,0xA50,0xA50,0x55B,0xA51,0xA51,0xA4D,0xF50,0xF50,0x58,0x565,0x559,0x559,0xA53,0x558,0x558,0xF51,0xF51,0xA58,0xA58,0x563,0xA59,0xA59,0xA55,0xF58,0xF58,0x60,0x56D,0x561,0x561,0xA5B,0x560,0x560,0xF59, + 0xF59,0xA60,0xA60,0x56B,0xA61,0xA61,0xA5D,0xF60,0xF60,0x68,0x575,0x569,0x569,0xA63,0x568,0x568,0xF61,0xF61,0xA68,0xA68,0x573,0xA69,0xA69,0xA65,0xF68,0xF68,0x70,0x57D,0x571,0x571,0xA6B,0x570,0x570,0xF69, + 0xF69,0xA70,0xA70,0x57B,0xA71,0xA71,0xA6D,0xF70,0xF70,0x78,0x78,0x579,0x579,0xA73,0x578,0x578,0xE6E,0x278 + }; + + static const uint16_t g_etc1_y_solid_block_1i_configs[256] = + { + 0x0,0x116,0x200,0x200,0x10B,0x201,0x201,0x300,0x300,0x8,0x115,0x109,0x109,0x203,0x108,0x108,0x114,0x301,0x204,0x208,0x208,0x113,0x209,0x209,0x205,0x308,0x10,0x11D,0x111,0x111,0x20B,0x110,0x110,0x11C,0x309, + 0x20C,0x210,0x210,0x11B,0x211,0x211,0x20D,0x310,0x18,0x125,0x119,0x119,0x213,0x118,0x118,0x124,0x311,0x214,0x218,0x218,0x123,0x219,0x219,0x215,0x318,0x20,0x12D,0x121,0x121,0x21B,0x120,0x120,0x12C,0x319,0x21C, + 0x220,0x220,0x12B,0x221,0x221,0x21D,0x320,0x28,0x135,0x129,0x129,0x223,0x128,0x128,0x134,0x321,0x224,0x228,0x228,0x133,0x229,0x229,0x225,0x328,0x30,0x13D,0x131,0x131,0x22B,0x130,0x130,0x13C,0x329,0x22C,0x230, + 0x230,0x13B,0x231,0x231,0x22D,0x330,0x38,0x145,0x139,0x139,0x233,0x138,0x138,0x144,0x331,0x234,0x238,0x238,0x143,0x239,0x239,0x235,0x338,0x40,0x14D,0x141,0x141,0x23B,0x140,0x140,0x14C,0x339,0x23C,0x240,0x240, + 0x14B,0x241,0x241,0x23D,0x340,0x48,0x155,0x149,0x149,0x243,0x148,0x148,0x154,0x341,0x244,0x248,0x248,0x153,0x249,0x249,0x245,0x348,0x50,0x15D,0x151,0x151,0x24B,0x150,0x150,0x15C,0x349,0x24C,0x250,0x250,0x15B, + 0x251,0x251,0x24D,0x350,0x58,0x165,0x159,0x159,0x253,0x158,0x158,0x164,0x351,0x254,0x258,0x258,0x163,0x259,0x259,0x255,0x358,0x60,0x16D,0x161,0x161,0x25B,0x160,0x160,0x16C,0x359,0x25C,0x260,0x260,0x16B,0x261, + 0x261,0x25D,0x360,0x68,0x175,0x169,0x169,0x263,0x168,0x168,0x174,0x361,0x264,0x268,0x268,0x173,0x269,0x269,0x265,0x368,0x70,0x17D,0x171,0x171,0x26B,0x170,0x170,0x17C,0x369,0x26C,0x270,0x270,0x17B,0x271,0x271, + 0x26D,0x370,0x78,0x78,0x179,0x179,0x273,0x178,0x178,0x26E,0x278 + }; + + // We don't have any useful hints to accelerate single channel ETC1, so we need to real-time encode from scratch. + bool transcode_uastc_to_etc1(const uastc_block& src_blk, void* pDst, uint32_t channel) + { + unpacked_uastc_block unpacked_src_blk; + if (!unpack_uastc(src_blk, unpacked_src_blk, false)) + return false; + +#if 0 + for (uint32_t individ = 0; individ < 2; individ++) + { + uint32_t overall_error = 0; + + for (uint32_t c = 0; c < 256; c++) + { + uint32_t best_err = UINT32_MAX; + uint32_t best_individ = 0; + uint32_t best_base = 0; + uint32_t best_sels[4] = { 0,0,0,0 }; + uint32_t best_table = 0; + + const uint32_t limit = individ ? 16 : 32; + + for (uint32_t table = 0; table < 8; table++) + { + for (uint32_t base = 0; base < limit; base++) + { + uint32_t total_e = 0; + uint32_t sels[4] = { 0,0,0,0 }; + + const uint32_t N = 4; + for (uint32_t i = 0; i < basisu::minimum(N, (256 - c)); i++) + { + uint32_t best_sel_e = UINT32_MAX; + uint32_t best_sel = 0; + + for (uint32_t sel = 0; sel < 4; sel++) + { + int val = individ ? ((base << 4) | base) : ((base << 3) | (base >> 2)); + val = clamp255(val + g_etc1_inten_tables[table][sel]); + + int e = iabs(val - clamp255(c + i)); + if (e < best_sel_e) + { + best_sel_e = e; + best_sel = sel; + } + + } // sel + + sels[i] = best_sel; + total_e += best_sel_e * best_sel_e; + + } // i + + if (total_e < best_err) + { + best_err = total_e; + best_individ = individ; + best_base = base; + memcpy(best_sels, sels, sizeof(best_sels)); + best_table = table; + } + + } // base + } // table + + //printf("%u: %u,%u,%u,%u,%u,%u,%u,%u\n", c, best_err, best_individ, best_table, best_base, best_sels[0], best_sels[1], best_sels[2], best_sels[3]); + + uint32_t encoded = best_table | (best_base << 3) | + (best_sels[0] << 8) | + (best_sels[1] << 10) | + (best_sels[2] << 12) | + (best_sels[3] << 14); + + printf("0x%X,", encoded); + + overall_error += best_err; + } // c + + printf("\n"); + printf("Overall error: %u\n", overall_error); + + } // individ + + exit(0); +#endif + +#if 0 + for (uint32_t individ = 0; individ < 2; individ++) + { + uint32_t overall_error = 0; + + for (uint32_t c = 0; c < 256; c++) + { + uint32_t best_err = UINT32_MAX; + uint32_t best_individ = 0; + uint32_t best_base = 0; + uint32_t best_sels[4] = { 0,0,0,0 }; + uint32_t best_table = 0; + + const uint32_t limit = individ ? 16 : 32; + + for (uint32_t table = 0; table < 8; table++) + { + for (uint32_t base = 0; base < limit; base++) + { + uint32_t total_e = 0; + uint32_t sels[4] = { 0,0,0,0 }; + + const uint32_t N = 1; + for (uint32_t i = 0; i < basisu::minimum(N, (256 - c)); i++) + { + uint32_t best_sel_e = UINT32_MAX; + uint32_t best_sel = 0; + + for (uint32_t sel = 0; sel < 4; sel++) + { + int val = individ ? ((base << 4) | base) : ((base << 3) | (base >> 2)); + val = clamp255(val + g_etc1_inten_tables[table][sel]); + + int e = iabs(val - clamp255(c + i)); + if (e < best_sel_e) + { + best_sel_e = e; + best_sel = sel; + } + + } // sel + + sels[i] = best_sel; + total_e += best_sel_e * best_sel_e; + + } // i + + if (total_e < best_err) + { + best_err = total_e; + best_individ = individ; + best_base = base; + memcpy(best_sels, sels, sizeof(best_sels)); + best_table = table; + } + + } // base + } // table + + //printf("%u: %u,%u,%u,%u,%u,%u,%u,%u\n", c, best_err, best_individ, best_table, best_base, best_sels[0], best_sels[1], best_sels[2], best_sels[3]); + + uint32_t encoded = best_table | (best_base << 3) | + (best_sels[0] << 8) | + (best_sels[1] << 10) | + (best_sels[2] << 12) | + (best_sels[3] << 14); + + printf("0x%X,", encoded); + + overall_error += best_err; + } // c + + printf("\n"); + printf("Overall error: %u\n", overall_error); + + } // individ + + exit(0); +#endif + + decoder_etc_block& dst_blk = *static_cast(pDst); + + if (unpacked_src_blk.m_mode == UASTC_MODE_INDEX_SOLID_COLOR) + { + const uint32_t y = unpacked_src_blk.m_solid_color[channel]; + const uint32_t encoded_config = g_etc1_y_solid_block_configs[y]; + + const uint32_t base = encoded_config & 31; + const uint32_t sel = (encoded_config >> 5) & 3; + const uint32_t table = encoded_config >> 7; + + dst_blk.m_bytes[3] = (uint8_t)(2 | (table << 5) | (table << 2)); + + dst_blk.m_bytes[0] = (uint8_t)(base << 3); + dst_blk.m_bytes[1] = (uint8_t)(base << 3); + dst_blk.m_bytes[2] = (uint8_t)(base << 3); + + memcpy(dst_blk.m_bytes + 4, &s_etc1_solid_selectors[sel][0], 4); + return true; + } + + color32 block_pixels[4][4]; + const bool unpack_srgb = false; + if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) + return false; + + uint8_t block_y[4][4]; + for (uint32_t i = 0; i < 16; i++) + ((uint8_t*)block_y)[i] = ((color32*)block_pixels)[i][channel]; + + int upper_avg, lower_avg, left_avg, right_avg; + bool flip = pack_etc1_y_estimate_flipped(&block_y[0][0], upper_avg, lower_avg, left_avg, right_avg); + + // non-flipped: | | + // vs. + // flipped: -- + // -- + + uint32_t low[2] = { 255, 255 }, high[2] = { 0, 0 }; + + if (flip) + { + for (uint32_t y = 0; y < 2; y++) + { + for (uint32_t x = 0; x < 4; x++) { - switch (num_remaining) - { - case 1: num_bits = 2; break; - case 2: num_bits = 4; break; - case 3: num_bits = 5; break; - case 4: num_bits = 7; break; - default: break; - } + const uint32_t v = block_y[y][x]; + low[0] = basisu::minimum(low[0], v); + high[0] = basisu::maximum(high[0], v); } - else if (ep_quints) + } + for (uint32_t y = 2; y < 4; y++) + { + for (uint32_t x = 0; x < 4; x++) { - switch (num_remaining) - { - case 1: num_bits = 3; break; - case 2: num_bits = 5; break; - default: break; - } + const uint32_t v = block_y[y][x]; + low[1] = basisu::minimum(low[1], v); + high[1] = basisu::maximum(high[1], v); + } + } + } + else + { + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 2; x++) + { + const uint32_t v = block_y[y][x]; + low[0] = basisu::minimum(low[0], v); + high[0] = basisu::maximum(high[0], v); + } + } + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 2; x < 4; x++) + { + const uint32_t v = block_y[y][x]; + low[1] = basisu::minimum(low[1], v); + high[1] = basisu::maximum(high[1], v); } } + } - tq_values[i] = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, num_bits); - } // i + const uint32_t range[2] = { high[0] - low[0], high[1] - low[1] }; - uint32_t accum = 0; - uint32_t accum_remaining = 0; - uint32_t next_tq_index = 0; + dst_blk.m_bytes[3] = (uint8_t)((int)flip); - for (uint32_t i = 0; i < total_values; i++) + if ((range[0] <= 3) && (range[1] <= 3)) { - uint32_t value = (uint32_t)read_bits1_to_9_fst(blk.m_bytes, bit_ofs, ep_bits); + // This is primarily for better gradients. + dst_blk.m_bytes[0] = 0; + dst_blk.m_bytes[1] = 0; + dst_blk.m_bytes[2] = 0; - if (total_tqs) + uint16_t l_bitmask = 0, h_bitmask = 0; + + for (uint32_t subblock = 0; subblock < 2; subblock++) { - if (!accum_remaining) + const uint32_t encoded = (range[subblock] == 0) ? g_etc1_y_solid_block_1i_configs[low[subblock]] : ((range[subblock] < 2) ? g_etc1_y_solid_block_2i_configs[low[subblock]] : g_etc1_y_solid_block_4i_configs[low[subblock]]); + + const uint32_t table = encoded & 7; + const uint32_t base = (encoded >> 3) & 31; + assert(base <= 15); + const uint32_t sels[4] = { (encoded >> 8) & 3, (encoded >> 10) & 3, (encoded >> 12) & 3, (encoded >> 14) & 3 }; + + dst_blk.m_bytes[3] |= (uint8_t)(table << (subblock ? 2 : 5)); + + const uint32_t sv = base << (subblock ? 0 : 4); + dst_blk.m_bytes[0] |= (uint8_t)(sv); + dst_blk.m_bytes[1] |= (uint8_t)(sv); + dst_blk.m_bytes[2] |= (uint8_t)(sv); + + if (flip) { - assert(next_tq_index < total_tqs); - accum = tq_values[next_tq_index++]; - accum_remaining = bundle_size; + uint32_t ofs = subblock * 2; + for (uint32_t y = 0; y < 2; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + uint32_t t = block_y[y + subblock * 2][x]; + assert(t >= low[subblock] && t <= high[subblock]); + t -= low[subblock]; + assert(t <= 3); + + t = g_selector_index_to_etc1[sels[t]]; + + assert(ofs < 16); + l_bitmask |= ((t & 1) << ofs); + h_bitmask |= ((t >> 1) << ofs); + ofs += 4; + } + + ofs = (int)ofs + 1 - 4 * 4; + } } + else + { + uint32_t ofs = (subblock * 2) * 4; + for (uint32_t x = 0; x < 2; x++) + { + for (uint32_t y = 0; y < 4; y++) + { + uint32_t t = block_y[y][x + subblock * 2]; + assert(t >= low[subblock] && t <= high[subblock]); + t -= low[subblock]; + assert(t <= 3); - // TODO: Optimize with tables - uint32_t v = accum % mul; - accum /= mul; - accum_remaining--; + t = g_selector_index_to_etc1[sels[t]]; - value |= (v << ep_bits); - } + assert(ofs < 16); + l_bitmask |= ((t & 1) << ofs); + h_bitmask |= ((t >> 1) << ofs); + ++ofs; + } + } + } + } // subblock - unpacked.m_astc.m_endpoints[i] = (uint8_t)value; + dst_blk.m_bytes[7] = (uint8_t)(l_bitmask); + dst_blk.m_bytes[6] = (uint8_t)(l_bitmask >> 8); + dst_blk.m_bytes[5] = (uint8_t)(h_bitmask); + dst_blk.m_bytes[4] = (uint8_t)(h_bitmask >> 8); + + return true; } - const uint8_t* pPartition_pattern; - const uint8_t* pSubset_anchor_indices = get_anchor_indices(subsets, mode, unpacked.m_common_pattern, pPartition_pattern); + uint32_t y0 = ((flip ? upper_avg : left_avg) * 31 + 127) / 255; + uint32_t y1 = ((flip ? lower_avg : right_avg) * 31 + 127) / 255; -#ifdef _DEBUG - for (uint32_t i = 0; i < 16; i++) - assert(pPartition_pattern[i] == astc_compute_texel_partition(part_seed, i & 3, i >> 2, 0, subsets, true)); + bool diff = true; - for (uint32_t subset_index = 0; subset_index < subsets; subset_index++) + int dy = y1 - y0; + + if ((dy < cETC1ColorDeltaMin) || (dy > cETC1ColorDeltaMax)) { - uint32_t anchor_index = 0; + diff = false; - for (uint32_t i = 0; i < 16; i++) - { - if (pPartition_pattern[i] == subset_index) - { - anchor_index = i; - break; - } - } + y0 = ((flip ? upper_avg : left_avg) * 15 + 127) / 255; + y1 = ((flip ? lower_avg : right_avg) * 15 + 127) / 255; - assert(pSubset_anchor_indices[subset_index] == anchor_index); + dst_blk.m_bytes[0] = (uint8_t)(y1 | (y0 << 4)); + dst_blk.m_bytes[1] = (uint8_t)(y1 | (y0 << 4)); + dst_blk.m_bytes[2] = (uint8_t)(y1 | (y0 << 4)); } -#endif - -#if 0 - const uint32_t total_planes_shift = total_planes - 1; - for (uint32_t i = 0; i < 16 * total_planes; i++) + else { - uint32_t num_bits = weight_bits; - for (uint32_t s = 0; s < subsets; s++) - { - if (pSubset_anchor_indices[s] == (i >> total_planes_shift)) - { - num_bits--; - break; - } - } + dy = basisu::clamp(dy, cETC1ColorDeltaMin, cETC1ColorDeltaMax); - unpacked.m_astc.m_weights[i] = (uint8_t)read_bits1_to_9(blk.m_bytes, bit_ofs, num_bits); + y1 = y0 + dy; + + if (dy < 0) dy += 8; + + dst_blk.m_bytes[0] = (uint8_t)((y0 << 3) | dy); + dst_blk.m_bytes[1] = (uint8_t)((y0 << 3) | dy); + dst_blk.m_bytes[2] = (uint8_t)((y0 << 3) | dy); + + dst_blk.m_bytes[3] |= 2; } -#endif - if (mode == 18) + const uint32_t base_y[2] = { diff ? ((y0 << 3) | (y0 >> 2)) : ((y0 << 4) | y0), diff ? ((y1 << 3) | (y1 >> 2)) : ((y1 << 4) | y1) }; + + uint32_t enc_range[2]; + for (uint32_t subset = 0; subset < 2; subset++) { - // Mode 18 is the only mode with more than 64 weight bits. - for (uint32_t i = 0; i < 16; i++) - unpacked.m_astc.m_weights[i] = (uint8_t)read_bits1_to_9(blk.m_bytes, bit_ofs, i ? weight_bits : (weight_bits - 1)); + const int pos = basisu::iabs((int)high[subset] - (int)base_y[subset]); + const int neg = basisu::iabs((int)base_y[subset] - (int)low[subset]); + + enc_range[subset] = basisu::maximum(pos, neg); } - else - { - // All other modes have <= 64 weight bits. - uint64_t bits; - - // Read the weight bits - if ((BASISD_IS_BIG_ENDIAN) || (!BASISD_USE_UNALIGNED_WORD_READS)) - bits = read_bits64(blk.m_bytes, bit_ofs, basisu::minimum(64, 128 - (int)bit_ofs)); - else - { - bits = blk.m_dwords[2]; - bits |= (((uint64_t)blk.m_dwords[3]) << 32U); - - if (bit_ofs >= 64U) - bits >>= (bit_ofs - 64U); - else - { - assert(bit_ofs >= 56U); - - uint32_t bits_needed = 64U - bit_ofs; - bits <<= bits_needed; - bits |= (blk.m_bytes[7] >> (8U - bits_needed)); - } - } - - bit_ofs = 0; - const uint32_t mask = (1U << weight_bits) - 1U; - const uint32_t anchor_mask = (1U << (weight_bits - 1U)) - 1U; - - if (total_planes == 2) + uint16_t l_bitmask = 0, h_bitmask = 0; + for (uint32_t subblock = 0; subblock < 2; subblock++) + { + if ((!diff) && (range[subblock] <= 3)) { - // Dual plane modes always have a single subset, and the first 2 weights are anchors. + const uint32_t encoded = (range[subblock] == 0) ? g_etc1_y_solid_block_1i_configs[low[subblock]] : ((range[subblock] < 2) ? g_etc1_y_solid_block_2i_configs[low[subblock]] : g_etc1_y_solid_block_4i_configs[low[subblock]]); - unpacked.m_astc.m_weights[0] = (uint8_t)((uint32_t)(bits >> bit_ofs) & anchor_mask); - bit_ofs += (weight_bits - 1); - - unpacked.m_astc.m_weights[1] = (uint8_t)((uint32_t)(bits >> bit_ofs) & anchor_mask); - bit_ofs += (weight_bits - 1); + const uint32_t table = encoded & 7; + const uint32_t base = (encoded >> 3) & 31; + assert(base <= 15); + const uint32_t sels[4] = { (encoded >> 8) & 3, (encoded >> 10) & 3, (encoded >> 12) & 3, (encoded >> 14) & 3 }; - for (uint32_t i = 2; i < 32; i++) - { - unpacked.m_astc.m_weights[i] = (uint8_t)((uint32_t)(bits >> bit_ofs) & mask); - bit_ofs += weight_bits; - } - } - else - { - if (subsets == 1) - { - // Specialize the single subset case. - if (weight_bits == 4) - { - assert(bit_ofs == 0); - - // Specialize the most common case: 4-bit weights. - unpacked.m_astc.m_weights[0] = (uint8_t)((uint32_t)(bits) & 7); - unpacked.m_astc.m_weights[1] = (uint8_t)((uint32_t)(bits >> 3) & 15); - unpacked.m_astc.m_weights[2] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 1)) & 15); - unpacked.m_astc.m_weights[3] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 2)) & 15); + dst_blk.m_bytes[3] |= (uint8_t)(table << (subblock ? 2 : 5)); - unpacked.m_astc.m_weights[4] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 3)) & 15); - unpacked.m_astc.m_weights[5] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 4)) & 15); - unpacked.m_astc.m_weights[6] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 5)) & 15); - unpacked.m_astc.m_weights[7] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 6)) & 15); + const uint32_t mask = ~(0xF << (subblock ? 0 : 4)); - unpacked.m_astc.m_weights[8] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 7)) & 15); - unpacked.m_astc.m_weights[9] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 8)) & 15); - unpacked.m_astc.m_weights[10] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 9)) & 15); - unpacked.m_astc.m_weights[11] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 10)) & 15); + dst_blk.m_bytes[0] &= mask; + dst_blk.m_bytes[1] &= mask; + dst_blk.m_bytes[2] &= mask; - unpacked.m_astc.m_weights[12] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 11)) & 15); - unpacked.m_astc.m_weights[13] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 12)) & 15); - unpacked.m_astc.m_weights[14] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 13)) & 15); - unpacked.m_astc.m_weights[15] = (uint8_t)((uint32_t)(bits >> (3 + 4 * 14)) & 15); - } - else - { - // First weight is always an anchor. - unpacked.m_astc.m_weights[0] = (uint8_t)((uint32_t)(bits >> bit_ofs) & anchor_mask); - bit_ofs += (weight_bits - 1); + const uint32_t sv = base << (subblock ? 0 : 4); + dst_blk.m_bytes[0] |= (uint8_t)(sv); + dst_blk.m_bytes[1] |= (uint8_t)(sv); + dst_blk.m_bytes[2] |= (uint8_t)(sv); - for (uint32_t i = 1; i < 16; i++) + if (flip) + { + uint32_t ofs = subblock * 2; + for (uint32_t y = 0; y < 2; y++) + { + for (uint32_t x = 0; x < 4; x++) { - unpacked.m_astc.m_weights[i] = (uint8_t)((uint32_t)(bits >> bit_ofs) & mask); - bit_ofs += weight_bits; + uint32_t t = block_y[y + subblock * 2][x]; + assert(t >= low[subblock] && t <= high[subblock]); + t -= low[subblock]; + assert(t <= 3); + + t = g_selector_index_to_etc1[sels[t]]; + + assert(ofs < 16); + l_bitmask |= ((t & 1) << ofs); + h_bitmask |= ((t >> 1) << ofs); + ofs += 4; } + + ofs = (int)ofs + 1 - 4 * 4; } } else { - const uint32_t a0 = pSubset_anchor_indices[0], a1 = pSubset_anchor_indices[1], a2 = pSubset_anchor_indices[2]; - - for (uint32_t i = 0; i < 16; i++) + uint32_t ofs = (subblock * 2) * 4; + for (uint32_t x = 0; x < 2; x++) { - if ((i == a0) || (i == a1) || (i == a2)) - { - unpacked.m_astc.m_weights[i] = (uint8_t)((uint32_t)(bits >> bit_ofs) & anchor_mask); - bit_ofs += (weight_bits - 1); - } - else + for (uint32_t y = 0; y < 4; y++) { - unpacked.m_astc.m_weights[i] = (uint8_t)((uint32_t)(bits >> bit_ofs) & mask); - bit_ofs += weight_bits; + uint32_t t = block_y[y][x + subblock * 2]; + assert(t >= low[subblock] && t <= high[subblock]); + t -= low[subblock]; + assert(t <= 3); + + t = g_selector_index_to_etc1[sels[t]]; + + assert(ofs < 16); + l_bitmask |= ((t & 1) << ofs); + h_bitmask |= ((t >> 1) << ofs); + ++ofs; } } } - } - } - if ((blue_contract_check) && (total_comps >= 3)) - { - // We only need to disable ASTC Blue Contraction when we'll be packing to ASTC. The other transcoders don't care. - bool invert_subset[3] = { false, false, false }; - bool any_flag = false; + continue; + } // if - for (uint32_t subset_index = 0; subset_index < subsets; subset_index++) + uint32_t best_err = UINT32_MAX; + uint8_t best_sels[8]; + uint32_t best_inten = 0; + + const int base = base_y[subblock]; + + const int low_limit = -base; + const int high_limit = 255 - base; + + assert(low_limit <= 0 && high_limit >= 0); + + uint32_t inten_table_mask = 0xFF; + const uint32_t er = enc_range[subblock]; + // Each one of these tables is expensive to evaluate, so let's only examine the ones we know may be useful. + if (er <= 51) { - const int s0 = g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 0]].m_unquant + - g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 2]].m_unquant + - g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 4]].m_unquant; + inten_table_mask = 0xF; - const int s1 = g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 1]].m_unquant + - g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 3]].m_unquant + - g_astc_unquant[endpoint_range][unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + 5]].m_unquant; + if (er > 22) + inten_table_mask &= ~(1 << 0); - if (s1 < s0) - { - for (uint32_t c = 0; c < total_comps; c++) - std::swap(unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + c * 2 + 0], unpacked.m_astc.m_endpoints[subset_index * total_comps * 2 + c * 2 + 1]); + if ((er < 4) || (er > 39)) + inten_table_mask &= ~(1 << 1); - invert_subset[subset_index] = true; - any_flag = true; - } + if (er < 9) + inten_table_mask &= ~(1 << 2); + + if (er < 12) + inten_table_mask &= ~(1 << 3); } + else + { + inten_table_mask &= ~((1 << 0) | (1 << 1)); - if (any_flag) + if (er > 60) + inten_table_mask &= ~(1 << 2); + + if (er > 89) + inten_table_mask &= ~(1 << 3); + + if (er > 120) + inten_table_mask &= ~(1 << 4); + + if (er > 136) + inten_table_mask &= ~(1 << 5); + + if (er > 174) + inten_table_mask &= ~(1 << 6); + } + + for (uint32_t inten = 0; inten < 8; inten++) { - const uint32_t weight_mask = (1 << weight_bits) - 1; + if ((inten_table_mask & (1 << inten)) == 0) + continue; - for (uint32_t i = 0; i < 16; i++) + const int t0 = basisu::maximum(low_limit, g_etc1_inten_tables[inten][0]); + const int t1 = basisu::maximum(low_limit, g_etc1_inten_tables[inten][1]); + const int t2 = basisu::minimum(high_limit, g_etc1_inten_tables[inten][2]); + const int t3 = basisu::minimum(high_limit, g_etc1_inten_tables[inten][3]); + assert((t0 <= t1) && (t1 <= t2) && (t2 <= t3)); + + const int tv[4] = { t2, t3, t1, t0 }; + + const int thresh01 = t0 + t1; + const int thresh12 = t1 + t2; + const int thresh23 = t2 + t3; + + assert(thresh01 <= thresh12 && thresh12 <= thresh23); + + static const uint8_t s_table[4] = { 1, 0, 2, 3 }; + + uint32_t total_err = 0; + uint8_t sels[8]; + + if (flip) { - uint32_t subset = pPartition_pattern[i]; + if (((int)high[subblock] - base) * 2 < thresh01) + { + memset(sels, 3, 8); - if (invert_subset[subset]) + for (uint32_t y = 0; y < 2; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + const int delta = (int)block_y[y + subblock * 2][x] - base; + + const uint32_t c = 3; + + uint32_t e = basisu::iabs(tv[c] - delta); + total_err += e * e; + } + if (total_err >= best_err) + break; + } + } + else if (((int)low[subblock] - base) * 2 >= thresh23) { - unpacked.m_astc.m_weights[i * total_planes] = (uint8_t)(weight_mask - unpacked.m_astc.m_weights[i * total_planes]); + memset(sels, 1, 8); - if (total_planes == 2) - unpacked.m_astc.m_weights[i * total_planes + 1] = (uint8_t)(weight_mask - unpacked.m_astc.m_weights[i * total_planes + 1]); + for (uint32_t y = 0; y < 2; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + const int delta = (int)block_y[y + subblock * 2][x] - base; + + const uint32_t c = 1; + + uint32_t e = basisu::iabs(tv[c] - delta); + total_err += e * e; + } + if (total_err >= best_err) + break; + } + } + else + { + for (uint32_t y = 0; y < 2; y++) + { + for (uint32_t x = 0; x < 4; x++) + { + const int delta = (int)block_y[y + subblock * 2][x] - base; + const int delta2 = delta * 2; + + uint32_t c = s_table[(delta2 < thresh01) + (delta2 < thresh12) + (delta2 < thresh23)]; + sels[y * 4 + x] = (uint8_t)c; + + uint32_t e = basisu::iabs(tv[c] - delta); + total_err += e * e; + } + if (total_err >= best_err) + break; + } } } - } - } + else + { + if (((int)high[subblock] - base) * 2 < thresh01) + { + memset(sels, 3, 8); - return true; - } + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 2; x++) + { + const int delta = (int)block_y[y][x + subblock * 2] - base; - static const uint32_t* g_astc_weight_tables[6] = { nullptr, g_bc7_weights1, g_bc7_weights2, g_bc7_weights3, g_astc_weights4, g_astc_weights5 }; + const uint32_t c = 3; - bool unpack_uastc(uint32_t mode, uint32_t common_pattern, const color32& solid_color, const astc_block_desc& astc, color32* pPixels, bool srgb) - { - if (mode == UASTC_MODE_INDEX_SOLID_COLOR) - { - for (uint32_t i = 0; i < 16; i++) - pPixels[i] = solid_color; - return true; - } + uint32_t e = basisu::iabs(tv[c] - delta); + total_err += e * e; + } + if (total_err >= best_err) + break; + } + } + else if (((int)low[subblock] - base) * 2 >= thresh23) + { + memset(sels, 1, 8); - color32 endpoints[3][2]; + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 2; x++) + { + const int delta = (int)block_y[y][x + subblock * 2] - base; - const uint32_t total_subsets = g_uastc_mode_subsets[mode]; - const uint32_t total_comps = basisu::minimum(4U, g_uastc_mode_comps[mode]); - const uint32_t endpoint_range = g_uastc_mode_endpoint_ranges[mode]; - const uint32_t total_planes = g_uastc_mode_planes[mode]; - const uint32_t weight_bits = g_uastc_mode_weight_bits[mode]; - const uint32_t weight_levels = 1 << weight_bits; + const uint32_t c = 1; - for (uint32_t subset_index = 0; subset_index < total_subsets; subset_index++) - { - if (total_comps == 2) - { - const uint32_t ll = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + 0 * 2 + 0]].m_unquant; - const uint32_t lh = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + 0 * 2 + 1]].m_unquant; + uint32_t e = basisu::iabs(tv[c] - delta); + total_err += e * e; + } + if (total_err >= best_err) + break; + } + } + else + { + for (uint32_t y = 0; y < 4; y++) + { + for (uint32_t x = 0; x < 2; x++) + { + const int delta = (int)block_y[y][x + subblock * 2] - base; + const int delta2 = delta * 2; - const uint32_t al = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + 1 * 2 + 0]].m_unquant; - const uint32_t ah = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + 1 * 2 + 1]].m_unquant; + uint32_t c = s_table[(delta2 < thresh01) + (delta2 < thresh12) + (delta2 < thresh23)]; + sels[y * 2 + x] = (uint8_t)c; - endpoints[subset_index][0].set_noclamp_rgba(ll, ll, ll, al); - endpoints[subset_index][1].set_noclamp_rgba(lh, lh, lh, ah); - } - else - { - for (uint32_t comp_index = 0; comp_index < total_comps; comp_index++) - { - endpoints[subset_index][0][comp_index] = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + comp_index * 2 + 0]].m_unquant; - endpoints[subset_index][1][comp_index] = g_astc_unquant[endpoint_range][astc.m_endpoints[subset_index * total_comps * 2 + comp_index * 2 + 1]].m_unquant; - } - for (uint32_t comp_index = total_comps; comp_index < 4; comp_index++) - { - endpoints[subset_index][0][comp_index] = 255; - endpoints[subset_index][1][comp_index] = 255; + uint32_t e = basisu::iabs(tv[c] - delta); + total_err += e * e; + } + if (total_err >= best_err) + break; + } + } } - } - } - - color32 block_colors[3][32]; - - const uint32_t* pWeights = g_astc_weight_tables[weight_bits]; - - for (uint32_t subset_index = 0; subset_index < total_subsets; subset_index++) - { - for (uint32_t l = 0; l < weight_levels; l++) - { - if (total_comps == 2) - { - const uint8_t lc = (uint8_t)astc_interpolate(endpoints[subset_index][0][0], endpoints[subset_index][1][0], pWeights[l], srgb); - const uint8_t ac = (uint8_t)astc_interpolate(endpoints[subset_index][0][3], endpoints[subset_index][1][3], pWeights[l], srgb); - block_colors[subset_index][l].set(lc, lc, lc, ac); - } - else + if (total_err < best_err) { - uint32_t comp_index; - for (comp_index = 0; comp_index < total_comps; comp_index++) - block_colors[subset_index][l][comp_index] = (uint8_t)astc_interpolate(endpoints[subset_index][0][comp_index], endpoints[subset_index][1][comp_index], pWeights[l], srgb); - - for (; comp_index < 4; comp_index++) - block_colors[subset_index][l][comp_index] = 255; + best_err = total_err; + best_inten = inten; + memcpy(best_sels, sels, 8); } - } - } - const uint8_t* pPartition_pattern = g_zero_pattern; + } // inten - if (total_subsets >= 2) - { - if (total_subsets == 3) - pPartition_pattern = &g_astc_bc7_patterns3[common_pattern][0]; - else if (mode == 7) - pPartition_pattern = &g_bc7_3_astc2_patterns2[common_pattern][0]; - else - pPartition_pattern = &g_astc_bc7_patterns2[common_pattern][0]; + //g_inten_hist[best_inten][enc_range[subblock]]++; -#ifdef _DEBUG - for (uint32_t i = 0; i < 16; i++) - { - assert(pPartition_pattern[i] == (uint8_t)astc_compute_texel_partition(astc.m_partition_seed, i & 3, i >> 2, 0, total_subsets, true)); - } -#endif - } + dst_blk.m_bytes[3] |= (uint8_t)(best_inten << (subblock ? 2 : 5)); - if (total_planes == 1) - { - if (total_subsets == 1) + if (flip) { - for (uint32_t i = 0; i < 16; i++) + uint32_t ofs = subblock * 2; + for (uint32_t y = 0; y < 2; y++) { - assert(astc.m_weights[i] < weight_levels); - pPixels[i] = block_colors[0][astc.m_weights[i]]; + for (uint32_t x = 0; x < 4; x++) + { + uint32_t t = best_sels[y * 4 + x]; + + assert(ofs < 16); + l_bitmask |= ((t & 1) << ofs); + h_bitmask |= ((t >> 1) << ofs); + ofs += 4; + } + + ofs = (int)ofs + 1 - 4 * 4; } } else { - for (uint32_t i = 0; i < 16; i++) + uint32_t ofs = (subblock * 2) * 4; + for (uint32_t x = 0; x < 2; x++) { - assert(astc.m_weights[i] < weight_levels); - pPixels[i] = block_colors[pPartition_pattern[i]][astc.m_weights[i]]; + for (uint32_t y = 0; y < 4; y++) + { + uint32_t t = best_sels[y * 2 + x]; + + assert(ofs < 16); + l_bitmask |= ((t & 1) << ofs); + h_bitmask |= ((t >> 1) << ofs); + ++ofs; + } } } - } - else - { - assert(total_subsets == 1); - - for (uint32_t i = 0; i < 16; i++) - { - const uint32_t subset_index = 0; // pPartition_pattern[i]; - - const uint32_t weight_index0 = astc.m_weights[i * 2]; - const uint32_t weight_index1 = astc.m_weights[i * 2 + 1]; - assert(weight_index0 < weight_levels && weight_index1 < weight_levels); + } // subblock - color32& c = pPixels[i]; - for (uint32_t comp = 0; comp < 4; comp++) - { - if ((int)comp == astc.m_ccs) - c[comp] = block_colors[subset_index][weight_index1][comp]; - else - c[comp] = block_colors[subset_index][weight_index0][comp]; - } - } - } + dst_blk.m_bytes[7] = (uint8_t)(l_bitmask); + dst_blk.m_bytes[6] = (uint8_t)(l_bitmask >> 8); + dst_blk.m_bytes[5] = (uint8_t)(h_bitmask); + dst_blk.m_bytes[4] = (uint8_t)(h_bitmask >> 8); return true; } - bool unpack_uastc(const unpacked_uastc_block& unpacked_blk, color32* pPixels, bool srgb) - { - return unpack_uastc(unpacked_blk.m_mode, unpacked_blk.m_common_pattern, unpacked_blk.m_solid_color, unpacked_blk.m_astc, pPixels, srgb); - } - - bool unpack_uastc(const uastc_block& blk, color32* pPixels, bool srgb) - { - unpacked_uastc_block unpacked_blk; - - if (!unpack_uastc(blk, unpacked_blk, false, false)) - return false; - - return unpack_uastc(unpacked_blk, pPixels, srgb); - } + const uint32_t ETC2_EAC_MIN_VALUE_SELECTOR = 3, ETC2_EAC_MAX_VALUE_SELECTOR = 7; - // Determines the best shared pbits to use to encode xl/xh - static void determine_shared_pbits( - uint32_t total_comps, uint32_t comp_bits, float xl[4], float xh[4], - color_quad_u8& bestMinColor, color_quad_u8& bestMaxColor, uint32_t best_pbits[2]) + void transcode_uastc_to_etc2_eac_a8(unpacked_uastc_block& unpacked_src_blk, color32 block_pixels[4][4], void* pDst) { - const uint32_t total_bits = comp_bits + 1; - assert(total_bits >= 4 && total_bits <= 8); - - const int iscalep = (1 << total_bits) - 1; - const float scalep = (float)iscalep; - - float best_err = 1e+9f; + eac_block& dst = *static_cast(pDst); + const color32* pSrc_pixels = &block_pixels[0][0]; - for (int p = 0; p < 2; p++) + if ((!g_uastc_mode_has_alpha[unpacked_src_blk.m_mode]) || (unpacked_src_blk.m_mode == UASTC_MODE_INDEX_SOLID_COLOR)) { - color_quad_u8 xMinColor, xMaxColor; - for (uint32_t c = 0; c < 4; c++) - { - xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); - xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); - } + const uint32_t a = (unpacked_src_blk.m_mode == UASTC_MODE_INDEX_SOLID_COLOR) ? unpacked_src_blk.m_solid_color[3] : 255; - color_quad_u8 scaledLow, scaledHigh; + dst.m_base = a; + dst.m_table = 13; + dst.m_multiplier = 1; - for (uint32_t i = 0; i < 4; i++) - { - scaledLow.m_c[i] = (xMinColor.m_c[i] << (8 - total_bits)); - scaledLow.m_c[i] |= (scaledLow.m_c[i] >> total_bits); - assert(scaledLow.m_c[i] <= 255); + memcpy(dst.m_selectors, g_etc2_eac_a8_sel4, sizeof(g_etc2_eac_a8_sel4)); + + return; + } - scaledHigh.m_c[i] = (xMaxColor.m_c[i] << (8 - total_bits)); - scaledHigh.m_c[i] |= (scaledHigh.m_c[i] >> total_bits); - assert(scaledHigh.m_c[i] <= 255); - } + uint32_t min_a = 255, max_a = 0; + for (uint32_t i = 0; i < 16; i++) + { + min_a = basisu::minimum(min_a, pSrc_pixels[i].a); + max_a = basisu::maximum(max_a, pSrc_pixels[i].a); + } - float err = 0; - for (uint32_t i = 0; i < total_comps; i++) - err += basisu::squaref((scaledLow.m_c[i] / 255.0f) - xl[i]) + basisu::squaref((scaledHigh.m_c[i] / 255.0f) - xh[i]); + if (min_a == max_a) + { + dst.m_base = min_a; + dst.m_table = 13; + dst.m_multiplier = 1; - if (err < best_err) - { - best_err = err; - best_pbits[0] = p; - best_pbits[1] = p; - for (uint32_t j = 0; j < 4; j++) - { - bestMinColor.m_c[j] = xMinColor.m_c[j] >> 1; - bestMaxColor.m_c[j] = xMaxColor.m_c[j] >> 1; - } - } + memcpy(dst.m_selectors, g_etc2_eac_a8_sel4, sizeof(g_etc2_eac_a8_sel4)); + return; } - } - // Determines the best unique pbits to use to encode xl/xh - static void determine_unique_pbits( - uint32_t total_comps, uint32_t comp_bits, float xl[4], float xh[4], - color_quad_u8& bestMinColor, color_quad_u8& bestMaxColor, uint32_t best_pbits[2]) - { - const uint32_t total_bits = comp_bits + 1; - const int iscalep = (1 << total_bits) - 1; - const float scalep = (float)iscalep; + const uint32_t table = unpacked_src_blk.m_etc2_hints & 0xF; + const int multiplier = unpacked_src_blk.m_etc2_hints >> 4; - float best_err0 = 1e+9f; - float best_err1 = 1e+9f; + assert(multiplier >= 1); - for (int p = 0; p < 2; p++) - { - color_quad_u8 xMinColor, xMaxColor; + dst.m_multiplier = multiplier; + dst.m_table = table; - for (uint32_t c = 0; c < 4; c++) - { - xMinColor.m_c[c] = (uint8_t)(clampi(((int)((xl[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); - xMaxColor.m_c[c] = (uint8_t)(clampi(((int)((xh[c] * scalep - p) / 2.0f + .5f)) * 2 + p, p, iscalep - 1 + p)); - } + const float range = (float)(g_eac_modifier_table[dst.m_table][ETC2_EAC_MAX_VALUE_SELECTOR] - g_eac_modifier_table[dst.m_table][ETC2_EAC_MIN_VALUE_SELECTOR]); + const int center = (int)roundf(basisu::lerp((float)min_a, (float)max_a, (float)(0 - g_eac_modifier_table[dst.m_table][ETC2_EAC_MIN_VALUE_SELECTOR]) / range)); - color_quad_u8 scaledLow, scaledHigh; - for (uint32_t i = 0; i < 4; i++) - { - scaledLow.m_c[i] = (xMinColor.m_c[i] << (8 - total_bits)); - scaledLow.m_c[i] |= (scaledLow.m_c[i] >> total_bits); - assert(scaledLow.m_c[i] <= 255); + dst.m_base = center; - scaledHigh.m_c[i] = (xMaxColor.m_c[i] << (8 - total_bits)); - scaledHigh.m_c[i] |= (scaledHigh.m_c[i] >> total_bits); - assert(scaledHigh.m_c[i] <= 255); - } + const int8_t* pTable = &g_eac_modifier_table[dst.m_table][0]; - float err0 = 0, err1 = 0; - for (uint32_t i = 0; i < total_comps; i++) - { - err0 += basisu::squaref(scaledLow.m_c[i] - xl[i] * 255.0f); - err1 += basisu::squaref(scaledHigh.m_c[i] - xh[i] * 255.0f); - } + uint32_t vals[8]; + for (uint32_t j = 0; j < 8; j++) + vals[j] = clamp255(center + (pTable[j] * multiplier)); - if (err0 < best_err0) - { - best_err0 = err0; - best_pbits[0] = p; + uint64_t sels = 0; + for (uint32_t i = 0; i < 16; i++) + { + const uint32_t a = block_pixels[i & 3][i >> 2].a; - bestMinColor.m_c[0] = xMinColor.m_c[0] >> 1; - bestMinColor.m_c[1] = xMinColor.m_c[1] >> 1; - bestMinColor.m_c[2] = xMinColor.m_c[2] >> 1; - bestMinColor.m_c[3] = xMinColor.m_c[3] >> 1; - } + const uint32_t err0 = (basisu::iabs(vals[0] - a) << 3) | 0; + const uint32_t err1 = (basisu::iabs(vals[1] - a) << 3) | 1; + const uint32_t err2 = (basisu::iabs(vals[2] - a) << 3) | 2; + const uint32_t err3 = (basisu::iabs(vals[3] - a) << 3) | 3; + const uint32_t err4 = (basisu::iabs(vals[4] - a) << 3) | 4; + const uint32_t err5 = (basisu::iabs(vals[5] - a) << 3) | 5; + const uint32_t err6 = (basisu::iabs(vals[6] - a) << 3) | 6; + const uint32_t err7 = (basisu::iabs(vals[7] - a) << 3) | 7; - if (err1 < best_err1) - { - best_err1 = err1; - best_pbits[1] = p; + const uint32_t min_err = basisu::minimum(basisu::minimum(basisu::minimum(basisu::minimum(basisu::minimum(basisu::minimum(err0, err1, err2), err3), err4), err5), err6), err7); - bestMaxColor.m_c[0] = xMaxColor.m_c[0] >> 1; - bestMaxColor.m_c[1] = xMaxColor.m_c[1] >> 1; - bestMaxColor.m_c[2] = xMaxColor.m_c[2] >> 1; - bestMaxColor.m_c[3] = xMaxColor.m_c[3] >> 1; - } + const uint64_t best_index = min_err & 7; + sels |= (best_index << (45 - i * 3)); } + + dst.set_selector_bits(sels); } - bool transcode_uastc_to_astc(const uastc_block& src_blk, void* pDst) + bool transcode_uastc_to_etc2_rgba(const uastc_block& src_blk, void* pDst) { + eac_block& dst_etc2_eac_a8_blk = *static_cast(pDst); + decoder_etc_block& dst_etc1_blk = static_cast(pDst)[1]; + unpacked_uastc_block unpacked_src_blk; - if (!unpack_uastc(src_blk, unpacked_src_blk, true, false)) + if (!unpack_uastc(src_blk, unpacked_src_blk, false)) return false; - bool success = false; - if (unpacked_src_blk.m_mode == UASTC_MODE_INDEX_SOLID_COLOR) - { - pack_astc_solid_block(pDst, unpacked_src_blk.m_solid_color); - success = true; - } - else + color32 block_pixels[4][4]; + if (unpacked_src_blk.m_mode != UASTC_MODE_INDEX_SOLID_COLOR) { - success = pack_astc_block(static_cast(pDst), &unpacked_src_blk.m_astc, unpacked_src_blk.m_mode); + const bool unpack_srgb = false; + if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) + return false; } - return success; + transcode_uastc_to_etc2_eac_a8(unpacked_src_blk, block_pixels, &dst_etc2_eac_a8_blk); + + transcode_uastc_to_etc1(unpacked_src_blk, block_pixels, &dst_etc1_blk); + + return true; } - bool transcode_uastc_to_bc7(const unpacked_uastc_block& unpacked_src_blk, bc7_optimization_results& dst_blk) + static const uint8_t s_uastc5_to_bc1[32] = { 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1 }; + static const uint8_t s_uastc4_to_bc1[16] = { 0, 0, 0, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1, 1, 1 }; + static const uint8_t s_uastc3_to_bc1[8] = { 0, 0, 2, 2, 3, 3, 1, 1 }; + static const uint8_t s_uastc2_to_bc1[4] = { 0, 2, 3, 1 }; + static const uint8_t s_uastc1_to_bc1[2] = { 0, 1 }; + const uint8_t* s_uastc_to_bc1_weights[6] = { nullptr, s_uastc1_to_bc1, s_uastc2_to_bc1, s_uastc3_to_bc1, s_uastc4_to_bc1, s_uastc5_to_bc1 }; + + void encode_bc4(void* pDst, const uint8_t* pPixels, uint32_t stride) { - memset(&dst_blk, 0, sizeof(dst_blk)); + uint32_t min0_v, max0_v, min1_v, max1_v,min2_v, max2_v, min3_v, max3_v; - const uint32_t mode = unpacked_src_blk.m_mode; + { + min0_v = max0_v = pPixels[0 * stride]; + min1_v = max1_v = pPixels[1 * stride]; + min2_v = max2_v = pPixels[2 * stride]; + min3_v = max3_v = pPixels[3 * stride]; + } - const uint32_t endpoint_range = g_uastc_mode_endpoint_ranges[mode]; - const uint32_t total_comps = g_uastc_mode_comps[mode]; + { + uint32_t v0 = pPixels[4 * stride]; min0_v = basisu::minimum(min0_v, v0); max0_v = basisu::maximum(max0_v, v0); + uint32_t v1 = pPixels[5 * stride]; min1_v = basisu::minimum(min1_v, v1); max1_v = basisu::maximum(max1_v, v1); + uint32_t v2 = pPixels[6 * stride]; min2_v = basisu::minimum(min2_v, v2); max2_v = basisu::maximum(max2_v, v2); + uint32_t v3 = pPixels[7 * stride]; min3_v = basisu::minimum(min3_v, v3); max3_v = basisu::maximum(max3_v, v3); + } - switch (mode) { - case 0: - case 5: - case 10: - case 12: - case 14: - case 15: - case 18: + uint32_t v0 = pPixels[8 * stride]; min0_v = basisu::minimum(min0_v, v0); max0_v = basisu::maximum(max0_v, v0); + uint32_t v1 = pPixels[9 * stride]; min1_v = basisu::minimum(min1_v, v1); max1_v = basisu::maximum(max1_v, v1); + uint32_t v2 = pPixels[10 * stride]; min2_v = basisu::minimum(min2_v, v2); max2_v = basisu::maximum(max2_v, v2); + uint32_t v3 = pPixels[11 * stride]; min3_v = basisu::minimum(min3_v, v3); max3_v = basisu::maximum(max3_v, v3); + } + { - // MODE 0: DualPlane: 0, WeightRange: 8 (16), Subsets: 1, EndpointRange: 19 (192) - BC7 MODE6 RGB - // MODE 5: DualPlane: 0, WeightRange : 5 (8), Subsets : 1, EndpointRange : 20 (256) - BC7 MODE6 RGB - // MODE 10 DualPlane: 0, WeightRange: 8 (16), Subsets: 1, EndpointRange: 13 (48) - BC7 MODE6 - // MODE 12: DualPlane: 0, WeightRange : 5 (8), Subsets : 1, EndpointRange : 19 (192) - BC7 MODE6 - // MODE 14: DualPlane: 0, WeightRange : 2 (4), Subsets : 1, EndpointRange : 20 (256) - BC7 MODE6 - // MODE 18: DualPlane: 0, WeightRange : 11 (32), Subsets : 1, CEM : 8, EndpointRange : 11 (32) - BC7 MODE6 - // MODE 15: DualPlane: 0, WeightRange : 8 (16), Subsets : 1, CEM : 4 (LA Direct), EndpointRange : 20 (256) - BC7 MODE6 - dst_blk.m_mode = 6; + uint32_t v0 = pPixels[12 * stride]; min0_v = basisu::minimum(min0_v, v0); max0_v = basisu::maximum(max0_v, v0); + uint32_t v1 = pPixels[13 * stride]; min1_v = basisu::minimum(min1_v, v1); max1_v = basisu::maximum(max1_v, v1); + uint32_t v2 = pPixels[14 * stride]; min2_v = basisu::minimum(min2_v, v2); max2_v = basisu::maximum(max2_v, v2); + uint32_t v3 = pPixels[15 * stride]; min3_v = basisu::minimum(min3_v, v3); max3_v = basisu::maximum(max3_v, v3); + } - float xl[4], xh[4]; - if (total_comps == 2) - { - xl[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[0]].m_unquant / 255.0f; - xh[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[1]].m_unquant / 255.0f; + const uint32_t min_v = basisu::minimum(min0_v, min1_v, min2_v, min3_v); + const uint32_t max_v = basisu::maximum(max0_v, max1_v, max2_v, max3_v); - xl[1] = xl[0]; - xh[1] = xh[0]; + uint8_t* pDst_bytes = static_cast(pDst); + pDst_bytes[0] = (uint8_t)max_v; + pDst_bytes[1] = (uint8_t)min_v; + + if (max_v == min_v) + { + memset(pDst_bytes + 2, 0, 6); + return; + } + + const uint32_t delta = max_v - min_v; + + // min_v is now 0. Compute thresholds between values by scaling max_v. It's x14 because we're adding two x7 scale factors. + const int t0 = delta * 13; + const int t1 = delta * 11; + const int t2 = delta * 9; + const int t3 = delta * 7; + const int t4 = delta * 5; + const int t5 = delta * 3; + const int t6 = delta * 1; + + // BC4 floors in its divisions, which we compensate for with the 4 bias. + // This function is optimal for all possible inputs (i.e. it outputs the same results as checking all 8 values and choosing the closest one). + const int bias = 4 - min_v * 14; + + static const uint32_t s_tran0[8] = { 1U , 7U , 6U , 5U , 4U , 3U , 2U , 0U }; + static const uint32_t s_tran1[8] = { 1U << 3U, 7U << 3U, 6U << 3U, 5U << 3U, 4U << 3U, 3U << 3U, 2U << 3U, 0U << 3U }; + static const uint32_t s_tran2[8] = { 1U << 6U, 7U << 6U, 6U << 6U, 5U << 6U, 4U << 6U, 3U << 6U, 2U << 6U, 0U << 6U }; + static const uint32_t s_tran3[8] = { 1U << 9U, 7U << 9U, 6U << 9U, 5U << 9U, 4U << 9U, 3U << 9U, 2U << 9U, 0U << 9U }; + + uint64_t a0, a1, a2, a3; + { + const int v0 = pPixels[0 * stride] * 14 + bias; + const int v1 = pPixels[1 * stride] * 14 + bias; + const int v2 = pPixels[2 * stride] * 14 + bias; + const int v3 = pPixels[3 * stride] * 14 + bias; + a0 = s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]; + a1 = s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]; + a2 = s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]; + a3 = s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]; + } + + { + const int v0 = pPixels[4 * stride] * 14 + bias; + const int v1 = pPixels[5 * stride] * 14 + bias; + const int v2 = pPixels[6 * stride] * 14 + bias; + const int v3 = pPixels[7 * stride] * 14 + bias; + a0 |= (s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)] << 12U); + a1 |= (s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)] << 12U); + a2 |= (s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)] << 12U); + a3 |= (s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)] << 12U); + } + + { + const int v0 = pPixels[8 * stride] * 14 + bias; + const int v1 = pPixels[9 * stride] * 14 + bias; + const int v2 = pPixels[10 * stride] * 14 + bias; + const int v3 = pPixels[11 * stride] * 14 + bias; + a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 24U); + a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 24U); + a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 24U); + a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 24U); + } - xl[2] = xl[0]; - xh[2] = xh[0]; + { + const int v0 = pPixels[12 * stride] * 14 + bias; + const int v1 = pPixels[13 * stride] * 14 + bias; + const int v2 = pPixels[14 * stride] * 14 + bias; + const int v3 = pPixels[15 * stride] * 14 + bias; + a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 36U); + a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 36U); + a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 36U); + a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 36U); + } - xl[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[2]].m_unquant / 255.0f; - xh[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[3]].m_unquant / 255.0f; - } - else - { - xl[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[0]].m_unquant / 255.0f; - xl[1] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[2]].m_unquant / 255.0f; - xl[2] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[4]].m_unquant / 255.0f; + const uint64_t f = a0 | a1 | a2 | a3; + + pDst_bytes[2] = (uint8_t)f; + pDst_bytes[3] = (uint8_t)(f >> 8U); + pDst_bytes[4] = (uint8_t)(f >> 16U); + pDst_bytes[5] = (uint8_t)(f >> 24U); + pDst_bytes[6] = (uint8_t)(f >> 32U); + pDst_bytes[7] = (uint8_t)(f >> 40U); + } - xh[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[1]].m_unquant / 255.0f; - xh[1] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[3]].m_unquant / 255.0f; - xh[2] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[5]].m_unquant / 255.0f; + static void bc1_find_sels(const color32 *pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16]) + { + uint32_t block_r[4], block_g[4], block_b[4]; - if (total_comps == 4) - { - xl[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[6]].m_unquant / 255.0f; - xh[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[7]].m_unquant / 255.0f; - } - else - { - xl[3] = 1.0f; - xh[3] = 1.0f; - } - } + block_r[0] = (lr << 3) | (lr >> 2); block_g[0] = (lg << 2) | (lg >> 4); block_b[0] = (lb << 3) | (lb >> 2); + block_r[3] = (hr << 3) | (hr >> 2); block_g[3] = (hg << 2) | (hg >> 4); block_b[3] = (hb << 3) | (hb >> 2); + block_r[1] = (block_r[0] * 2 + block_r[3]) / 3; block_g[1] = (block_g[0] * 2 + block_g[3]) / 3; block_b[1] = (block_b[0] * 2 + block_b[3]) / 3; + block_r[2] = (block_r[3] * 2 + block_r[0]) / 3; block_g[2] = (block_g[3] * 2 + block_g[0]) / 3; block_b[2] = (block_b[3] * 2 + block_b[0]) / 3; - uint32_t best_pbits[2]; - color_quad_u8 bestMinColor, bestMaxColor; - determine_unique_pbits((total_comps == 2) ? 4 : total_comps, 7, xl, xh, bestMinColor, bestMaxColor, best_pbits); + int ar = block_r[3] - block_r[0], ag = block_g[3] - block_g[0], ab = block_b[3] - block_b[0]; - dst_blk.m_low[0] = bestMinColor; - dst_blk.m_high[0] = bestMaxColor; + int dots[4]; + for (uint32_t i = 0; i < 4; i++) + dots[i] = (int)block_r[i] * ar + (int)block_g[i] * ag + (int)block_b[i] * ab; + + int t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3]; - if (total_comps == 3) - { - dst_blk.m_low[0].m_c[3] = 127; - dst_blk.m_high[0].m_c[3] = 127; - } + ar *= 2; ag *= 2; ab *= 2; - dst_blk.m_pbits[0][0] = best_pbits[0]; - dst_blk.m_pbits[0][1] = best_pbits[1]; + for (uint32_t i = 0; i < 16; i++) + { + const int d = pSrc_pixels[i].r * ar + pSrc_pixels[i].g * ag + pSrc_pixels[i].b * ab; + static const uint8_t s_sels[4] = { 3, 2, 1, 0 }; + + // Rounding matters here! + // d <= t0: <=, not <, to the later LS step "sees" a wider range of selectors. It matters for quality. + sels[i] = s_sels[(d <= t0) + (d < t1) + (d < t2)]; + } + } - if (mode == 18) - { - const uint8_t s_bc7_5_to_4[32] = { 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 6, 7, 8, 9, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15 }; - for (uint32_t i = 0; i < 16; i++) - dst_blk.m_selectors[i] = s_bc7_5_to_4[unpacked_src_blk.m_astc.m_weights[i]]; - } - else if (mode == 14) - { - const uint8_t s_bc7_2_to_4[4] = { 0, 5, 10, 15 }; - for (uint32_t i = 0; i < 16; i++) - dst_blk.m_selectors[i] = s_bc7_2_to_4[unpacked_src_blk.m_astc.m_weights[i]]; - } - else if ((mode == 5) || (mode == 12)) - { - const uint8_t s_bc7_3_to_4[8] = { 0, 2, 4, 6, 9, 11, 13, 15 }; - for (uint32_t i = 0; i < 16; i++) - dst_blk.m_selectors[i] = s_bc7_3_to_4[unpacked_src_blk.m_astc.m_weights[i]]; - } - else - { - for (uint32_t i = 0; i < 16; i++) - dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i]; - } + static inline void bc1_find_sels_2(const color32* pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16]) + { + uint32_t block_r[4], block_g[4], block_b[4]; - break; - } - case 1: - { - // DualPlane: 0, WeightRange : 2 (4), Subsets : 1, EndpointRange : 20 (256) - BC7 MODE3 - // Mode 1 uses endpoint range 20 - no need to use ASTC dequant tables. - dst_blk.m_mode = 3; + block_r[0] = (lr << 3) | (lr >> 2); block_g[0] = (lg << 2) | (lg >> 4); block_b[0] = (lb << 3) | (lb >> 2); + block_r[3] = (hr << 3) | (hr >> 2); block_g[3] = (hg << 2) | (hg >> 4); block_b[3] = (hb << 3) | (hb >> 2); + block_r[1] = (block_r[0] * 2 + block_r[3]) / 3; block_g[1] = (block_g[0] * 2 + block_g[3]) / 3; block_b[1] = (block_b[0] * 2 + block_b[3]) / 3; + block_r[2] = (block_r[3] * 2 + block_r[0]) / 3; block_g[2] = (block_g[3] * 2 + block_g[0]) / 3; block_b[2] = (block_b[3] * 2 + block_b[0]) / 3; - float xl[4], xh[4]; - xl[0] = unpacked_src_blk.m_astc.m_endpoints[0] / 255.0f; - xl[1] = unpacked_src_blk.m_astc.m_endpoints[2] / 255.0f; - xl[2] = unpacked_src_blk.m_astc.m_endpoints[4] / 255.0f; - xl[3] = 1.0f; + int ar = block_r[3] - block_r[0], ag = block_g[3] - block_g[0], ab = block_b[3] - block_b[0]; - xh[0] = unpacked_src_blk.m_astc.m_endpoints[1] / 255.0f; - xh[1] = unpacked_src_blk.m_astc.m_endpoints[3] / 255.0f; - xh[2] = unpacked_src_blk.m_astc.m_endpoints[5] / 255.0f; - xh[3] = 1.0f; + int dots[4]; + for (uint32_t i = 0; i < 4; i++) + dots[i] = (int)block_r[i] * ar + (int)block_g[i] * ag + (int)block_b[i] * ab; - uint32_t best_pbits[2]; - color_quad_u8 bestMinColor, bestMaxColor; - memset(&bestMinColor, 0, sizeof(bestMinColor)); - memset(&bestMaxColor, 0, sizeof(bestMaxColor)); - determine_unique_pbits(3, 7, xl, xh, bestMinColor, bestMaxColor, best_pbits); + int t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3]; - for (uint32_t i = 0; i < 3; i++) - { - dst_blk.m_low[0].m_c[i] = bestMinColor.m_c[i]; - dst_blk.m_high[0].m_c[i] = bestMaxColor.m_c[i]; - dst_blk.m_low[1].m_c[i] = bestMinColor.m_c[i]; - dst_blk.m_high[1].m_c[i] = bestMaxColor.m_c[i]; - } - dst_blk.m_pbits[0][0] = best_pbits[0]; - dst_blk.m_pbits[0][1] = best_pbits[1]; - dst_blk.m_pbits[1][0] = best_pbits[0]; - dst_blk.m_pbits[1][1] = best_pbits[1]; + ar *= 2; ag *= 2; ab *= 2; - for (uint32_t i = 0; i < 16; i++) - dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i]; + static const uint8_t s_sels[4] = { 3, 2, 1, 0 }; - break; - } - case 2: + for (uint32_t i = 0; i < 16; i += 4) { - // 2. DualPlane: 0, WeightRange : 5 (8), Subsets : 2, EndpointRange : 8 (16) - BC7 MODE1 - dst_blk.m_mode = 1; - dst_blk.m_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_bc7; - - const bool invert_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_invert; + const int d0 = pSrc_pixels[i+0].r * ar + pSrc_pixels[i+0].g * ag + pSrc_pixels[i+0].b * ab; + const int d1 = pSrc_pixels[i+1].r * ar + pSrc_pixels[i+1].g * ag + pSrc_pixels[i+1].b * ab; + const int d2 = pSrc_pixels[i+2].r * ar + pSrc_pixels[i+2].g * ag + pSrc_pixels[i+2].b * ab; + const int d3 = pSrc_pixels[i+3].r * ar + pSrc_pixels[i+3].g * ag + pSrc_pixels[i+3].b * ab; - float xl[4], xh[4]; - xl[3] = 1.0f; - xh[3] = 1.0f; + sels[i+0] = s_sels[(d0 <= t0) + (d0 < t1) + (d0 < t2)]; + sels[i+1] = s_sels[(d1 <= t0) + (d1 < t1) + (d1 < t2)]; + sels[i+2] = s_sels[(d2 <= t0) + (d2 < t1) + (d2 < t2)]; + sels[i+3] = s_sels[(d3 <= t0) + (d3 < t1) + (d3 < t2)]; + } + } + + static bool compute_least_squares_endpoints_rgb(const color32* pColors, const uint8_t* pSelectors, vec3F* pXl, vec3F* pXh) + { + // Derived from bc7enc16's LS function. + // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf + // I did this in matrix form first, expanded out all the ops, then optimized it a bit. + uint32_t uq00_r = 0, uq10_r = 0, ut_r = 0, uq00_g = 0, uq10_g = 0, ut_g = 0, uq00_b = 0, uq10_b = 0, ut_b = 0; - for (uint32_t subset = 0; subset < 2; subset++) - { - for (uint32_t i = 0; i < 3; i++) - { - uint32_t v = unpacked_src_blk.m_astc.m_endpoints[i * 2 + subset * 6]; - v = (v << 4) | v; - xl[i] = v / 255.0f; + // This table is: 9 * (w * w), 9 * ((1.0f - w) * w), 9 * ((1.0f - w) * (1.0f - w)) + // where w is [0,1/3,2/3,1]. 9 is the perfect multiplier. + static const uint32_t s_weight_vals[4] = { 0x000009, 0x010204, 0x040201, 0x090000 }; - v = unpacked_src_blk.m_astc.m_endpoints[i * 2 + subset * 6 + 1]; - v = (v << 4) | v; - xh[i] = v / 255.0f; - } + uint32_t weight_accum = 0; + for (uint32_t i = 0; i < 16; i++) + { + const uint32_t r = pColors[i].c[0], g = pColors[i].c[1], b = pColors[i].c[2]; + const uint32_t sel = pSelectors[i]; + ut_r += r; + ut_g += g; + ut_b += b; + weight_accum += s_weight_vals[sel]; + uq00_r += sel * r; + uq00_g += sel * g; + uq00_b += sel * b; + } - uint32_t best_pbits[2] = { 0, 0 }; - color_quad_u8 bestMinColor, bestMaxColor; - memset(&bestMinColor, 0, sizeof(bestMinColor)); - memset(&bestMaxColor, 0, sizeof(bestMaxColor)); - determine_shared_pbits(3, 6, xl, xh, bestMinColor, bestMaxColor, best_pbits); + float q00_r = (float)uq00_r, q10_r = (float)uq10_r, t_r = (float)ut_r; + float q00_g = (float)uq00_g, q10_g = (float)uq10_g, t_g = (float)ut_g; + float q00_b = (float)uq00_b, q10_b = (float)uq10_b, t_b = (float)ut_b; - const uint32_t bc7_subset_index = invert_partition ? (1 - subset) : subset; + q10_r = t_r * 3.0f - q00_r; + q10_g = t_g * 3.0f - q00_g; + q10_b = t_b * 3.0f - q00_b; - for (uint32_t i = 0; i < 3; i++) - { - dst_blk.m_low[bc7_subset_index].m_c[i] = bestMinColor.m_c[i]; - dst_blk.m_high[bc7_subset_index].m_c[i] = bestMaxColor.m_c[i]; - } + float z00 = (float)((weight_accum >> 16) & 0xFF); + float z10 = (float)((weight_accum >> 8) & 0xFF); + float z11 = (float)(weight_accum & 0xFF); + float z01 = z10; - dst_blk.m_pbits[bc7_subset_index][0] = best_pbits[0]; - } // subset + float det = z00 * z11 - z01 * z10; + if (fabs(det) < 1e-8f) + return false; - for (uint32_t i = 0; i < 16; i++) - dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i]; + det = 3.0f / det; - break; - } - case 3: - { - // DualPlane: 0, WeightRange : 2 (4), Subsets : 3, EndpointRange : 7 (12) - BC7 MODE2 - dst_blk.m_mode = 2; - dst_blk.m_partition = g_astc_bc7_common_partitions3[unpacked_src_blk.m_common_pattern].m_bc7; + float iz00, iz01, iz10, iz11; + iz00 = z11 * det; + iz01 = -z01 * det; + iz10 = -z10 * det; + iz11 = z00 * det; - const uint32_t perm = g_astc_bc7_common_partitions3[unpacked_src_blk.m_common_pattern].m_astc_to_bc7_perm; + pXl->c[0] = iz00 * q00_r + iz01 * q10_r; pXh->c[0] = iz10 * q00_r + iz11 * q10_r; + pXl->c[1] = iz00 * q00_g + iz01 * q10_g; pXh->c[1] = iz10 * q00_g + iz11 * q10_g; + pXl->c[2] = iz00 * q00_b + iz01 * q10_b; pXh->c[2] = iz10 * q00_b + iz11 * q10_b; - for (uint32_t subset = 0; subset < 3; subset++) + // Check and fix channel singularities - might not be needed, but is in UASTC's encoder. + for (uint32_t c = 0; c < 3; c++) + { + if ((pXl->c[c] < 0.0f) || (pXh->c[c] > 255.0f)) { - for (uint32_t comp = 0; comp < 3; comp++) + uint32_t lo_v = UINT32_MAX, hi_v = 0; + for (uint32_t i = 0; i < 16; i++) { - uint32_t lo = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[comp * 2 + 0 + subset * 6]].m_unquant; - uint32_t hi = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[comp * 2 + 1 + subset * 6]].m_unquant; - - // TODO: I think this can be improved by using tables like Basis Universal does with ETC1S conversion. - lo = (lo * 31 + 127) / 255; - hi = (hi * 31 + 127) / 255; - - const uint32_t bc7_subset_index = g_astc_to_bc7_partition_index_perm_tables[perm][subset]; + lo_v = basisu::minimumu(lo_v, pColors[i].c[c]); + hi_v = basisu::maximumu(hi_v, pColors[i].c[c]); + } - dst_blk.m_low[bc7_subset_index].m_c[comp] = (uint8_t)lo; - dst_blk.m_high[bc7_subset_index].m_c[comp] = (uint8_t)hi; + if (lo_v == hi_v) + { + pXl->c[c] = (float)lo_v; + pXh->c[c] = (float)hi_v; } } + } - for (uint32_t i = 0; i < 16; i++) - dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i]; + return true; + } - break; - } - case 4: - { - // 4. DualPlane: 0, WeightRange: 2 (4), Subsets: 2, EndpointRange: 12 (40) - BC7 MODE3 - dst_blk.m_mode = 3; - dst_blk.m_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_bc7; + void encode_bc1_solid_block(void* pDst, uint32_t fr, uint32_t fg, uint32_t fb) + { + dxt1_block* pDst_block = static_cast(pDst); - const bool invert_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_invert; + uint32_t mask = 0xAA; + uint32_t max16 = (g_bc1_match5_equals_1[fr].m_hi << 11) | (g_bc1_match6_equals_1[fg].m_hi << 5) | g_bc1_match5_equals_1[fb].m_hi; + uint32_t min16 = (g_bc1_match5_equals_1[fr].m_lo << 11) | (g_bc1_match6_equals_1[fg].m_lo << 5) | g_bc1_match5_equals_1[fb].m_lo; - float xl[4], xh[4]; - xl[3] = 1.0f; - xh[3] = 1.0f; + if (min16 == max16) + { + // Always forbid 3 color blocks + // This is to guarantee that BC3 blocks never use punchthrough alpha (3 color) mode, which isn't supported on some (all?) GPU's. + mask = 0; - for (uint32_t subset = 0; subset < 2; subset++) + // Make l > h + if (min16 > 0) + min16--; + else { - for (uint32_t i = 0; i < 3; i++) - { - xl[i] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[i * 2 + subset * 6]].m_unquant / 255.0f; - xh[i] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[i * 2 + subset * 6 + 1]].m_unquant / 255.0f; - } + // l = h = 0 + assert(min16 == max16 && max16 == 0); - uint32_t best_pbits[2] = { 0, 0 }; - color_quad_u8 bestMinColor, bestMaxColor; - memset(&bestMinColor, 0, sizeof(bestMinColor)); - memset(&bestMaxColor, 0, sizeof(bestMaxColor)); - determine_unique_pbits(3, 7, xl, xh, bestMinColor, bestMaxColor, best_pbits); + max16 = 1; + min16 = 0; + mask = 0x55; + } - const uint32_t bc7_subset_index = invert_partition ? (1 - subset) : subset; + assert(max16 > min16); + } - for (uint32_t i = 0; i < 3; i++) - { - dst_blk.m_low[bc7_subset_index].m_c[i] = bestMinColor.m_c[i]; - dst_blk.m_high[bc7_subset_index].m_c[i] = bestMaxColor.m_c[i]; - } - dst_blk.m_low[bc7_subset_index].m_c[3] = 127; - dst_blk.m_high[bc7_subset_index].m_c[3] = 127; + if (max16 < min16) + { + std::swap(max16, min16); + mask ^= 0x55; + } - dst_blk.m_pbits[bc7_subset_index][0] = best_pbits[0]; - dst_blk.m_pbits[bc7_subset_index][1] = best_pbits[1]; + pDst_block->set_low_color(static_cast(max16)); + pDst_block->set_high_color(static_cast(min16)); + pDst_block->m_selectors[0] = static_cast(mask); + pDst_block->m_selectors[1] = static_cast(mask); + pDst_block->m_selectors[2] = static_cast(mask); + pDst_block->m_selectors[3] = static_cast(mask); + } - } // subset + static inline uint8_t to_5(uint32_t v) { v = v * 31 + 128; return (uint8_t)((v + (v >> 8)) >> 8); } + static inline uint8_t to_6(uint32_t v) { v = v * 63 + 128; return (uint8_t)((v + (v >> 8)) >> 8); } + // Good references: squish library, stb_dxt. + void encode_bc1(void* pDst, const uint8_t* pPixels, uint32_t flags) + { + const color32* pSrc_pixels = (const color32*)pPixels; + dxt1_block* pDst_block = static_cast(pDst); + + int avg_r = -1, avg_g = 0, avg_b = 0; + int lr = 0, lg = 0, lb = 0, hr = 0, hg = 0, hb = 0; + uint8_t sels[16]; + + const bool use_sels = (flags & cEncodeBC1UseSelectors) != 0; + if (use_sels) + { + // Caller is jamming in their own selectors for us to try. + const uint32_t s = pDst_block->m_selectors[0] | (pDst_block->m_selectors[1] << 8) | (pDst_block->m_selectors[2] << 16) | (pDst_block->m_selectors[3] << 24); + + static const uint8_t s_sel_tran[4] = { 0, 3, 1, 2 }; + for (uint32_t i = 0; i < 16; i++) - dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i]; - - break; + sels[i] = s_sel_tran[(s >> (i * 2)) & 3]; } - case 6: - case 11: - case 13: - case 17: + else { - // MODE 6: DualPlane: 1, WeightRange : 2 (4), Subsets : 1, EndpointRange : 18 (160) - BC7 MODE5 RGB - // MODE 11: DualPlane: 1, WeightRange: 2 (4), Subsets: 1, EndpointRange: 13 (48) - BC7 MODE5 - // MODE 13: DualPlane: 1, WeightRange: 0 (2), Subsets : 1, EndpointRange : 20 (256) - BC7 MODE5 - // MODE 17: DualPlane: 1, WeightRange: 2 (4), Subsets: 1, CEM: 4 (LA Direct), EndpointRange: 20 (256) - BC7 MODE5 - dst_blk.m_mode = 5; - dst_blk.m_rotation = (unpacked_src_blk.m_astc.m_ccs + 1) & 3; + const uint32_t fr = pSrc_pixels[0].r, fg = pSrc_pixels[0].g, fb = pSrc_pixels[0].b; - if (total_comps == 2) + uint32_t j; + for (j = 1; j < 16; j++) + if ((pSrc_pixels[j].r != fr) || (pSrc_pixels[j].g != fg) || (pSrc_pixels[j].b != fb)) + break; + + if (j == 16) { - assert(unpacked_src_blk.m_astc.m_ccs == 3); - - dst_blk.m_low->m_c[0] = (uint8_t)((g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[0]].m_unquant * 127 + 127) / 255); - dst_blk.m_high->m_c[0] = (uint8_t)((g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[1]].m_unquant * 127 + 127) / 255); - - dst_blk.m_low->m_c[1] = dst_blk.m_low->m_c[0]; - dst_blk.m_high->m_c[1] = dst_blk.m_high->m_c[0]; - - dst_blk.m_low->m_c[2] = dst_blk.m_low->m_c[0]; - dst_blk.m_high->m_c[2] = dst_blk.m_high->m_c[0]; - - dst_blk.m_low->m_c[3] = (uint8_t)(g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[2]].m_unquant); - dst_blk.m_high->m_c[3] = (uint8_t)(g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[3]].m_unquant); + encode_bc1_solid_block(pDst, fr, fg, fb); + return; } - else + + // Select 2 colors along the principle axis. (There must be a faster/simpler way.) + int total_r = fr, total_g = fg, total_b = fb; + int max_r = fr, max_g = fg, max_b = fb; + int min_r = fr, min_g = fg, min_b = fb; + for (uint32_t i = 1; i < 16; i++) { - for (uint32_t astc_comp = 0; astc_comp < 4; astc_comp++) - { - uint32_t bc7_comp = astc_comp; - // ASTC and BC7 handle dual plane component rotations differently: - // ASTC: 2nd plane separately interpolates the CCS channel. - // BC7: 2nd plane channel is swapped with alpha, 2nd plane controls alpha interpolation, then we swap alpha with the desired channel. - if (astc_comp == (uint32_t)unpacked_src_blk.m_astc.m_ccs) - bc7_comp = 3; - else if (astc_comp == 3) - bc7_comp = unpacked_src_blk.m_astc.m_ccs; + const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b; + max_r = basisu::maximum(max_r, r); max_g = basisu::maximum(max_g, g); max_b = basisu::maximum(max_b, b); + min_r = basisu::minimum(min_r, r); min_g = basisu::minimum(min_g, g); min_b = basisu::minimum(min_b, b); + total_r += r; total_g += g; total_b += b; + } - uint32_t l = 255, h = 255; - if (astc_comp < total_comps) - { - l = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[astc_comp * 2 + 0]].m_unquant; - h = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[astc_comp * 2 + 1]].m_unquant; - } + avg_r = (total_r + 8) >> 4; + avg_g = (total_g + 8) >> 4; + avg_b = (total_b + 8) >> 4; - if (bc7_comp < 3) - { - l = (l * 127 + 127) / 255; - h = (h * 127 + 127) / 255; - } + int icov[6] = { 0, 0, 0, 0, 0, 0 }; + for (uint32_t i = 0; i < 16; i++) + { + int r = (int)pSrc_pixels[i].r - avg_r; + int g = (int)pSrc_pixels[i].g - avg_g; + int b = (int)pSrc_pixels[i].b - avg_b; + icov[0] += r * r; + icov[1] += r * g; + icov[2] += r * b; + icov[3] += g * g; + icov[4] += g * b; + icov[5] += b * b; + } - dst_blk.m_low->m_c[bc7_comp] = (uint8_t)l; - dst_blk.m_high->m_c[bc7_comp] = (uint8_t)h; - } + float cov[6]; + for (uint32_t i = 0; i < 6; i++) + cov[i] = static_cast(icov[i])* (1.0f / 255.0f); + +#if 0 + // Seems silly to use full PCA to choose 2 colors. The diff in avg. PSNR between using PCA vs. not is small (~.025 difference). + // TODO: Try 2 or 3 different normalized diagonal vectors, choose the one that results in the largest dot delta + int saxis_r = max_r - min_r; + int saxis_g = max_g - min_g; + int saxis_b = max_b - min_b; +#else + float xr = (float)(max_r - min_r); + float xg = (float)(max_g - min_g); + float xb = (float)(max_b - min_b); + //float xr = (float)(max_r - avg_r); // max-avg is nearly the same, and doesn't require computing min's + //float xg = (float)(max_g - avg_g); + //float xb = (float)(max_b - avg_b); + for (uint32_t power_iter = 0; power_iter < 4; power_iter++) + { + float r = xr * cov[0] + xg * cov[1] + xb * cov[2]; + float g = xr * cov[1] + xg * cov[3] + xb * cov[4]; + float b = xr * cov[2] + xg * cov[4] + xb * cov[5]; + xr = r; xg = g; xb = b; } - if (mode == 13) + float k = basisu::maximum(fabsf(xr), fabsf(xg), fabsf(xb)); + int saxis_r = 306, saxis_g = 601, saxis_b = 117; + if (k >= 2) { - for (uint32_t i = 0; i < 16; i++) - { - dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i * 2] ? 3 : 0; - dst_blk.m_alpha_selectors[i] = unpacked_src_blk.m_astc.m_weights[i * 2 + 1] ? 3 : 0; - } + float m = 1024.0f / k; + saxis_r = (int)(xr * m); + saxis_g = (int)(xg * m); + saxis_b = (int)(xb * m); } - else +#endif + + int low_dot = INT_MAX, high_dot = INT_MIN, low_c = 0, high_c = 0; + for (uint32_t i = 0; i < 16; i++) { - for (uint32_t i = 0; i < 16; i++) + int dot = pSrc_pixels[i].r * saxis_r + pSrc_pixels[i].g * saxis_g + pSrc_pixels[i].b * saxis_b; + if (dot < low_dot) { - dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i * 2]; - dst_blk.m_alpha_selectors[i] = unpacked_src_blk.m_astc.m_weights[i * 2 + 1]; + low_dot = dot; + low_c = i; } - } - - break; - } - case 7: - { - // DualPlane: 0, WeightRange : 2 (4), Subsets : 2, EndpointRange : 12 (40) - BC7 MODE2 - dst_blk.m_mode = 2; - dst_blk.m_partition = g_bc7_3_astc2_common_partitions[unpacked_src_blk.m_common_pattern].m_bc73; - - const uint32_t common_pattern_k = g_bc7_3_astc2_common_partitions[unpacked_src_blk.m_common_pattern].k; - - for (uint32_t bc7_part = 0; bc7_part < 3; bc7_part++) - { - const uint32_t astc_part = bc7_convert_partition_index_3_to_2(bc7_part, common_pattern_k); - - for (uint32_t c = 0; c < 3; c++) + if (dot > high_dot) { - dst_blk.m_low[bc7_part].m_c[c] = (g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[c * 2 + 0 + astc_part * 6]].m_unquant * 31 + 127) / 255; - dst_blk.m_high[bc7_part].m_c[c] = (g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[c * 2 + 1 + astc_part * 6]].m_unquant * 31 + 127) / 255; + high_dot = dot; + high_c = i; } } - for (uint32_t i = 0; i < 16; i++) - dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i]; - - break; - } - case UASTC_MODE_INDEX_SOLID_COLOR: - { - // Void-Extent: Solid Color RGBA (BC7 MODE5 or MODE6) - const color32& solid_color = unpacked_src_blk.m_solid_color; - - uint32_t best_err0 = g_bc7_mode_6_optimal_endpoints[solid_color.r][0].m_error + g_bc7_mode_6_optimal_endpoints[solid_color.g][0].m_error + - g_bc7_mode_6_optimal_endpoints[solid_color.b][0].m_error + g_bc7_mode_6_optimal_endpoints[solid_color.a][0].m_error; + lr = to_5(pSrc_pixels[low_c].r); + lg = to_6(pSrc_pixels[low_c].g); + lb = to_5(pSrc_pixels[low_c].b); - uint32_t best_err1 = g_bc7_mode_6_optimal_endpoints[solid_color.r][1].m_error + g_bc7_mode_6_optimal_endpoints[solid_color.g][1].m_error + - g_bc7_mode_6_optimal_endpoints[solid_color.b][1].m_error + g_bc7_mode_6_optimal_endpoints[solid_color.a][1].m_error; + hr = to_5(pSrc_pixels[high_c].r); + hg = to_6(pSrc_pixels[high_c].g); + hb = to_5(pSrc_pixels[high_c].b); + + bc1_find_sels(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels); + } // if (use_sels) - if (best_err0 > 0 && best_err1 > 0) + const uint32_t total_ls_passes = (flags & cEncodeBC1HigherQuality) ? 3 : (flags & cEncodeBC1HighQuality ? 2 : 1); + for (uint32_t ls_pass = 0; ls_pass < total_ls_passes; ls_pass++) + { + // This is where the real magic happens. We have an array of candidate selectors, so let's use least squares to compute the optimal low/high endpoint colors. + vec3F xl, xh; + if (!compute_least_squares_endpoints_rgb(pSrc_pixels, sels, &xl, &xh)) { - dst_blk.m_mode = 5; - - for (uint32_t c = 0; c < 3; c++) + if (avg_r < 0) { - dst_blk.m_low[0].m_c[c] = g_bc7_mode_5_optimal_endpoints[solid_color.c[c]].m_lo; - dst_blk.m_high[0].m_c[c] = g_bc7_mode_5_optimal_endpoints[solid_color.c[c]].m_hi; + int total_r = 0, total_g = 0, total_b = 0; + for (uint32_t i = 0; i < 16; i++) + { + total_r += pSrc_pixels[i].r; + total_g += pSrc_pixels[i].g; + total_b += pSrc_pixels[i].b; + } + + avg_r = (total_r + 8) >> 4; + avg_g = (total_g + 8) >> 4; + avg_b = (total_b + 8) >> 4; } - memset(dst_blk.m_selectors, BC7ENC_MODE_5_OPTIMAL_INDEX, 16); + // All selectors equal - treat it as a solid block which should always be equal or better. + lr = g_bc1_match5_equals_1[avg_r].m_hi; + lg = g_bc1_match6_equals_1[avg_g].m_hi; + lb = g_bc1_match5_equals_1[avg_b].m_hi; - dst_blk.m_low[0].m_c[3] = solid_color.c[3]; - dst_blk.m_high[0].m_c[3] = solid_color.c[3]; + hr = g_bc1_match5_equals_1[avg_r].m_lo; + hg = g_bc1_match6_equals_1[avg_g].m_lo; + hb = g_bc1_match5_equals_1[avg_b].m_lo; - //memset(dst_blk.m_alpha_selectors, 0, 16); + // In high/higher quality mode, let it try again in case the optimal tables have caused the sels to diverge. } else { - dst_blk.m_mode = 6; - - uint32_t best_p = 0; - if (best_err1 < best_err0) - best_p = 1; - - for (uint32_t c = 0; c < 4; c++) - { - dst_blk.m_low[0].m_c[c] = g_bc7_mode_6_optimal_endpoints[solid_color.c[c]][best_p].m_lo; - dst_blk.m_high[0].m_c[c] = g_bc7_mode_6_optimal_endpoints[solid_color.c[c]][best_p].m_hi; - } + lr = basisu::clamp((int)((xl.c[0]) * (31.0f / 255.0f) + .5f), 0, 31); + lg = basisu::clamp((int)((xl.c[1]) * (63.0f / 255.0f) + .5f), 0, 63); + lb = basisu::clamp((int)((xl.c[2]) * (31.0f / 255.0f) + .5f), 0, 31); - dst_blk.m_pbits[0][0] = best_p; - dst_blk.m_pbits[0][1] = best_p; - memset(dst_blk.m_selectors, BC7ENC_MODE_6_OPTIMAL_INDEX, 16); + hr = basisu::clamp((int)((xh.c[0]) * (31.0f / 255.0f) + .5f), 0, 31); + hg = basisu::clamp((int)((xh.c[1]) * (63.0f / 255.0f) + .5f), 0, 63); + hb = basisu::clamp((int)((xh.c[2]) * (31.0f / 255.0f) + .5f), 0, 31); } - - break; + + bc1_find_sels(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels); } - case 9: - case 16: - { - // 9. DualPlane: 0, WeightRange : 2 (4), Subsets : 2, EndpointRange : 8 (16) - BC7 MODE7 - // 16. DualPlane: 0, WeightRange: 2 (4), Subsets: 2, CEM: 4 (LA Direct), EndpointRange: 20 (256) - BC7 MODE7 - - dst_blk.m_mode = 7; - dst_blk.m_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_bc7; - const bool invert_partition = g_astc_bc7_common_partitions2[unpacked_src_blk.m_common_pattern].m_invert; + uint32_t lc16 = dxt1_block::pack_unscaled_color(lr, lg, lb); + uint32_t hc16 = dxt1_block::pack_unscaled_color(hr, hg, hb); + + // Always forbid 3 color blocks + if (lc16 == hc16) + { + uint8_t mask = 0; - for (uint32_t astc_subset = 0; astc_subset < 2; astc_subset++) + // Make l > h + if (hc16 > 0) + hc16--; + else { - float xl[4], xh[4]; - - if (total_comps == 2) - { - xl[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[0 + astc_subset * 4]].m_unquant / 255.0f; - xh[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[1 + astc_subset * 4]].m_unquant / 255.0f; - - xl[1] = xl[0]; - xh[1] = xh[0]; - - xl[2] = xl[0]; - xh[2] = xh[0]; - - xl[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[2 + astc_subset * 4]].m_unquant / 255.0f; - xh[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[3 + astc_subset * 4]].m_unquant / 255.0f; - } - else - { - xl[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[0 + astc_subset * 8]].m_unquant / 255.0f; - xl[1] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[2 + astc_subset * 8]].m_unquant / 255.0f; - xl[2] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[4 + astc_subset * 8]].m_unquant / 255.0f; - xl[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[6 + astc_subset * 8]].m_unquant / 255.0f; - - xh[0] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[1 + astc_subset * 8]].m_unquant / 255.0f; - xh[1] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[3 + astc_subset * 8]].m_unquant / 255.0f; - xh[2] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[5 + astc_subset * 8]].m_unquant / 255.0f; - xh[3] = g_astc_unquant[endpoint_range][unpacked_src_blk.m_astc.m_endpoints[7 + astc_subset * 8]].m_unquant / 255.0f; - } + // lc16 = hc16 = 0 + assert(lc16 == hc16 && hc16 == 0); - uint32_t best_pbits[2] = { 0, 0 }; - color_quad_u8 bestMinColor, bestMaxColor; - memset(&bestMinColor, 0, sizeof(bestMinColor)); - memset(&bestMaxColor, 0, sizeof(bestMaxColor)); - determine_unique_pbits(4, 5, xl, xh, bestMinColor, bestMaxColor, best_pbits); + hc16 = 0; + lc16 = 1; + mask = 0x55; // select hc16 + } - const uint32_t bc7_subset_index = invert_partition ? (1 - astc_subset) : astc_subset; + assert(lc16 > hc16); + pDst_block->set_low_color(static_cast(lc16)); + pDst_block->set_high_color(static_cast(hc16)); - dst_blk.m_low[bc7_subset_index] = bestMinColor; - dst_blk.m_high[bc7_subset_index] = bestMaxColor; + pDst_block->m_selectors[0] = mask; + pDst_block->m_selectors[1] = mask; + pDst_block->m_selectors[2] = mask; + pDst_block->m_selectors[3] = mask; + } + else + { + uint8_t invert_mask = 0; + if (lc16 < hc16) + { + std::swap(lc16, hc16); + invert_mask = 0x55; + } - dst_blk.m_pbits[bc7_subset_index][0] = best_pbits[0]; - dst_blk.m_pbits[bc7_subset_index][1] = best_pbits[1]; - } // astc_subset + assert(lc16 > hc16); + pDst_block->set_low_color((uint16_t)lc16); + pDst_block->set_high_color((uint16_t)hc16); + uint32_t packed_sels = 0; + static const uint8_t s_sel_trans[4] = { 0, 2, 3, 1 }; for (uint32_t i = 0; i < 16; i++) - dst_blk.m_selectors[i] = unpacked_src_blk.m_astc.m_weights[i]; + packed_sels |= ((uint32_t)s_sel_trans[sels[i]] << (i * 2)); - break; - } - default: - return false; + pDst_block->m_selectors[0] = (uint8_t)packed_sels ^ invert_mask; + pDst_block->m_selectors[1] = (uint8_t)(packed_sels >> 8) ^ invert_mask; + pDst_block->m_selectors[2] = (uint8_t)(packed_sels >> 16) ^ invert_mask; + pDst_block->m_selectors[3] = (uint8_t)(packed_sels >> 24) ^ invert_mask; } - - return true; } - - bool transcode_uastc_to_bc7(const uastc_block& src_blk, bc7_optimization_results& dst_blk) - { - unpacked_uastc_block unpacked_src_blk; - if (!unpack_uastc(src_blk, unpacked_src_blk, false, false)) - return false; - - return transcode_uastc_to_bc7(unpacked_src_blk, dst_blk); - } - - bool transcode_uastc_to_bc7(const uastc_block& src_blk, void* pDst) + + void encode_bc1_alt(void* pDst, const uint8_t* pPixels, uint32_t flags) { - bc7_optimization_results temp; - if (!transcode_uastc_to_bc7(src_blk, temp)) - return false; - - encode_bc7_block(pDst, &temp); - return true; - } + const color32* pSrc_pixels = (const color32*)pPixels; + dxt1_block* pDst_block = static_cast(pDst); - color32 apply_etc1_bias(const color32 &block_color, uint32_t bias, uint32_t limit, uint32_t subblock) - { - color32 result; + int avg_r = -1, avg_g = 0, avg_b = 0; + int lr = 0, lg = 0, lb = 0, hr = 0, hg = 0, hb = 0; + uint8_t sels[16]; - for (uint32_t c = 0; c < 3; c++) + const bool use_sels = (flags & cEncodeBC1UseSelectors) != 0; + if (use_sels) { - static const int s_divs[3] = { 1, 3, 9 }; - - int delta = 0; - - switch (bias) - { - case 2: delta = subblock ? 0 : ((c == 0) ? -1 : 0); break; - case 5: delta = subblock ? 0 : ((c == 1) ? -1 : 0); break; - case 6: delta = subblock ? 0 : ((c == 2) ? -1 : 0); break; - - case 7: delta = subblock ? 0 : ((c == 0) ? 1 : 0); break; - case 11: delta = subblock ? 0 : ((c == 1) ? 1 : 0); break; - case 15: delta = subblock ? 0 : ((c == 2) ? 1 : 0); break; - - case 18: delta = subblock ? ((c == 0) ? -1 : 0) : 0; break; - case 19: delta = subblock ? ((c == 1) ? -1 : 0) : 0; break; - case 20: delta = subblock ? ((c == 2) ? -1 : 0) : 0; break; + // Caller is jamming in their own selectors for us to try. + const uint32_t s = pDst_block->m_selectors[0] | (pDst_block->m_selectors[1] << 8) | (pDst_block->m_selectors[2] << 16) | (pDst_block->m_selectors[3] << 24); - case 21: delta = subblock ? ((c == 0) ? 1 : 0) : 0; break; - case 24: delta = subblock ? ((c == 1) ? 1 : 0) : 0; break; - case 8: delta = subblock ? ((c == 2) ? 1 : 0) : 0; break; + static const uint8_t s_sel_tran[4] = { 0, 3, 1, 2 }; - case 10: delta = -2; break; + for (uint32_t i = 0; i < 16; i++) + sels[i] = s_sel_tran[(s >> (i * 2)) & 3]; + } + else + { + const uint32_t fr = pSrc_pixels[0].r, fg = pSrc_pixels[0].g, fb = pSrc_pixels[0].b; - case 27: delta = subblock ? 0 : -1; break; - case 28: delta = subblock ? -1 : 1; break; - case 29: delta = subblock ? 1 : 0; break; - case 30: delta = subblock ? -1 : 0; break; - case 31: delta = subblock ? 0 : 1; break; + uint32_t j; + for (j = 1; j < 16; j++) + if ((pSrc_pixels[j].r != fr) || (pSrc_pixels[j].g != fg) || (pSrc_pixels[j].b != fb)) + break; - default: - delta = ((bias / s_divs[c]) % 3) - 1; - break; + if (j == 16) + { + encode_bc1_solid_block(pDst, fr, fg, fb); + return; } - int v = block_color[c]; - if (v == 0) + // Select 2 colors along the principle axis. (There must be a faster/simpler way.) + int total_r = fr, total_g = fg, total_b = fb; + int max_r = fr, max_g = fg, max_b = fb; + int min_r = fr, min_g = fg, min_b = fb; + uint32_t grayscale_flag = (fr == fg) && (fr == fb); + for (uint32_t i = 1; i < 16; i++) { - if (delta == -2) - v += 3; - else - v += delta + 1; + const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b; + grayscale_flag &= ((r == g) && (r == b)); + max_r = basisu::maximum(max_r, r); max_g = basisu::maximum(max_g, g); max_b = basisu::maximum(max_b, b); + min_r = basisu::minimum(min_r, r); min_g = basisu::minimum(min_g, g); min_b = basisu::minimum(min_b, b); + total_r += r; total_g += g; total_b += b; } - else if (v == (int)limit) + + if (grayscale_flag) { - v += (delta - 1); + // Grayscale blocks are a common enough case to specialize. + if ((max_r - min_r) < 2) + { + lr = lb = hr = hb = to_5(fr); + lg = hg = to_6(fr); + } + else + { + lr = lb = to_5(min_r); + lg = to_6(min_r); + + hr = hb = to_5(max_r); + hg = to_6(max_r); + } } else { - v += delta; - if ((v < 0) || (v > (int)limit)) - v = (v - delta) - delta; - } + avg_r = (total_r + 8) >> 4; + avg_g = (total_g + 8) >> 4; + avg_b = (total_b + 8) >> 4; - assert(v >= 0); - assert(v <= (int)limit); + // Find the shortest vector from a AABB corner to the block's average color. + // This is to help avoid outliers. - result[c] = (uint8_t)v; - } + uint32_t dist[3][2]; + dist[0][0] = basisu::square(min_r - avg_r) << 3; dist[0][1] = basisu::square(max_r - avg_r) << 3; + dist[1][0] = basisu::square(min_g - avg_g) << 3; dist[1][1] = basisu::square(max_g - avg_g) << 3; + dist[2][0] = basisu::square(min_b - avg_b) << 3; dist[2][1] = basisu::square(max_b - avg_b) << 3; - return result; - } + uint32_t min_d0 = (dist[0][0] + dist[1][0] + dist[2][0]); + uint32_t d4 = (dist[0][0] + dist[1][0] + dist[2][1]) | 4; + min_d0 = basisu::minimum(min_d0, d4); - static void etc1_determine_selectors(decoder_etc_block& dst_blk, const color32* pSource_pixels, uint32_t first_subblock, uint32_t last_subblock) - { - static const uint8_t s_tran[4] = { 1, 0, 2, 3 }; + uint32_t min_d1 = (dist[0][1] + dist[1][0] + dist[2][0]) | 1; + uint32_t d5 = (dist[0][1] + dist[1][0] + dist[2][1]) | 5; + min_d1 = basisu::minimum(min_d1, d5); - uint16_t l_bitmask = 0; - uint16_t h_bitmask = 0; + uint32_t d2 = (dist[0][0] + dist[1][1] + dist[2][0]) | 2; + min_d0 = basisu::minimum(min_d0, d2); - for (uint32_t subblock = first_subblock; subblock < last_subblock; subblock++) - { - color32 block_colors[4]; - dst_blk.get_block_colors(block_colors, subblock); + uint32_t d3 = (dist[0][1] + dist[1][1] + dist[2][0]) | 3; + min_d1 = basisu::minimum(min_d1, d3); - uint32_t block_y[4]; - for (uint32_t i = 0; i < 4; i++) - block_y[i] = block_colors[i][0] * 54 + block_colors[i][1] * 183 + block_colors[i][2] * 19; + uint32_t d6 = (dist[0][0] + dist[1][1] + dist[2][1]) | 6; + min_d0 = basisu::minimum(min_d0, d6); - const uint32_t block_y01 = block_y[0] + block_y[1]; - const uint32_t block_y12 = block_y[1] + block_y[2]; - const uint32_t block_y23 = block_y[2] + block_y[3]; + uint32_t d7 = (dist[0][1] + dist[1][1] + dist[2][1]) | 7; + min_d1 = basisu::minimum(min_d1, d7); - // X0 X0 X0 X0 X1 X1 X1 X1 X2 X2 X2 X2 X3 X3 X3 X3 - // Y0 Y1 Y2 Y3 Y0 Y1 Y2 Y3 Y0 Y1 Y2 Y3 Y0 Y1 Y2 Y3 + uint32_t min_d = basisu::minimum(min_d0, min_d1); + uint32_t best_i = min_d & 7; - if (dst_blk.get_flip_bit()) - { - uint32_t ofs = subblock * 2; + int delta_r = (best_i & 1) ? (max_r - avg_r) : (avg_r - min_r); + int delta_g = (best_i & 2) ? (max_g - avg_g) : (avg_g - min_g); + int delta_b = (best_i & 4) ? (max_b - avg_b) : (avg_b - min_b); - for (uint32_t y = 0; y < 2; y++) + // Note: if delta_r/g/b==0, we actually want to choose a single color, so the block average color optimization kicks in. + uint32_t low_c = 0, high_c = 0; + if ((delta_r | delta_g | delta_b) != 0) { - for (uint32_t x = 0; x < 4; x++) - { - const color32& c = pSource_pixels[x + (subblock * 2 + y) * 4]; - const uint32_t l = c[0] * 108 + c[1] * 366 + c[2] * 38; + // Now we have a smaller AABB going from the block's average color to a cornerpoint of the larger AABB. + // Project all pixels colors along the 4 vectors going from a smaller AABB cornerpoint to the opposite cornerpoint, find largest projection. + // One of these vectors will be a decent approximation of the block's PCA. + const int saxis0_r = delta_r, saxis0_g = delta_g, saxis0_b = delta_b; - uint32_t t = s_tran[(l < block_y01) + (l < block_y12) + (l < block_y23)]; + int low_dot0 = INT_MAX, high_dot0 = INT_MIN; + int low_dot1 = INT_MAX, high_dot1 = INT_MIN; + int low_dot2 = INT_MAX, high_dot2 = INT_MIN; + int low_dot3 = INT_MAX, high_dot3 = INT_MIN; - assert(ofs < 16); - l_bitmask |= ((t & 1) << ofs); - h_bitmask |= ((t >> 1) << ofs); - ofs += 4; - } + //int low_c0, low_c1, low_c2, low_c3; + //int high_c0, high_c1, high_c2, high_c3; - ofs = (int)ofs + 1 - 4 * 4; - } - } - else - { - uint32_t ofs = (subblock * 2) * 4; - for (uint32_t x = 0; x < 2; x++) - { - for (uint32_t y = 0; y < 4; y++) + for (uint32_t i = 0; i < 16; i++) { - const color32& c = pSource_pixels[subblock * 2 + x + y * 4]; - const uint32_t l = c[0] * 108 + c[1] * 366 + c[2] * 38; - - uint32_t t = s_tran[(l < block_y01) + (l < block_y12) + (l < block_y23)]; - - assert(ofs < 16); - l_bitmask |= ((t & 1) << ofs); - h_bitmask |= ((t >> 1) << ofs); - ++ofs; - } - } - } - } - - dst_blk.m_bytes[7] = (uint8_t)(l_bitmask); - dst_blk.m_bytes[6] = (uint8_t)(l_bitmask >> 8); - dst_blk.m_bytes[5] = (uint8_t)(h_bitmask); - dst_blk.m_bytes[4] = (uint8_t)(h_bitmask >> 8); - } + const int dotx = pSrc_pixels[i].r * saxis0_r; + const int doty = pSrc_pixels[i].g * saxis0_g; + const int dotz = pSrc_pixels[i].b * saxis0_b; - static const uint8_t s_etc1_solid_selectors[4][4] = { { 255, 255, 255, 255 }, { 255, 255, 0, 0 }, { 0, 0, 0, 0 }, {0, 0, 255, 255 } }; + const int dot0 = ((dotz + dotx + doty) << 4) + i; + const int dot1 = ((dotz - dotx - doty) << 4) + i; + const int dot2 = ((dotz - dotx + doty) << 4) + i; + const int dot3 = ((dotz + dotx - doty) << 4) + i; - struct etc_coord2 - { - uint8_t m_x, m_y; - }; + if (dot0 < low_dot0) + { + low_dot0 = dot0; + //low_c0 = i; + } + if ((dot0 ^ 15) > high_dot0) + { + high_dot0 = dot0 ^ 15; + //high_c0 = i; + } - // [flip][subblock][pixel_index] - const etc_coord2 g_etc1_pixel_coords[2][2][8] = - { - { - { - { 0, 0 }, { 0, 1 }, { 0, 2 }, { 0, 3 }, - { 1, 0 }, { 1, 1 }, { 1, 2 }, { 1, 3 } - }, - { - { 2, 0 }, { 2, 1 }, { 2, 2 }, { 2, 3 }, - { 3, 0 }, { 3, 1 }, { 3, 2 }, { 3, 3 } - } - }, - { - { - { 0, 0 }, { 1, 0 }, { 2, 0 }, { 3, 0 }, - { 0, 1 }, { 1, 1 }, { 2, 1 }, { 3, 1 } - }, - { - { 0, 2 }, { 1, 2 }, { 2, 2 }, { 3, 2 }, - { 0, 3 }, { 1, 3 }, { 2, 3 }, { 3, 3 } - }, - } - }; + if (dot1 < low_dot1) + { + low_dot1 = dot1; + //low_c1 = i; + } + if ((dot1 ^ 15) > high_dot1) + { + high_dot1 = dot1 ^ 15; + //high_c1 = i; + } - void transcode_uastc_to_etc1(unpacked_uastc_block& unpacked_src_blk, color32 block_pixels[4][4], void* pDst) - { - decoder_etc_block& dst_blk = *static_cast(pDst); + if (dot2 < low_dot2) + { + low_dot2 = dot2; + //low_c2 = i; + } + if ((dot2 ^ 15) > high_dot2) + { + high_dot2 = dot2 ^ 15; + //high_c2 = i; + } - if (unpacked_src_blk.m_mode == UASTC_MODE_INDEX_SOLID_COLOR) - { - dst_blk.m_bytes[3] = (uint8_t)((unpacked_src_blk.m_etc1_diff << 1) | (unpacked_src_blk.m_etc1_inten0 << 5) | (unpacked_src_blk.m_etc1_inten0 << 2)); + if (dot3 < low_dot3) + { + low_dot3 = dot3; + //low_c3 = i; + } + if ((dot3 ^ 15) > high_dot3) + { + high_dot3 = dot3 ^ 15; + //high_c3 = i; + } + } - if (unpacked_src_blk.m_etc1_diff) - { - dst_blk.m_bytes[0] = (uint8_t)(unpacked_src_blk.m_etc1_r << 3); - dst_blk.m_bytes[1] = (uint8_t)(unpacked_src_blk.m_etc1_g << 3); - dst_blk.m_bytes[2] = (uint8_t)(unpacked_src_blk.m_etc1_b << 3); - } - else - { - dst_blk.m_bytes[0] = (uint8_t)(unpacked_src_blk.m_etc1_r | (unpacked_src_blk.m_etc1_r << 4)); - dst_blk.m_bytes[1] = (uint8_t)(unpacked_src_blk.m_etc1_g | (unpacked_src_blk.m_etc1_g << 4)); - dst_blk.m_bytes[2] = (uint8_t)(unpacked_src_blk.m_etc1_b | (unpacked_src_blk.m_etc1_b << 4)); - } + low_c = low_dot0 & 15; + high_c = ~high_dot0 & 15; + uint32_t r = (high_dot0 & ~15) - (low_dot0 & ~15); - memcpy(dst_blk.m_bytes + 4, &s_etc1_solid_selectors[unpacked_src_blk.m_etc1_selector][0], 4); + uint32_t tr = (high_dot1 & ~15) - (low_dot1 & ~15); + if (tr > r) { + low_c = low_dot1 & 15; + high_c = ~high_dot1 & 15; + r = tr; + } - return; - } + tr = (high_dot2 & ~15) - (low_dot2 & ~15); + if (tr > r) { + low_c = low_dot2 & 15; + high_c = ~high_dot2 & 15; + r = tr; + } - const bool flip = unpacked_src_blk.m_etc1_flip != 0; - const bool diff = unpacked_src_blk.m_etc1_diff != 0; + tr = (high_dot3 & ~15) - (low_dot3 & ~15); + if (tr > r) { + low_c = low_dot3 & 15; + high_c = ~high_dot3 & 15; + } + } - dst_blk.m_bytes[3] = (uint8_t)((int)flip | (diff << 1) | (unpacked_src_blk.m_etc1_inten0 << 5) | (unpacked_src_blk.m_etc1_inten1 << 2)); + lr = to_5(pSrc_pixels[low_c].r); + lg = to_6(pSrc_pixels[low_c].g); + lb = to_5(pSrc_pixels[low_c].b); - const uint32_t limit = diff ? 31 : 15; + hr = to_5(pSrc_pixels[high_c].r); + hg = to_6(pSrc_pixels[high_c].g); + hb = to_5(pSrc_pixels[high_c].b); + } - color32 block_colors[2]; + bc1_find_sels_2(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels); + } // if (use_sels) - for (uint32_t subset = 0; subset < 2; subset++) + const uint32_t total_ls_passes = (flags & cEncodeBC1HigherQuality) ? 3 : (flags & cEncodeBC1HighQuality ? 2 : 1); + for (uint32_t ls_pass = 0; ls_pass < total_ls_passes; ls_pass++) { - uint32_t avg_color[3]; - memset(avg_color, 0, sizeof(avg_color)); + int prev_lr = lr, prev_lg = lg, prev_lb = lb, prev_hr = hr, prev_hg = hg, prev_hb = hb; - for (uint32_t j = 0; j < 8; j++) + // This is where the real magic happens. We have an array of candidate selectors, so let's use least squares to compute the optimal low/high endpoint colors. + vec3F xl, xh; + if (!compute_least_squares_endpoints_rgb(pSrc_pixels, sels, &xl, &xh)) { - const etc_coord2& c = g_etc1_pixel_coords[flip][subset][j]; + if (avg_r < 0) + { + int total_r = 0, total_g = 0, total_b = 0; + for (uint32_t i = 0; i < 16; i++) + { + total_r += pSrc_pixels[i].r; + total_g += pSrc_pixels[i].g; + total_b += pSrc_pixels[i].b; + } - avg_color[0] += block_pixels[c.m_y][c.m_x].r; - avg_color[1] += block_pixels[c.m_y][c.m_x].g; - avg_color[2] += block_pixels[c.m_y][c.m_x].b; - } // j + avg_r = (total_r + 8) >> 4; + avg_g = (total_g + 8) >> 4; + avg_b = (total_b + 8) >> 4; + } - block_colors[subset][0] = (uint8_t)((avg_color[0] * limit + 1020) / (8 * 255)); - block_colors[subset][1] = (uint8_t)((avg_color[1] * limit + 1020) / (8 * 255)); - block_colors[subset][2] = (uint8_t)((avg_color[2] * limit + 1020) / (8 * 255)); - block_colors[subset][3] = 0; + // All selectors equal - treat it as a solid block which should always be equal or better. + lr = g_bc1_match5_equals_1[avg_r].m_hi; + lg = g_bc1_match6_equals_1[avg_g].m_hi; + lb = g_bc1_match5_equals_1[avg_b].m_hi; - if (g_uastc_mode_has_etc1_bias[unpacked_src_blk.m_mode]) + hr = g_bc1_match5_equals_1[avg_r].m_lo; + hg = g_bc1_match6_equals_1[avg_g].m_lo; + hb = g_bc1_match5_equals_1[avg_b].m_lo; + + // In high/higher quality mode, let it try again in case the optimal tables have caused the sels to diverge. + } + else { - block_colors[subset] = apply_etc1_bias(block_colors[subset], unpacked_src_blk.m_etc1_bias, limit, subset); + lr = basisu::clamp((int)((xl.c[0]) * (31.0f / 255.0f) + .5f), 0, 31); + lg = basisu::clamp((int)((xl.c[1]) * (63.0f / 255.0f) + .5f), 0, 63); + lb = basisu::clamp((int)((xl.c[2]) * (31.0f / 255.0f) + .5f), 0, 31); + + hr = basisu::clamp((int)((xh.c[0]) * (31.0f / 255.0f) + .5f), 0, 31); + hg = basisu::clamp((int)((xh.c[1]) * (63.0f / 255.0f) + .5f), 0, 63); + hb = basisu::clamp((int)((xh.c[2]) * (31.0f / 255.0f) + .5f), 0, 31); } - } // subset + if ((prev_lr == lr) && (prev_lg == lg) && (prev_lb == lb) && (prev_hr == hr) && (prev_hg == hg) && (prev_hb == hb)) + break; - if (diff) + bc1_find_sels_2(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels); + } + + uint32_t lc16 = dxt1_block::pack_unscaled_color(lr, lg, lb); + uint32_t hc16 = dxt1_block::pack_unscaled_color(hr, hg, hb); + + // Always forbid 3 color blocks + if (lc16 == hc16) { - int dr = block_colors[1].r - block_colors[0].r; - int dg = block_colors[1].g - block_colors[0].g; - int db = block_colors[1].b - block_colors[0].b; + uint8_t mask = 0; - dr = basisu::clamp(dr, cETC1ColorDeltaMin, cETC1ColorDeltaMax); - dg = basisu::clamp(dg, cETC1ColorDeltaMin, cETC1ColorDeltaMax); - db = basisu::clamp(db, cETC1ColorDeltaMin, cETC1ColorDeltaMax); + // Make l > h + if (hc16 > 0) + hc16--; + else + { + // lc16 = hc16 = 0 + assert(lc16 == hc16 && hc16 == 0); - if (dr < 0) dr += 8; - if (dg < 0) dg += 8; - if (db < 0) db += 8; + hc16 = 0; + lc16 = 1; + mask = 0x55; // select hc16 + } - dst_blk.m_bytes[0] = (uint8_t)((block_colors[0].r << 3) | dr); - dst_blk.m_bytes[1] = (uint8_t)((block_colors[0].g << 3) | dg); - dst_blk.m_bytes[2] = (uint8_t)((block_colors[0].b << 3) | db); + assert(lc16 > hc16); + pDst_block->set_low_color(static_cast(lc16)); + pDst_block->set_high_color(static_cast(hc16)); + + pDst_block->m_selectors[0] = mask; + pDst_block->m_selectors[1] = mask; + pDst_block->m_selectors[2] = mask; + pDst_block->m_selectors[3] = mask; } else { - dst_blk.m_bytes[0] = (uint8_t)(block_colors[1].r | (block_colors[0].r << 4)); - dst_blk.m_bytes[1] = (uint8_t)(block_colors[1].g | (block_colors[0].g << 4)); - dst_blk.m_bytes[2] = (uint8_t)(block_colors[1].b | (block_colors[0].b << 4)); - } + uint8_t invert_mask = 0; + if (lc16 < hc16) + { + std::swap(lc16, hc16); + invert_mask = 0x55; + } - etc1_determine_selectors(dst_blk, &block_pixels[0][0], 0, 2); - } + assert(lc16 > hc16); + pDst_block->set_low_color((uint16_t)lc16); + pDst_block->set_high_color((uint16_t)hc16); - bool transcode_uastc_to_etc1(const uastc_block& src_blk, void* pDst) - { - unpacked_uastc_block unpacked_src_blk; - if (!unpack_uastc(src_blk, unpacked_src_blk, false)) - return false; + uint32_t packed_sels = 0; + static const uint8_t s_sel_trans[4] = { 0, 2, 3, 1 }; + for (uint32_t i = 0; i < 16; i++) + packed_sels |= ((uint32_t)s_sel_trans[sels[i]] << (i * 2)); - color32 block_pixels[4][4]; - if (unpacked_src_blk.m_mode != UASTC_MODE_INDEX_SOLID_COLOR) - { - const bool unpack_srgb = false; - if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) - return false; + pDst_block->m_selectors[0] = (uint8_t)packed_sels ^ invert_mask; + pDst_block->m_selectors[1] = (uint8_t)(packed_sels >> 8) ^ invert_mask; + pDst_block->m_selectors[2] = (uint8_t)(packed_sels >> 16) ^ invert_mask; + pDst_block->m_selectors[3] = (uint8_t)(packed_sels >> 24) ^ invert_mask; } - - transcode_uastc_to_etc1(unpacked_src_blk, block_pixels, pDst); - - return true; - } - - static inline int gray_distance2(const uint8_t c, int y) - { - int gray_dist = (int)c - y; - return gray_dist * gray_dist; } - static bool pack_etc1_y_estimate_flipped(const uint8_t* pSrc_pixels, - int& upper_avg, int& lower_avg, int& left_avg, int& right_avg) + // Scale the UASTC first subset endpoints and first plane's weight indices directly to BC1's - fastest. + void transcode_uastc_to_bc1_hint0(const unpacked_uastc_block& unpacked_src_blk, void* pDst) { - int sums[2][2]; - -#define GET_XY(x, y) pSrc_pixels[(x) + ((y) * 4)] + const uint32_t mode = unpacked_src_blk.m_mode; + const astc_block_desc& astc_blk = unpacked_src_blk.m_astc; - sums[0][0] = GET_XY(0, 0) + GET_XY(0, 1) + GET_XY(1, 0) + GET_XY(1, 1); - sums[1][0] = GET_XY(2, 0) + GET_XY(2, 1) + GET_XY(3, 0) + GET_XY(3, 1); - sums[0][1] = GET_XY(0, 2) + GET_XY(0, 3) + GET_XY(1, 2) + GET_XY(1, 3); - sums[1][1] = GET_XY(2, 2) + GET_XY(2, 3) + GET_XY(3, 2) + GET_XY(3, 3); + dxt1_block& b = *static_cast(pDst); - upper_avg = (sums[0][0] + sums[1][0] + 4) / 8; - lower_avg = (sums[0][1] + sums[1][1] + 4) / 8; - left_avg = (sums[0][0] + sums[0][1] + 4) / 8; - right_avg = (sums[1][0] + sums[1][1] + 4) / 8; + const uint32_t endpoint_range = g_uastc_mode_endpoint_ranges[mode]; -#undef GET_XY -#define GET_XY(x, y, a) gray_distance2(pSrc_pixels[(x) + ((y) * 4)], a) + const uint32_t total_comps = g_uastc_mode_comps[mode]; - int upper_gray_dist = 0, lower_gray_dist = 0, left_gray_dist = 0, right_gray_dist = 0; - for (uint32_t i = 0; i < 4; i++) + if (total_comps == 2) { - for (uint32_t j = 0; j < 2; j++) - { - upper_gray_dist += GET_XY(i, j, upper_avg); - lower_gray_dist += GET_XY(i, 2 + j, lower_avg); - left_gray_dist += GET_XY(j, i, left_avg); - right_gray_dist += GET_XY(2 + j, i, right_avg); - } - } - -#undef GET_XY - - int upper_lower_sum = upper_gray_dist + lower_gray_dist; - int left_right_sum = left_gray_dist + right_gray_dist; + const uint32_t l = g_astc_unquant[endpoint_range][astc_blk.m_endpoints[0]].m_unquant; + const uint32_t h = g_astc_unquant[endpoint_range][astc_blk.m_endpoints[1]].m_unquant; - return upper_lower_sum < left_right_sum; - } + b.set_low_color(dxt1_block::pack_color(color32(l, l, l, 255), true, 127)); + b.set_high_color(dxt1_block::pack_color(color32(h, h, h, 255), true, 127)); + } + else + { + b.set_low_color(dxt1_block::pack_color( + color32(g_astc_unquant[endpoint_range][astc_blk.m_endpoints[0]].m_unquant, + g_astc_unquant[endpoint_range][astc_blk.m_endpoints[2]].m_unquant, + g_astc_unquant[endpoint_range][astc_blk.m_endpoints[4]].m_unquant, + 255), true, 127) + ); - // Base Sel Table - // XXXXX XX XXX - static const uint16_t g_etc1_y_solid_block_configs[256] = - { - 0,781,64,161,260,192,33,131,96,320,65,162,261,193,34,291,97,224,66,163,262,194,35,549,98,4,67,653,164,195,523,36,99,5,578,68,165,353,196,37,135,100,324,69,166,354,197,38,295,101,228,70,167, - 355,198,39,553,102,8,71,608,168,199,527,40,103,9,582,72,169,357,200,41,139,104,328,73,170,358,201,42,299,105,232,74,171,359,202,43,557,106,12,75,612,172,203,531,44,107,13,586,76,173,361, - 204,45,143,108,332,77,174,362,205,46,303,109,236,78,175,363,206,47,561,110,16,79,616,176,207,535,48,111,17,590,80,177,365,208,49,147,112,336,81,178,366,209,50,307,113,240,82,179,367,210, - 51,565,114,20,83,620,180,211,539,52,115,21,594,84,181,369,212,53,151,116,340,85,182,370,213,54,311,117,244,86,183,371,214,55,569,118,24,87,624,184,215,543,56,119,25,598,88,185,373,216,57, - 155,120,344,89,186,374,217,58,315,121,248,90,187,375,218,59,573,122,28,91,628,188,219,754,60,123,29,602,92,189,377,220,61,159,124,348,93,190,378,221,62,319,125,252,94,191,379,222,63,882,126 - }; + b.set_high_color(dxt1_block::pack_color( + color32(g_astc_unquant[endpoint_range][astc_blk.m_endpoints[1]].m_unquant, + g_astc_unquant[endpoint_range][astc_blk.m_endpoints[3]].m_unquant, + g_astc_unquant[endpoint_range][astc_blk.m_endpoints[5]].m_unquant, + 255), true, 127) + ); + } - // individual - // table base sel0 sel1 sel2 sel3 - static const uint16_t g_etc1_y_solid_block_4i_configs[256] = - { - 0xA000,0xA800,0x540B,0xAA01,0xAA01,0xFE00,0xFF00,0xFF00,0x8,0x5515,0x5509,0x5509,0xAA03,0x5508,0x5508,0x9508,0xA508,0xA908,0xAA08,0x5513,0xAA09,0xAA09,0xAA05,0xFF08,0xFF08,0x10,0x551D,0x5511,0x5511, - 0xAA0B,0x5510,0x5510,0x9510,0xA510,0xA910,0xAA10,0x551B,0xAA11,0xAA11,0xAA0D,0xFF10,0xFF10,0x18,0x5525,0x5519,0x5519,0xAA13,0x5518,0x5518,0x9518,0xA518,0xA918,0xAA18,0x5523,0xAA19,0xAA19,0xAA15, - 0xFF18,0xFF18,0x20,0x552D,0x5521,0x5521,0xAA1B,0x5520,0x5520,0x9520,0xA520,0xA920,0xAA20,0x552B,0xAA21,0xAA21,0xAA1D,0xFF20,0xFF20,0x28,0x5535,0x5529,0x5529,0xAA23,0x5528,0x5528,0x9528,0xA528,0xA928, - 0xAA28,0x5533,0xAA29,0xAA29,0xAA25,0xFF28,0xFF28,0x30,0x553D,0x5531,0x5531,0xAA2B,0x5530,0x5530,0x9530,0xA530,0xA930,0xAA30,0x553B,0xAA31,0xAA31,0xAA2D,0xFF30,0xFF30,0x38,0x5545,0x5539,0x5539,0xAA33, - 0x5538,0x5538,0x9538,0xA538,0xA938,0xAA38,0x5543,0xAA39,0xAA39,0xAA35,0xFF38,0xFF38,0x40,0x554D,0x5541,0x5541,0xAA3B,0x5540,0x5540,0x9540,0xA540,0xA940,0xAA40,0x554B,0xAA41,0xAA41,0xAA3D,0xFF40,0xFF40, - 0x48,0x5555,0x5549,0x5549,0xAA43,0x5548,0x5548,0x9548,0xA548,0xA948,0xAA48,0x5553,0xAA49,0xAA49,0xAA45,0xFF48,0xFF48,0x50,0x555D,0x5551,0x5551,0xAA4B,0x5550,0x5550,0x9550,0xA550,0xA950,0xAA50,0x555B, - 0xAA51,0xAA51,0xAA4D,0xFF50,0xFF50,0x58,0x5565,0x5559,0x5559,0xAA53,0x5558,0x5558,0x9558,0xA558,0xA958,0xAA58,0x5563,0xAA59,0xAA59,0xAA55,0xFF58,0xFF58,0x60,0x556D,0x5561,0x5561,0xAA5B,0x5560,0x5560, - 0x9560,0xA560,0xA960,0xAA60,0x556B,0xAA61,0xAA61,0xAA5D,0xFF60,0xFF60,0x68,0x5575,0x5569,0x5569,0xAA63,0x5568,0x5568,0x9568,0xA568,0xA968,0xAA68,0x5573,0xAA69,0xAA69,0xAA65,0xFF68,0xFF68,0x70,0x557D, - 0x5571,0x5571,0xAA6B,0x5570,0x5570,0x9570,0xA570,0xA970,0xAA70,0x557B,0xAA71,0xAA71,0xAA6D,0xFF70,0xFF70,0x78,0x78,0x5579,0x5579,0xAA73,0x5578,0x9578,0x2578,0xE6E,0x278 - }; + if (b.get_low_color() == b.get_high_color()) + { + // Always forbid 3 color blocks + uint16_t lc16 = (uint16_t)b.get_low_color(); + uint16_t hc16 = (uint16_t)b.get_high_color(); + + uint8_t mask = 0; - static const uint16_t g_etc1_y_solid_block_2i_configs[256] = - { - 0x416,0x800,0xA00,0x50B,0xA01,0xA01,0xF00,0xF00,0xF00,0x8,0x515,0x509,0x509,0xA03,0x508,0x508,0xF01,0xF01,0xA08,0xA08,0x513,0xA09,0xA09,0xA05,0xF08,0xF08,0x10,0x51D,0x511,0x511,0xA0B,0x510,0x510,0xF09, - 0xF09,0xA10,0xA10,0x51B,0xA11,0xA11,0xA0D,0xF10,0xF10,0x18,0x525,0x519,0x519,0xA13,0x518,0x518,0xF11,0xF11,0xA18,0xA18,0x523,0xA19,0xA19,0xA15,0xF18,0xF18,0x20,0x52D,0x521,0x521,0xA1B,0x520,0x520,0xF19, - 0xF19,0xA20,0xA20,0x52B,0xA21,0xA21,0xA1D,0xF20,0xF20,0x28,0x535,0x529,0x529,0xA23,0x528,0x528,0xF21,0xF21,0xA28,0xA28,0x533,0xA29,0xA29,0xA25,0xF28,0xF28,0x30,0x53D,0x531,0x531,0xA2B,0x530,0x530,0xF29, - 0xF29,0xA30,0xA30,0x53B,0xA31,0xA31,0xA2D,0xF30,0xF30,0x38,0x545,0x539,0x539,0xA33,0x538,0x538,0xF31,0xF31,0xA38,0xA38,0x543,0xA39,0xA39,0xA35,0xF38,0xF38,0x40,0x54D,0x541,0x541,0xA3B,0x540,0x540,0xF39, - 0xF39,0xA40,0xA40,0x54B,0xA41,0xA41,0xA3D,0xF40,0xF40,0x48,0x555,0x549,0x549,0xA43,0x548,0x548,0xF41,0xF41,0xA48,0xA48,0x553,0xA49,0xA49,0xA45,0xF48,0xF48,0x50,0x55D,0x551,0x551,0xA4B,0x550,0x550,0xF49, - 0xF49,0xA50,0xA50,0x55B,0xA51,0xA51,0xA4D,0xF50,0xF50,0x58,0x565,0x559,0x559,0xA53,0x558,0x558,0xF51,0xF51,0xA58,0xA58,0x563,0xA59,0xA59,0xA55,0xF58,0xF58,0x60,0x56D,0x561,0x561,0xA5B,0x560,0x560,0xF59, - 0xF59,0xA60,0xA60,0x56B,0xA61,0xA61,0xA5D,0xF60,0xF60,0x68,0x575,0x569,0x569,0xA63,0x568,0x568,0xF61,0xF61,0xA68,0xA68,0x573,0xA69,0xA69,0xA65,0xF68,0xF68,0x70,0x57D,0x571,0x571,0xA6B,0x570,0x570,0xF69, - 0xF69,0xA70,0xA70,0x57B,0xA71,0xA71,0xA6D,0xF70,0xF70,0x78,0x78,0x579,0x579,0xA73,0x578,0x578,0xE6E,0x278 - }; + // Make l > h + if (hc16 > 0) + hc16--; + else + { + // lc16 = hc16 = 0 + assert(lc16 == hc16 && hc16 == 0); - static const uint16_t g_etc1_y_solid_block_1i_configs[256] = - { - 0x0,0x116,0x200,0x200,0x10B,0x201,0x201,0x300,0x300,0x8,0x115,0x109,0x109,0x203,0x108,0x108,0x114,0x301,0x204,0x208,0x208,0x113,0x209,0x209,0x205,0x308,0x10,0x11D,0x111,0x111,0x20B,0x110,0x110,0x11C,0x309, - 0x20C,0x210,0x210,0x11B,0x211,0x211,0x20D,0x310,0x18,0x125,0x119,0x119,0x213,0x118,0x118,0x124,0x311,0x214,0x218,0x218,0x123,0x219,0x219,0x215,0x318,0x20,0x12D,0x121,0x121,0x21B,0x120,0x120,0x12C,0x319,0x21C, - 0x220,0x220,0x12B,0x221,0x221,0x21D,0x320,0x28,0x135,0x129,0x129,0x223,0x128,0x128,0x134,0x321,0x224,0x228,0x228,0x133,0x229,0x229,0x225,0x328,0x30,0x13D,0x131,0x131,0x22B,0x130,0x130,0x13C,0x329,0x22C,0x230, - 0x230,0x13B,0x231,0x231,0x22D,0x330,0x38,0x145,0x139,0x139,0x233,0x138,0x138,0x144,0x331,0x234,0x238,0x238,0x143,0x239,0x239,0x235,0x338,0x40,0x14D,0x141,0x141,0x23B,0x140,0x140,0x14C,0x339,0x23C,0x240,0x240, - 0x14B,0x241,0x241,0x23D,0x340,0x48,0x155,0x149,0x149,0x243,0x148,0x148,0x154,0x341,0x244,0x248,0x248,0x153,0x249,0x249,0x245,0x348,0x50,0x15D,0x151,0x151,0x24B,0x150,0x150,0x15C,0x349,0x24C,0x250,0x250,0x15B, - 0x251,0x251,0x24D,0x350,0x58,0x165,0x159,0x159,0x253,0x158,0x158,0x164,0x351,0x254,0x258,0x258,0x163,0x259,0x259,0x255,0x358,0x60,0x16D,0x161,0x161,0x25B,0x160,0x160,0x16C,0x359,0x25C,0x260,0x260,0x16B,0x261, - 0x261,0x25D,0x360,0x68,0x175,0x169,0x169,0x263,0x168,0x168,0x174,0x361,0x264,0x268,0x268,0x173,0x269,0x269,0x265,0x368,0x70,0x17D,0x171,0x171,0x26B,0x170,0x170,0x17C,0x369,0x26C,0x270,0x270,0x17B,0x271,0x271, - 0x26D,0x370,0x78,0x78,0x179,0x179,0x273,0x178,0x178,0x26E,0x278 - }; + hc16 = 0; + lc16 = 1; + mask = 0x55; // select hc16 + } - // We don't have any useful hints to accelerate single channel ETC1, so we need to real-time encode from scratch. - bool transcode_uastc_to_etc1(const uastc_block& src_blk, void* pDst, uint32_t channel) - { - unpacked_uastc_block unpacked_src_blk; - if (!unpack_uastc(src_blk, unpacked_src_blk, false)) - return false; + assert(lc16 > hc16); + b.set_low_color(static_cast(lc16)); + b.set_high_color(static_cast(hc16)); -#if 0 - for (uint32_t individ = 0; individ < 2; individ++) + b.m_selectors[0] = mask; + b.m_selectors[1] = mask; + b.m_selectors[2] = mask; + b.m_selectors[3] = mask; + } + else { - uint32_t overall_error = 0; - - for (uint32_t c = 0; c < 256; c++) + bool invert = false; + if (b.get_low_color() < b.get_high_color()) { - uint32_t best_err = UINT32_MAX; - uint32_t best_individ = 0; - uint32_t best_base = 0; - uint32_t best_sels[4] = { 0,0,0,0 }; - uint32_t best_table = 0; + std::swap(b.m_low_color[0], b.m_high_color[0]); + std::swap(b.m_low_color[1], b.m_high_color[1]); + invert = true; + } - const uint32_t limit = individ ? 16 : 32; + const uint8_t* pTran = s_uastc_to_bc1_weights[g_uastc_mode_weight_bits[mode]]; - for (uint32_t table = 0; table < 8; table++) - { - for (uint32_t base = 0; base < limit; base++) - { - uint32_t total_e = 0; - uint32_t sels[4] = { 0,0,0,0 }; + const uint32_t plane_shift = g_uastc_mode_planes[mode] - 1; - const uint32_t N = 4; - for (uint32_t i = 0; i < basisu::minimum(N, (256 - c)); i++) - { - uint32_t best_sel_e = UINT32_MAX; - uint32_t best_sel = 0; + uint32_t sels = 0; + for (int i = 15; i >= 0; --i) + { + uint32_t s = pTran[astc_blk.m_weights[i << plane_shift]]; - for (uint32_t sel = 0; sel < 4; sel++) - { - int val = individ ? ((base << 4) | base) : ((base << 3) | (base >> 2)); - val = clamp255(val + g_etc1_inten_tables[table][sel]); + if (invert) + s ^= 1; - int e = iabs(val - clamp255(c + i)); - if (e < best_sel_e) - { - best_sel_e = e; - best_sel = sel; - } + sels = (sels << 2) | s; + } + b.m_selectors[0] = sels & 0xFF; + b.m_selectors[1] = (sels >> 8) & 0xFF; + b.m_selectors[2] = (sels >> 16) & 0xFF; + b.m_selectors[3] = (sels >> 24) & 0xFF; + } + } - } // sel + // Scale the UASTC first plane's weight indices to BC1, use 1 or 2 least squares passes to compute endpoints - no PCA needed. + void transcode_uastc_to_bc1_hint1(const unpacked_uastc_block& unpacked_src_blk, const color32 block_pixels[4][4], void* pDst, bool high_quality) + { + const uint32_t mode = unpacked_src_blk.m_mode; - sels[i] = best_sel; - total_e += best_sel_e * best_sel_e; + const astc_block_desc& astc_blk = unpacked_src_blk.m_astc; - } // i + dxt1_block& b = *static_cast(pDst); - if (total_e < best_err) - { - best_err = total_e; - best_individ = individ; - best_base = base; - memcpy(best_sels, sels, sizeof(best_sels)); - best_table = table; - } + b.set_low_color(1); + b.set_high_color(0); - } // base - } // table + const uint8_t* pTran = s_uastc_to_bc1_weights[g_uastc_mode_weight_bits[mode]]; - //printf("%u: %u,%u,%u,%u,%u,%u,%u,%u\n", c, best_err, best_individ, best_table, best_base, best_sels[0], best_sels[1], best_sels[2], best_sels[3]); + const uint32_t plane_shift = g_uastc_mode_planes[mode] - 1; - uint32_t encoded = best_table | (best_base << 3) | - (best_sels[0] << 8) | - (best_sels[1] << 10) | - (best_sels[2] << 12) | - (best_sels[3] << 14); + uint32_t sels = 0; + for (int i = 15; i >= 0; --i) + { + sels <<= 2; + sels |= pTran[astc_blk.m_weights[i << plane_shift]]; + } - printf("0x%X,", encoded); + b.m_selectors[0] = sels & 0xFF; + b.m_selectors[1] = (sels >> 8) & 0xFF; + b.m_selectors[2] = (sels >> 16) & 0xFF; + b.m_selectors[3] = (sels >> 24) & 0xFF; - overall_error += best_err; - } // c + encode_bc1(&b, (const uint8_t*)&block_pixels[0][0].c[0], (high_quality ? cEncodeBC1HighQuality : 0) | cEncodeBC1UseSelectors); + } - printf("\n"); - printf("Overall error: %u\n", overall_error); + bool transcode_uastc_to_bc1(const uastc_block& src_blk, void* pDst, bool high_quality) + { + unpacked_uastc_block unpacked_src_blk; + if (!unpack_uastc(src_blk, unpacked_src_blk, false)) + return false; - } // individ + const uint32_t mode = unpacked_src_blk.m_mode; - exit(0); -#endif + if (mode == UASTC_MODE_INDEX_SOLID_COLOR) + { + encode_bc1_solid_block(pDst, unpacked_src_blk.m_solid_color.r, unpacked_src_blk.m_solid_color.g, unpacked_src_blk.m_solid_color.b); + return true; + } -#if 0 - for (uint32_t individ = 0; individ < 2; individ++) + if ((!high_quality) && (unpacked_src_blk.m_bc1_hint0)) + transcode_uastc_to_bc1_hint0(unpacked_src_blk, pDst); + else { - uint32_t overall_error = 0; + color32 block_pixels[4][4]; + const bool unpack_srgb = false; + if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) + return false; - for (uint32_t c = 0; c < 256; c++) - { - uint32_t best_err = UINT32_MAX; - uint32_t best_individ = 0; - uint32_t best_base = 0; - uint32_t best_sels[4] = { 0,0,0,0 }; - uint32_t best_table = 0; + if (unpacked_src_blk.m_bc1_hint1) + transcode_uastc_to_bc1_hint1(unpacked_src_blk, block_pixels, pDst, high_quality); + else + encode_bc1(pDst, &block_pixels[0][0].r, high_quality ? cEncodeBC1HighQuality : 0); + } - const uint32_t limit = individ ? 16 : 32; + return true; + } - for (uint32_t table = 0; table < 8; table++) - { - for (uint32_t base = 0; base < limit; base++) - { - uint32_t total_e = 0; - uint32_t sels[4] = { 0,0,0,0 }; + static void write_bc4_solid_block(uint8_t* pDst, uint32_t a) + { + pDst[0] = (uint8_t)a; + pDst[1] = (uint8_t)a; + memset(pDst + 2, 0, 6); + } - const uint32_t N = 1; - for (uint32_t i = 0; i < basisu::minimum(N, (256 - c)); i++) - { - uint32_t best_sel_e = UINT32_MAX; - uint32_t best_sel = 0; + bool transcode_uastc_to_bc3(const uastc_block& src_blk, void* pDst, bool high_quality) + { + unpacked_uastc_block unpacked_src_blk; + if (!unpack_uastc(src_blk, unpacked_src_blk, false)) + return false; - for (uint32_t sel = 0; sel < 4; sel++) - { - int val = individ ? ((base << 4) | base) : ((base << 3) | (base >> 2)); - val = clamp255(val + g_etc1_inten_tables[table][sel]); + const uint32_t mode = unpacked_src_blk.m_mode; - int e = iabs(val - clamp255(c + i)); - if (e < best_sel_e) - { - best_sel_e = e; - best_sel = sel; - } + void* pBC4_block = pDst; + dxt1_block* pBC1_block = &static_cast(pDst)[1]; - } // sel + if (mode == UASTC_MODE_INDEX_SOLID_COLOR) + { + write_bc4_solid_block(static_cast(pBC4_block), unpacked_src_blk.m_solid_color.a); + encode_bc1_solid_block(pBC1_block, unpacked_src_blk.m_solid_color.r, unpacked_src_blk.m_solid_color.g, unpacked_src_blk.m_solid_color.b); + return true; + } - sels[i] = best_sel; - total_e += best_sel_e * best_sel_e; + color32 block_pixels[4][4]; + const bool unpack_srgb = false; + if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) + return false; - } // i + basist::encode_bc4(pBC4_block, &block_pixels[0][0].a, sizeof(color32)); - if (total_e < best_err) - { - best_err = total_e; - best_individ = individ; - best_base = base; - memcpy(best_sels, sels, sizeof(best_sels)); - best_table = table; - } + if ((!high_quality) && (unpacked_src_blk.m_bc1_hint0)) + transcode_uastc_to_bc1_hint0(unpacked_src_blk, pBC1_block); + else + { + if (unpacked_src_blk.m_bc1_hint1) + transcode_uastc_to_bc1_hint1(unpacked_src_blk, block_pixels, pBC1_block, high_quality); + else + encode_bc1(pBC1_block, &block_pixels[0][0].r, high_quality ? cEncodeBC1HighQuality : 0); + } - } // base - } // table + return true; + } - //printf("%u: %u,%u,%u,%u,%u,%u,%u,%u\n", c, best_err, best_individ, best_table, best_base, best_sels[0], best_sels[1], best_sels[2], best_sels[3]); + bool transcode_uastc_to_bc4(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0) + { + BASISU_NOTE_UNUSED(high_quality); - uint32_t encoded = best_table | (best_base << 3) | - (best_sels[0] << 8) | - (best_sels[1] << 10) | - (best_sels[2] << 12) | - (best_sels[3] << 14); + unpacked_uastc_block unpacked_src_blk; + if (!unpack_uastc(src_blk, unpacked_src_blk, false)) + return false; - printf("0x%X,", encoded); + const uint32_t mode = unpacked_src_blk.m_mode; - overall_error += best_err; - } // c + void* pBC4_block = pDst; - printf("\n"); - printf("Overall error: %u\n", overall_error); + if (mode == UASTC_MODE_INDEX_SOLID_COLOR) + { + write_bc4_solid_block(static_cast(pBC4_block), unpacked_src_blk.m_solid_color.c[chan0]); + return true; + } - } // individ + color32 block_pixels[4][4]; + const bool unpack_srgb = false; + if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) + return false; - exit(0); -#endif + basist::encode_bc4(pBC4_block, &block_pixels[0][0].c[chan0], sizeof(color32)); - decoder_etc_block& dst_blk = *static_cast(pDst); + return true; + } - if (unpacked_src_blk.m_mode == UASTC_MODE_INDEX_SOLID_COLOR) - { - const uint32_t y = unpacked_src_blk.m_solid_color[channel]; - const uint32_t encoded_config = g_etc1_y_solid_block_configs[y]; + bool transcode_uastc_to_bc5(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0, uint32_t chan1) + { + BASISU_NOTE_UNUSED(high_quality); - const uint32_t base = encoded_config & 31; - const uint32_t sel = (encoded_config >> 5) & 3; - const uint32_t table = encoded_config >> 7; + unpacked_uastc_block unpacked_src_blk; + if (!unpack_uastc(src_blk, unpacked_src_blk, false)) + return false; - dst_blk.m_bytes[3] = (uint8_t)(2 | (table << 5) | (table << 2)); + const uint32_t mode = unpacked_src_blk.m_mode; - dst_blk.m_bytes[0] = (uint8_t)(base << 3); - dst_blk.m_bytes[1] = (uint8_t)(base << 3); - dst_blk.m_bytes[2] = (uint8_t)(base << 3); + void* pBC4_block0 = pDst; + void* pBC4_block1 = (uint8_t*)pDst + 8; - memcpy(dst_blk.m_bytes + 4, &s_etc1_solid_selectors[sel][0], 4); + if (mode == UASTC_MODE_INDEX_SOLID_COLOR) + { + write_bc4_solid_block(static_cast(pBC4_block0), unpacked_src_blk.m_solid_color.c[chan0]); + write_bc4_solid_block(static_cast(pBC4_block1), unpacked_src_blk.m_solid_color.c[chan1]); return true; } @@ -14405,5066 +17951,6023 @@ namespace basist if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) return false; - uint8_t block_y[4][4]; - for (uint32_t i = 0; i < 16; i++) - ((uint8_t*)block_y)[i] = ((color32*)block_pixels)[i][channel]; + basist::encode_bc4(pBC4_block0, &block_pixels[0][0].c[chan0], sizeof(color32)); + basist::encode_bc4(pBC4_block1, &block_pixels[0][0].c[chan1], sizeof(color32)); - int upper_avg, lower_avg, left_avg, right_avg; - bool flip = pack_etc1_y_estimate_flipped(&block_y[0][0], upper_avg, lower_avg, left_avg, right_avg); + return true; + } - // non-flipped: | | - // vs. - // flipped: -- - // -- + static const uint8_t s_etc2_eac_bit_ofs[16] = { 45, 33, 21, 9, 42, 30, 18, 6, 39, 27, 15, 3, 36, 24, 12, 0 }; - uint32_t low[2] = { 255, 255 }, high[2] = { 0, 0 }; + static void pack_eac_solid_block(eac_block& blk, uint32_t a) + { + blk.m_base = static_cast(a); + blk.m_table = 13; + blk.m_multiplier = 0; + + memcpy(blk.m_selectors, g_etc2_eac_a8_sel4, sizeof(g_etc2_eac_a8_sel4)); - if (flip) + return; + } + + // Only checks 4 tables. + static void pack_eac(eac_block& blk, const uint8_t* pPixels, uint32_t stride) + { + uint32_t min_alpha = 255, max_alpha = 0; + for (uint32_t i = 0; i < 16; i++) { - for (uint32_t y = 0; y < 2; y++) - { - for (uint32_t x = 0; x < 4; x++) - { - const uint32_t v = block_y[y][x]; - low[0] = basisu::minimum(low[0], v); - high[0] = basisu::maximum(high[0], v); - } - } - for (uint32_t y = 2; y < 4; y++) - { - for (uint32_t x = 0; x < 4; x++) - { - const uint32_t v = block_y[y][x]; - low[1] = basisu::minimum(low[1], v); - high[1] = basisu::maximum(high[1], v); - } - } + const uint32_t a = pPixels[i * stride]; + if (a < min_alpha) min_alpha = a; + if (a > max_alpha) max_alpha = a; } - else + + if (min_alpha == max_alpha) { - for (uint32_t y = 0; y < 4; y++) - { - for (uint32_t x = 0; x < 2; x++) - { - const uint32_t v = block_y[y][x]; - low[0] = basisu::minimum(low[0], v); - high[0] = basisu::maximum(high[0], v); - } - } - for (uint32_t y = 0; y < 4; y++) + pack_eac_solid_block(blk, min_alpha); + return; + } + + const uint32_t alpha_range = max_alpha - min_alpha; + + const uint32_t SINGLE_TABLE_THRESH = 5; + if (alpha_range <= SINGLE_TABLE_THRESH) + { + // If alpha_range <= 5 table 13 is lossless + int base = clamp255((int)max_alpha - 2); + + blk.m_base = base; + blk.m_multiplier = 1; + blk.m_table = 13; + + base -= 3; + + uint64_t packed_sels = 0; + for (uint32_t i = 0; i < 16; i++) { - for (uint32_t x = 2; x < 4; x++) - { - const uint32_t v = block_y[y][x]; - low[1] = basisu::minimum(low[1], v); - high[1] = basisu::maximum(high[1], v); - } + const int a = pPixels[i * stride]; + + static const uint8_t s_sels[6] = { 2, 1, 0, 4, 5, 6 }; + + int sel = a - base; + assert(sel >= 0 && sel <= 5); + + packed_sels |= (static_cast(s_sels[sel]) << s_etc2_eac_bit_ofs[i]); } - } - const uint32_t range[2] = { high[0] - low[0], high[1] - low[1] }; + blk.set_selector_bits(packed_sels); - dst_blk.m_bytes[3] = (uint8_t)((int)flip); + return; + } - if ((range[0] <= 3) && (range[1] <= 3)) + const uint32_t T0 = 2, T1 = 8, T2 = 11, T3 = 13; + static const uint8_t s_tables[4] = { T0, T1, T2, T3 }; + + int base[4], mul[4]; + uint32_t mul_or = 0; + for (uint32_t i = 0; i < 4; i++) { - // This is primarily for better gradients. - dst_blk.m_bytes[0] = 0; - dst_blk.m_bytes[1] = 0; - dst_blk.m_bytes[2] = 0; + const uint32_t table = s_tables[i]; - uint16_t l_bitmask = 0, h_bitmask = 0; + const float range = (float)(g_eac_modifier_table[table][ETC2_EAC_MAX_VALUE_SELECTOR] - g_eac_modifier_table[table][ETC2_EAC_MIN_VALUE_SELECTOR]); - for (uint32_t subblock = 0; subblock < 2; subblock++) - { - const uint32_t encoded = (range[subblock] == 0) ? g_etc1_y_solid_block_1i_configs[low[subblock]] : ((range[subblock] < 2) ? g_etc1_y_solid_block_2i_configs[low[subblock]] : g_etc1_y_solid_block_4i_configs[low[subblock]]); + base[i] = clamp255((int)roundf(basisu::lerp((float)min_alpha, (float)max_alpha, (float)(0 - g_eac_modifier_table[table][ETC2_EAC_MIN_VALUE_SELECTOR]) / range))); + mul[i] = clampi((int)roundf(alpha_range / range), 1, 15); + mul_or |= mul[i]; + } - const uint32_t table = encoded & 7; - const uint32_t base = (encoded >> 3) & 31; - assert(base <= 15); - const uint32_t sels[4] = { (encoded >> 8) & 3, (encoded >> 10) & 3, (encoded >> 12) & 3, (encoded >> 14) & 3 }; + uint32_t total_err[4] = { 0, 0, 0, 0 }; + uint8_t sels[4][16]; - dst_blk.m_bytes[3] |= (uint8_t)(table << (subblock ? 2 : 5)); + for (uint32_t i = 0; i < 16; i++) + { + const int a = pPixels[i * stride]; - const uint32_t sv = base << (subblock ? 0 : 4); - dst_blk.m_bytes[0] |= (uint8_t)(sv); - dst_blk.m_bytes[1] |= (uint8_t)(sv); - dst_blk.m_bytes[2] |= (uint8_t)(sv); + uint32_t l0 = UINT32_MAX, l1 = UINT32_MAX, l2 = UINT32_MAX, l3 = UINT32_MAX; - if (flip) + if ((a < 7) || (a > (255 - 7))) + { + for (uint32_t s = 0; s < 8; s++) { - uint32_t ofs = subblock * 2; - for (uint32_t y = 0; y < 2; y++) - { - for (uint32_t x = 0; x < 4; x++) - { - uint32_t t = block_y[y + subblock * 2][x]; - assert(t >= low[subblock] && t <= high[subblock]); - t -= low[subblock]; - assert(t <= 3); + const int v0 = clamp255(mul[0] * g_eac_modifier_table[T0][s] + base[0]); + const int v1 = clamp255(mul[1] * g_eac_modifier_table[T1][s] + base[1]); + const int v2 = clamp255(mul[2] * g_eac_modifier_table[T2][s] + base[2]); + const int v3 = clamp255(mul[3] * g_eac_modifier_table[T3][s] + base[3]); - t = g_selector_index_to_etc1[sels[t]]; + l0 = basisu::minimum(l0, (basisu::iabs(v0 - a) << 3) | s); + l1 = basisu::minimum(l1, (basisu::iabs(v1 - a) << 3) | s); + l2 = basisu::minimum(l2, (basisu::iabs(v2 - a) << 3) | s); + l3 = basisu::minimum(l3, (basisu::iabs(v3 - a) << 3) | s); + } + } + else if (mul_or == 1) + { + const int a0 = base[0] - a, a1 = base[1] - a, a2 = base[2] - a, a3 = base[3] - a; - assert(ofs < 16); - l_bitmask |= ((t & 1) << ofs); - h_bitmask |= ((t >> 1) << ofs); - ofs += 4; - } + for (uint32_t s = 0; s < 8; s++) + { + const int v0 = g_eac_modifier_table[T0][s] + a0; + const int v1 = g_eac_modifier_table[T1][s] + a1; + const int v2 = g_eac_modifier_table[T2][s] + a2; + const int v3 = g_eac_modifier_table[T3][s] + a3; - ofs = (int)ofs + 1 - 4 * 4; - } + l0 = basisu::minimum(l0, (basisu::iabs(v0) << 3) | s); + l1 = basisu::minimum(l1, (basisu::iabs(v1) << 3) | s); + l2 = basisu::minimum(l2, (basisu::iabs(v2) << 3) | s); + l3 = basisu::minimum(l3, (basisu::iabs(v3) << 3) | s); } - else - { - uint32_t ofs = (subblock * 2) * 4; - for (uint32_t x = 0; x < 2; x++) - { - for (uint32_t y = 0; y < 4; y++) - { - uint32_t t = block_y[y][x + subblock * 2]; - assert(t >= low[subblock] && t <= high[subblock]); - t -= low[subblock]; - assert(t <= 3); + } + else + { + const int a0 = base[0] - a, a1 = base[1] - a, a2 = base[2] - a, a3 = base[3] - a; - t = g_selector_index_to_etc1[sels[t]]; + for (uint32_t s = 0; s < 8; s++) + { + const int v0 = mul[0] * g_eac_modifier_table[T0][s] + a0; + const int v1 = mul[1] * g_eac_modifier_table[T1][s] + a1; + const int v2 = mul[2] * g_eac_modifier_table[T2][s] + a2; + const int v3 = mul[3] * g_eac_modifier_table[T3][s] + a3; - assert(ofs < 16); - l_bitmask |= ((t & 1) << ofs); - h_bitmask |= ((t >> 1) << ofs); - ++ofs; - } - } + l0 = basisu::minimum(l0, (basisu::iabs(v0) << 3) | s); + l1 = basisu::minimum(l1, (basisu::iabs(v1) << 3) | s); + l2 = basisu::minimum(l2, (basisu::iabs(v2) << 3) | s); + l3 = basisu::minimum(l3, (basisu::iabs(v3) << 3) | s); } - } // subblock + } - dst_blk.m_bytes[7] = (uint8_t)(l_bitmask); - dst_blk.m_bytes[6] = (uint8_t)(l_bitmask >> 8); - dst_blk.m_bytes[5] = (uint8_t)(h_bitmask); - dst_blk.m_bytes[4] = (uint8_t)(h_bitmask >> 8); + sels[0][i] = l0 & 7; + sels[1][i] = l1 & 7; + sels[2][i] = l2 & 7; + sels[3][i] = l3 & 7; - return true; + total_err[0] += basisu::square(l0 >> 3); + total_err[1] += basisu::square(l1 >> 3); + total_err[2] += basisu::square(l2 >> 3); + total_err[3] += basisu::square(l3 >> 3); } - uint32_t y0 = ((flip ? upper_avg : left_avg) * 31 + 127) / 255; - uint32_t y1 = ((flip ? lower_avg : right_avg) * 31 + 127) / 255; + uint32_t min_err = total_err[0], min_index = 0; + for (uint32_t i = 1; i < 4; i++) + { + if (total_err[i] < min_err) + { + min_err = total_err[i]; + min_index = i; + } + } - bool diff = true; + blk.m_base = base[min_index]; + blk.m_multiplier = mul[min_index]; + blk.m_table = s_tables[min_index]; - int dy = y1 - y0; + uint64_t packed_sels = 0; + const uint8_t* pSels = &sels[min_index][0]; + for (uint32_t i = 0; i < 16; i++) + packed_sels |= (static_cast(pSels[i]) << s_etc2_eac_bit_ofs[i]); - if ((dy < cETC1ColorDeltaMin) || (dy > cETC1ColorDeltaMax)) - { - diff = false; + blk.set_selector_bits(packed_sels); + } - y0 = ((flip ? upper_avg : left_avg) * 15 + 127) / 255; - y1 = ((flip ? lower_avg : right_avg) * 15 + 127) / 255; + // Checks all 16 tables. Around ~2 dB better vs. pack_eac(), ~1.2 dB less than near-optimal. + static void pack_eac_high_quality(eac_block& blk, const uint8_t* pPixels, uint32_t stride) + { + uint32_t min_alpha = 255, max_alpha = 0; + for (uint32_t i = 0; i < 16; i++) + { + const uint32_t a = pPixels[i * stride]; + if (a < min_alpha) min_alpha = a; + if (a > max_alpha) max_alpha = a; + } - dst_blk.m_bytes[0] = (uint8_t)(y1 | (y0 << 4)); - dst_blk.m_bytes[1] = (uint8_t)(y1 | (y0 << 4)); - dst_blk.m_bytes[2] = (uint8_t)(y1 | (y0 << 4)); + if (min_alpha == max_alpha) + { + pack_eac_solid_block(blk, min_alpha); + return; } - else + + const uint32_t alpha_range = max_alpha - min_alpha; + + const uint32_t SINGLE_TABLE_THRESH = 5; + if (alpha_range <= SINGLE_TABLE_THRESH) { - dy = basisu::clamp(dy, cETC1ColorDeltaMin, cETC1ColorDeltaMax); + // If alpha_range <= 5 table 13 is lossless + int base = clamp255((int)max_alpha - 2); - y1 = y0 + dy; + blk.m_base = base; + blk.m_multiplier = 1; + blk.m_table = 13; - if (dy < 0) dy += 8; + base -= 3; - dst_blk.m_bytes[0] = (uint8_t)((y0 << 3) | dy); - dst_blk.m_bytes[1] = (uint8_t)((y0 << 3) | dy); - dst_blk.m_bytes[2] = (uint8_t)((y0 << 3) | dy); + uint64_t packed_sels = 0; + for (uint32_t i = 0; i < 16; i++) + { + const int a = pPixels[i * stride]; - dst_blk.m_bytes[3] |= 2; - } + static const uint8_t s_sels[6] = { 2, 1, 0, 4, 5, 6 }; - const uint32_t base_y[2] = { diff ? ((y0 << 3) | (y0 >> 2)) : ((y0 << 4) | y0), diff ? ((y1 << 3) | (y1 >> 2)) : ((y1 << 4) | y1) }; + int sel = a - base; + assert(sel >= 0 && sel <= 5); - uint32_t enc_range[2]; - for (uint32_t subset = 0; subset < 2; subset++) - { - const int pos = basisu::iabs((int)high[subset] - (int)base_y[subset]); - const int neg = basisu::iabs((int)base_y[subset] - (int)low[subset]); + packed_sels |= (static_cast(s_sels[sel]) << s_etc2_eac_bit_ofs[i]); + } - enc_range[subset] = basisu::maximum(pos, neg); + blk.set_selector_bits(packed_sels); + + return; } - uint16_t l_bitmask = 0, h_bitmask = 0; - for (uint32_t subblock = 0; subblock < 2; subblock++) + int base[16], mul[16]; + for (uint32_t table = 0; table < 16; table++) { - if ((!diff) && (range[subblock] <= 3)) - { - const uint32_t encoded = (range[subblock] == 0) ? g_etc1_y_solid_block_1i_configs[low[subblock]] : ((range[subblock] < 2) ? g_etc1_y_solid_block_2i_configs[low[subblock]] : g_etc1_y_solid_block_4i_configs[low[subblock]]); - - const uint32_t table = encoded & 7; - const uint32_t base = (encoded >> 3) & 31; - assert(base <= 15); - const uint32_t sels[4] = { (encoded >> 8) & 3, (encoded >> 10) & 3, (encoded >> 12) & 3, (encoded >> 14) & 3 }; - - dst_blk.m_bytes[3] |= (uint8_t)(table << (subblock ? 2 : 5)); + const float range = (float)(g_eac_modifier_table[table][ETC2_EAC_MAX_VALUE_SELECTOR] - g_eac_modifier_table[table][ETC2_EAC_MIN_VALUE_SELECTOR]); - const uint32_t mask = ~(0xF << (subblock ? 0 : 4)); + base[table] = clamp255((int)roundf(basisu::lerp((float)min_alpha, (float)max_alpha, (float)(0 - g_eac_modifier_table[table][ETC2_EAC_MIN_VALUE_SELECTOR]) / range))); + mul[table] = clampi((int)roundf(alpha_range / range), 1, 15); + } - dst_blk.m_bytes[0] &= mask; - dst_blk.m_bytes[1] &= mask; - dst_blk.m_bytes[2] &= mask; + uint32_t total_err[16]; + memset(total_err, 0, sizeof(total_err)); - const uint32_t sv = base << (subblock ? 0 : 4); - dst_blk.m_bytes[0] |= (uint8_t)(sv); - dst_blk.m_bytes[1] |= (uint8_t)(sv); - dst_blk.m_bytes[2] |= (uint8_t)(sv); + uint8_t sels[16][16]; - if (flip) - { - uint32_t ofs = subblock * 2; - for (uint32_t y = 0; y < 2; y++) - { - for (uint32_t x = 0; x < 4; x++) - { - uint32_t t = block_y[y + subblock * 2][x]; - assert(t >= low[subblock] && t <= high[subblock]); - t -= low[subblock]; - assert(t <= 3); + for (uint32_t table = 0; table < 16; table++) + { + const int8_t* pTable = &g_eac_modifier_table[table][0]; + const int m = mul[table], b = base[table]; - t = g_selector_index_to_etc1[sels[t]]; + uint32_t prev_l = 0, prev_a = UINT32_MAX; - assert(ofs < 16); - l_bitmask |= ((t & 1) << ofs); - h_bitmask |= ((t >> 1) << ofs); - ofs += 4; - } + for (uint32_t i = 0; i < 16; i++) + { + const int a = pPixels[i * stride]; - ofs = (int)ofs + 1 - 4 * 4; - } + if ((uint32_t)a == prev_a) + { + sels[table][i] = prev_l & 7; + total_err[table] += basisu::square(prev_l >> 3); } else { - uint32_t ofs = (subblock * 2) * 4; - for (uint32_t x = 0; x < 2; x++) - { - for (uint32_t y = 0; y < 4; y++) - { - uint32_t t = block_y[y][x + subblock * 2]; - assert(t >= low[subblock] && t <= high[subblock]); - t -= low[subblock]; - assert(t <= 3); + uint32_t l = basisu::iabs(clamp255(m * pTable[0] + b) - a) << 3; + l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[1] + b) - a) << 3) | 1); + l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[2] + b) - a) << 3) | 2); + l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[3] + b) - a) << 3) | 3); + l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[4] + b) - a) << 3) | 4); + l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[5] + b) - a) << 3) | 5); + l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[6] + b) - a) << 3) | 6); + l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[7] + b) - a) << 3) | 7); - t = g_selector_index_to_etc1[sels[t]]; + sels[table][i] = l & 7; + total_err[table] += basisu::square(l >> 3); + + prev_l = l; + prev_a = a; + } + } + } + + uint32_t min_err = total_err[0], min_index = 0; + for (uint32_t i = 1; i < 16; i++) + { + if (total_err[i] < min_err) + { + min_err = total_err[i]; + min_index = i; + } + } + + blk.m_base = base[min_index]; + blk.m_multiplier = mul[min_index]; + blk.m_table = min_index; + + uint64_t packed_sels = 0; + const uint8_t* pSels = &sels[min_index][0]; + for (uint32_t i = 0; i < 16; i++) + packed_sels |= (static_cast(pSels[i]) << s_etc2_eac_bit_ofs[i]); + + blk.set_selector_bits(packed_sels); + } - assert(ofs < 16); - l_bitmask |= ((t & 1) << ofs); - h_bitmask |= ((t >> 1) << ofs); - ++ofs; - } - } - } + bool transcode_uastc_to_etc2_eac_r11(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0) + { + unpacked_uastc_block unpacked_src_blk; + if (!unpack_uastc(src_blk, unpacked_src_blk, false)) + return false; - continue; - } // if + const uint32_t mode = unpacked_src_blk.m_mode; - uint32_t best_err = UINT32_MAX; - uint8_t best_sels[8]; - uint32_t best_inten = 0; + if (mode == UASTC_MODE_INDEX_SOLID_COLOR) + { + pack_eac_solid_block(*static_cast(pDst), unpacked_src_blk.m_solid_color.c[chan0]); + return true; + } - const int base = base_y[subblock]; + color32 block_pixels[4][4]; + const bool unpack_srgb = false; + if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) + return false; - const int low_limit = -base; - const int high_limit = 255 - base; + if (chan0 == 3) + transcode_uastc_to_etc2_eac_a8(unpacked_src_blk, block_pixels, pDst); + else + (high_quality ? pack_eac_high_quality : pack_eac)(*static_cast(pDst), &block_pixels[0][0].c[chan0], sizeof(color32)); - assert(low_limit <= 0 && high_limit >= 0); + return true; + } - uint32_t inten_table_mask = 0xFF; - const uint32_t er = enc_range[subblock]; - // Each one of these tables is expensive to evaluate, so let's only examine the ones we know may be useful. - if (er <= 51) - { - inten_table_mask = 0xF; + bool transcode_uastc_to_etc2_eac_rg11(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0, uint32_t chan1) + { + unpacked_uastc_block unpacked_src_blk; + if (!unpack_uastc(src_blk, unpacked_src_blk, false)) + return false; - if (er > 22) - inten_table_mask &= ~(1 << 0); + const uint32_t mode = unpacked_src_blk.m_mode; - if ((er < 4) || (er > 39)) - inten_table_mask &= ~(1 << 1); + if (mode == UASTC_MODE_INDEX_SOLID_COLOR) + { + pack_eac_solid_block(static_cast(pDst)[0], unpacked_src_blk.m_solid_color.c[chan0]); + pack_eac_solid_block(static_cast(pDst)[1], unpacked_src_blk.m_solid_color.c[chan1]); + return true; + } - if (er < 9) - inten_table_mask &= ~(1 << 2); + color32 block_pixels[4][4]; + const bool unpack_srgb = false; + if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) + return false; - if (er < 12) - inten_table_mask &= ~(1 << 3); - } - else - { - inten_table_mask &= ~((1 << 0) | (1 << 1)); + if (chan0 == 3) + transcode_uastc_to_etc2_eac_a8(unpacked_src_blk, block_pixels, &static_cast(pDst)[0]); + else + (high_quality ? pack_eac_high_quality : pack_eac)(static_cast(pDst)[0], &block_pixels[0][0].c[chan0], sizeof(color32)); - if (er > 60) - inten_table_mask &= ~(1 << 2); + if (chan1 == 3) + transcode_uastc_to_etc2_eac_a8(unpacked_src_blk, block_pixels, &static_cast(pDst)[1]); + else + (high_quality ? pack_eac_high_quality : pack_eac)(static_cast(pDst)[1], &block_pixels[0][0].c[chan1], sizeof(color32)); + return true; + } - if (er > 89) - inten_table_mask &= ~(1 << 3); + // PVRTC1 + static void fixup_pvrtc1_4_modulation_rgb( + const uastc_block* pSrc_blocks, + const uint32_t* pPVRTC_endpoints, + void* pDst_blocks, + uint32_t num_blocks_x, uint32_t num_blocks_y, bool from_alpha) + { + const uint32_t x_mask = num_blocks_x - 1; + const uint32_t y_mask = num_blocks_y - 1; + const uint32_t x_bits = basisu::total_bits(x_mask); + const uint32_t y_bits = basisu::total_bits(y_mask); + const uint32_t min_bits = basisu::minimum(x_bits, y_bits); + //const uint32_t max_bits = basisu::maximum(x_bits, y_bits); + const uint32_t swizzle_mask = (1 << (min_bits * 2)) - 1; - if (er > 120) - inten_table_mask &= ~(1 << 4); + uint32_t block_index = 0; - if (er > 136) - inten_table_mask &= ~(1 << 5); + // really 3x3 + int e0[4][4], e1[4][4]; - if (er > 174) - inten_table_mask &= ~(1 << 6); - } + for (int y = 0; y < static_cast(num_blocks_y); y++) + { + const uint32_t* pE_rows[3]; - for (uint32_t inten = 0; inten < 8; inten++) + for (int ey = 0; ey < 3; ey++) { - if ((inten_table_mask & (1 << inten)) == 0) - continue; + int by = y + ey - 1; - const int t0 = basisu::maximum(low_limit, g_etc1_inten_tables[inten][0]); - const int t1 = basisu::maximum(low_limit, g_etc1_inten_tables[inten][1]); - const int t2 = basisu::minimum(high_limit, g_etc1_inten_tables[inten][2]); - const int t3 = basisu::minimum(high_limit, g_etc1_inten_tables[inten][3]); - assert((t0 <= t1) && (t1 <= t2) && (t2 <= t3)); + const uint32_t* pE = &pPVRTC_endpoints[(by & y_mask) * num_blocks_x]; - const int tv[4] = { t2, t3, t1, t0 }; + pE_rows[ey] = pE; - const int thresh01 = t0 + t1; - const int thresh12 = t1 + t2; - const int thresh23 = t2 + t3; + for (int ex = 0; ex < 3; ex++) + { + int bx = 0 + ex - 1; - assert(thresh01 <= thresh12 && thresh12 <= thresh23); + const uint32_t e = pE[bx & x_mask]; - static const uint8_t s_table[4] = { 1, 0, 2, 3 }; + e0[ex][ey] = (get_opaque_endpoint_l0(e) * 255) / 31; + e1[ex][ey] = (get_opaque_endpoint_l1(e) * 255) / 31; + } + } - uint32_t total_err = 0; - uint8_t sels[8]; + const uint32_t y_swizzle = (g_pvrtc_swizzle_table[y >> 8] << 16) | g_pvrtc_swizzle_table[y & 0xFF]; - if (flip) + for (int x = 0; x < static_cast(num_blocks_x); x++, block_index++) + { + const uastc_block& src_block = pSrc_blocks[block_index]; + + color32 block_pixels[4][4]; + unpack_uastc(src_block, &block_pixels[0][0], false); + if (from_alpha) { - if (((int)high[subblock] - base) * 2 < thresh01) + // Just set RGB to alpha to avoid adding complexity below. + for (uint32_t i = 0; i < 16; i++) { - memset(sels, 3, 8); + const uint8_t a = ((color32*)block_pixels)[i].a; + ((color32*)block_pixels)[i].set(a, a, a, 255); + } + } - for (uint32_t y = 0; y < 2; y++) - { - for (uint32_t x = 0; x < 4; x++) - { - const int delta = (int)block_y[y + subblock * 2][x] - base; + const uint32_t x_swizzle = (g_pvrtc_swizzle_table[x >> 8] << 17) | (g_pvrtc_swizzle_table[x & 0xFF] << 1); - const uint32_t c = 3; + uint32_t swizzled = x_swizzle | y_swizzle; + if (num_blocks_x != num_blocks_y) + { + swizzled &= swizzle_mask; - uint32_t e = basisu::iabs(tv[c] - delta); - total_err += e * e; - } - if (total_err >= best_err) - break; - } - } - else if (((int)low[subblock] - base) * 2 >= thresh23) - { - memset(sels, 1, 8); + if (num_blocks_x > num_blocks_y) + swizzled |= ((x >> min_bits) << (min_bits * 2)); + else + swizzled |= ((y >> min_bits) << (min_bits * 2)); + } - for (uint32_t y = 0; y < 2; y++) - { - for (uint32_t x = 0; x < 4; x++) - { - const int delta = (int)block_y[y + subblock * 2][x] - base; + pvrtc4_block* pDst_block = static_cast(pDst_blocks) + swizzled; + pDst_block->m_endpoints = pPVRTC_endpoints[block_index]; - const uint32_t c = 1; + { + const uint32_t ex = 2; + int bx = x + ex - 1; + bx &= x_mask; - uint32_t e = basisu::iabs(tv[c] - delta); - total_err += e * e; - } - if (total_err >= best_err) - break; - } +#define DO_ROW(ey) \ + { \ + const uint32_t e = pE_rows[ey][bx]; \ + e0[ex][ey] = (get_opaque_endpoint_l0(e) * 255) / 31; \ + e1[ex][ey] = (get_opaque_endpoint_l1(e) * 255) / 31; \ } - else - { - for (uint32_t y = 0; y < 2; y++) - { - for (uint32_t x = 0; x < 4; x++) - { - const int delta = (int)block_y[y + subblock * 2][x] - base; - const int delta2 = delta * 2; - uint32_t c = s_table[(delta2 < thresh01) + (delta2 < thresh12) + (delta2 < thresh23)]; - sels[y * 4 + x] = (uint8_t)c; + DO_ROW(0); + DO_ROW(1); + DO_ROW(2); +#undef DO_ROW + } - uint32_t e = basisu::iabs(tv[c] - delta); - total_err += e * e; - } - if (total_err >= best_err) - break; - } - } + uint32_t mod = 0; + +#define DO_PIX(lx, ly, w0, w1, w2, w3) \ + { \ + int ca_l = a0 * w0 + a1 * w1 + a2 * w2 + a3 * w3; \ + int cb_l = b0 * w0 + b1 * w1 + b2 * w2 + b3 * w3; \ + int cl = (block_pixels[ly][lx].r + block_pixels[ly][lx].g + block_pixels[ly][lx].b) * 16; \ + int dl = cb_l - ca_l; \ + int vl = cl - ca_l; \ + int p = vl * 16; \ + if (ca_l > cb_l) { p = -p; dl = -dl; } \ + uint32_t m = 0; \ + if (p > 3 * dl) m = (uint32_t)(1 << ((ly) * 8 + (lx) * 2)); \ + if (p > 8 * dl) m = (uint32_t)(2 << ((ly) * 8 + (lx) * 2)); \ + if (p > 13 * dl) m = (uint32_t)(3 << ((ly) * 8 + (lx) * 2)); \ + mod |= m; \ + } + + { + const uint32_t ex = 0, ey = 0; + const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1]; + const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1]; + DO_PIX(0, 0, 4, 4, 4, 4); + DO_PIX(1, 0, 2, 6, 2, 6); + DO_PIX(0, 1, 2, 2, 6, 6); + DO_PIX(1, 1, 1, 3, 3, 9); + } + + { + const uint32_t ex = 1, ey = 0; + const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1]; + const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1]; + DO_PIX(2, 0, 8, 0, 8, 0); + DO_PIX(3, 0, 6, 2, 6, 2); + DO_PIX(2, 1, 4, 0, 12, 0); + DO_PIX(3, 1, 3, 1, 9, 3); } - else + { - if (((int)high[subblock] - base) * 2 < thresh01) - { - memset(sels, 3, 8); + const uint32_t ex = 0, ey = 1; + const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1]; + const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1]; + DO_PIX(0, 2, 8, 8, 0, 0); + DO_PIX(1, 2, 4, 12, 0, 0); + DO_PIX(0, 3, 6, 6, 2, 2); + DO_PIX(1, 3, 3, 9, 1, 3); + } - for (uint32_t y = 0; y < 4; y++) - { - for (uint32_t x = 0; x < 2; x++) - { - const int delta = (int)block_y[y][x + subblock * 2] - base; + { + const uint32_t ex = 1, ey = 1; + const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1]; + const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1]; + DO_PIX(2, 2, 16, 0, 0, 0); + DO_PIX(3, 2, 12, 4, 0, 0); + DO_PIX(2, 3, 12, 0, 4, 0); + DO_PIX(3, 3, 9, 3, 3, 1); + } +#undef DO_PIX - const uint32_t c = 3; + pDst_block->m_modulation = mod; - uint32_t e = basisu::iabs(tv[c] - delta); - total_err += e * e; - } - if (total_err >= best_err) - break; - } - } - else if (((int)low[subblock] - base) * 2 >= thresh23) - { - memset(sels, 1, 8); + e0[0][0] = e0[1][0]; e0[1][0] = e0[2][0]; + e0[0][1] = e0[1][1]; e0[1][1] = e0[2][1]; + e0[0][2] = e0[1][2]; e0[1][2] = e0[2][2]; - for (uint32_t y = 0; y < 4; y++) - { - for (uint32_t x = 0; x < 2; x++) - { - const int delta = (int)block_y[y][x + subblock * 2] - base; + e1[0][0] = e1[1][0]; e1[1][0] = e1[2][0]; + e1[0][1] = e1[1][1]; e1[1][1] = e1[2][1]; + e1[0][2] = e1[1][2]; e1[1][2] = e1[2][2]; - const uint32_t c = 1; + } // x + } // y + } - uint32_t e = basisu::iabs(tv[c] - delta); - total_err += e * e; - } - if (total_err >= best_err) - break; - } - } - else - { - for (uint32_t y = 0; y < 4; y++) - { - for (uint32_t x = 0; x < 2; x++) - { - const int delta = (int)block_y[y][x + subblock * 2] - base; - const int delta2 = delta * 2; + static void fixup_pvrtc1_4_modulation_rgba( + const uastc_block* pSrc_blocks, + const uint32_t* pPVRTC_endpoints, + void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y) + { + const uint32_t x_mask = num_blocks_x - 1; + const uint32_t y_mask = num_blocks_y - 1; + const uint32_t x_bits = basisu::total_bits(x_mask); + const uint32_t y_bits = basisu::total_bits(y_mask); + const uint32_t min_bits = basisu::minimum(x_bits, y_bits); + //const uint32_t max_bits = basisu::maximum(x_bits, y_bits); + const uint32_t swizzle_mask = (1 << (min_bits * 2)) - 1; - uint32_t c = s_table[(delta2 < thresh01) + (delta2 < thresh12) + (delta2 < thresh23)]; - sels[y * 2 + x] = (uint8_t)c; + uint32_t block_index = 0; - uint32_t e = basisu::iabs(tv[c] - delta); - total_err += e * e; - } - if (total_err >= best_err) - break; - } - } - } + // really 3x3 + int e0[4][4], e1[4][4]; - if (total_err < best_err) - { - best_err = total_err; - best_inten = inten; - memcpy(best_sels, sels, 8); - } + for (int y = 0; y < static_cast(num_blocks_y); y++) + { + const uint32_t* pE_rows[3]; - } // inten + for (int ey = 0; ey < 3; ey++) + { + int by = y + ey - 1; - //g_inten_hist[best_inten][enc_range[subblock]]++; + const uint32_t* pE = &pPVRTC_endpoints[(by & y_mask) * num_blocks_x]; - dst_blk.m_bytes[3] |= (uint8_t)(best_inten << (subblock ? 2 : 5)); + pE_rows[ey] = pE; - if (flip) - { - uint32_t ofs = subblock * 2; - for (uint32_t y = 0; y < 2; y++) + for (int ex = 0; ex < 3; ex++) { - for (uint32_t x = 0; x < 4; x++) - { - uint32_t t = best_sels[y * 4 + x]; + int bx = 0 + ex - 1; - assert(ofs < 16); - l_bitmask |= ((t & 1) << ofs); - h_bitmask |= ((t >> 1) << ofs); - ofs += 4; - } + const uint32_t e = pE[bx & x_mask]; - ofs = (int)ofs + 1 - 4 * 4; + e0[ex][ey] = get_endpoint_l8(e, 0); + e1[ex][ey] = get_endpoint_l8(e, 1); } } - else + + const uint32_t y_swizzle = (g_pvrtc_swizzle_table[y >> 8] << 16) | g_pvrtc_swizzle_table[y & 0xFF]; + + for (int x = 0; x < static_cast(num_blocks_x); x++, block_index++) { - uint32_t ofs = (subblock * 2) * 4; - for (uint32_t x = 0; x < 2; x++) + const uastc_block& src_block = pSrc_blocks[block_index]; + + color32 block_pixels[4][4]; + unpack_uastc(src_block, &block_pixels[0][0], false); + + const uint32_t x_swizzle = (g_pvrtc_swizzle_table[x >> 8] << 17) | (g_pvrtc_swizzle_table[x & 0xFF] << 1); + + uint32_t swizzled = x_swizzle | y_swizzle; + if (num_blocks_x != num_blocks_y) { - for (uint32_t y = 0; y < 4; y++) - { - uint32_t t = best_sels[y * 2 + x]; + swizzled &= swizzle_mask; - assert(ofs < 16); - l_bitmask |= ((t & 1) << ofs); - h_bitmask |= ((t >> 1) << ofs); - ++ofs; - } + if (num_blocks_x > num_blocks_y) + swizzled |= ((x >> min_bits) << (min_bits * 2)); + else + swizzled |= ((y >> min_bits) << (min_bits * 2)); } - } - - } // subblock - dst_blk.m_bytes[7] = (uint8_t)(l_bitmask); - dst_blk.m_bytes[6] = (uint8_t)(l_bitmask >> 8); - dst_blk.m_bytes[5] = (uint8_t)(h_bitmask); - dst_blk.m_bytes[4] = (uint8_t)(h_bitmask >> 8); + pvrtc4_block* pDst_block = static_cast(pDst_blocks) + swizzled; + pDst_block->m_endpoints = pPVRTC_endpoints[block_index]; - return true; - } + { + const uint32_t ex = 2; + int bx = x + ex - 1; + bx &= x_mask; - const uint32_t ETC2_EAC_MIN_VALUE_SELECTOR = 3, ETC2_EAC_MAX_VALUE_SELECTOR = 7; +#define DO_ROW(ey) \ + { \ + const uint32_t e = pE_rows[ey][bx]; \ + e0[ex][ey] = get_endpoint_l8(e, 0); \ + e1[ex][ey] = get_endpoint_l8(e, 1); \ + } - void transcode_uastc_to_etc2_eac_a8(unpacked_uastc_block& unpacked_src_blk, color32 block_pixels[4][4], void* pDst) - { - eac_block& dst = *static_cast(pDst); - const color32* pSrc_pixels = &block_pixels[0][0]; + DO_ROW(0); + DO_ROW(1); + DO_ROW(2); +#undef DO_ROW + } - if ((!g_uastc_mode_has_alpha[unpacked_src_blk.m_mode]) || (unpacked_src_blk.m_mode == UASTC_MODE_INDEX_SOLID_COLOR)) - { - const uint32_t a = (unpacked_src_blk.m_mode == UASTC_MODE_INDEX_SOLID_COLOR) ? unpacked_src_blk.m_solid_color[3] : 255; + uint32_t mod = 0; - dst.m_base = a; - dst.m_table = 13; - dst.m_multiplier = 1; +#define DO_PIX(lx, ly, w0, w1, w2, w3) \ + { \ + int ca_l = a0 * w0 + a1 * w1 + a2 * w2 + a3 * w3; \ + int cb_l = b0 * w0 + b1 * w1 + b2 * w2 + b3 * w3; \ + int cl = 16 * (block_pixels[ly][lx].r + block_pixels[ly][lx].g + block_pixels[ly][lx].b + block_pixels[ly][lx].a); \ + int dl = cb_l - ca_l; \ + int vl = cl - ca_l; \ + int p = vl * 16; \ + if (ca_l > cb_l) { p = -p; dl = -dl; } \ + uint32_t m = 0; \ + if (p > 3 * dl) m = (uint32_t)(1 << ((ly) * 8 + (lx) * 2)); \ + if (p > 8 * dl) m = (uint32_t)(2 << ((ly) * 8 + (lx) * 2)); \ + if (p > 13 * dl) m = (uint32_t)(3 << ((ly) * 8 + (lx) * 2)); \ + mod |= m; \ + } - memcpy(dst.m_selectors, g_etc2_eac_a8_sel4, sizeof(g_etc2_eac_a8_sel4)); + { + const uint32_t ex = 0, ey = 0; + const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1]; + const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1]; + DO_PIX(0, 0, 4, 4, 4, 4); + DO_PIX(1, 0, 2, 6, 2, 6); + DO_PIX(0, 1, 2, 2, 6, 6); + DO_PIX(1, 1, 1, 3, 3, 9); + } - return; - } + { + const uint32_t ex = 1, ey = 0; + const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1]; + const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1]; + DO_PIX(2, 0, 8, 0, 8, 0); + DO_PIX(3, 0, 6, 2, 6, 2); + DO_PIX(2, 1, 4, 0, 12, 0); + DO_PIX(3, 1, 3, 1, 9, 3); + } - uint32_t min_a = 255, max_a = 0; - for (uint32_t i = 0; i < 16; i++) - { - min_a = basisu::minimum(min_a, pSrc_pixels[i].a); - max_a = basisu::maximum(max_a, pSrc_pixels[i].a); - } + { + const uint32_t ex = 0, ey = 1; + const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1]; + const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1]; + DO_PIX(0, 2, 8, 8, 0, 0); + DO_PIX(1, 2, 4, 12, 0, 0); + DO_PIX(0, 3, 6, 6, 2, 2); + DO_PIX(1, 3, 3, 9, 1, 3); + } - if (min_a == max_a) - { - dst.m_base = min_a; - dst.m_table = 13; - dst.m_multiplier = 1; + { + const uint32_t ex = 1, ey = 1; + const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1]; + const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1]; + DO_PIX(2, 2, 16, 0, 0, 0); + DO_PIX(3, 2, 12, 4, 0, 0); + DO_PIX(2, 3, 12, 0, 4, 0); + DO_PIX(3, 3, 9, 3, 3, 1); + } +#undef DO_PIX - memcpy(dst.m_selectors, g_etc2_eac_a8_sel4, sizeof(g_etc2_eac_a8_sel4)); - return; - } + pDst_block->m_modulation = mod; - const uint32_t table = unpacked_src_blk.m_etc2_hints & 0xF; - const int multiplier = unpacked_src_blk.m_etc2_hints >> 4; + e0[0][0] = e0[1][0]; e0[1][0] = e0[2][0]; + e0[0][1] = e0[1][1]; e0[1][1] = e0[2][1]; + e0[0][2] = e0[1][2]; e0[1][2] = e0[2][2]; - assert(multiplier >= 1); + e1[0][0] = e1[1][0]; e1[1][0] = e1[2][0]; + e1[0][1] = e1[1][1]; e1[1][1] = e1[2][1]; + e1[0][2] = e1[1][2]; e1[1][2] = e1[2][2]; - dst.m_multiplier = multiplier; - dst.m_table = table; + } // x + } // y + } - const float range = (float)(g_eac_modifier_table[dst.m_table][ETC2_EAC_MAX_VALUE_SELECTOR] - g_eac_modifier_table[dst.m_table][ETC2_EAC_MIN_VALUE_SELECTOR]); - const int center = (int)roundf(basisu::lerp((float)min_a, (float)max_a, (float)(0 - g_eac_modifier_table[dst.m_table][ETC2_EAC_MIN_VALUE_SELECTOR]) / range)); + bool transcode_uastc_to_pvrtc1_4_rgb(const uastc_block* pSrc_blocks, void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, bool high_quality, bool from_alpha) + { + BASISU_NOTE_UNUSED(high_quality); - dst.m_base = center; + if ((!num_blocks_x) || (!num_blocks_y)) + return false; - const int8_t* pTable = &g_eac_modifier_table[dst.m_table][0]; + const uint32_t width = num_blocks_x * 4; + const uint32_t height = num_blocks_y * 4; + if (!basisu::is_pow2(width) || !basisu::is_pow2(height)) + return false; - uint32_t vals[8]; - for (uint32_t j = 0; j < 8; j++) - vals[j] = clamp255(center + (pTable[j] * multiplier)); + basisu::vector temp_endpoints(num_blocks_x * num_blocks_y); - uint64_t sels = 0; - for (uint32_t i = 0; i < 16; i++) + for (uint32_t y = 0; y < num_blocks_y; y++) { - const uint32_t a = block_pixels[i & 3][i >> 2].a; + for (uint32_t x = 0; x < num_blocks_x; x++) + { + color32 block_pixels[16]; + if (!unpack_uastc(pSrc_blocks[x + y * num_blocks_x], block_pixels, false)) + return false; - const uint32_t err0 = (basisu::iabs(vals[0] - a) << 3) | 0; - const uint32_t err1 = (basisu::iabs(vals[1] - a) << 3) | 1; - const uint32_t err2 = (basisu::iabs(vals[2] - a) << 3) | 2; - const uint32_t err3 = (basisu::iabs(vals[3] - a) << 3) | 3; - const uint32_t err4 = (basisu::iabs(vals[4] - a) << 3) | 4; - const uint32_t err5 = (basisu::iabs(vals[5] - a) << 3) | 5; - const uint32_t err6 = (basisu::iabs(vals[6] - a) << 3) | 6; - const uint32_t err7 = (basisu::iabs(vals[7] - a) << 3) | 7; + // Get block's RGB bounding box + color32 low_color(255, 255, 255, 255), high_color(0, 0, 0, 0); - const uint32_t min_err = basisu::minimum(basisu::minimum(basisu::minimum(basisu::minimum(basisu::minimum(basisu::minimum(err0, err1, err2), err3), err4), err5), err6), err7); + if (from_alpha) + { + uint32_t low_a = 255, high_a = 0; + for (uint32_t i = 0; i < 16; i++) + { + low_a = basisu::minimum(low_a, block_pixels[i].a); + high_a = basisu::maximum(high_a, block_pixels[i].a); + } + low_color.set(low_a, low_a, low_a, 255); + high_color.set(high_a, high_a, high_a, 255); + } + else + { + for (uint32_t i = 0; i < 16; i++) + { + low_color = color32::comp_min(low_color, block_pixels[i]); + high_color = color32::comp_max(high_color, block_pixels[i]); + } + } - const uint64_t best_index = min_err & 7; - sels |= (best_index << (45 - i * 3)); + // Set PVRTC1 endpoints to floor/ceil of bounding box's coordinates. + pvrtc4_block temp; + temp.set_opaque_endpoint_floor(0, low_color); + temp.set_opaque_endpoint_ceil(1, high_color); + + temp_endpoints[x + y * num_blocks_x] = temp.m_endpoints; + } } - dst.set_selector_bits(sels); + fixup_pvrtc1_4_modulation_rgb(pSrc_blocks, &temp_endpoints[0], pDst_blocks, num_blocks_x, num_blocks_y, from_alpha); + + return true; } - bool transcode_uastc_to_etc2_rgba(const uastc_block& src_blk, void* pDst) + bool transcode_uastc_to_pvrtc1_4_rgba(const uastc_block* pSrc_blocks, void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, bool high_quality) { - eac_block& dst_etc2_eac_a8_blk = *static_cast(pDst); - decoder_etc_block& dst_etc1_blk = static_cast(pDst)[1]; + BASISU_NOTE_UNUSED(high_quality); - unpacked_uastc_block unpacked_src_blk; - if (!unpack_uastc(src_blk, unpacked_src_blk, false)) + if ((!num_blocks_x) || (!num_blocks_y)) return false; - color32 block_pixels[4][4]; - if (unpacked_src_blk.m_mode != UASTC_MODE_INDEX_SOLID_COLOR) - { - const bool unpack_srgb = false; - if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) - return false; - } - - transcode_uastc_to_etc2_eac_a8(unpacked_src_blk, block_pixels, &dst_etc2_eac_a8_blk); - - transcode_uastc_to_etc1(unpacked_src_blk, block_pixels, &dst_etc1_blk); - - return true; - } + const uint32_t width = num_blocks_x * 4; + const uint32_t height = num_blocks_y * 4; + if (!basisu::is_pow2(width) || !basisu::is_pow2(height)) + return false; - static const uint8_t s_uastc5_to_bc1[32] = { 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1 }; - static const uint8_t s_uastc4_to_bc1[16] = { 0, 0, 0, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 1, 1, 1 }; - static const uint8_t s_uastc3_to_bc1[8] = { 0, 0, 2, 2, 3, 3, 1, 1 }; - static const uint8_t s_uastc2_to_bc1[4] = { 0, 2, 3, 1 }; - static const uint8_t s_uastc1_to_bc1[2] = { 0, 1 }; - const uint8_t* s_uastc_to_bc1_weights[6] = { nullptr, s_uastc1_to_bc1, s_uastc2_to_bc1, s_uastc3_to_bc1, s_uastc4_to_bc1, s_uastc5_to_bc1 }; - - void encode_bc4(void* pDst, const uint8_t* pPixels, uint32_t stride) - { - uint32_t min0_v, max0_v, min1_v, max1_v,min2_v, max2_v, min3_v, max3_v; + basisu::vector temp_endpoints(num_blocks_x * num_blocks_y); + for (uint32_t y = 0; y < num_blocks_y; y++) { - min0_v = max0_v = pPixels[0 * stride]; - min1_v = max1_v = pPixels[1 * stride]; - min2_v = max2_v = pPixels[2 * stride]; - min3_v = max3_v = pPixels[3 * stride]; - } + for (uint32_t x = 0; x < num_blocks_x; x++) + { + color32 block_pixels[16]; + if (!unpack_uastc(pSrc_blocks[x + y * num_blocks_x], block_pixels, false)) + return false; - { - uint32_t v0 = pPixels[4 * stride]; min0_v = basisu::minimum(min0_v, v0); max0_v = basisu::maximum(max0_v, v0); - uint32_t v1 = pPixels[5 * stride]; min1_v = basisu::minimum(min1_v, v1); max1_v = basisu::maximum(max1_v, v1); - uint32_t v2 = pPixels[6 * stride]; min2_v = basisu::minimum(min2_v, v2); max2_v = basisu::maximum(max2_v, v2); - uint32_t v3 = pPixels[7 * stride]; min3_v = basisu::minimum(min3_v, v3); max3_v = basisu::maximum(max3_v, v3); - } + // Get block's RGBA bounding box + color32 low_color(255, 255, 255, 255), high_color(0, 0, 0, 0); - { - uint32_t v0 = pPixels[8 * stride]; min0_v = basisu::minimum(min0_v, v0); max0_v = basisu::maximum(max0_v, v0); - uint32_t v1 = pPixels[9 * stride]; min1_v = basisu::minimum(min1_v, v1); max1_v = basisu::maximum(max1_v, v1); - uint32_t v2 = pPixels[10 * stride]; min2_v = basisu::minimum(min2_v, v2); max2_v = basisu::maximum(max2_v, v2); - uint32_t v3 = pPixels[11 * stride]; min3_v = basisu::minimum(min3_v, v3); max3_v = basisu::maximum(max3_v, v3); - } + for (uint32_t i = 0; i < 16; i++) + { + low_color = color32::comp_min(low_color, block_pixels[i]); + high_color = color32::comp_max(high_color, block_pixels[i]); + } - { - uint32_t v0 = pPixels[12 * stride]; min0_v = basisu::minimum(min0_v, v0); max0_v = basisu::maximum(max0_v, v0); - uint32_t v1 = pPixels[13 * stride]; min1_v = basisu::minimum(min1_v, v1); max1_v = basisu::maximum(max1_v, v1); - uint32_t v2 = pPixels[14 * stride]; min2_v = basisu::minimum(min2_v, v2); max2_v = basisu::maximum(max2_v, v2); - uint32_t v3 = pPixels[15 * stride]; min3_v = basisu::minimum(min3_v, v3); max3_v = basisu::maximum(max3_v, v3); + // Set PVRTC1 endpoints to floor/ceil of bounding box's coordinates. + pvrtc4_block temp; + temp.set_endpoint_floor(0, low_color); + temp.set_endpoint_ceil(1, high_color); + + temp_endpoints[x + y * num_blocks_x] = temp.m_endpoints; + } } - const uint32_t min_v = basisu::minimum(min0_v, min1_v, min2_v, min3_v); - const uint32_t max_v = basisu::maximum(max0_v, max1_v, max2_v, max3_v); + fixup_pvrtc1_4_modulation_rgba(pSrc_blocks, &temp_endpoints[0], pDst_blocks, num_blocks_x, num_blocks_y); - uint8_t* pDst_bytes = static_cast(pDst); - pDst_bytes[0] = (uint8_t)max_v; - pDst_bytes[1] = (uint8_t)min_v; + return true; + } - if (max_v == min_v) + void uastc_init() + { + for (uint32_t range = 0; range < BC7ENC_TOTAL_ASTC_RANGES; range++) { - memset(pDst_bytes + 2, 0, 6); - return; - } - - const uint32_t delta = max_v - min_v; + if (!astc_is_valid_endpoint_range(range)) + continue; - // min_v is now 0. Compute thresholds between values by scaling max_v. It's x14 because we're adding two x7 scale factors. - const int t0 = delta * 13; - const int t1 = delta * 11; - const int t2 = delta * 9; - const int t3 = delta * 7; - const int t4 = delta * 5; - const int t5 = delta * 3; - const int t6 = delta * 1; + const uint32_t levels = astc_get_levels(range); - // BC4 floors in its divisions, which we compensate for with the 4 bias. - // This function is optimal for all possible inputs (i.e. it outputs the same results as checking all 8 values and choosing the closest one). - const int bias = 4 - min_v * 14; + uint32_t vals[256]; + for (uint32_t i = 0; i < levels; i++) + vals[i] = (unquant_astc_endpoint_val(i, range) << 8) | i; - static const uint32_t s_tran0[8] = { 1U , 7U , 6U , 5U , 4U , 3U , 2U , 0U }; - static const uint32_t s_tran1[8] = { 1U << 3U, 7U << 3U, 6U << 3U, 5U << 3U, 4U << 3U, 3U << 3U, 2U << 3U, 0U << 3U }; - static const uint32_t s_tran2[8] = { 1U << 6U, 7U << 6U, 6U << 6U, 5U << 6U, 4U << 6U, 3U << 6U, 2U << 6U, 0U << 6U }; - static const uint32_t s_tran3[8] = { 1U << 9U, 7U << 9U, 6U << 9U, 5U << 9U, 4U << 9U, 3U << 9U, 2U << 9U, 0U << 9U }; + std::sort(vals, vals + levels); - uint64_t a0, a1, a2, a3; - { - const int v0 = pPixels[0 * stride] * 14 + bias; - const int v1 = pPixels[1 * stride] * 14 + bias; - const int v2 = pPixels[2 * stride] * 14 + bias; - const int v3 = pPixels[3 * stride] * 14 + bias; - a0 = s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]; - a1 = s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]; - a2 = s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]; - a3 = s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]; - } + for (uint32_t i = 0; i < levels; i++) + { + const uint32_t order = vals[i] & 0xFF; + const uint32_t unq = vals[i] >> 8; - { - const int v0 = pPixels[4 * stride] * 14 + bias; - const int v1 = pPixels[5 * stride] * 14 + bias; - const int v2 = pPixels[6 * stride] * 14 + bias; - const int v3 = pPixels[7 * stride] * 14 + bias; - a0 |= (s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)] << 12U); - a1 |= (s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)] << 12U); - a2 |= (s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)] << 12U); - a3 |= (s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)] << 12U); - } - - { - const int v0 = pPixels[8 * stride] * 14 + bias; - const int v1 = pPixels[9 * stride] * 14 + bias; - const int v2 = pPixels[10 * stride] * 14 + bias; - const int v3 = pPixels[11 * stride] * 14 + bias; - a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 24U); - a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 24U); - a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 24U); - a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 24U); + g_astc_unquant[range][order].m_unquant = (uint8_t)unq; + g_astc_unquant[range][order].m_index = (uint8_t)i; + + } // i } + // TODO: Precompute? + // BC7 777.1 + for (int c = 0; c < 256; c++) { - const int v0 = pPixels[12 * stride] * 14 + bias; - const int v1 = pPixels[13 * stride] * 14 + bias; - const int v2 = pPixels[14 * stride] * 14 + bias; - const int v3 = pPixels[15 * stride] * 14 + bias; - a0 |= (((uint64_t)s_tran0[(v0 >= t0) + (v0 >= t1) + (v0 >= t2) + (v0 >= t3) + (v0 >= t4) + (v0 >= t5) + (v0 >= t6)]) << 36U); - a1 |= (((uint64_t)s_tran1[(v1 >= t0) + (v1 >= t1) + (v1 >= t2) + (v1 >= t3) + (v1 >= t4) + (v1 >= t5) + (v1 >= t6)]) << 36U); - a2 |= (((uint64_t)s_tran2[(v2 >= t0) + (v2 >= t1) + (v2 >= t2) + (v2 >= t3) + (v2 >= t4) + (v2 >= t5) + (v2 >= t6)]) << 36U); - a3 |= (((uint64_t)s_tran3[(v3 >= t0) + (v3 >= t1) + (v3 >= t2) + (v3 >= t3) + (v3 >= t4) + (v3 >= t5) + (v3 >= t6)]) << 36U); - } + for (uint32_t lp = 0; lp < 2; lp++) + { + endpoint_err best; + best.m_error = (uint16_t)UINT16_MAX; - const uint64_t f = a0 | a1 | a2 | a3; - - pDst_bytes[2] = (uint8_t)f; - pDst_bytes[3] = (uint8_t)(f >> 8U); - pDst_bytes[4] = (uint8_t)(f >> 16U); - pDst_bytes[5] = (uint8_t)(f >> 24U); - pDst_bytes[6] = (uint8_t)(f >> 32U); - pDst_bytes[7] = (uint8_t)(f >> 40U); - } + for (uint32_t l = 0; l < 128; l++) + { + const uint32_t low = (l << 1) | lp; - static void bc1_find_sels(const color32 *pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16]) - { - uint32_t block_r[4], block_g[4], block_b[4]; + for (uint32_t h = 0; h < 128; h++) + { + const uint32_t high = (h << 1) | lp; - block_r[0] = (lr << 3) | (lr >> 2); block_g[0] = (lg << 2) | (lg >> 4); block_b[0] = (lb << 3) | (lb >> 2); - block_r[3] = (hr << 3) | (hr >> 2); block_g[3] = (hg << 2) | (hg >> 4); block_b[3] = (hb << 3) | (hb >> 2); - block_r[1] = (block_r[0] * 2 + block_r[3]) / 3; block_g[1] = (block_g[0] * 2 + block_g[3]) / 3; block_b[1] = (block_b[0] * 2 + block_b[3]) / 3; - block_r[2] = (block_r[3] * 2 + block_r[0]) / 3; block_g[2] = (block_g[3] * 2 + block_g[0]) / 3; block_b[2] = (block_b[3] * 2 + block_b[0]) / 3; + const int k = (low * (64 - g_bc7_weights4[BC7ENC_MODE_6_OPTIMAL_INDEX]) + high * g_bc7_weights4[BC7ENC_MODE_6_OPTIMAL_INDEX] + 32) >> 6; - int ar = block_r[3] - block_r[0], ag = block_g[3] - block_g[0], ab = block_b[3] - block_b[0]; + const int err = (k - c) * (k - c); + if (err < best.m_error) + { + best.m_error = (uint16_t)err; + best.m_lo = (uint8_t)l; + best.m_hi = (uint8_t)h; + } + } // h + } // l - int dots[4]; - for (uint32_t i = 0; i < 4; i++) - dots[i] = (int)block_r[i] * ar + (int)block_g[i] * ag + (int)block_b[i] * ab; - - int t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3]; + g_bc7_mode_6_optimal_endpoints[c][lp] = best; + } // lp - ar *= 2; ag *= 2; ab *= 2; + } // c - for (uint32_t i = 0; i < 16; i++) + // BC7 777 + for (int c = 0; c < 256; c++) { - const int d = pSrc_pixels[i].r * ar + pSrc_pixels[i].g * ag + pSrc_pixels[i].b * ab; - static const uint8_t s_sels[4] = { 3, 2, 1, 0 }; - - // Rounding matters here! - // d <= t0: <=, not <, to the later LS step "sees" a wider range of selectors. It matters for quality. - sels[i] = s_sels[(d <= t0) + (d < t1) + (d < t2)]; - } - } + endpoint_err best; + best.m_error = (uint16_t)UINT16_MAX; - static inline void bc1_find_sels_2(const color32* pSrc_pixels, uint32_t lr, uint32_t lg, uint32_t lb, uint32_t hr, uint32_t hg, uint32_t hb, uint8_t sels[16]) - { - uint32_t block_r[4], block_g[4], block_b[4]; + for (uint32_t l = 0; l < 128; l++) + { + const uint32_t low = (l << 1) | (l >> 6); - block_r[0] = (lr << 3) | (lr >> 2); block_g[0] = (lg << 2) | (lg >> 4); block_b[0] = (lb << 3) | (lb >> 2); - block_r[3] = (hr << 3) | (hr >> 2); block_g[3] = (hg << 2) | (hg >> 4); block_b[3] = (hb << 3) | (hb >> 2); - block_r[1] = (block_r[0] * 2 + block_r[3]) / 3; block_g[1] = (block_g[0] * 2 + block_g[3]) / 3; block_b[1] = (block_b[0] * 2 + block_b[3]) / 3; - block_r[2] = (block_r[3] * 2 + block_r[0]) / 3; block_g[2] = (block_g[3] * 2 + block_g[0]) / 3; block_b[2] = (block_b[3] * 2 + block_b[0]) / 3; + for (uint32_t h = 0; h < 128; h++) + { + const uint32_t high = (h << 1) | (h >> 6); - int ar = block_r[3] - block_r[0], ag = block_g[3] - block_g[0], ab = block_b[3] - block_b[0]; + const int k = (low * (64 - g_bc7_weights2[BC7ENC_MODE_5_OPTIMAL_INDEX]) + high * g_bc7_weights2[BC7ENC_MODE_5_OPTIMAL_INDEX] + 32) >> 6; - int dots[4]; - for (uint32_t i = 0; i < 4; i++) - dots[i] = (int)block_r[i] * ar + (int)block_g[i] * ag + (int)block_b[i] * ab; + const int err = (k - c) * (k - c); + if (err < best.m_error) + { + best.m_error = (uint16_t)err; + best.m_lo = (uint8_t)l; + best.m_hi = (uint8_t)h; + } + } // h + } // l - int t0 = dots[0] + dots[1], t1 = dots[1] + dots[2], t2 = dots[2] + dots[3]; + g_bc7_mode_5_optimal_endpoints[c] = best; - ar *= 2; ag *= 2; ab *= 2; + } // c + } - static const uint8_t s_sels[4] = { 3, 2, 1, 0 }; +#endif // #if BASISD_SUPPORT_UASTC - for (uint32_t i = 0; i < 16; i += 4) - { - const int d0 = pSrc_pixels[i+0].r * ar + pSrc_pixels[i+0].g * ag + pSrc_pixels[i+0].b * ab; - const int d1 = pSrc_pixels[i+1].r * ar + pSrc_pixels[i+1].g * ag + pSrc_pixels[i+1].b * ab; - const int d2 = pSrc_pixels[i+2].r * ar + pSrc_pixels[i+2].g * ag + pSrc_pixels[i+2].b * ab; - const int d3 = pSrc_pixels[i+3].r * ar + pSrc_pixels[i+3].g * ag + pSrc_pixels[i+3].b * ab; +// ------------------------------------------------------------------------------------------------------ +// KTX2 +// ------------------------------------------------------------------------------------------------------ - sels[i+0] = s_sels[(d0 <= t0) + (d0 < t1) + (d0 < t2)]; - sels[i+1] = s_sels[(d1 <= t0) + (d1 < t1) + (d1 < t2)]; - sels[i+2] = s_sels[(d2 <= t0) + (d2 < t1) + (d2 < t2)]; - sels[i+3] = s_sels[(d3 <= t0) + (d3 < t1) + (d3 < t2)]; - } +#if BASISD_SUPPORT_KTX2 + const uint8_t g_ktx2_file_identifier[12] = { 0xAB, 0x4B, 0x54, 0x58, 0x20, 0x32, 0x30, 0xBB, 0x0D, 0x0A, 0x1A, 0x0A }; + + ktx2_transcoder::ktx2_transcoder() : + m_etc1s_transcoder() + { + clear(); } - struct vec3F { float c[3]; }; + void ktx2_transcoder::clear() + { + m_pData = nullptr; + m_data_size = 0; + + memset(&m_header, 0, sizeof(m_header)); + m_levels.clear(); + m_dfd.clear(); + m_key_values.clear(); + memset(&m_etc1s_header, 0, sizeof(m_etc1s_header)); + m_etc1s_image_descs.clear(); + m_astc_6x6_intermediate_image_descs.clear(); + + m_format = basist::basis_tex_format::cETC1S; + + m_dfd_color_model = 0; + m_dfd_color_prims = KTX2_DF_PRIMARIES_UNSPECIFIED; + m_dfd_transfer_func = 0; + m_dfd_flags = 0; + m_dfd_samples = 0; + m_dfd_chan0 = KTX2_DF_CHANNEL_UASTC_RGB; + m_dfd_chan1 = KTX2_DF_CHANNEL_UASTC_RGB; + + m_etc1s_transcoder.clear(); + + m_def_transcoder_state.clear(); - static bool compute_least_squares_endpoints_rgb(const color32* pColors, const uint8_t* pSelectors, vec3F* pXl, vec3F* pXh) + m_has_alpha = false; + m_is_video = false; + m_ldr_hdr_upconversion_nit_multiplier = 0.0f; + } + + bool ktx2_transcoder::init(const void* pData, uint32_t data_size) { - // Derived from bc7enc16's LS function. - // Least squares using normal equations: http://www.cs.cornell.edu/~bindel/class/cs3220-s12/notes/lec10.pdf - // I did this in matrix form first, expanded out all the ops, then optimized it a bit. - uint32_t uq00_r = 0, uq10_r = 0, ut_r = 0, uq00_g = 0, uq10_g = 0, ut_g = 0, uq00_b = 0, uq10_b = 0, ut_b = 0; + clear(); - // This table is: 9 * (w * w), 9 * ((1.0f - w) * w), 9 * ((1.0f - w) * (1.0f - w)) - // where w is [0,1/3,2/3,1]. 9 is the perfect multiplier. - static const uint32_t s_weight_vals[4] = { 0x000009, 0x010204, 0x040201, 0x090000 }; + if (!pData) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: pData is nullptr\n"); + assert(0); + return false; + } - uint32_t weight_accum = 0; - for (uint32_t i = 0; i < 16; i++) + if (data_size <= sizeof(ktx2_header)) { - const uint32_t r = pColors[i].c[0], g = pColors[i].c[1], b = pColors[i].c[2]; - const uint32_t sel = pSelectors[i]; - ut_r += r; - ut_g += g; - ut_b += b; - weight_accum += s_weight_vals[sel]; - uq00_r += sel * r; - uq00_g += sel * g; - uq00_b += sel * b; + BASISU_DEVEL_ERROR("ktx2_transcoder::init: File is impossibly too small to be a valid KTX2 file\n"); + return false; } - float q00_r = (float)uq00_r, q10_r = (float)uq10_r, t_r = (float)ut_r; - float q00_g = (float)uq00_g, q10_g = (float)uq10_g, t_g = (float)ut_g; - float q00_b = (float)uq00_b, q10_b = (float)uq10_b, t_b = (float)ut_b; + if (memcmp(pData, g_ktx2_file_identifier, sizeof(g_ktx2_file_identifier)) != 0) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: KTX2 file identifier is not present\n"); + return false; + } - q10_r = t_r * 3.0f - q00_r; - q10_g = t_g * 3.0f - q00_g; - q10_b = t_b * 3.0f - q00_b; + m_pData = static_cast(pData); + m_data_size = data_size; - float z00 = (float)((weight_accum >> 16) & 0xFF); - float z10 = (float)((weight_accum >> 8) & 0xFF); - float z11 = (float)(weight_accum & 0xFF); - float z01 = z10; + memcpy(&m_header, pData, sizeof(m_header)); - float det = z00 * z11 - z01 * z10; - if (fabs(det) < 1e-8f) + // Check for supported VK formats. We may also need to parse the DFD. + if ((m_header.m_vk_format != KTX2_VK_FORMAT_UNDEFINED) && + (m_header.m_vk_format != basist::KTX2_FORMAT_ASTC_4x4_SFLOAT_BLOCK) && + (m_header.m_vk_format != basist::KTX2_FORMAT_ASTC_6x6_SFLOAT_BLOCK)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: KTX2 file must be in ETC1S or UASTC LDR/HDR format\n"); return false; + } - det = 3.0f / det; + // 3.3: "When format is VK_FORMAT_UNDEFINED, typeSize must equal 1." + if (m_header.m_type_size != 1) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid type_size\n"); + return false; + } - float iz00, iz01, iz10, iz11; - iz00 = z11 * det; - iz01 = -z01 * det; - iz10 = -z10 * det; - iz11 = z00 * det; + // We only currently support 2D textures (plain, cubemapped, or texture array), which is by far the most common use case. + // The BasisU library does not support 1D or 3D textures at all. + if ((m_header.m_pixel_width < 1) || (m_header.m_pixel_height < 1) || (m_header.m_pixel_depth > 0)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Only 2D or cubemap textures are supported\n"); + return false; + } + + // Face count must be 1 or 6 + if ((m_header.m_face_count != 1) && (m_header.m_face_count != 6)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid face count, file is corrupted or invalid\n"); + return false; + } + + if (m_header.m_face_count > 1) + { + // 3.4: Make sure cubemaps are square. + if (m_header.m_pixel_width != m_header.m_pixel_height) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Cubemap is not square\n"); + return false; + } + } + + // 3.7 levelCount: "levelCount=0 is allowed, except for block-compressed formats" + if (m_header.m_level_count < 1) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid level count\n"); + return false; + } - pXl->c[0] = iz00 * q00_r + iz01 * q10_r; pXh->c[0] = iz10 * q00_r + iz11 * q10_r; - pXl->c[1] = iz00 * q00_g + iz01 * q10_g; pXh->c[1] = iz10 * q00_g + iz11 * q10_g; - pXl->c[2] = iz00 * q00_b + iz01 * q10_b; pXh->c[2] = iz10 * q00_b + iz11 * q10_b; + // Sanity check the level count. + if (m_header.m_level_count > KTX2_MAX_SUPPORTED_LEVEL_COUNT) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Too many levels or file is corrupted or invalid\n"); + return false; + } - // Check and fix channel singularities - might not be needed, but is in UASTC's encoder. - for (uint32_t c = 0; c < 3; c++) + if (m_header.m_supercompression_scheme > KTX2_SS_ZSTANDARD) { - if ((pXl->c[c] < 0.0f) || (pXh->c[c] > 255.0f)) + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid/unsupported supercompression or file is corrupted or invalid\n"); + return false; + } + + if (m_header.m_supercompression_scheme == KTX2_SS_BASISLZ) + { +#if 0 + if (m_header.m_sgd_byte_length <= sizeof(ktx2_etc1s_global_data_header)) { - uint32_t lo_v = UINT32_MAX, hi_v = 0; - for (uint32_t i = 0; i < 16; i++) - { - lo_v = basisu::minimumu(lo_v, pColors[i].c[c]); - hi_v = basisu::maximumu(hi_v, pColors[i].c[c]); - } + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Supercompression global data is too small\n"); + return false; + } +#endif - if (lo_v == hi_v) - { - pXl->c[c] = (float)lo_v; - pXh->c[c] = (float)hi_v; - } + if (m_header.m_sgd_byte_offset < sizeof(ktx2_header)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Supercompression global data offset is too low\n"); + return false; + } + + if (m_header.m_sgd_byte_offset + m_header.m_sgd_byte_length > m_data_size) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Supercompression global data offset and/or length is too high\n"); + return false; } } - return true; - } + if (!m_levels.try_resize(m_header.m_level_count)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Out of memory\n"); + return false; + } - void encode_bc1_solid_block(void* pDst, uint32_t fr, uint32_t fg, uint32_t fb) - { - dxt1_block* pDst_block = static_cast(pDst); + const uint32_t level_index_size_in_bytes = basisu::maximum(1U, (uint32_t)m_header.m_level_count) * sizeof(ktx2_level_index); - uint32_t mask = 0xAA; - uint32_t max16 = (g_bc1_match5_equals_1[fr].m_hi << 11) | (g_bc1_match6_equals_1[fg].m_hi << 5) | g_bc1_match5_equals_1[fb].m_hi; - uint32_t min16 = (g_bc1_match5_equals_1[fr].m_lo << 11) | (g_bc1_match6_equals_1[fg].m_lo << 5) | g_bc1_match5_equals_1[fb].m_lo; + if ((sizeof(ktx2_header) + level_index_size_in_bytes) > m_data_size) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: File is too small (can't read level index array)\n"); + return false; + } - if (min16 == max16) + memcpy(&m_levels[0], m_pData + sizeof(ktx2_header), level_index_size_in_bytes); + + // Sanity check the level offsets and byte sizes + for (uint32_t i = 0; i < m_levels.size(); i++) { - // Always forbid 3 color blocks - // This is to guarantee that BC3 blocks never use punchthrough alpha (3 color) mode, which isn't supported on some (all?) GPU's. - mask = 0; + if (m_levels[i].m_byte_offset < sizeof(ktx2_header)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid level offset (too low)\n"); + return false; + } - // Make l > h - if (min16 > 0) - min16--; - else + if (!m_levels[i].m_byte_length) { - // l = h = 0 - assert(min16 == max16 && max16 == 0); + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid level byte length\n"); + } - max16 = 1; - min16 = 0; - mask = 0x55; + if ((m_levels[i].m_byte_offset + m_levels[i].m_byte_length) > m_data_size) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid level offset and/or length\n"); + return false; + } + + const uint64_t MAX_SANE_LEVEL_UNCOMP_SIZE = 2048ULL * 1024ULL * 1024ULL; + + if (m_levels[i].m_uncompressed_byte_length >= MAX_SANE_LEVEL_UNCOMP_SIZE) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid level offset (too large)\n"); + return false; } - assert(max16 > min16); + if (m_header.m_supercompression_scheme == KTX2_SS_BASISLZ) + { + if (m_levels[i].m_uncompressed_byte_length) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid uncompressed length (0)\n"); + return false; + } + } + else if (m_header.m_supercompression_scheme >= KTX2_SS_ZSTANDARD) + { + if (!m_levels[i].m_uncompressed_byte_length) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid uncompressed length (1)\n"); + return false; + } + } } - if (max16 < min16) + const uint32_t DFD_MINIMUM_SIZE = 44, DFD_MAXIMUM_SIZE = 60; + if ((m_header.m_dfd_byte_length != DFD_MINIMUM_SIZE) && (m_header.m_dfd_byte_length != DFD_MAXIMUM_SIZE)) { - std::swap(max16, min16); - mask ^= 0x55; + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Unsupported DFD size\n"); + return false; } - pDst_block->set_low_color(static_cast(max16)); - pDst_block->set_high_color(static_cast(min16)); - pDst_block->m_selectors[0] = static_cast(mask); - pDst_block->m_selectors[1] = static_cast(mask); - pDst_block->m_selectors[2] = static_cast(mask); - pDst_block->m_selectors[3] = static_cast(mask); - } + if (((m_header.m_dfd_byte_offset + m_header.m_dfd_byte_length) > m_data_size) || (m_header.m_dfd_byte_offset < sizeof(ktx2_header))) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid DFD offset and/or length\n"); + return false; + } + + const uint8_t* pDFD = m_pData + m_header.m_dfd_byte_offset; - static inline uint8_t to_5(uint32_t v) { v = v * 31 + 128; return (uint8_t)((v + (v >> 8)) >> 8); } - static inline uint8_t to_6(uint32_t v) { v = v * 63 + 128; return (uint8_t)((v + (v >> 8)) >> 8); } + if (!m_dfd.try_resize(m_header.m_dfd_byte_length)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Out of memory\n"); + return false; + } - // Good references: squish library, stb_dxt. - void encode_bc1(void* pDst, const uint8_t* pPixels, uint32_t flags) - { - const color32* pSrc_pixels = (const color32*)pPixels; - dxt1_block* pDst_block = static_cast(pDst); + memcpy(m_dfd.data(), pDFD, m_header.m_dfd_byte_length); - int avg_r = -1, avg_g = 0, avg_b = 0; - int lr = 0, lg = 0, lb = 0, hr = 0, hg = 0, hb = 0; - uint8_t sels[16]; + // This is all hard coded for only ETC1S and UASTC. + uint32_t dfd_total_size = basisu::read_le_dword(pDFD); - const bool use_sels = (flags & cEncodeBC1UseSelectors) != 0; - if (use_sels) + // 3.10.3: Sanity check + if (dfd_total_size != m_header.m_dfd_byte_length) { - // Caller is jamming in their own selectors for us to try. - const uint32_t s = pDst_block->m_selectors[0] | (pDst_block->m_selectors[1] << 8) | (pDst_block->m_selectors[2] << 16) | (pDst_block->m_selectors[3] << 24); - - static const uint8_t s_sel_tran[4] = { 0, 3, 1, 2 }; - - for (uint32_t i = 0; i < 16; i++) - sels[i] = s_sel_tran[(s >> (i * 2)) & 3]; + BASISU_DEVEL_ERROR("ktx2_transcoder::init: DFD size validation failed (1)\n"); + return false; } - else + + // 3.10.3: More sanity checking + if (m_header.m_kvd_byte_length) { - const uint32_t fr = pSrc_pixels[0].r, fg = pSrc_pixels[0].g, fb = pSrc_pixels[0].b; - - uint32_t j; - for (j = 1; j < 16; j++) - if ((pSrc_pixels[j].r != fr) || (pSrc_pixels[j].g != fg) || (pSrc_pixels[j].b != fb)) - break; - - if (j == 16) - { - encode_bc1_solid_block(pDst, fr, fg, fb); - return; - } - - // Select 2 colors along the principle axis. (There must be a faster/simpler way.) - int total_r = fr, total_g = fg, total_b = fb; - int max_r = fr, max_g = fg, max_b = fb; - int min_r = fr, min_g = fg, min_b = fb; - for (uint32_t i = 1; i < 16; i++) + if (dfd_total_size != m_header.m_kvd_byte_offset - m_header.m_dfd_byte_offset) { - const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b; - max_r = basisu::maximum(max_r, r); max_g = basisu::maximum(max_g, g); max_b = basisu::maximum(max_b, b); - min_r = basisu::minimum(min_r, r); min_g = basisu::minimum(min_g, g); min_b = basisu::minimum(min_b, b); - total_r += r; total_g += g; total_b += b; + BASISU_DEVEL_ERROR("ktx2_transcoder::init: DFD size validation failed (2)\n"); + return false; } + } - avg_r = (total_r + 8) >> 4; - avg_g = (total_g + 8) >> 4; - avg_b = (total_b + 8) >> 4; + const uint32_t dfd_bits = basisu::read_le_dword(pDFD + 3 * sizeof(uint32_t)); + const uint32_t sample_channel0 = basisu::read_le_dword(pDFD + 7 * sizeof(uint32_t)); + + m_dfd_color_model = dfd_bits & 255; + m_dfd_color_prims = (ktx2_df_color_primaries)((dfd_bits >> 8) & 255); + m_dfd_transfer_func = (dfd_bits >> 16) & 255; + m_dfd_flags = (dfd_bits >> 24) & 255; - int icov[6] = { 0, 0, 0, 0, 0, 0 }; - for (uint32_t i = 0; i < 16; i++) + // See 3.10.1.Restrictions + if ((m_dfd_transfer_func != KTX2_KHR_DF_TRANSFER_LINEAR) && (m_dfd_transfer_func != KTX2_KHR_DF_TRANSFER_SRGB)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid DFD transfer function\n"); + return false; + } + + if (m_dfd_color_model == KTX2_KDF_DF_MODEL_ETC1S) + { + if (m_header.m_vk_format != basist::KTX2_VK_FORMAT_UNDEFINED) { - int r = (int)pSrc_pixels[i].r - avg_r; - int g = (int)pSrc_pixels[i].g - avg_g; - int b = (int)pSrc_pixels[i].b - avg_b; - icov[0] += r * r; - icov[1] += r * g; - icov[2] += r * b; - icov[3] += g * g; - icov[4] += g * b; - icov[5] += b * b; + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid header vkFormat\n"); + return false; } - float cov[6]; - for (uint32_t i = 0; i < 6; i++) - cov[i] = static_cast(icov[i])* (1.0f / 255.0f); + m_format = basist::basis_tex_format::cETC1S; -#if 0 - // Seems silly to use full PCA to choose 2 colors. The diff in avg. PSNR between using PCA vs. not is small (~.025 difference). - // TODO: Try 2 or 3 different normalized diagonal vectors, choose the one that results in the largest dot delta - int saxis_r = max_r - min_r; - int saxis_g = max_g - min_g; - int saxis_b = max_b - min_b; -#else - float xr = (float)(max_r - min_r); - float xg = (float)(max_g - min_g); - float xb = (float)(max_b - min_b); - //float xr = (float)(max_r - avg_r); // max-avg is nearly the same, and doesn't require computing min's - //float xg = (float)(max_g - avg_g); - //float xb = (float)(max_b - avg_b); - for (uint32_t power_iter = 0; power_iter < 4; power_iter++) + // 3.10.2: "Whether the image has 1 or 2 slices can be determined from the DFD's sample count." + // If m_has_alpha is true it may be 2-channel RRRG or 4-channel RGBA, but we let the caller deal with that. + m_has_alpha = (m_header.m_dfd_byte_length == 60); + + m_dfd_samples = m_has_alpha ? 2 : 1; + m_dfd_chan0 = (ktx2_df_channel_id)((sample_channel0 >> 24) & 15); + + if (m_has_alpha) { - float r = xr * cov[0] + xg * cov[1] + xb * cov[2]; - float g = xr * cov[1] + xg * cov[3] + xb * cov[4]; - float b = xr * cov[2] + xg * cov[4] + xb * cov[5]; - xr = r; xg = g; xb = b; + const uint32_t sample_channel1 = basisu::read_le_dword(pDFD + 11 * sizeof(uint32_t)); + m_dfd_chan1 = (ktx2_df_channel_id)((sample_channel1 >> 24) & 15); } - - float k = basisu::maximum(fabsf(xr), fabsf(xg), fabsf(xb)); - int saxis_r = 306, saxis_g = 601, saxis_b = 117; - if (k >= 2) + } + else if (m_dfd_color_model == KTX2_KDF_DF_MODEL_UASTC_LDR_4X4) + { + if (m_header.m_vk_format != basist::KTX2_VK_FORMAT_UNDEFINED) { - float m = 1024.0f / k; - saxis_r = (int)(xr * m); - saxis_g = (int)(xg * m); - saxis_b = (int)(xb * m); + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid header vkFormat\n"); + return false; } -#endif + + m_format = basist::basis_tex_format::cUASTC4x4; + + m_dfd_samples = 1; + m_dfd_chan0 = (ktx2_df_channel_id)((sample_channel0 >> 24) & 15); - int low_dot = INT_MAX, high_dot = INT_MIN, low_c = 0, high_c = 0; - for (uint32_t i = 0; i < 16; i++) + // We're assuming "DATA" means RGBA so it has alpha. + m_has_alpha = (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RGBA) || (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RRRG); + } + else if (m_dfd_color_model == KTX2_KDF_DF_MODEL_UASTC_HDR_4X4) + { + // UASTC HDR 4x4 is standard ASTC HDR 4x4 texture data. Check the header's vkFormat. + if (m_header.m_vk_format != basist::KTX2_FORMAT_ASTC_4x4_SFLOAT_BLOCK) { - int dot = pSrc_pixels[i].r * saxis_r + pSrc_pixels[i].g * saxis_g + pSrc_pixels[i].b * saxis_b; - if (dot < low_dot) - { - low_dot = dot; - low_c = i; - } - if (dot > high_dot) - { - high_dot = dot; - high_c = i; - } + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid header vkFormat\n"); + return false; } - lr = to_5(pSrc_pixels[low_c].r); - lg = to_6(pSrc_pixels[low_c].g); - lb = to_5(pSrc_pixels[low_c].b); + m_format = basist::basis_tex_format::cUASTC_HDR_4x4; - hr = to_5(pSrc_pixels[high_c].r); - hg = to_6(pSrc_pixels[high_c].g); - hb = to_5(pSrc_pixels[high_c].b); - - bc1_find_sels(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels); - } // if (use_sels) + m_dfd_samples = 1; + m_dfd_chan0 = (ktx2_df_channel_id)((sample_channel0 >> 24) & 15); - const uint32_t total_ls_passes = (flags & cEncodeBC1HigherQuality) ? 3 : (flags & cEncodeBC1HighQuality ? 2 : 1); - for (uint32_t ls_pass = 0; ls_pass < total_ls_passes; ls_pass++) + // We're assuming "DATA" means RGBA so it has alpha. + // [11/26/2024] - changed to always false for now + m_has_alpha = false;// (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RGBA) || (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RRRG); + } + else if (m_dfd_color_model == KTX2_KDF_DF_MODEL_ASTC) { - // This is where the real magic happens. We have an array of candidate selectors, so let's use least squares to compute the optimal low/high endpoint colors. - vec3F xl, xh; - if (!compute_least_squares_endpoints_rgb(pSrc_pixels, sels, &xl, &xh)) + // The DFD indicates plain ASTC texture data. We only support ASTC HDR 6x6 - check the header's vkFormat. + if (m_header.m_vk_format != basist::KTX2_FORMAT_ASTC_6x6_SFLOAT_BLOCK) { - if (avg_r < 0) - { - int total_r = 0, total_g = 0, total_b = 0; - for (uint32_t i = 0; i < 16; i++) - { - total_r += pSrc_pixels[i].r; - total_g += pSrc_pixels[i].g; - total_b += pSrc_pixels[i].b; - } - - avg_r = (total_r + 8) >> 4; - avg_g = (total_g + 8) >> 4; - avg_b = (total_b + 8) >> 4; - } - - // All selectors equal - treat it as a solid block which should always be equal or better. - lr = g_bc1_match5_equals_1[avg_r].m_hi; - lg = g_bc1_match6_equals_1[avg_g].m_hi; - lb = g_bc1_match5_equals_1[avg_b].m_hi; + BASISU_DEVEL_ERROR("ktx2_transcoder::init: DVD color model is ASTC, but the header's vkFormat isn't KTX2_FORMAT_ASTC_6x6_SFLOAT_BLOCK\n"); + return false; + } - hr = g_bc1_match5_equals_1[avg_r].m_lo; - hg = g_bc1_match6_equals_1[avg_g].m_lo; - hb = g_bc1_match5_equals_1[avg_b].m_lo; + m_format = basist::basis_tex_format::cASTC_HDR_6x6; - // In high/higher quality mode, let it try again in case the optimal tables have caused the sels to diverge. - } - else - { - lr = basisu::clamp((int)((xl.c[0]) * (31.0f / 255.0f) + .5f), 0, 31); - lg = basisu::clamp((int)((xl.c[1]) * (63.0f / 255.0f) + .5f), 0, 63); - lb = basisu::clamp((int)((xl.c[2]) * (31.0f / 255.0f) + .5f), 0, 31); + m_dfd_samples = 1; + m_dfd_chan0 = (ktx2_df_channel_id)((sample_channel0 >> 24) & 15); - hr = basisu::clamp((int)((xh.c[0]) * (31.0f / 255.0f) + .5f), 0, 31); - hg = basisu::clamp((int)((xh.c[1]) * (63.0f / 255.0f) + .5f), 0, 63); - hb = basisu::clamp((int)((xh.c[2]) * (31.0f / 255.0f) + .5f), 0, 31); - } - - bc1_find_sels(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels); + m_has_alpha = false; } - - uint32_t lc16 = dxt1_block::pack_unscaled_color(lr, lg, lb); - uint32_t hc16 = dxt1_block::pack_unscaled_color(hr, hg, hb); - - // Always forbid 3 color blocks - if (lc16 == hc16) + else if (m_dfd_color_model == KTX2_KDF_DF_MODEL_ASTC_HDR_6X6_INTERMEDIATE) { - uint8_t mask = 0; - - // Make l > h - if (hc16 > 0) - hc16--; - else + // Custom variable block size ASTC HDR 6x6 texture data. + if (m_header.m_vk_format != basist::KTX2_VK_FORMAT_UNDEFINED) { - // lc16 = hc16 = 0 - assert(lc16 == hc16 && hc16 == 0); - - hc16 = 0; - lc16 = 1; - mask = 0x55; // select hc16 + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid header vkFormat\n"); + return false; } - assert(lc16 > hc16); - pDst_block->set_low_color(static_cast(lc16)); - pDst_block->set_high_color(static_cast(hc16)); + m_format = basist::basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE; - pDst_block->m_selectors[0] = mask; - pDst_block->m_selectors[1] = mask; - pDst_block->m_selectors[2] = mask; - pDst_block->m_selectors[3] = mask; + m_dfd_samples = 1; + m_dfd_chan0 = (ktx2_df_channel_id)((sample_channel0 >> 24) & 15); + + m_has_alpha = false; } else { - uint8_t invert_mask = 0; - if (lc16 < hc16) + // Unsupported DFD color model. + BASISU_DEVEL_ERROR("ktx2_transcoder::init: Unsupported DFD color model\n"); + return false; + } + + if (!read_key_values()) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::init: read_key_values() failed\n"); + return false; + } + + // Check for a KTXanimData key + for (uint32_t i = 0; i < m_key_values.size(); i++) + { + if (strcmp(reinterpret_cast(m_key_values[i].m_key.data()), "KTXanimData") == 0) { - std::swap(lc16, hc16); - invert_mask = 0x55; + m_is_video = true; + break; } + } - assert(lc16 > hc16); - pDst_block->set_low_color((uint16_t)lc16); - pDst_block->set_high_color((uint16_t)hc16); + m_ldr_hdr_upconversion_nit_multiplier = 0.0f; - uint32_t packed_sels = 0; - static const uint8_t s_sel_trans[4] = { 0, 2, 3, 1 }; - for (uint32_t i = 0; i < 16; i++) - packed_sels |= ((uint32_t)s_sel_trans[sels[i]] << (i * 2)); + for (uint32_t i = 0; i < m_key_values.size(); i++) + { + if (strcmp(reinterpret_cast(m_key_values[i].m_key.data()), "LDRUpconversionMultiplier") == 0) + { + m_ldr_hdr_upconversion_nit_multiplier = (float)atof(reinterpret_cast(m_key_values[i].m_value.data())); - pDst_block->m_selectors[0] = (uint8_t)packed_sels ^ invert_mask; - pDst_block->m_selectors[1] = (uint8_t)(packed_sels >> 8) ^ invert_mask; - pDst_block->m_selectors[2] = (uint8_t)(packed_sels >> 16) ^ invert_mask; - pDst_block->m_selectors[3] = (uint8_t)(packed_sels >> 24) ^ invert_mask; + if (std::isnan(m_ldr_hdr_upconversion_nit_multiplier) || std::isinf(m_ldr_hdr_upconversion_nit_multiplier) || (m_ldr_hdr_upconversion_nit_multiplier < 0.0f)) + m_ldr_hdr_upconversion_nit_multiplier = 0; + + break; + } } + + return true; } - - void encode_bc1_alt(void* pDst, const uint8_t* pPixels, uint32_t flags) - { - const color32* pSrc_pixels = (const color32*)pPixels; - dxt1_block* pDst_block = static_cast(pDst); - int avg_r = -1, avg_g = 0, avg_b = 0; - int lr = 0, lg = 0, lb = 0, hr = 0, hg = 0, hb = 0; - uint8_t sels[16]; + uint32_t ktx2_transcoder::get_etc1s_image_descs_image_flags(uint32_t level_index, uint32_t layer_index, uint32_t face_index) const + { + const uint32_t etc1s_image_index = + (level_index * basisu::maximum(m_header.m_layer_count, 1) * m_header.m_face_count) + + layer_index * m_header.m_face_count + + face_index; - const bool use_sels = (flags & cEncodeBC1UseSelectors) != 0; - if (use_sels) + if (etc1s_image_index >= get_etc1s_image_descs().size()) { - // Caller is jamming in their own selectors for us to try. - const uint32_t s = pDst_block->m_selectors[0] | (pDst_block->m_selectors[1] << 8) | (pDst_block->m_selectors[2] << 16) | (pDst_block->m_selectors[3] << 24); + assert(0); + return 0; + } - static const uint8_t s_sel_tran[4] = { 0, 3, 1, 2 }; + return get_etc1s_image_descs()[etc1s_image_index].m_image_flags; + } - for (uint32_t i = 0; i < 16; i++) - sels[i] = s_sel_tran[(s >> (i * 2)) & 3]; - } - else - { - const uint32_t fr = pSrc_pixels[0].r, fg = pSrc_pixels[0].g, fb = pSrc_pixels[0].b; + const basisu::uint8_vec* ktx2_transcoder::find_key(const std::string& key_name) const + { + for (uint32_t i = 0; i < m_key_values.size(); i++) + if (strcmp((const char *)m_key_values[i].m_key.data(), key_name.c_str()) == 0) + return &m_key_values[i].m_value; - uint32_t j; - for (j = 1; j < 16; j++) - if ((pSrc_pixels[j].r != fr) || (pSrc_pixels[j].g != fg) || (pSrc_pixels[j].b != fb)) - break; + return nullptr; + } + + bool ktx2_transcoder::start_transcoding() + { + if (!m_pData) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::start_transcoding: Must call init() first\n"); + return false; + } - if (j == 16) + if (m_header.m_supercompression_scheme == KTX2_SS_BASISLZ) + { + if (m_format == basis_tex_format::cETC1S) { - encode_bc1_solid_block(pDst, fr, fg, fb); - return; - } + // Check if we've already decompressed the ETC1S global data. If so don't unpack it again. + if (!m_etc1s_transcoder.get_endpoints().empty()) + return true; + + if (!decompress_etc1s_global_data()) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::start_transcoding: decompress_etc1s_global_data() failed\n"); + return false; + } - // Select 2 colors along the principle axis. (There must be a faster/simpler way.) - int total_r = fr, total_g = fg, total_b = fb; - int max_r = fr, max_g = fg, max_b = fb; - int min_r = fr, min_g = fg, min_b = fb; - uint32_t grayscale_flag = (fr == fg) && (fr == fb); - for (uint32_t i = 1; i < 16; i++) - { - const int r = pSrc_pixels[i].r, g = pSrc_pixels[i].g, b = pSrc_pixels[i].b; - grayscale_flag &= ((r == g) && (r == b)); - max_r = basisu::maximum(max_r, r); max_g = basisu::maximum(max_g, g); max_b = basisu::maximum(max_b, b); - min_r = basisu::minimum(min_r, r); min_g = basisu::minimum(min_g, g); min_b = basisu::minimum(min_b, b); - total_r += r; total_g += g; total_b += b; - } - - if (grayscale_flag) - { - // Grayscale blocks are a common enough case to specialize. - if ((max_r - min_r) < 2) + if (!m_is_video) { - lr = lb = hr = hb = to_5(fr); - lg = hg = to_6(fr); + // See if there are any P-frames. If so it must be a video, even if there wasn't a KTXanimData key. + // Video cannot be a cubemap, and it must be a texture array. + if ((m_header.m_face_count == 1) && (m_header.m_layer_count > 1)) + { + for (uint32_t i = 0; i < m_etc1s_image_descs.size(); i++) + { + if (m_etc1s_image_descs[i].m_image_flags & KTX2_IMAGE_IS_P_FRAME) + { + m_is_video = true; + break; + } + } + } } - else - { - lr = lb = to_5(min_r); - lg = to_6(min_r); + } + else if (m_format == basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE) + { + if (m_astc_6x6_intermediate_image_descs.size()) + return true; - hr = hb = to_5(max_r); - hg = to_6(max_r); + if (!read_astc_6x6_hdr_intermediate_global_data()) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::start_transcoding: read_astc_6x6_hdr_intermediate_global_data() failed\n"); + return false; } } else { - avg_r = (total_r + 8) >> 4; - avg_g = (total_g + 8) >> 4; - avg_b = (total_b + 8) >> 4; - - // Find the shortest vector from a AABB corner to the block's average color. - // This is to help avoid outliers. - - uint32_t dist[3][2]; - dist[0][0] = basisu::square(min_r - avg_r) << 3; dist[0][1] = basisu::square(max_r - avg_r) << 3; - dist[1][0] = basisu::square(min_g - avg_g) << 3; dist[1][1] = basisu::square(max_g - avg_g) << 3; - dist[2][0] = basisu::square(min_b - avg_b) << 3; dist[2][1] = basisu::square(max_b - avg_b) << 3; - - uint32_t min_d0 = (dist[0][0] + dist[1][0] + dist[2][0]); - uint32_t d4 = (dist[0][0] + dist[1][0] + dist[2][1]) | 4; - min_d0 = basisu::minimum(min_d0, d4); - - uint32_t min_d1 = (dist[0][1] + dist[1][0] + dist[2][0]) | 1; - uint32_t d5 = (dist[0][1] + dist[1][0] + dist[2][1]) | 5; - min_d1 = basisu::minimum(min_d1, d5); - - uint32_t d2 = (dist[0][0] + dist[1][1] + dist[2][0]) | 2; - min_d0 = basisu::minimum(min_d0, d2); - - uint32_t d3 = (dist[0][1] + dist[1][1] + dist[2][0]) | 3; - min_d1 = basisu::minimum(min_d1, d3); - - uint32_t d6 = (dist[0][0] + dist[1][1] + dist[2][1]) | 6; - min_d0 = basisu::minimum(min_d0, d6); + BASISU_DEVEL_ERROR("ktx2_transcoder::start_transcoding: Invalid supercompression scheme and/or format\n"); + return false; + } + } + else if (m_header.m_supercompression_scheme == KTX2_SS_ZSTANDARD) + { +#if !BASISD_SUPPORT_KTX2_ZSTD + BASISU_DEVEL_ERROR("ktx2_transcoder::start_transcoding: File uses zstd supercompression, but zstd support was not enabled at compilation time (BASISD_SUPPORT_KTX2_ZSTD == 0)\n"); + return false; +#endif + } - uint32_t d7 = (dist[0][1] + dist[1][1] + dist[2][1]) | 7; - min_d1 = basisu::minimum(min_d1, d7); + return true; + } - uint32_t min_d = basisu::minimum(min_d0, min_d1); - uint32_t best_i = min_d & 7; + bool ktx2_transcoder::get_image_level_info(ktx2_image_level_info& level_info, uint32_t level_index, uint32_t layer_index, uint32_t face_index) const + { + if (level_index >= m_levels.size()) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::get_image_level_info: level_index >= m_levels.size()\n"); + return false; + } - int delta_r = (best_i & 1) ? (max_r - avg_r) : (avg_r - min_r); - int delta_g = (best_i & 2) ? (max_g - avg_g) : (avg_g - min_g); - int delta_b = (best_i & 4) ? (max_b - avg_b) : (avg_b - min_b); + if (m_header.m_face_count > 1) + { + if (face_index >= 6) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::get_image_level_info: face_index >= 6\n"); + return false; + } + } + else if (face_index != 0) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::get_image_level_info: face_index != 0\n"); + return false; + } - // Note: if delta_r/g/b==0, we actually want to choose a single color, so the block average color optimization kicks in. - uint32_t low_c = 0, high_c = 0; - if ((delta_r | delta_g | delta_b) != 0) - { - // Now we have a smaller AABB going from the block's average color to a cornerpoint of the larger AABB. - // Project all pixels colors along the 4 vectors going from a smaller AABB cornerpoint to the opposite cornerpoint, find largest projection. - // One of these vectors will be a decent approximation of the block's PCA. - const int saxis0_r = delta_r, saxis0_g = delta_g, saxis0_b = delta_b; + if (layer_index >= basisu::maximum(m_header.m_layer_count, 1)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::get_image_level_info: layer_index >= maximum(m_header.m_layer_count, 1)\n"); + return false; + } + + const uint32_t level_width = basisu::maximum(m_header.m_pixel_width >> level_index, 1); + const uint32_t level_height = basisu::maximum(m_header.m_pixel_height >> level_index, 1); - int low_dot0 = INT_MAX, high_dot0 = INT_MIN; - int low_dot1 = INT_MAX, high_dot1 = INT_MIN; - int low_dot2 = INT_MAX, high_dot2 = INT_MIN; - int low_dot3 = INT_MAX, high_dot3 = INT_MIN; + const uint32_t block_width = get_block_width(); + const uint32_t block_height = get_block_height(); - //int low_c0, low_c1, low_c2, low_c3; - //int high_c0, high_c1, high_c2, high_c3; + const uint32_t num_blocks_x = (level_width + block_width - 1) / block_width; + const uint32_t num_blocks_y = (level_height + block_height - 1) / block_height; - for (uint32_t i = 0; i < 16; i++) - { - const int dotx = pSrc_pixels[i].r * saxis0_r; - const int doty = pSrc_pixels[i].g * saxis0_g; - const int dotz = pSrc_pixels[i].b * saxis0_b; + level_info.m_face_index = face_index; + level_info.m_layer_index = layer_index; + level_info.m_level_index = level_index; + level_info.m_orig_width = level_width; + level_info.m_orig_height = level_height; + level_info.m_width = num_blocks_x * block_width; + level_info.m_height = num_blocks_y * block_height; + level_info.m_block_width = block_width; + level_info.m_block_height = block_height; + level_info.m_num_blocks_x = num_blocks_x; + level_info.m_num_blocks_y = num_blocks_y; + level_info.m_total_blocks = num_blocks_x * num_blocks_y; + level_info.m_alpha_flag = m_has_alpha; + level_info.m_iframe_flag = false; + + if (m_etc1s_image_descs.size()) + { + const uint32_t etc1s_image_index = + (level_index * basisu::maximum(m_header.m_layer_count, 1) * m_header.m_face_count) + + layer_index * m_header.m_face_count + + face_index; - const int dot0 = ((dotz + dotx + doty) << 4) + i; - const int dot1 = ((dotz - dotx - doty) << 4) + i; - const int dot2 = ((dotz - dotx + doty) << 4) + i; - const int dot3 = ((dotz + dotx - doty) << 4) + i; + level_info.m_iframe_flag = (m_etc1s_image_descs[etc1s_image_index].m_image_flags & KTX2_IMAGE_IS_P_FRAME) == 0; + } - if (dot0 < low_dot0) - { - low_dot0 = dot0; - //low_c0 = i; - } - if ((dot0 ^ 15) > high_dot0) - { - high_dot0 = dot0 ^ 15; - //high_c0 = i; - } + return true; + } + + bool ktx2_transcoder::transcode_image_level( + uint32_t level_index, uint32_t layer_index, uint32_t face_index, + void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, + basist::transcoder_texture_format fmt, + uint32_t decode_flags, uint32_t output_row_pitch_in_blocks_or_pixels, uint32_t output_rows_in_pixels, int channel0, int channel1, + ktx2_transcoder_state* pState) + { + if (!m_pData) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: Must call init() first\n"); + return false; + } - if (dot1 < low_dot1) - { - low_dot1 = dot1; - //low_c1 = i; - } - if ((dot1 ^ 15) > high_dot1) - { - high_dot1 = dot1 ^ 15; - //high_c1 = i; - } + if (!pState) + pState = &m_def_transcoder_state; + + if (level_index >= m_levels.size()) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: level_index >= m_levels.size()\n"); + return false; + } - if (dot2 < low_dot2) - { - low_dot2 = dot2; - //low_c2 = i; - } - if ((dot2 ^ 15) > high_dot2) - { - high_dot2 = dot2 ^ 15; - //high_c2 = i; - } + if (m_header.m_face_count > 1) + { + if (face_index >= 6) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: face_index >= 6\n"); + return false; + } + } + else if (face_index != 0) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: face_index != 0\n"); + return false; + } - if (dot3 < low_dot3) - { - low_dot3 = dot3; - //low_c3 = i; - } - if ((dot3 ^ 15) > high_dot3) - { - high_dot3 = dot3 ^ 15; - //high_c3 = i; - } - } + if (layer_index >= basisu::maximum(m_header.m_layer_count, 1)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: layer_index >= maximum(m_header.m_layer_count, 1)\n"); + return false; + } - low_c = low_dot0 & 15; - high_c = ~high_dot0 & 15; - uint32_t r = (high_dot0 & ~15) - (low_dot0 & ~15); + const uint8_t* pComp_level_data = m_pData + m_levels[level_index].m_byte_offset; + uint64_t comp_level_data_size = m_levels[level_index].m_byte_length; + + const uint8_t* pUncomp_level_data = pComp_level_data; + uint64_t uncomp_level_data_size = comp_level_data_size; - uint32_t tr = (high_dot1 & ~15) - (low_dot1 & ~15); - if (tr > r) { - low_c = low_dot1 & 15; - high_c = ~high_dot1 & 15; - r = tr; - } + if (uncomp_level_data_size > UINT32_MAX) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: uncomp_level_data_size > UINT32_MAX\n"); + return false; + } + + if (m_header.m_supercompression_scheme == KTX2_SS_ZSTANDARD) + { + // Check if we've already decompressed this level's supercompressed data. + if ((int)level_index != pState->m_uncomp_data_level_index) + { + // Uncompress the entire level's supercompressed data. + if (!decompress_level_data(level_index, pState->m_level_uncomp_data)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: decompress_level_data() failed\n"); + return false; + } + pState->m_uncomp_data_level_index = level_index; + } - tr = (high_dot2 & ~15) - (low_dot2 & ~15); - if (tr > r) { - low_c = low_dot2 & 15; - high_c = ~high_dot2 & 15; - r = tr; - } + pUncomp_level_data = pState->m_level_uncomp_data.data(); + uncomp_level_data_size = pState->m_level_uncomp_data.size(); + } + + const uint32_t level_width = basisu::maximum(m_header.m_pixel_width >> level_index, 1); + const uint32_t level_height = basisu::maximum(m_header.m_pixel_height >> level_index, 1); + const uint32_t num_blocks4_x = (level_width + 3) >> 2; + const uint32_t num_blocks4_y = (level_height + 3) >> 2; + + if (m_format == basist::basis_tex_format::cETC1S) + { + // Ensure start_transcoding() was called. + if (m_etc1s_transcoder.get_endpoints().empty()) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: must call start_transcoding() first\n"); + return false; + } - tr = (high_dot3 & ~15) - (low_dot3 & ~15); - if (tr > r) { - low_c = low_dot3 & 15; - high_c = ~high_dot3 & 15; - } - } + const uint32_t etc1s_image_index = + (level_index * basisu::maximum(m_header.m_layer_count, 1) * m_header.m_face_count) + + layer_index * m_header.m_face_count + + face_index; + + // Sanity check + if (etc1s_image_index >= m_etc1s_image_descs.size()) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: etc1s_image_index >= m_etc1s_image_descs.size()\n"); + assert(0); + return false; + } - lr = to_5(pSrc_pixels[low_c].r); - lg = to_6(pSrc_pixels[low_c].g); - lb = to_5(pSrc_pixels[low_c].b); + const ktx2_etc1s_image_desc& image_desc = m_etc1s_image_descs[etc1s_image_index]; - hr = to_5(pSrc_pixels[high_c].r); - hg = to_6(pSrc_pixels[high_c].g); - hb = to_5(pSrc_pixels[high_c].b); + if (!m_etc1s_transcoder.transcode_image(fmt, + pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, m_pData, m_data_size, + num_blocks4_x, num_blocks4_y, level_width, level_height, + level_index, + m_levels[level_index].m_byte_offset + image_desc.m_rgb_slice_byte_offset, image_desc.m_rgb_slice_byte_length, + image_desc.m_alpha_slice_byte_length ? (m_levels[level_index].m_byte_offset + image_desc.m_alpha_slice_byte_offset) : 0, image_desc.m_alpha_slice_byte_length, + decode_flags, m_has_alpha, + m_is_video, output_row_pitch_in_blocks_or_pixels, &pState->m_transcoder_state, output_rows_in_pixels)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: ETC1S transcode_image() failed, this is either a bug or the file is corrupted/invalid\n"); + return false; + } + } + else if (m_format == basist::basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE) + { + if (!m_astc_6x6_intermediate_image_descs.size()) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: must call start_transcoding() first\n"); + return false; } - bc1_find_sels_2(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels); - } // if (use_sels) + const uint32_t num_blocks6_x = (level_width + 5) / 6; + const uint32_t num_blocks6_y = (level_height + 5) / 6; - const uint32_t total_ls_passes = (flags & cEncodeBC1HigherQuality) ? 3 : (flags & cEncodeBC1HighQuality ? 2 : 1); - for (uint32_t ls_pass = 0; ls_pass < total_ls_passes; ls_pass++) - { - int prev_lr = lr, prev_lg = lg, prev_lb = lb, prev_hr = hr, prev_hg = hg, prev_hb = hb; + const uint32_t image_index = + (level_index * basisu::maximum(m_header.m_layer_count, 1) * m_header.m_face_count) + + layer_index * m_header.m_face_count + + face_index; - // This is where the real magic happens. We have an array of candidate selectors, so let's use least squares to compute the optimal low/high endpoint colors. - vec3F xl, xh; - if (!compute_least_squares_endpoints_rgb(pSrc_pixels, sels, &xl, &xh)) + // Sanity check + if (image_index >= m_astc_6x6_intermediate_image_descs.size()) { - if (avg_r < 0) - { - int total_r = 0, total_g = 0, total_b = 0; - for (uint32_t i = 0; i < 16; i++) - { - total_r += pSrc_pixels[i].r; - total_g += pSrc_pixels[i].g; - total_b += pSrc_pixels[i].b; - } + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: Invalid image_index\n"); + assert(0); + return false; + } - avg_r = (total_r + 8) >> 4; - avg_g = (total_g + 8) >> 4; - avg_b = (total_b + 8) >> 4; - } + const ktx2_astc_hdr_6x6_intermediate_image_desc& image_desc = m_astc_6x6_intermediate_image_descs[image_index]; + + if (!m_astc_hdr_6x6_intermediate_transcoder.transcode_image(fmt, + pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, + m_pData, m_data_size, num_blocks6_x, num_blocks6_y, level_width, level_height, level_index, + m_levels[level_index].m_byte_offset + image_desc.m_rgb_slice_byte_offset, image_desc.m_rgb_slice_byte_length, + decode_flags, m_has_alpha, m_is_video, output_row_pitch_in_blocks_or_pixels, nullptr, output_rows_in_pixels, channel0, channel1)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: ASTC 6x6 HDR transcode_image() failed, this is either a bug or the file is corrupted/invalid\n"); + return false; + } + } + else if (m_format == basist::basis_tex_format::cASTC_HDR_6x6) + { + const uint32_t num_blocks6_x = (level_width + 5) / 6; + const uint32_t num_blocks6_y = (level_height + 5) / 6; - // All selectors equal - treat it as a solid block which should always be equal or better. - lr = g_bc1_match5_equals_1[avg_r].m_hi; - lg = g_bc1_match6_equals_1[avg_g].m_hi; - lb = g_bc1_match5_equals_1[avg_b].m_hi; + // Compute length and offset to uncompressed 2D UASTC texture data, given the face/layer indices. + assert(uncomp_level_data_size == m_levels[level_index].m_uncompressed_byte_length); + const uint32_t total_2D_image_size = num_blocks6_x * num_blocks6_y * sizeof(astc_helpers::astc_block); - hr = g_bc1_match5_equals_1[avg_r].m_lo; - hg = g_bc1_match6_equals_1[avg_g].m_lo; - hb = g_bc1_match5_equals_1[avg_b].m_lo; + const uint32_t uncomp_ofs = (layer_index * m_header.m_face_count + face_index) * total_2D_image_size; - // In high/higher quality mode, let it try again in case the optimal tables have caused the sels to diverge. - } - else + // Sanity checks + if (uncomp_ofs >= uncomp_level_data_size) { - lr = basisu::clamp((int)((xl.c[0]) * (31.0f / 255.0f) + .5f), 0, 31); - lg = basisu::clamp((int)((xl.c[1]) * (63.0f / 255.0f) + .5f), 0, 63); - lb = basisu::clamp((int)((xl.c[2]) * (31.0f / 255.0f) + .5f), 0, 31); - - hr = basisu::clamp((int)((xh.c[0]) * (31.0f / 255.0f) + .5f), 0, 31); - hg = basisu::clamp((int)((xh.c[1]) * (63.0f / 255.0f) + .5f), 0, 63); - hb = basisu::clamp((int)((xh.c[2]) * (31.0f / 255.0f) + .5f), 0, 31); + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: uncomp_ofs >= total_2D_image_size\n"); + return false; } - if ((prev_lr == lr) && (prev_lg == lg) && (prev_lb == lb) && (prev_hr == hr) && (prev_hg == hg) && (prev_hb == hb)) - break; + if ((uncomp_level_data_size - uncomp_ofs) < total_2D_image_size) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: (uncomp_level_data_size - uncomp_ofs) < total_2D_image_size\n"); + return false; + } - bc1_find_sels_2(pSrc_pixels, lr, lg, lb, hr, hg, hb, sels); + if (!m_astc_hdr_6x6_transcoder.transcode_image(fmt, + pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, + (const uint8_t*)pUncomp_level_data + uncomp_ofs, (uint32_t)total_2D_image_size, num_blocks6_x, num_blocks6_y, level_width, level_height, level_index, + 0, (uint32_t)total_2D_image_size, + decode_flags, m_has_alpha, m_is_video, output_row_pitch_in_blocks_or_pixels, nullptr, output_rows_in_pixels, channel0, channel1)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: ASTC 6x6 HDR transcode_image() failed, this is either a bug or the file is corrupted/invalid\n"); + return false; + } } + else if ((m_format == basist::basis_tex_format::cUASTC4x4) || + (m_format == basist::basis_tex_format::cUASTC_HDR_4x4)) + { + // Compute length and offset to uncompressed 2D UASTC texture data, given the face/layer indices. + assert(uncomp_level_data_size == m_levels[level_index].m_uncompressed_byte_length); + const uint32_t total_2D_image_size = num_blocks4_x * num_blocks4_y * KTX2_UASTC_BLOCK_SIZE; + + const uint32_t uncomp_ofs = (layer_index * m_header.m_face_count + face_index) * total_2D_image_size; - uint32_t lc16 = dxt1_block::pack_unscaled_color(lr, lg, lb); - uint32_t hc16 = dxt1_block::pack_unscaled_color(hr, hg, hb); + // Sanity checks + if (uncomp_ofs >= uncomp_level_data_size) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: uncomp_ofs >= total_2D_image_size\n"); + return false; + } - // Always forbid 3 color blocks - if (lc16 == hc16) - { - uint8_t mask = 0; + if ((uncomp_level_data_size - uncomp_ofs) < total_2D_image_size) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: (uncomp_level_data_size - uncomp_ofs) < total_2D_image_size\n"); + return false; + } - // Make l > h - if (hc16 > 0) - hc16--; + if (m_format == basist::basis_tex_format::cUASTC_HDR_4x4) + { + if (!m_uastc_hdr_transcoder.transcode_image(fmt, + pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, + (const uint8_t*)pUncomp_level_data + uncomp_ofs, (uint32_t)total_2D_image_size, num_blocks4_x, num_blocks4_y, level_width, level_height, level_index, + 0, (uint32_t)total_2D_image_size, + decode_flags, m_has_alpha, m_is_video, output_row_pitch_in_blocks_or_pixels, nullptr, output_rows_in_pixels, channel0, channel1)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: UASTC HDR transcode_image() failed, this is either a bug or the file is corrupted/invalid\n"); + return false; + } + } else { - // lc16 = hc16 = 0 - assert(lc16 == hc16 && hc16 == 0); - - hc16 = 0; - lc16 = 1; - mask = 0x55; // select hc16 + if (!m_uastc_transcoder.transcode_image(fmt, + pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, + (const uint8_t*)pUncomp_level_data + uncomp_ofs, (uint32_t)total_2D_image_size, num_blocks4_x, num_blocks4_y, level_width, level_height, level_index, + 0, (uint32_t)total_2D_image_size, + decode_flags, m_has_alpha, m_is_video, output_row_pitch_in_blocks_or_pixels, nullptr, output_rows_in_pixels, channel0, channel1)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: UASTC transcode_image() failed, this is either a bug or the file is corrupted/invalid\n"); + return false; + } } + } + else + { + // Shouldn't get here. + BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: Internal error\n"); + assert(0); + return false; + } - assert(lc16 > hc16); - pDst_block->set_low_color(static_cast(lc16)); - pDst_block->set_high_color(static_cast(hc16)); + return true; + } + + bool ktx2_transcoder::decompress_level_data(uint32_t level_index, basisu::uint8_vec& uncomp_data) + { + const uint8_t* pComp_data = m_levels[level_index].m_byte_offset + m_pData; + const uint64_t comp_size = m_levels[level_index].m_byte_length; + + const uint64_t uncomp_size = m_levels[level_index].m_uncompressed_byte_length; - pDst_block->m_selectors[0] = mask; - pDst_block->m_selectors[1] = mask; - pDst_block->m_selectors[2] = mask; - pDst_block->m_selectors[3] = mask; + if (((size_t)comp_size) != comp_size) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: Compressed data too large\n"); + return false; } - else + if (((size_t)uncomp_size) != uncomp_size) { - uint8_t invert_mask = 0; - if (lc16 < hc16) + BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: Uncompressed data too large\n"); + return false; + } + + if (!uncomp_data.try_resize((size_t)uncomp_size)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: Out of memory\n"); + return false; + } + + if (m_header.m_supercompression_scheme == KTX2_SS_ZSTANDARD) + { +#if BASISD_SUPPORT_KTX2_ZSTD + size_t actualUncompSize = ZSTD_decompress(uncomp_data.data(), (size_t)uncomp_size, pComp_data, (size_t)comp_size); + if (ZSTD_isError(actualUncompSize)) { - std::swap(lc16, hc16); - invert_mask = 0x55; + BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: Zstd decompression failed, file is invalid or corrupted\n"); + return false; } + if (actualUncompSize != uncomp_size) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: Zstd decompression returned too few bytes, file is invalid or corrupted\n"); + return false; + } +#else + BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: File uses Zstd supercompression, but Zstd support was not enabled at compile time (BASISD_SUPPORT_KTX2_ZSTD is 0)\n"); + return false; +#endif + } - assert(lc16 > hc16); - pDst_block->set_low_color((uint16_t)lc16); - pDst_block->set_high_color((uint16_t)hc16); + return true; + } - uint32_t packed_sels = 0; - static const uint8_t s_sel_trans[4] = { 0, 2, 3, 1 }; - for (uint32_t i = 0; i < 16; i++) - packed_sels |= ((uint32_t)s_sel_trans[sels[i]] << (i * 2)); + bool ktx2_transcoder::read_astc_6x6_hdr_intermediate_global_data() + { + const uint32_t image_count = basisu::maximum(m_header.m_layer_count, 1) * m_header.m_face_count * m_header.m_level_count; + assert(image_count); + + const uint8_t* pSrc = m_pData + m_header.m_sgd_byte_offset; + + if (m_header.m_sgd_byte_length != image_count * sizeof(ktx2_astc_hdr_6x6_intermediate_image_desc)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_astc_6x6_hdr_intermediate_global_data: Invalid global data length\n"); + return false; + } + + m_astc_6x6_intermediate_image_descs.resize(image_count); + + memcpy(m_astc_6x6_intermediate_image_descs.data(), pSrc, sizeof(ktx2_astc_hdr_6x6_intermediate_image_desc) * image_count); + + // Sanity check the image descs + for (uint32_t i = 0; i < image_count; i++) + { + // transcode_image() will validate the slice offsets/lengths before transcoding. - pDst_block->m_selectors[0] = (uint8_t)packed_sels ^ invert_mask; - pDst_block->m_selectors[1] = (uint8_t)(packed_sels >> 8) ^ invert_mask; - pDst_block->m_selectors[2] = (uint8_t)(packed_sels >> 16) ^ invert_mask; - pDst_block->m_selectors[3] = (uint8_t)(packed_sels >> 24) ^ invert_mask; + if (!m_astc_6x6_intermediate_image_descs[i].m_rgb_slice_byte_length) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_astc_6x6_hdr_intermediate_global_data: image descs sanity check failed (1)\n"); + return false; + } } - } - // Scale the UASTC first subset endpoints and first plane's weight indices directly to BC1's - fastest. - void transcode_uastc_to_bc1_hint0(const unpacked_uastc_block& unpacked_src_blk, void* pDst) + return true; + } + + bool ktx2_transcoder::decompress_etc1s_global_data() { - const uint32_t mode = unpacked_src_blk.m_mode; - const astc_block_desc& astc_blk = unpacked_src_blk.m_astc; + // Note: we don't actually support 3D textures in here yet + //uint32_t layer_pixel_depth = basisu::maximum(m_header.m_pixel_depth, 1); + //for (uint32_t i = 1; i < m_header.m_level_count; i++) + // layer_pixel_depth += basisu::maximum(m_header.m_pixel_depth >> i, 1); - dxt1_block& b = *static_cast(pDst); + const uint32_t image_count = basisu::maximum(m_header.m_layer_count, 1) * m_header.m_face_count * m_header.m_level_count; + assert(image_count); - const uint32_t endpoint_range = g_uastc_mode_endpoint_ranges[mode]; + const uint8_t* pSrc = m_pData + m_header.m_sgd_byte_offset; - const uint32_t total_comps = g_uastc_mode_comps[mode]; + memcpy(&m_etc1s_header, pSrc, sizeof(ktx2_etc1s_global_data_header)); + pSrc += sizeof(ktx2_etc1s_global_data_header); - if (total_comps == 2) + if ((!m_etc1s_header.m_endpoints_byte_length) || (!m_etc1s_header.m_selectors_byte_length) || (!m_etc1s_header.m_tables_byte_length)) { - const uint32_t l = g_astc_unquant[endpoint_range][astc_blk.m_endpoints[0]].m_unquant; - const uint32_t h = g_astc_unquant[endpoint_range][astc_blk.m_endpoints[1]].m_unquant; - - b.set_low_color(dxt1_block::pack_color(color32(l, l, l, 255), true, 127)); - b.set_high_color(dxt1_block::pack_color(color32(h, h, h, 255), true, 127)); + BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: Invalid ETC1S global data\n"); + return false; } - else + + if ((!m_etc1s_header.m_endpoint_count) || (!m_etc1s_header.m_selector_count)) { - b.set_low_color(dxt1_block::pack_color( - color32(g_astc_unquant[endpoint_range][astc_blk.m_endpoints[0]].m_unquant, - g_astc_unquant[endpoint_range][astc_blk.m_endpoints[2]].m_unquant, - g_astc_unquant[endpoint_range][astc_blk.m_endpoints[4]].m_unquant, - 255), true, 127) - ); + BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: endpoint and/or selector count is 0, file is invalid or corrupted\n"); + return false; + } - b.set_high_color(dxt1_block::pack_color( - color32(g_astc_unquant[endpoint_range][astc_blk.m_endpoints[1]].m_unquant, - g_astc_unquant[endpoint_range][astc_blk.m_endpoints[3]].m_unquant, - g_astc_unquant[endpoint_range][astc_blk.m_endpoints[5]].m_unquant, - 255), true, 127) - ); + // Sanity check the ETC1S header. + if ((sizeof(ktx2_etc1s_global_data_header) + + sizeof(ktx2_etc1s_image_desc) * image_count + + m_etc1s_header.m_endpoints_byte_length + + m_etc1s_header.m_selectors_byte_length + + m_etc1s_header.m_tables_byte_length + + m_etc1s_header.m_extended_byte_length) > m_header.m_sgd_byte_length) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: SGD byte length is too small, file is invalid or corrupted\n"); + return false; + } + + if (!m_etc1s_image_descs.try_resize(image_count)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: Out of memory\n"); + return false; } + + memcpy(m_etc1s_image_descs.data(), pSrc, sizeof(ktx2_etc1s_image_desc) * image_count); + pSrc += sizeof(ktx2_etc1s_image_desc) * image_count; - if (b.get_low_color() == b.get_high_color()) + // Sanity check the ETC1S image descs + for (uint32_t i = 0; i < image_count; i++) { - // Always forbid 3 color blocks - uint16_t lc16 = (uint16_t)b.get_low_color(); - uint16_t hc16 = (uint16_t)b.get_high_color(); - - uint8_t mask = 0; + // m_etc1s_transcoder.transcode_image() will validate the slice offsets/lengths before transcoding. - // Make l > h - if (hc16 > 0) - hc16--; - else + if (!m_etc1s_image_descs[i].m_rgb_slice_byte_length) { - // lc16 = hc16 = 0 - assert(lc16 == hc16 && hc16 == 0); + BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: ETC1S image descs sanity check failed (1)\n"); + return false; + } - hc16 = 0; - lc16 = 1; - mask = 0x55; // select hc16 + if (m_has_alpha) + { + if (!m_etc1s_image_descs[i].m_alpha_slice_byte_length) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: ETC1S image descs sanity check failed (2)\n"); + return false; + } } + } - assert(lc16 > hc16); - b.set_low_color(static_cast(lc16)); - b.set_high_color(static_cast(hc16)); + const uint8_t* pEndpoint_data = pSrc; + const uint8_t* pSelector_data = pSrc + m_etc1s_header.m_endpoints_byte_length; + const uint8_t* pTables_data = pSrc + m_etc1s_header.m_endpoints_byte_length + m_etc1s_header.m_selectors_byte_length; - b.m_selectors[0] = mask; - b.m_selectors[1] = mask; - b.m_selectors[2] = mask; - b.m_selectors[3] = mask; + if (!m_etc1s_transcoder.decode_tables(pTables_data, m_etc1s_header.m_tables_byte_length)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: decode_tables() failed, file is invalid or corrupted\n"); + return false; } - else + + if (!m_etc1s_transcoder.decode_palettes( + m_etc1s_header.m_endpoint_count, pEndpoint_data, m_etc1s_header.m_endpoints_byte_length, + m_etc1s_header.m_selector_count, pSelector_data, m_etc1s_header.m_selectors_byte_length)) { - bool invert = false; - if (b.get_low_color() < b.get_high_color()) + BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: decode_palettes() failed, file is likely corrupted\n"); + return false; + } + + return true; + } + + bool ktx2_transcoder::read_key_values() + { + if (!m_header.m_kvd_byte_length) + { + if (m_header.m_kvd_byte_offset) { - std::swap(b.m_low_color[0], b.m_high_color[0]); - std::swap(b.m_low_color[1], b.m_high_color[1]); - invert = true; + BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Invalid KVD byte offset (it should be zero when the length is zero)\n"); + return false; } - const uint8_t* pTran = s_uastc_to_bc1_weights[g_uastc_mode_weight_bits[mode]]; + return true; + } - const uint32_t plane_shift = g_uastc_mode_planes[mode] - 1; + if (m_header.m_kvd_byte_offset < sizeof(ktx2_header)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Invalid KVD byte offset\n"); + return false; + } - uint32_t sels = 0; - for (int i = 15; i >= 0; --i) - { - uint32_t s = pTran[astc_blk.m_weights[i << plane_shift]]; + if ((m_header.m_kvd_byte_offset + m_header.m_kvd_byte_length) > m_data_size) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Invalid KVD byte offset and/or length\n"); + return false; + } - if (invert) - s ^= 1; + const uint8_t* pSrc = m_pData + m_header.m_kvd_byte_offset; + uint32_t src_left = m_header.m_kvd_byte_length; - sels = (sels << 2) | s; - } - b.m_selectors[0] = sels & 0xFF; - b.m_selectors[1] = (sels >> 8) & 0xFF; - b.m_selectors[2] = (sels >> 16) & 0xFF; - b.m_selectors[3] = (sels >> 24) & 0xFF; + if (!m_key_values.try_reserve(8)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Out of memory\n"); + return false; } - } - // Scale the UASTC first plane's weight indices to BC1, use 1 or 2 least squares passes to compute endpoints - no PCA needed. - void transcode_uastc_to_bc1_hint1(const unpacked_uastc_block& unpacked_src_blk, const color32 block_pixels[4][4], void* pDst, bool high_quality) - { - const uint32_t mode = unpacked_src_blk.m_mode; + while (src_left > sizeof(uint32_t)) + { + uint32_t l = basisu::read_le_dword(pSrc); + + pSrc += sizeof(uint32_t); + src_left -= sizeof(uint32_t); - const astc_block_desc& astc_blk = unpacked_src_blk.m_astc; + if (l < 2) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Failed reading key value fields (0)\n"); + return false; + } - dxt1_block& b = *static_cast(pDst); + if (src_left < l) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Failed reading key value fields (1)\n"); + return false; + } - b.set_low_color(1); - b.set_high_color(0); + if (!m_key_values.try_resize(m_key_values.size() + 1)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Out of memory\n"); + return false; + } + + basisu::uint8_vec& key_data = m_key_values.back().m_key; + basisu::uint8_vec& value_data = m_key_values.back().m_value; - const uint8_t* pTran = s_uastc_to_bc1_weights[g_uastc_mode_weight_bits[mode]]; + do + { + if (!l) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Failed reading key value fields (2)\n"); + return false; + } - const uint32_t plane_shift = g_uastc_mode_planes[mode] - 1; + if (!key_data.try_push_back(*pSrc++)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Out of memory\n"); + return false; + } - uint32_t sels = 0; - for (int i = 15; i >= 0; --i) - { - sels <<= 2; - sels |= pTran[astc_blk.m_weights[i << plane_shift]]; - } + src_left--; + l--; - b.m_selectors[0] = sels & 0xFF; - b.m_selectors[1] = (sels >> 8) & 0xFF; - b.m_selectors[2] = (sels >> 16) & 0xFF; - b.m_selectors[3] = (sels >> 24) & 0xFF; + } while (key_data.back()); - encode_bc1(&b, (const uint8_t*)&block_pixels[0][0].c[0], (high_quality ? cEncodeBC1HighQuality : 0) | cEncodeBC1UseSelectors); - } + // Ensure key and value are definitely 0 terminated + if (!key_data.try_push_back('\0')) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Out of memory\n"); + return false; + } + + if (!value_data.try_resize(l)) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Out of memory\n"); + return false; + } - bool transcode_uastc_to_bc1(const uastc_block& src_blk, void* pDst, bool high_quality) - { - unpacked_uastc_block unpacked_src_blk; - if (!unpack_uastc(src_blk, unpacked_src_blk, false)) - return false; + if (l) + { + memcpy(value_data.data(), pSrc, l); + pSrc += l; + src_left -= l; + } - const uint32_t mode = unpacked_src_blk.m_mode; + // Ensure key and value are definitely 0 terminated + if (!value_data.try_push_back('\0')) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Out of memory\n"); + return false; + } - if (mode == UASTC_MODE_INDEX_SOLID_COLOR) - { - encode_bc1_solid_block(pDst, unpacked_src_blk.m_solid_color.r, unpacked_src_blk.m_solid_color.g, unpacked_src_blk.m_solid_color.b); - return true; - } + uint32_t ofs = (uint32_t)(pSrc - m_pData) & 3; + uint32_t alignment_bytes = (4 - ofs) & 3; - if ((!high_quality) && (unpacked_src_blk.m_bc1_hint0)) - transcode_uastc_to_bc1_hint0(unpacked_src_blk, pDst); - else - { - color32 block_pixels[4][4]; - const bool unpack_srgb = false; - if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) + if (src_left < alignment_bytes) + { + BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Failed reading key value fields (3)\n"); return false; + } - if (unpacked_src_blk.m_bc1_hint1) - transcode_uastc_to_bc1_hint1(unpacked_src_blk, block_pixels, pDst, high_quality); - else - encode_bc1(pDst, &block_pixels[0][0].r, high_quality ? cEncodeBC1HighQuality : 0); + pSrc += alignment_bytes; + src_left -= alignment_bytes; } return true; } + +#endif // BASISD_SUPPORT_KTX2 - static void write_bc4_solid_block(uint8_t* pDst, uint32_t a) + bool basisu_transcoder_supports_ktx2() { - pDst[0] = (uint8_t)a; - pDst[1] = (uint8_t)a; - memset(pDst + 2, 0, 6); +#if BASISD_SUPPORT_KTX2 + return true; +#else + return false; +#endif } - bool transcode_uastc_to_bc3(const uastc_block& src_blk, void* pDst, bool high_quality) + bool basisu_transcoder_supports_ktx2_zstd() { - unpacked_uastc_block unpacked_src_blk; - if (!unpack_uastc(src_blk, unpacked_src_blk, false)) - return false; +#if BASISD_SUPPORT_KTX2_ZSTD + return true; +#else + return false; +#endif + } - const uint32_t mode = unpacked_src_blk.m_mode; + //------------------------------- - void* pBC4_block = pDst; - dxt1_block* pBC1_block = &static_cast(pDst)[1]; +#if BASISD_SUPPORT_UASTC_HDR + // This float->half conversion matches how "F32TO16" works on Intel GPU's. + basist::half_float float_to_half(float val) + { + union { float f; int32_t i; uint32_t u; } fi = { val }; + const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF, flt_s = (fi.i >> 31) & 0x1; + int s = flt_s, e = 0, m = 0; - if (mode == UASTC_MODE_INDEX_SOLID_COLOR) + // inf/NaN + if (flt_e == 0xff) { - write_bc4_solid_block(static_cast(pBC4_block), unpacked_src_blk.m_solid_color.a); - encode_bc1_solid_block(pBC1_block, unpacked_src_blk.m_solid_color.r, unpacked_src_blk.m_solid_color.g, unpacked_src_blk.m_solid_color.b); - return true; + e = 31; + if (flt_m != 0) // NaN + m = 1; } - - color32 block_pixels[4][4]; - const bool unpack_srgb = false; - if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) - return false; - - basist::encode_bc4(pBC4_block, &block_pixels[0][0].a, sizeof(color32)); - - if ((!high_quality) && (unpacked_src_blk.m_bc1_hint0)) - transcode_uastc_to_bc1_hint0(unpacked_src_blk, pBC1_block); - else + // not zero or denormal + else if (flt_e != 0) { - if (unpacked_src_blk.m_bc1_hint1) - transcode_uastc_to_bc1_hint1(unpacked_src_blk, block_pixels, pBC1_block, high_quality); + int new_exp = flt_e - 127; + if (new_exp > 15) + e = 31; + else if (new_exp < -14) + m = lrintf((1 << 24) * fabsf(fi.f)); else - encode_bc1(pBC1_block, &block_pixels[0][0].r, high_quality ? cEncodeBC1HighQuality : 0); + { + e = new_exp + 15; + m = lrintf(flt_m * (1.0f / ((float)(1 << 13)))); + } } - return true; - } - - bool transcode_uastc_to_bc4(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0) - { - BASISU_NOTE_UNUSED(high_quality); - - unpacked_uastc_block unpacked_src_blk; - if (!unpack_uastc(src_blk, unpacked_src_blk, false)) - return false; - - const uint32_t mode = unpacked_src_blk.m_mode; - - void* pBC4_block = pDst; - - if (mode == UASTC_MODE_INDEX_SOLID_COLOR) + assert((0 <= m) && (m <= 1024)); + if (m == 1024) { - write_bc4_solid_block(static_cast(pBC4_block), unpacked_src_blk.m_solid_color.c[chan0]); - return true; + e++; + m = 0; } - color32 block_pixels[4][4]; - const bool unpack_srgb = false; - if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) - return false; - - basist::encode_bc4(pBC4_block, &block_pixels[0][0].c[chan0], sizeof(color32)); + assert((s >= 0) && (s <= 1)); + assert((e >= 0) && (e <= 31)); + assert((m >= 0) && (m <= 1023)); - return true; + basist::half_float result = (basist::half_float)((s << 15) | (e << 10) | m); + return result; } + + //------------------------------------------------------------------------------------------------ + // HDR support + // + // Originally from bc6h_enc.cpp + // BC6H decoder fuzzed vs. DirectXTex's for unsigned/signed - bool transcode_uastc_to_bc5(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0, uint32_t chan1) + const uint8_t g_bc6h_mode_sig_bits[NUM_BC6H_MODES][4] = // base bits, r, g, b { - BASISU_NOTE_UNUSED(high_quality); - - unpacked_uastc_block unpacked_src_blk; - if (!unpack_uastc(src_blk, unpacked_src_blk, false)) - return false; - - const uint32_t mode = unpacked_src_blk.m_mode; - - void* pBC4_block0 = pDst; - void* pBC4_block1 = (uint8_t*)pDst + 8; - - if (mode == UASTC_MODE_INDEX_SOLID_COLOR) - { - write_bc4_solid_block(static_cast(pBC4_block0), unpacked_src_blk.m_solid_color.c[chan0]); - write_bc4_solid_block(static_cast(pBC4_block1), unpacked_src_blk.m_solid_color.c[chan1]); - return true; - } - - color32 block_pixels[4][4]; - const bool unpack_srgb = false; - if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) - return false; + // 2 subsets + { 10, 5, 5, 5, }, // 0, mode 1 in MS/D3D docs + { 7, 6, 6, 6, }, // 1 + { 11, 5, 4, 4, }, // 2 + { 11, 4, 5, 4, }, // 3 + { 11, 4, 4, 5, }, // 4 + { 9, 5, 5, 5, }, // 5 + { 8, 6, 5, 5, }, // 6 + { 8, 5, 6, 5, }, // 7 + { 8, 5, 5, 6, }, // 8 + { 6, 6, 6, 6, }, // 9, endpoints not delta encoded, mode 10 in MS/D3D docs + // 1 subset + { 10, 10, 10, 10, }, // 10, endpoints not delta encoded, mode 11 in MS/D3D docs + { 11, 9, 9, 9, }, // 11 + { 12, 8, 8, 8, }, // 12 + { 16, 4, 4, 4, } // 13, also useful for solid blocks + }; - basist::encode_bc4(pBC4_block0, &block_pixels[0][0].c[chan0], sizeof(color32)); - basist::encode_bc4(pBC4_block1, &block_pixels[0][0].c[chan1], sizeof(color32)); + const int8_t g_bc6h_mode_lookup[32] = { 0, 1, 2, 10, 0, 1, 3, 11, 0, 1, 4, 12, 0, 1, 5, 13, 0, 1, 6, -1, 0, 1, 7, -1, 0, 1, 8, -1, 0, 1, 9, -1 }; - return true; - } + const bc6h_bit_layout g_bc6h_bit_layouts[NUM_BC6H_MODES][MAX_BC6H_LAYOUT_INDEX] = + { + // comp_index, subset*2+lh_index, last_bit, first_bit + //------------------------ mode 0: 2 subsets, Weight bits: 46 bits, Endpoint bits: 75 bits (10.555, 10.555, 10.555), delta + { { 1, 2, 4, -1 }, { 2, 2, 4, -1 }, { 2, 3, 4, -1 }, { 0, 0, 9, 0 }, { 1, 0, 9, 0 }, { 2, 0, 9, 0 }, { 0, 1, 4, 0 }, + { 1, 3, 4, -1 }, { 1, 2, 3, 0 }, { 1, 1, 4, 0 }, { 2, 3, 0, -1 }, { 1, 3, 3, 0 }, { 2, 1, 4, 0 }, { 2, 3, 1, -1 }, + { 2, 2, 3, 0 }, { 0, 2, 4, 0 }, { 2, 3, 2, -1 }, { 0, 3, 4, 0 }, { 2, 3, 3, -1 }, { 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 1: 2 subsets, Weight bits: 46 bits, Endpoint bits: 75 bits (7.666, 7.666, 7.666), delta + { { 1, 2, 5, -1 },{ 1, 3, 4, -1 },{ 1, 3, 5, -1 },{ 0, 0, 6, 0 },{ 2, 3, 0, -1 },{ 2, 3, 1, -1 },{ 2, 2, 4, -1 }, + { 1, 0, 6, 0 },{ 2, 2, 5, -1 },{ 2, 3, 2, -1 },{ 1, 2, 4, -1 },{ 2, 0, 6, 0 },{ 2, 3, 3, -1 },{ 2, 3, 5, -1 }, + { 2, 3, 4, -1 },{ 0, 1, 5, 0 },{ 1, 2, 3, 0 },{ 1, 1, 5, 0 },{ 1, 3, 3, 0 },{ 2, 1, 5, 0 },{ 2, 2, 3, 0 },{ 0, 2, 5, 0 }, + { 0, 3, 5, 0 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 2: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (11.555, 11.444, 11.444), delta + { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 4, 0 },{ 0, 0, 10, -1 },{ 1, 2, 3, 0 },{ 1, 1, 3, 0 },{ 1, 0, 10, -1 }, + { 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 3, 0 },{ 2, 0, 10, -1 },{ 2, 3, 1, -1 },{ 2, 2, 3, 0 },{ 0, 2, 4, 0 },{ 2, 3, 2, -1 }, + { 0, 3, 4, 0 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 3: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (11.444, 11.555, 11.444), delta + { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 3, 0 },{ 0, 0, 10, -1 },{ 1, 3, 4, -1 },{ 1, 2, 3, 0 },{ 1, 1, 4, 0 }, + { 1, 0, 10, -1 },{ 1, 3, 3, 0 },{ 2, 1, 3, 0 },{ 2, 0, 10, -1 },{ 2, 3, 1, -1 },{ 2, 2, 3, 0 },{ 0, 2, 3, 0 },{ 2, 3, 0, -1 }, + { 2, 3, 2, -1 },{ 0, 3, 3, 0 },{ 1, 2, 4, -1 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 4: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (11.444, 11.444, 11.555), delta + { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 3, 0 },{ 0, 0, 10, -1 },{ 2, 2, 4, -1 },{ 1, 2, 3, 0 },{ 1, 1, 3, 0 }, + { 1, 0, 10, -1 },{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 4, 0 },{ 2, 0, 10, -1 },{ 2, 2, 3, 0 },{ 0, 2, 3, 0 },{ 2, 3, 1, -1 }, + { 2, 3, 2, -1 },{ 0, 3, 3, 0 },{ 2, 3, 4, -1 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 5: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (9.555, 9.555, 9.555), delta + { { 0, 0, 8, 0 },{ 2, 2, 4, -1 },{ 1, 0, 8, 0 },{ 1, 2, 4, -1 },{ 2, 0, 8, 0 },{ 2, 3, 4, -1 },{ 0, 1, 4, 0 },{ 1, 3, 4, -1 }, + { 1, 2, 3, 0 },{ 1, 1, 4, 0 },{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 4, 0 },{ 2, 3, 1, -1 },{ 2, 2, 3, 0 },{ 0, 2, 4, 0 }, + { 2, 3, 2, -1 },{ 0, 3, 4, 0 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 6: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (8.666, 8.555, 8.555), delta + { { 0, 0, 7, 0 },{ 1, 3, 4, -1 },{ 2, 2, 4, -1 },{ 1, 0, 7, 0 },{ 2, 3, 2, -1 },{ 1, 2, 4, -1 },{ 2, 0, 7, 0 },{ 2, 3, 3, -1 }, + { 2, 3, 4, -1 },{ 0, 1, 5, 0 },{ 1, 2, 3, 0 },{ 1, 1, 4, 0 },{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 4, 0 },{ 2, 3, 1, -1 }, + { 2, 2, 3, 0 },{ 0, 2, 5, 0 },{ 0, 3, 5, 0 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 7: 2 subsets, Weight bits: 46 bits, Endpoints bits: 72 bits (8.555, 8.666, 8.555), delta + { { 0, 0, 7, 0 },{ 2, 3, 0, -1 },{ 2, 2, 4, -1 },{ 1, 0, 7, 0 },{ 1, 2, 5, -1 },{ 1, 2, 4, -1 },{ 2, 0, 7, 0 },{ 1, 3, 5, -1 }, + { 2, 3, 4, -1 },{ 0, 1, 4, 0 },{ 1, 3, 4, -1 },{ 1, 2, 3, 0 },{ 1, 1, 5, 0 },{ 1, 3, 3, 0 },{ 2, 1, 4, 0 },{ 2, 3, 1, -1 }, + { 2, 2, 3, 0 },{ 0, 2, 4, 0 },{ 2, 3, 2, -1 },{ 0, 3, 4, 0 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 8: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (8.555, 8.555, 8.666), delta + { { 0, 0, 7, 0 },{ 2, 3, 1, -1 },{ 2, 2, 4, -1 },{ 1, 0, 7, 0 },{ 2, 2, 5, -1 },{ 1, 2, 4, -1 },{ 2, 0, 7, 0 },{ 2, 3, 5, -1 }, + { 2, 3, 4, -1 },{ 0, 1, 4, 0 },{ 1, 3, 4, -1 },{ 1, 2, 3, 0 },{ 1, 1, 4, 0 },{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 5, 0 }, + { 2, 2, 3, 0 },{ 0, 2, 4, 0 },{ 2, 3, 2, -1 },{ 0, 3, 4, 0 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 9: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (6.6.6.6, 6.6.6.6, 6.6.6.6), NO delta + { { 0, 0, 5, 0 },{ 1, 3, 4, -1 },{ 2, 3, 0, -1 },{ 2, 3, 1, -1 },{ 2, 2, 4, -1 },{ 1, 0, 5, 0 },{ 1, 2, 5, -1 },{ 2, 2, 5, -1 }, + { 2, 3, 2, -1 },{ 1, 2, 4, -1 },{ 2, 0, 5, 0 },{ 1, 3, 5, -1 },{ 2, 3, 3, -1 },{ 2, 3, 5, -1 },{ 2, 3, 4, -1 },{ 0, 1, 5, 0 }, + { 1, 2, 3, 0 },{ 1, 1, 5, 0 },{ 1, 3, 3, 0 },{ 2, 1, 5, 0 },{ 2, 2, 3, 0 },{ 0, 2, 5, 0 },{ 0, 3, 5, 0 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 10: 1 subset, Weight bits: 63 bits, Endpoint bits: 60 bits (10.10, 10.10, 10.10), NO delta + { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 9, 0 },{ 1, 1, 9, 0 },{ 2, 1, 9, 0 }, {-1, 0, 0, 0} }, + //------------------------ mode 11: 1 subset, Weight bits: 63 bits, Endpoint bits: 60 bits (11.9, 11.9, 11.9), delta + { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 8, 0 },{ 0, 0, 10, -1 },{ 1, 1, 8, 0 },{ 1, 0, 10, -1 },{ 2, 1, 8, 0 },{ 2, 0, 10, -1 }, {-1, 0, 0, 0} }, + //------------------------ mode 12: 1 subset, Weight bits: 63 bits, Endpoint bits: 60 bits (12.8, 12.8, 12.8), delta + { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 7, 0 },{ 0, 0, 10, 11 },{ 1, 1, 7, 0 },{ 1, 0, 10, 11 },{ 2, 1, 7, 0 },{ 2, 0, 10, 11 }, {-1, 0, 0, 0} }, + //------------------------ mode 13: 1 subset, Weight bits: 63 bits, Endpoint bits: 60 bits (16.4, 16.4, 16.4), delta + { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 3, 0 },{ 0, 0, 10, 15 },{ 1, 1, 3, 0 },{ 1, 0, 10, 15 },{ 2, 1, 3, 0 },{ 2, 0, 10, 15 }, {-1, 0, 0, 0} } + }; - static const uint8_t s_etc2_eac_bit_ofs[16] = { 45, 33, 21, 9, 42, 30, 18, 6, 39, 27, 15, 3, 36, 24, 12, 0 }; + // The same as the first 32 2-subset patterns in BC7. + // Bit 7 is a flag indicating that the weight uses 1 less bit than usual. + const uint8_t g_bc6h_2subset_patterns[TOTAL_BC6H_PARTITION_PATTERNS][4][4] = // [pat][y][x] + { + { {0x80, 0, 1, 1}, { 0, 0, 1, 1 }, { 0, 0, 1, 1 }, { 0, 0, 1, 0x81 }}, { {0x80, 0, 0, 1}, {0, 0, 0, 1}, {0, 0, 0, 1}, {0, 0, 0, 0x81} }, + { {0x80, 1, 1, 1}, {0, 1, 1, 1}, {0, 1, 1, 1}, {0, 1, 1, 0x81} }, { {0x80, 0, 0, 1}, {0, 0, 1, 1}, {0, 0, 1, 1}, {0, 1, 1, 0x81} }, + { {0x80, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 0, 1}, {0, 0, 1, 0x81} }, { {0x80, 0, 1, 1}, {0, 1, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 0x81} }, + { {0x80, 0, 0, 1}, {0, 0, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 0x81} }, { {0x80, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 1, 1}, {0, 1, 1, 0x81} }, + { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 1, 0x81} }, { {0x80, 0, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 0x81} }, + { {0x80, 0, 0, 0}, {0, 0, 0, 1}, {0, 1, 1, 1}, {1, 1, 1, 0x81} }, { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 1}, {0, 1, 1, 0x81} }, + { {0x80, 0, 0, 1}, {0, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 0x81} }, { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {1, 1, 1, 1}, {1, 1, 1, 0x81} }, + { {0x80, 0, 0, 0}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 0x81} }, { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {1, 1, 1, 0x81} }, + { {0x80, 0, 0, 0}, {1, 0, 0, 0}, {1, 1, 1, 0}, {1, 1, 1, 0x81} }, { {0x80, 1, 0x81, 1}, {0, 0, 0, 1}, {0, 0, 0, 0}, {0, 0, 0, 0} }, + { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0x81, 0, 0, 0}, {1, 1, 1, 0} }, { {0x80, 1, 0x81, 1}, {0, 0, 1, 1}, {0, 0, 0, 1}, {0, 0, 0, 0} }, + { {0x80, 0, 0x81, 1}, {0, 0, 0, 1}, {0, 0, 0, 0}, {0, 0, 0, 0} }, { {0x80, 0, 0, 0}, {1, 0, 0, 0}, {0x81, 1, 0, 0}, {1, 1, 1, 0} }, + { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0x81, 0, 0, 0}, {1, 1, 0, 0} }, { {0x80, 1, 1, 1}, {0, 0, 1, 1}, { 0, 0, 1, 1}, {0, 0, 0, 0x81} }, + { {0x80, 0, 0x81, 1}, {0, 0, 0, 1}, {0, 0, 0, 1}, {0, 0, 0, 0} }, { {0x80, 0, 0, 0}, {1, 0, 0, 0}, {0x81, 0, 0, 0}, {1, 1, 0, 0} }, + { {0x80, 1, 0x81, 0}, {0, 1, 1, 0}, {0, 1, 1, 0}, {0, 1, 1, 0} }, { {0x80, 0, 0x81, 1}, {0, 1, 1, 0}, {0, 1, 1, 0}, {1, 1, 0, 0} }, + { {0x80, 0, 0, 1}, {0, 1, 1, 1}, {0x81, 1, 1, 0}, {1, 0, 0, 0} }, { {0x80, 0, 0, 0}, {1, 1, 1, 1}, {0x81, 1, 1, 1}, {0, 0, 0, 0} }, + { {0x80, 1, 0x81, 1}, {0, 0, 0, 1}, {1, 0, 0, 0}, {1, 1, 1, 0} }, { {0x80, 0, 0x81, 1}, {1, 0, 0, 1}, {1, 0, 0, 1}, {1, 1, 0, 0} } + }; - static void pack_eac_solid_block(eac_block& blk, uint32_t a) + const uint8_t g_bc6h_weight3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 }; + const uint8_t g_bc6h_weight4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; + + static inline void write_bits(uint64_t val, uint32_t num_bits, uint32_t& bit_pos, uint64_t& l, uint64_t& h) { - blk.m_base = static_cast(a); - blk.m_table = 13; - blk.m_multiplier = 0; - - memcpy(blk.m_selectors, g_etc2_eac_a8_sel4, sizeof(g_etc2_eac_a8_sel4)); - - return; - } + assert((num_bits) && (num_bits < 64) && (bit_pos < 128)); + assert(val < (1ULL << num_bits)); - // Only checks 4 tables. - static void pack_eac(eac_block& blk, const uint8_t* pPixels, uint32_t stride) - { - uint32_t min_alpha = 255, max_alpha = 0; - for (uint32_t i = 0; i < 16; i++) + if (bit_pos < 64) { - const uint32_t a = pPixels[i * stride]; - if (a < min_alpha) min_alpha = a; - if (a > max_alpha) max_alpha = a; - } + l |= (val << bit_pos); - if (min_alpha == max_alpha) + if ((bit_pos + num_bits) > 64) + h |= (val >> (64 - bit_pos)); + } + else { - pack_eac_solid_block(blk, min_alpha); - return; + h |= (val << (bit_pos - 64)); } - const uint32_t alpha_range = max_alpha - min_alpha; - - const uint32_t SINGLE_TABLE_THRESH = 5; - if (alpha_range <= SINGLE_TABLE_THRESH) - { - // If alpha_range <= 5 table 13 is lossless - int base = clamp255((int)max_alpha - 2); + bit_pos += num_bits; + assert(bit_pos <= 128); + } - blk.m_base = base; - blk.m_multiplier = 1; - blk.m_table = 13; + static inline void write_rev_bits(uint64_t val, uint32_t num_bits, uint32_t& bit_pos, uint64_t& l, uint64_t& h) + { + assert((num_bits) && (num_bits < 64) && (bit_pos < 128)); + assert(val < (1ULL << num_bits)); - base -= 3; + for (uint32_t i = 0; i < num_bits; i++) + write_bits((val >> (num_bits - 1u - i)) & 1, 1, bit_pos, l, h); + } - uint64_t packed_sels = 0; - for (uint32_t i = 0; i < 16; i++) - { - const int a = pPixels[i * stride]; + void pack_bc6h_block(bc6h_block& dst_blk, bc6h_logical_block& log_blk) + { + const uint8_t s_mode_bits[NUM_BC6H_MODES] = { 0b00, 0b01, 0b00010, 0b00110, 0b01010, 0b01110, 0b10010, 0b10110, 0b11010, 0b11110, 0b00011, 0b00111, 0b01011, 0b01111 }; - static const uint8_t s_sels[6] = { 2, 1, 0, 4, 5, 6 }; + const uint32_t mode = log_blk.m_mode; + assert(mode < NUM_BC6H_MODES); - int sel = a - base; - assert(sel >= 0 && sel <= 5); + uint64_t l = s_mode_bits[mode], h = 0; + uint32_t bit_pos = (mode >= 2) ? 5 : 2; - packed_sels |= (static_cast(s_sels[sel]) << s_etc2_eac_bit_ofs[i]); - } + const uint32_t num_subsets = (mode >= BC6H_FIRST_1SUBSET_MODE_INDEX) ? 1 : 2; - blk.set_selector_bits(packed_sels); + assert(((num_subsets == 2) && (log_blk.m_partition_pattern < TOTAL_BC6H_PARTITION_PATTERNS)) || + ((num_subsets == 1) && (!log_blk.m_partition_pattern))); - return; + // Sanity checks + for (uint32_t c = 0; c < 3; c++) + { + assert(log_blk.m_endpoints[c][0] < (1u << g_bc6h_mode_sig_bits[mode][0])); // 1st subset l, base bits + assert(log_blk.m_endpoints[c][1] < (1u << g_bc6h_mode_sig_bits[mode][c + 1])); // 1st subset h, these are deltas except for modes 9,10 + assert(log_blk.m_endpoints[c][2] < (1u << g_bc6h_mode_sig_bits[mode][c + 1])); // 2nd subset l + assert(log_blk.m_endpoints[c][3] < (1u << g_bc6h_mode_sig_bits[mode][c + 1])); // 2nd subset h } - const uint32_t T0 = 2, T1 = 8, T2 = 11, T3 = 13; - static const uint8_t s_tables[4] = { T0, T1, T2, T3 }; + const bc6h_bit_layout* pLayout = &g_bc6h_bit_layouts[mode][0]; - int base[4], mul[4]; - uint32_t mul_or = 0; - for (uint32_t i = 0; i < 4; i++) + while (pLayout->m_comp != -1) { - const uint32_t table = s_tables[i]; + uint32_t v = (pLayout->m_comp == 3) ? log_blk.m_partition_pattern : log_blk.m_endpoints[pLayout->m_comp][pLayout->m_index]; - const float range = (float)(g_eac_modifier_table[table][ETC2_EAC_MAX_VALUE_SELECTOR] - g_eac_modifier_table[table][ETC2_EAC_MIN_VALUE_SELECTOR]); + if (pLayout->m_first_bit == -1) + { + write_bits((v >> pLayout->m_last_bit) & 1, 1, bit_pos, l, h); + } + else + { + const uint32_t total_bits = basisu::iabs(pLayout->m_last_bit - pLayout->m_first_bit) + 1; - base[i] = clamp255((int)roundf(basisu::lerp((float)min_alpha, (float)max_alpha, (float)(0 - g_eac_modifier_table[table][ETC2_EAC_MIN_VALUE_SELECTOR]) / range))); - mul[i] = clampi((int)roundf(alpha_range / range), 1, 15); - mul_or |= mul[i]; + v >>= basisu::minimum(pLayout->m_first_bit, pLayout->m_last_bit); + v &= ((1 << total_bits) - 1); + + if (pLayout->m_first_bit > pLayout->m_last_bit) + write_rev_bits(v, total_bits, bit_pos, l, h); + else + write_bits(v, total_bits, bit_pos, l, h); + } + + pLayout++; } - uint32_t total_err[4] = { 0, 0, 0, 0 }; - uint8_t sels[4][16]; + const uint32_t num_mode_sel_bits = (num_subsets == 1) ? 4 : 3; + const uint8_t* pPat = &g_bc6h_2subset_patterns[log_blk.m_partition_pattern][0][0]; for (uint32_t i = 0; i < 16; i++) { - const int a = pPixels[i * stride]; - - uint32_t l0 = UINT32_MAX, l1 = UINT32_MAX, l2 = UINT32_MAX, l3 = UINT32_MAX; + const uint32_t sel = log_blk.m_weights[i]; - if ((a < 7) || (a > (255 - 7))) + uint32_t num_bits = num_mode_sel_bits; + if (num_subsets == 2) { - for (uint32_t s = 0; s < 8; s++) - { - const int v0 = clamp255(mul[0] * g_eac_modifier_table[T0][s] + base[0]); - const int v1 = clamp255(mul[1] * g_eac_modifier_table[T1][s] + base[1]); - const int v2 = clamp255(mul[2] * g_eac_modifier_table[T2][s] + base[2]); - const int v3 = clamp255(mul[3] * g_eac_modifier_table[T3][s] + base[3]); - - l0 = basisu::minimum(l0, (basisu::iabs(v0 - a) << 3) | s); - l1 = basisu::minimum(l1, (basisu::iabs(v1 - a) << 3) | s); - l2 = basisu::minimum(l2, (basisu::iabs(v2 - a) << 3) | s); - l3 = basisu::minimum(l3, (basisu::iabs(v3 - a) << 3) | s); - } + const uint32_t subset_index = pPat[i]; + num_bits -= (subset_index >> 7); } - else if (mul_or == 1) + else if (!i) { - const int a0 = base[0] - a, a1 = base[1] - a, a2 = base[2] - a, a3 = base[3] - a; + num_bits--; + } - for (uint32_t s = 0; s < 8; s++) - { - const int v0 = g_eac_modifier_table[T0][s] + a0; - const int v1 = g_eac_modifier_table[T1][s] + a1; - const int v2 = g_eac_modifier_table[T2][s] + a2; - const int v3 = g_eac_modifier_table[T3][s] + a3; + assert(sel < (1u << num_bits)); - l0 = basisu::minimum(l0, (basisu::iabs(v0) << 3) | s); - l1 = basisu::minimum(l1, (basisu::iabs(v1) << 3) | s); - l2 = basisu::minimum(l2, (basisu::iabs(v2) << 3) | s); - l3 = basisu::minimum(l3, (basisu::iabs(v3) << 3) | s); - } - } - else - { - const int a0 = base[0] - a, a1 = base[1] - a, a2 = base[2] - a, a3 = base[3] - a; + write_bits(sel, num_bits, bit_pos, l, h); + } - for (uint32_t s = 0; s < 8; s++) - { - const int v0 = mul[0] * g_eac_modifier_table[T0][s] + a0; - const int v1 = mul[1] * g_eac_modifier_table[T1][s] + a1; - const int v2 = mul[2] * g_eac_modifier_table[T2][s] + a2; - const int v3 = mul[3] * g_eac_modifier_table[T3][s] + a3; + assert(bit_pos == 128); - l0 = basisu::minimum(l0, (basisu::iabs(v0) << 3) | s); - l1 = basisu::minimum(l1, (basisu::iabs(v1) << 3) | s); - l2 = basisu::minimum(l2, (basisu::iabs(v2) << 3) | s); - l3 = basisu::minimum(l3, (basisu::iabs(v3) << 3) | s); - } - } + basisu::write_le_dword(&dst_blk.m_bytes[0], (uint32_t)l); + basisu::write_le_dword(&dst_blk.m_bytes[4], (uint32_t)(l >> 32u)); + basisu::write_le_dword(&dst_blk.m_bytes[8], (uint32_t)h); + basisu::write_le_dword(&dst_blk.m_bytes[12], (uint32_t)(h >> 32u)); + } - sels[0][i] = l0 & 7; - sels[1][i] = l1 & 7; - sels[2][i] = l2 & 7; - sels[3][i] = l3 & 7; +#if 0 + static inline uint32_t bc6h_blog_dequantize_to_blog16(uint32_t comp, uint32_t bits_per_comp) + { + int unq; - total_err[0] += basisu::square(l0 >> 3); - total_err[1] += basisu::square(l1 >> 3); - total_err[2] += basisu::square(l2 >> 3); - total_err[3] += basisu::square(l3 >> 3); - } + if (bits_per_comp >= 15) + unq = comp; + else if (comp == 0) + unq = 0; + else if (comp == ((1u << bits_per_comp) - 1u)) + unq = 0xFFFFu; + else + unq = ((comp << 16u) + 0x8000u) >> bits_per_comp; - uint32_t min_err = total_err[0], min_index = 0; - for (uint32_t i = 1; i < 4; i++) + return unq; + } +#endif + + // 6,7,8,9,10,11,12 + const uint32_t BC6H_BLOG_TAB_MIN = 6; + const uint32_t BC6H_BLOG_TAB_MAX = 12; + //const uint32_t BC6H_BLOG_TAB_NUM = BC6H_BLOG_TAB_MAX - BC6H_BLOG_TAB_MIN + 1; + + // Handles 16, or 6-12 bits. Others assert. + static inline uint32_t half_to_blog_tab(half_float h, uint32_t num_bits) + { + assert(h <= MAX_BC6H_HALF_FLOAT_AS_UINT); + assert((num_bits == 16) || ((num_bits >= BC6H_BLOG_TAB_MIN) && (num_bits <= BC6H_BLOG_TAB_MAX))); + + return bc6h_half_to_blog(h, num_bits); +#if 0 + BASISU_NOTE_UNUSED(BC6H_BLOG_TAB_MIN); + BASISU_NOTE_UNUSED(BC6H_BLOG_TAB_MAX); + + if (num_bits == 16) { - if (total_err[i] < min_err) - { - min_err = total_err[i]; - min_index = i; - } + return bc6h_half_to_blog(h, 16); + } + else + { + assert((num_bits >= BC6H_BLOG_TAB_MIN) && (num_bits <= BC6H_BLOG_TAB_MAX)); + + // Note: This used to be done using a table lookup, but it required ~224KB of tables. This isn't quite as accurate, but the error is very slight (+-1 half values as ints). + return bc6h_half_to_blog(h, num_bits); } +#endif + } - blk.m_base = base[min_index]; - blk.m_multiplier = mul[min_index]; - blk.m_table = s_tables[min_index]; + bool g_bc6h_enc_initialized; - uint64_t packed_sels = 0; - const uint8_t* pSels = &sels[min_index][0]; - for (uint32_t i = 0; i < 16; i++) - packed_sels |= (static_cast(pSels[i]) << s_etc2_eac_bit_ofs[i]); + void bc6h_enc_init() + { + if (g_bc6h_enc_initialized) + return; - blk.set_selector_bits(packed_sels); + g_bc6h_enc_initialized = true; } - - // Checks all 16 tables. Around ~2 dB better vs. pack_eac(), ~1.2 dB less than near-optimal. - static void pack_eac_high_quality(eac_block& blk, const uint8_t* pPixels, uint32_t stride) + + // mode 10, 4-bit weights + void bc6h_enc_block_mode10(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights) { - uint32_t min_alpha = 255, max_alpha = 0; + assert(g_bc6h_enc_initialized); + for (uint32_t i = 0; i < 16; i++) { - const uint32_t a = pPixels[i * stride]; - if (a < min_alpha) min_alpha = a; - if (a > max_alpha) max_alpha = a; + assert(pWeights[i] <= 15); } - if (min_alpha == max_alpha) + bc6h_logical_block log_blk; + log_blk.clear(); + + // Convert half endpoints to blog10 (mode 10 doesn't use delta encoding) + for (uint32_t c = 0; c < 3; c++) { - pack_eac_solid_block(blk, min_alpha); - return; + log_blk.m_endpoints[c][0] = half_to_blog_tab(pEndpoints[c][0], 10); + log_blk.m_endpoints[c][1] = half_to_blog_tab(pEndpoints[c][1], 10); } - const uint32_t alpha_range = max_alpha - min_alpha; + memcpy(log_blk.m_weights, pWeights, 16); - const uint32_t SINGLE_TABLE_THRESH = 5; - if (alpha_range <= SINGLE_TABLE_THRESH) + if (log_blk.m_weights[0] & 8) { - // If alpha_range <= 5 table 13 is lossless - int base = clamp255((int)max_alpha - 2); - - blk.m_base = base; - blk.m_multiplier = 1; - blk.m_table = 13; - - base -= 3; - - uint64_t packed_sels = 0; for (uint32_t i = 0; i < 16; i++) - { - const int a = pPixels[i * stride]; - - static const uint8_t s_sels[6] = { 2, 1, 0, 4, 5, 6 }; - - int sel = a - base; - assert(sel >= 0 && sel <= 5); + log_blk.m_weights[i] = 15 - log_blk.m_weights[i]; - packed_sels |= (static_cast(s_sels[sel]) << s_etc2_eac_bit_ofs[i]); + for (uint32_t c = 0; c < 3; c++) + { + std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][1]); } + } - blk.set_selector_bits(packed_sels); + log_blk.m_mode = BC6H_FIRST_1SUBSET_MODE_INDEX; + pack_bc6h_block(*pPacked_block, log_blk); + } - return; + // Tries modes 11-13 (delta endpoint) encoding, falling back to mode 10 only when necessary, 4-bit weights + void bc6h_enc_block_1subset_4bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights) + { + assert(g_bc6h_enc_initialized); + + for (uint32_t i = 0; i < 16; i++) + { + assert(pWeights[i] <= 15); } - int base[16], mul[16]; - for (uint32_t table = 0; table < 16; table++) + bc6h_logical_block log_blk; + log_blk.clear(); + + for (uint32_t mode = BC6H_LAST_MODE_INDEX; mode > BC6H_FIRST_1SUBSET_MODE_INDEX; mode--) { - const float range = (float)(g_eac_modifier_table[table][ETC2_EAC_MAX_VALUE_SELECTOR] - g_eac_modifier_table[table][ETC2_EAC_MIN_VALUE_SELECTOR]); + const uint32_t num_base_bits = g_bc6h_mode_sig_bits[mode][0], num_delta_bits = g_bc6h_mode_sig_bits[mode][1]; + const int base_bitmask = (1 << num_base_bits) - 1; + const int delta_bitmask = (1 << num_delta_bits) - 1; + BASISU_NOTE_UNUSED(base_bitmask); - base[table] = clamp255((int)roundf(basisu::lerp((float)min_alpha, (float)max_alpha, (float)(0 - g_eac_modifier_table[table][ETC2_EAC_MIN_VALUE_SELECTOR]) / range))); - mul[table] = clampi((int)roundf(alpha_range / range), 1, 15); - } + assert(num_delta_bits < num_base_bits); + assert((num_delta_bits == g_bc6h_mode_sig_bits[mode][2]) && (num_delta_bits == g_bc6h_mode_sig_bits[mode][3])); - uint32_t total_err[16]; - memset(total_err, 0, sizeof(total_err)); + uint32_t blog_endpoints[3][2]; - uint8_t sels[16][16]; + // Convert half endpoints to blog 16, 12, or 11 + for (uint32_t c = 0; c < 3; c++) + { + blog_endpoints[c][0] = half_to_blog_tab(pEndpoints[c][0], num_base_bits); + assert((int)blog_endpoints[c][0] <= base_bitmask); - for (uint32_t table = 0; table < 16; table++) - { - const int8_t* pTable = &g_eac_modifier_table[table][0]; - const int m = mul[table], b = base[table]; + blog_endpoints[c][1] = half_to_blog_tab(pEndpoints[c][1], num_base_bits); + assert((int)blog_endpoints[c][1] <= base_bitmask); + } - uint32_t prev_l = 0, prev_a = UINT32_MAX; + // Copy weights + memcpy(log_blk.m_weights, pWeights, 16); - for (uint32_t i = 0; i < 16; i++) + // Ensure first weight MSB is 0 + if (log_blk.m_weights[0] & 8) { - const int a = pPixels[i * stride]; + // Invert weights + for (uint32_t i = 0; i < 16; i++) + log_blk.m_weights[i] = 15 - log_blk.m_weights[i]; - if ((uint32_t)a == prev_a) + // Swap blog quantized endpoints + for (uint32_t c = 0; c < 3; c++) { - sels[table][i] = prev_l & 7; - total_err[table] += basisu::square(prev_l >> 3); + std::swap(blog_endpoints[c][0], blog_endpoints[c][1]); } - else - { - uint32_t l = basisu::iabs(clamp255(m * pTable[0] + b) - a) << 3; - l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[1] + b) - a) << 3) | 1); - l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[2] + b) - a) << 3) | 2); - l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[3] + b) - a) << 3) | 3); - l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[4] + b) - a) << 3) | 4); - l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[5] + b) - a) << 3) | 5); - l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[6] + b) - a) << 3) | 6); - l = basisu::minimum(l, (basisu::iabs(clamp255(m * pTable[7] + b) - a) << 3) | 7); + } - sels[table][i] = l & 7; - total_err[table] += basisu::square(l >> 3); + const int max_delta = (1 << (num_delta_bits - 1)) - 1; + const int min_delta = -(max_delta + 1); + assert((max_delta - min_delta) == delta_bitmask); - prev_l = l; - prev_a = a; + bool failed_flag = false; + for (uint32_t c = 0; c < 3; c++) + { + log_blk.m_endpoints[c][0] = blog_endpoints[c][0]; + + int delta = (int)blog_endpoints[c][1] - (int)blog_endpoints[c][0]; + if ((delta < min_delta) || (delta > max_delta)) + { + failed_flag = true; + break; } - } - } - uint32_t min_err = total_err[0], min_index = 0; - for (uint32_t i = 1; i < 16; i++) - { - if (total_err[i] < min_err) - { - min_err = total_err[i]; - min_index = i; + log_blk.m_endpoints[c][1] = delta & delta_bitmask; } - } - blk.m_base = base[min_index]; - blk.m_multiplier = mul[min_index]; - blk.m_table = min_index; + if (failed_flag) + continue; - uint64_t packed_sels = 0; - const uint8_t* pSels = &sels[min_index][0]; - for (uint32_t i = 0; i < 16; i++) - packed_sels |= (static_cast(pSels[i]) << s_etc2_eac_bit_ofs[i]); + log_blk.m_mode = mode; + pack_bc6h_block(*pPacked_block, log_blk); + + return; + } - blk.set_selector_bits(packed_sels); + // Worst case fall back to mode 10, which can handle any endpoints + bc6h_enc_block_mode10(pPacked_block, pEndpoints, pWeights); } - bool transcode_uastc_to_etc2_eac_r11(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0) + // Mode 9 (direct endpoint encoding), 3-bit weights, but only 1 subset + void bc6h_enc_block_1subset_mode9_3bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights) { - unpacked_uastc_block unpacked_src_blk; - if (!unpack_uastc(src_blk, unpacked_src_blk, false)) - return false; - - const uint32_t mode = unpacked_src_blk.m_mode; + assert(g_bc6h_enc_initialized); - if (mode == UASTC_MODE_INDEX_SOLID_COLOR) + for (uint32_t i = 0; i < 16; i++) { - pack_eac_solid_block(*static_cast(pDst), unpacked_src_blk.m_solid_color.c[chan0]); - return true; + assert(pWeights[i] <= 7); } - color32 block_pixels[4][4]; - const bool unpack_srgb = false; - if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) - return false; + bc6h_logical_block log_blk; + log_blk.clear(); - if (chan0 == 3) - transcode_uastc_to_etc2_eac_a8(unpacked_src_blk, block_pixels, pDst); - else - (high_quality ? pack_eac_high_quality : pack_eac)(*static_cast(pDst), &block_pixels[0][0].c[chan0], sizeof(color32)); + // Convert half endpoints to blog6 (mode 9 doesn't use delta encoding) + for (uint32_t c = 0; c < 3; c++) + { + log_blk.m_endpoints[c][0] = half_to_blog_tab(pEndpoints[c][0], 6); + log_blk.m_endpoints[c][2] = log_blk.m_endpoints[c][0]; - return true; - } + log_blk.m_endpoints[c][1] = half_to_blog_tab(pEndpoints[c][1], 6); + log_blk.m_endpoints[c][3] = log_blk.m_endpoints[c][1]; + } - bool transcode_uastc_to_etc2_eac_rg11(const uastc_block& src_blk, void* pDst, bool high_quality, uint32_t chan0, uint32_t chan1) - { - unpacked_uastc_block unpacked_src_blk; - if (!unpack_uastc(src_blk, unpacked_src_blk, false)) - return false; + memcpy(log_blk.m_weights, pWeights, 16); - const uint32_t mode = unpacked_src_blk.m_mode; + const uint32_t pat_index = 0; + const uint8_t* pPat = &g_bc6h_2subset_patterns[pat_index][0][0]; - if (mode == UASTC_MODE_INDEX_SOLID_COLOR) + if (log_blk.m_weights[0] & 4) { - pack_eac_solid_block(static_cast(pDst)[0], unpacked_src_blk.m_solid_color.c[chan0]); - pack_eac_solid_block(static_cast(pDst)[1], unpacked_src_blk.m_solid_color.c[chan1]); - return true; - } - - color32 block_pixels[4][4]; - const bool unpack_srgb = false; - if (!unpack_uastc(unpacked_src_blk, &block_pixels[0][0], unpack_srgb)) - return false; + for (uint32_t c = 0; c < 3; c++) + std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][1]); - if (chan0 == 3) - transcode_uastc_to_etc2_eac_a8(unpacked_src_blk, block_pixels, &static_cast(pDst)[0]); - else - (high_quality ? pack_eac_high_quality : pack_eac)(static_cast(pDst)[0], &block_pixels[0][0].c[chan0], sizeof(color32)); + for (uint32_t i = 0; i < 16; i++) + if ((pPat[i] & 0x7F) == 0) + log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + } - if (chan1 == 3) - transcode_uastc_to_etc2_eac_a8(unpacked_src_blk, block_pixels, &static_cast(pDst)[1]); - else - (high_quality ? pack_eac_high_quality : pack_eac)(static_cast(pDst)[1], &block_pixels[0][0].c[chan1], sizeof(color32)); - return true; - } + if (log_blk.m_weights[15] & 4) + { + for (uint32_t c = 0; c < 3; c++) + std::swap(log_blk.m_endpoints[c][2], log_blk.m_endpoints[c][3]); - // PVRTC1 - static void fixup_pvrtc1_4_modulation_rgb( - const uastc_block* pSrc_blocks, - const uint32_t* pPVRTC_endpoints, - void* pDst_blocks, - uint32_t num_blocks_x, uint32_t num_blocks_y, bool from_alpha) - { - const uint32_t x_mask = num_blocks_x - 1; - const uint32_t y_mask = num_blocks_y - 1; - const uint32_t x_bits = basisu::total_bits(x_mask); - const uint32_t y_bits = basisu::total_bits(y_mask); - const uint32_t min_bits = basisu::minimum(x_bits, y_bits); - //const uint32_t max_bits = basisu::maximum(x_bits, y_bits); - const uint32_t swizzle_mask = (1 << (min_bits * 2)) - 1; + for (uint32_t i = 0; i < 16; i++) + if ((pPat[i] & 0x7F) == 1) + log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + } - uint32_t block_index = 0; + log_blk.m_mode = 9; + log_blk.m_partition_pattern = pat_index; + pack_bc6h_block(*pPacked_block, log_blk); + } - // really 3x3 - int e0[4][4], e1[4][4]; + // Tries modes 0-8, falls back to mode 9 + void bc6h_enc_block_1subset_3bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights) + { + assert(g_bc6h_enc_initialized); - for (int y = 0; y < static_cast(num_blocks_y); y++) + for (uint32_t i = 0; i < 16; i++) { - const uint32_t* pE_rows[3]; - - for (int ey = 0; ey < 3; ey++) - { - int by = y + ey - 1; - - const uint32_t* pE = &pPVRTC_endpoints[(by & y_mask) * num_blocks_x]; + assert(pWeights[i] <= 7); + } - pE_rows[ey] = pE; + bc6h_logical_block log_blk; + log_blk.clear(); - for (int ex = 0; ex < 3; ex++) - { - int bx = 0 + ex - 1; + for (uint32_t mode_iter = 0; mode_iter <= 8; mode_iter++) + { + static const int s_mode_order[9] = { 2, 3, 4, 0, 5, 6, 7, 8, 1 }; // ordered from largest base bits to least + const uint32_t mode = s_mode_order[mode_iter]; - const uint32_t e = pE[bx & x_mask]; + const uint32_t num_base_bits = g_bc6h_mode_sig_bits[mode][0]; + const int base_bitmask = (1 << num_base_bits) - 1; + BASISU_NOTE_UNUSED(base_bitmask); - e0[ex][ey] = (get_opaque_endpoint_l0(e) * 255) / 31; - e1[ex][ey] = (get_opaque_endpoint_l1(e) * 255) / 31; - } - } + const uint32_t num_delta_bits[3] = { g_bc6h_mode_sig_bits[mode][1], g_bc6h_mode_sig_bits[mode][2], g_bc6h_mode_sig_bits[mode][3] }; + const int delta_bitmasks[3] = { (1 << num_delta_bits[0]) - 1, (1 << num_delta_bits[1]) - 1, (1 << num_delta_bits[2]) - 1 }; - const uint32_t y_swizzle = (g_pvrtc_swizzle_table[y >> 8] << 16) | g_pvrtc_swizzle_table[y & 0xFF]; + uint32_t blog_endpoints[3][4]; - for (int x = 0; x < static_cast(num_blocks_x); x++, block_index++) + // Convert half endpoints to blog 7-11 + for (uint32_t c = 0; c < 3; c++) { - const uastc_block& src_block = pSrc_blocks[block_index]; + blog_endpoints[c][0] = half_to_blog_tab(pEndpoints[c][0], num_base_bits); + blog_endpoints[c][2] = blog_endpoints[c][0]; + assert((int)blog_endpoints[c][0] <= base_bitmask); - color32 block_pixels[4][4]; - unpack_uastc(src_block, &block_pixels[0][0], false); - if (from_alpha) - { - // Just set RGB to alpha to avoid adding complexity below. - for (uint32_t i = 0; i < 16; i++) - { - const uint8_t a = ((color32*)block_pixels)[i].a; - ((color32*)block_pixels)[i].set(a, a, a, 255); - } - } + blog_endpoints[c][1] = half_to_blog_tab(pEndpoints[c][1], num_base_bits); + blog_endpoints[c][3] = blog_endpoints[c][1]; + assert((int)blog_endpoints[c][1] <= base_bitmask); + } - const uint32_t x_swizzle = (g_pvrtc_swizzle_table[x >> 8] << 17) | (g_pvrtc_swizzle_table[x & 0xFF] << 1); + const uint32_t pat_index = 0; + const uint8_t* pPat = &g_bc6h_2subset_patterns[pat_index][0][0]; - uint32_t swizzled = x_swizzle | y_swizzle; - if (num_blocks_x != num_blocks_y) - { - swizzled &= swizzle_mask; + memcpy(log_blk.m_weights, pWeights, 16); - if (num_blocks_x > num_blocks_y) - swizzled |= ((x >> min_bits) << (min_bits * 2)); - else - swizzled |= ((y >> min_bits) << (min_bits * 2)); - } + if (log_blk.m_weights[0] & 4) + { + // Swap part 0's endpoints/weights + for (uint32_t c = 0; c < 3; c++) + std::swap(blog_endpoints[c][0], blog_endpoints[c][1]); - pvrtc4_block* pDst_block = static_cast(pDst_blocks) + swizzled; - pDst_block->m_endpoints = pPVRTC_endpoints[block_index]; + for (uint32_t i = 0; i < 16; i++) + if ((pPat[i] & 0x7F) == 0) + log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + } - { - const uint32_t ex = 2; - int bx = x + ex - 1; - bx &= x_mask; + if (log_blk.m_weights[15] & 4) + { + // Swap part 1's endpoints/weights + for (uint32_t c = 0; c < 3; c++) + std::swap(blog_endpoints[c][2], blog_endpoints[c][3]); -#define DO_ROW(ey) \ - { \ - const uint32_t e = pE_rows[ey][bx]; \ - e0[ex][ey] = (get_opaque_endpoint_l0(e) * 255) / 31; \ - e1[ex][ey] = (get_opaque_endpoint_l1(e) * 255) / 31; \ - } + for (uint32_t i = 0; i < 16; i++) + if ((pPat[i] & 0x7F) == 1) + log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + } - DO_ROW(0); - DO_ROW(1); - DO_ROW(2); -#undef DO_ROW - } + bool failed_flag = false; - uint32_t mod = 0; + for (uint32_t c = 0; c < 3; c++) + { + const int max_delta = (1 << (num_delta_bits[c] - 1)) - 1; -#define DO_PIX(lx, ly, w0, w1, w2, w3) \ - { \ - int ca_l = a0 * w0 + a1 * w1 + a2 * w2 + a3 * w3; \ - int cb_l = b0 * w0 + b1 * w1 + b2 * w2 + b3 * w3; \ - int cl = (block_pixels[ly][lx].r + block_pixels[ly][lx].g + block_pixels[ly][lx].b) * 16; \ - int dl = cb_l - ca_l; \ - int vl = cl - ca_l; \ - int p = vl * 16; \ - if (ca_l > cb_l) { p = -p; dl = -dl; } \ - uint32_t m = 0; \ - if (p > 3 * dl) m = (uint32_t)(1 << ((ly) * 8 + (lx) * 2)); \ - if (p > 8 * dl) m = (uint32_t)(2 << ((ly) * 8 + (lx) * 2)); \ - if (p > 13 * dl) m = (uint32_t)(3 << ((ly) * 8 + (lx) * 2)); \ - mod |= m; \ - } + const int min_delta = -(max_delta + 1); + assert((max_delta - min_delta) == delta_bitmasks[c]); - { - const uint32_t ex = 0, ey = 0; - const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1]; - const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1]; - DO_PIX(0, 0, 4, 4, 4, 4); - DO_PIX(1, 0, 2, 6, 2, 6); - DO_PIX(0, 1, 2, 2, 6, 6); - DO_PIX(1, 1, 1, 3, 3, 9); - } + log_blk.m_endpoints[c][0] = blog_endpoints[c][0]; - { - const uint32_t ex = 1, ey = 0; - const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1]; - const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1]; - DO_PIX(2, 0, 8, 0, 8, 0); - DO_PIX(3, 0, 6, 2, 6, 2); - DO_PIX(2, 1, 4, 0, 12, 0); - DO_PIX(3, 1, 3, 1, 9, 3); - } + int delta0 = (int)blog_endpoints[c][1] - (int)blog_endpoints[c][0]; + int delta1 = (int)blog_endpoints[c][2] - (int)blog_endpoints[c][0]; + int delta2 = (int)blog_endpoints[c][3] - (int)blog_endpoints[c][0]; + if ((delta0 < min_delta) || (delta0 > max_delta) || + (delta1 < min_delta) || (delta1 > max_delta) || + (delta2 < min_delta) || (delta2 > max_delta)) { - const uint32_t ex = 0, ey = 1; - const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1]; - const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1]; - DO_PIX(0, 2, 8, 8, 0, 0); - DO_PIX(1, 2, 4, 12, 0, 0); - DO_PIX(0, 3, 6, 6, 2, 2); - DO_PIX(1, 3, 3, 9, 1, 3); + failed_flag = true; + break; } - { - const uint32_t ex = 1, ey = 1; - const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1]; - const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1]; - DO_PIX(2, 2, 16, 0, 0, 0); - DO_PIX(3, 2, 12, 4, 0, 0); - DO_PIX(2, 3, 12, 0, 4, 0); - DO_PIX(3, 3, 9, 3, 3, 1); - } -#undef DO_PIX + log_blk.m_endpoints[c][1] = delta0 & delta_bitmasks[c]; + log_blk.m_endpoints[c][2] = delta1 & delta_bitmasks[c]; + log_blk.m_endpoints[c][3] = delta2 & delta_bitmasks[c]; + } - pDst_block->m_modulation = mod; + if (failed_flag) + continue; - e0[0][0] = e0[1][0]; e0[1][0] = e0[2][0]; - e0[0][1] = e0[1][1]; e0[1][1] = e0[2][1]; - e0[0][2] = e0[1][2]; e0[1][2] = e0[2][2]; + log_blk.m_mode = mode; + log_blk.m_partition_pattern = pat_index; + pack_bc6h_block(*pPacked_block, log_blk); - e1[0][0] = e1[1][0]; e1[1][0] = e1[2][0]; - e1[0][1] = e1[1][1]; e1[1][1] = e1[2][1]; - e1[0][2] = e1[1][2]; e1[1][2] = e1[2][2]; + return; - } // x - } // y + } // mode_iter + + bc6h_enc_block_1subset_mode9_3bit_weights(pPacked_block, pEndpoints, pWeights); } - static void fixup_pvrtc1_4_modulation_rgba( - const uastc_block* pSrc_blocks, - const uint32_t* pPVRTC_endpoints, - void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y) + // pEndpoints[subset][comp][lh_index] + void bc6h_enc_block_2subset_mode9_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights) { - const uint32_t x_mask = num_blocks_x - 1; - const uint32_t y_mask = num_blocks_y - 1; - const uint32_t x_bits = basisu::total_bits(x_mask); - const uint32_t y_bits = basisu::total_bits(y_mask); - const uint32_t min_bits = basisu::minimum(x_bits, y_bits); - //const uint32_t max_bits = basisu::maximum(x_bits, y_bits); - const uint32_t swizzle_mask = (1 << (min_bits * 2)) - 1; + assert(g_bc6h_enc_initialized); + assert(common_part_index < basist::TOTAL_ASTC_BC7_COMMON_PARTITIONS2); - uint32_t block_index = 0; + for (uint32_t i = 0; i < 16; i++) + { + assert(pWeights[i] <= 7); + } - // really 3x3 - int e0[4][4], e1[4][4]; + bc6h_logical_block log_blk; + log_blk.clear(); + + // Convert half endpoints to blog6 (mode 9 doesn't use delta encoding) + for (uint32_t s = 0; s < 2; s++) + { + for (uint32_t c = 0; c < 3; c++) + { + log_blk.m_endpoints[c][0 + s * 2] = half_to_blog_tab(pEndpoints[s][c][0], 6); + log_blk.m_endpoints[c][1 + s * 2] = half_to_blog_tab(pEndpoints[s][c][1], 6); + } + } + + memcpy(log_blk.m_weights, pWeights, 16); + + //const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[common_part_index].m_astc; + const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[common_part_index].m_bc7; + + const bool invert_flag = basist::g_astc_bc7_common_partitions2[common_part_index].m_invert; + if (invert_flag) + { + for (uint32_t c = 0; c < 3; c++) + { + std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][2]); + std::swap(log_blk.m_endpoints[c][1], log_blk.m_endpoints[c][3]); + } + } + + const uint32_t pat_index = bc7_pattern; + assert(pat_index < 32); + const uint8_t* pPat = &g_bc6h_2subset_patterns[pat_index][0][0]; - for (int y = 0; y < static_cast(num_blocks_y); y++) + bool swap_flags[2] = { false, false }; + for (uint32_t i = 0; i < 16; i++) { - const uint32_t* pE_rows[3]; + if ((pPat[i] & 0x80) == 0) + continue; - for (int ey = 0; ey < 3; ey++) + if (log_blk.m_weights[i] & 4) { - int by = y + ey - 1; + const uint32_t p = pPat[i] & 1; + swap_flags[p] = true; + } + } - const uint32_t* pE = &pPVRTC_endpoints[(by & y_mask) * num_blocks_x]; + if (swap_flags[0]) + { + for (uint32_t c = 0; c < 3; c++) + std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][1]); - pE_rows[ey] = pE; + for (uint32_t i = 0; i < 16; i++) + if ((pPat[i] & 0x7F) == 0) + log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + } - for (int ex = 0; ex < 3; ex++) - { - int bx = 0 + ex - 1; + if (swap_flags[1]) + { + for (uint32_t c = 0; c < 3; c++) + std::swap(log_blk.m_endpoints[c][2], log_blk.m_endpoints[c][3]); - const uint32_t e = pE[bx & x_mask]; + for (uint32_t i = 0; i < 16; i++) + if ((pPat[i] & 0x7F) == 1) + log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + } - e0[ex][ey] = get_endpoint_l8(e, 0); - e1[ex][ey] = get_endpoint_l8(e, 1); - } - } + log_blk.m_mode = 9; + log_blk.m_partition_pattern = pat_index; + pack_bc6h_block(*pPacked_block, log_blk); + } - const uint32_t y_swizzle = (g_pvrtc_swizzle_table[y >> 8] << 16) | g_pvrtc_swizzle_table[y & 0xFF]; + void bc6h_enc_block_2subset_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights) + { + assert(g_bc6h_enc_initialized); - for (int x = 0; x < static_cast(num_blocks_x); x++, block_index++) - { - const uastc_block& src_block = pSrc_blocks[block_index]; + for (uint32_t i = 0; i < 16; i++) + { + assert(pWeights[i] <= 7); + } - color32 block_pixels[4][4]; - unpack_uastc(src_block, &block_pixels[0][0], false); + bc6h_logical_block log_blk; + log_blk.clear(); - const uint32_t x_swizzle = (g_pvrtc_swizzle_table[x >> 8] << 17) | (g_pvrtc_swizzle_table[x & 0xFF] << 1); + for (uint32_t mode_iter = 0; mode_iter <= 8; mode_iter++) + { + static const int s_mode_order[9] = { 2, 3, 4, 0, 5, 6, 7, 8, 1 }; // ordered from largest base bits to least + const uint32_t mode = s_mode_order[mode_iter]; - uint32_t swizzled = x_swizzle | y_swizzle; - if (num_blocks_x != num_blocks_y) - { - swizzled &= swizzle_mask; + const uint32_t num_base_bits = g_bc6h_mode_sig_bits[mode][0]; + const int base_bitmask = (1 << num_base_bits) - 1; + BASISU_NOTE_UNUSED(base_bitmask); - if (num_blocks_x > num_blocks_y) - swizzled |= ((x >> min_bits) << (min_bits * 2)); - else - swizzled |= ((y >> min_bits) << (min_bits * 2)); - } + const uint32_t num_delta_bits[3] = { g_bc6h_mode_sig_bits[mode][1], g_bc6h_mode_sig_bits[mode][2], g_bc6h_mode_sig_bits[mode][3] }; + const int delta_bitmasks[3] = { (1 << num_delta_bits[0]) - 1, (1 << num_delta_bits[1]) - 1, (1 << num_delta_bits[2]) - 1 }; - pvrtc4_block* pDst_block = static_cast(pDst_blocks) + swizzled; - pDst_block->m_endpoints = pPVRTC_endpoints[block_index]; + uint32_t blog_endpoints[3][4]; + // Convert half endpoints to blog 7-11 + for (uint32_t s = 0; s < 2; s++) + { + for (uint32_t c = 0; c < 3; c++) { - const uint32_t ex = 2; - int bx = x + ex - 1; - bx &= x_mask; - -#define DO_ROW(ey) \ - { \ - const uint32_t e = pE_rows[ey][bx]; \ - e0[ex][ey] = get_endpoint_l8(e, 0); \ - e1[ex][ey] = get_endpoint_l8(e, 1); \ - } - - DO_ROW(0); - DO_ROW(1); - DO_ROW(2); -#undef DO_ROW + blog_endpoints[c][0 + s * 2] = half_to_blog_tab(pEndpoints[s][c][0], num_base_bits); + blog_endpoints[c][1 + s * 2] = half_to_blog_tab(pEndpoints[s][c][1], num_base_bits); } + } - uint32_t mod = 0; + memcpy(log_blk.m_weights, pWeights, 16); -#define DO_PIX(lx, ly, w0, w1, w2, w3) \ - { \ - int ca_l = a0 * w0 + a1 * w1 + a2 * w2 + a3 * w3; \ - int cb_l = b0 * w0 + b1 * w1 + b2 * w2 + b3 * w3; \ - int cl = 16 * (block_pixels[ly][lx].r + block_pixels[ly][lx].g + block_pixels[ly][lx].b + block_pixels[ly][lx].a); \ - int dl = cb_l - ca_l; \ - int vl = cl - ca_l; \ - int p = vl * 16; \ - if (ca_l > cb_l) { p = -p; dl = -dl; } \ - uint32_t m = 0; \ - if (p > 3 * dl) m = (uint32_t)(1 << ((ly) * 8 + (lx) * 2)); \ - if (p > 8 * dl) m = (uint32_t)(2 << ((ly) * 8 + (lx) * 2)); \ - if (p > 13 * dl) m = (uint32_t)(3 << ((ly) * 8 + (lx) * 2)); \ - mod |= m; \ - } + //const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[common_part_index].m_astc; + const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[common_part_index].m_bc7; + const bool invert_flag = basist::g_astc_bc7_common_partitions2[common_part_index].m_invert; + if (invert_flag) + { + for (uint32_t c = 0; c < 3; c++) { - const uint32_t ex = 0, ey = 0; - const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1]; - const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1]; - DO_PIX(0, 0, 4, 4, 4, 4); - DO_PIX(1, 0, 2, 6, 2, 6); - DO_PIX(0, 1, 2, 2, 6, 6); - DO_PIX(1, 1, 1, 3, 3, 9); + std::swap(blog_endpoints[c][0], blog_endpoints[c][2]); + std::swap(blog_endpoints[c][1], blog_endpoints[c][3]); } + } - { - const uint32_t ex = 1, ey = 0; - const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1]; - const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1]; - DO_PIX(2, 0, 8, 0, 8, 0); - DO_PIX(3, 0, 6, 2, 6, 2); - DO_PIX(2, 1, 4, 0, 12, 0); - DO_PIX(3, 1, 3, 1, 9, 3); - } + const uint32_t pat_index = bc7_pattern; + assert(pat_index < 32); + const uint8_t* pPat = &g_bc6h_2subset_patterns[pat_index][0][0]; - { - const uint32_t ex = 0, ey = 1; - const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1]; - const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1]; - DO_PIX(0, 2, 8, 8, 0, 0); - DO_PIX(1, 2, 4, 12, 0, 0); - DO_PIX(0, 3, 6, 6, 2, 2); - DO_PIX(1, 3, 3, 9, 1, 3); - } + bool swap_flags[2] = { false, false }; + for (uint32_t i = 0; i < 16; i++) + { + if ((pPat[i] & 0x80) == 0) + continue; + if (log_blk.m_weights[i] & 4) { - const uint32_t ex = 1, ey = 1; - const int a0 = e0[ex][ey], a1 = e0[ex + 1][ey], a2 = e0[ex][ey + 1], a3 = e0[ex + 1][ey + 1]; - const int b0 = e1[ex][ey], b1 = e1[ex + 1][ey], b2 = e1[ex][ey + 1], b3 = e1[ex + 1][ey + 1]; - DO_PIX(2, 2, 16, 0, 0, 0); - DO_PIX(3, 2, 12, 4, 0, 0); - DO_PIX(2, 3, 12, 0, 4, 0); - DO_PIX(3, 3, 9, 3, 3, 1); + const uint32_t p = pPat[i] & 1; + swap_flags[p] = true; } -#undef DO_PIX - - pDst_block->m_modulation = mod; + } - e0[0][0] = e0[1][0]; e0[1][0] = e0[2][0]; - e0[0][1] = e0[1][1]; e0[1][1] = e0[2][1]; - e0[0][2] = e0[1][2]; e0[1][2] = e0[2][2]; + if (swap_flags[0]) + { + for (uint32_t c = 0; c < 3; c++) + std::swap(blog_endpoints[c][0], blog_endpoints[c][1]); - e1[0][0] = e1[1][0]; e1[1][0] = e1[2][0]; - e1[0][1] = e1[1][1]; e1[1][1] = e1[2][1]; - e1[0][2] = e1[1][2]; e1[1][2] = e1[2][2]; + for (uint32_t i = 0; i < 16; i++) + if ((pPat[i] & 0x7F) == 0) + log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + } - } // x - } // y - } + if (swap_flags[1]) + { + for (uint32_t c = 0; c < 3; c++) + std::swap(blog_endpoints[c][2], blog_endpoints[c][3]); - bool transcode_uastc_to_pvrtc1_4_rgb(const uastc_block* pSrc_blocks, void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, bool high_quality, bool from_alpha) - { - BASISU_NOTE_UNUSED(high_quality); + for (uint32_t i = 0; i < 16; i++) + if ((pPat[i] & 0x7F) == 1) + log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + } - if ((!num_blocks_x) || (!num_blocks_y)) - return false; + // Try packing the endpoints + bool failed_flag = false; - const uint32_t width = num_blocks_x * 4; - const uint32_t height = num_blocks_y * 4; - if (!basisu::is_pow2(width) || !basisu::is_pow2(height)) - return false; + for (uint32_t c = 0; c < 3; c++) + { + const int max_delta = (1 << (num_delta_bits[c] - 1)) - 1; - basisu::vector temp_endpoints(num_blocks_x * num_blocks_y); + const int min_delta = -(max_delta + 1); + assert((max_delta - min_delta) == delta_bitmasks[c]); - for (uint32_t y = 0; y < num_blocks_y; y++) - { - for (uint32_t x = 0; x < num_blocks_x; x++) - { - color32 block_pixels[16]; - if (!unpack_uastc(pSrc_blocks[x + y * num_blocks_x], block_pixels, false)) - return false; + log_blk.m_endpoints[c][0] = blog_endpoints[c][0]; - // Get block's RGB bounding box - color32 low_color(255, 255, 255, 255), high_color(0, 0, 0, 0); + int delta0 = (int)blog_endpoints[c][1] - (int)blog_endpoints[c][0]; + int delta1 = (int)blog_endpoints[c][2] - (int)blog_endpoints[c][0]; + int delta2 = (int)blog_endpoints[c][3] - (int)blog_endpoints[c][0]; - if (from_alpha) - { - uint32_t low_a = 255, high_a = 0; - for (uint32_t i = 0; i < 16; i++) - { - low_a = basisu::minimum(low_a, block_pixels[i].a); - high_a = basisu::maximum(high_a, block_pixels[i].a); - } - low_color.set(low_a, low_a, low_a, 255); - high_color.set(high_a, high_a, high_a, 255); - } - else + if ((delta0 < min_delta) || (delta0 > max_delta) || + (delta1 < min_delta) || (delta1 > max_delta) || + (delta2 < min_delta) || (delta2 > max_delta)) { - for (uint32_t i = 0; i < 16; i++) - { - low_color = color32::comp_min(low_color, block_pixels[i]); - high_color = color32::comp_max(high_color, block_pixels[i]); - } - } - - // Set PVRTC1 endpoints to floor/ceil of bounding box's coordinates. - pvrtc4_block temp; - temp.set_opaque_endpoint_floor(0, low_color); - temp.set_opaque_endpoint_ceil(1, high_color); + failed_flag = true; + break; + } - temp_endpoints[x + y * num_blocks_x] = temp.m_endpoints; + log_blk.m_endpoints[c][1] = delta0 & delta_bitmasks[c]; + log_blk.m_endpoints[c][2] = delta1 & delta_bitmasks[c]; + log_blk.m_endpoints[c][3] = delta2 & delta_bitmasks[c]; } - } - fixup_pvrtc1_4_modulation_rgb(pSrc_blocks, &temp_endpoints[0], pDst_blocks, num_blocks_x, num_blocks_y, from_alpha); - - return true; - } + if (failed_flag) + continue; - bool transcode_uastc_to_pvrtc1_4_rgba(const uastc_block* pSrc_blocks, void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, bool high_quality) - { - BASISU_NOTE_UNUSED(high_quality); + log_blk.m_mode = mode; + log_blk.m_partition_pattern = pat_index; + pack_bc6h_block(*pPacked_block, log_blk); - if ((!num_blocks_x) || (!num_blocks_y)) - return false; + //half_float blk[16 * 3]; + //unpack_bc6h(pPacked_block, blk, false); - const uint32_t width = num_blocks_x * 4; - const uint32_t height = num_blocks_y * 4; - if (!basisu::is_pow2(width) || !basisu::is_pow2(height)) - return false; + return; + } - basisu::vector temp_endpoints(num_blocks_x * num_blocks_y); + bc6h_enc_block_2subset_mode9_3bit_weights(pPacked_block, common_part_index, pEndpoints, pWeights); + } - for (uint32_t y = 0; y < num_blocks_y; y++) - { - for (uint32_t x = 0; x < num_blocks_x; x++) - { - color32 block_pixels[16]; - if (!unpack_uastc(pSrc_blocks[x + y * num_blocks_x], block_pixels, false)) - return false; + bool bc6h_enc_block_solid_color(bc6h_block* pPacked_block, const half_float pColor[3]) + { + assert(g_bc6h_enc_initialized); - // Get block's RGBA bounding box - color32 low_color(255, 255, 255, 255), high_color(0, 0, 0, 0); + if ((pColor[0] | pColor[1] | pColor[2]) & 0x8000) + return false; - for (uint32_t i = 0; i < 16; i++) - { - low_color = color32::comp_min(low_color, block_pixels[i]); - high_color = color32::comp_max(high_color, block_pixels[i]); - } + // ASTC block unpacker won't allow Inf/NaN's to come through. + //if (is_half_inf_or_nan(pColor[0]) || is_half_inf_or_nan(pColor[1]) || is_half_inf_or_nan(pColor[2])) + // return false; - // Set PVRTC1 endpoints to floor/ceil of bounding box's coordinates. - pvrtc4_block temp; - temp.set_endpoint_floor(0, low_color); - temp.set_endpoint_ceil(1, high_color); + uint8_t weights[16]; + memset(weights, 0, sizeof(weights)); - temp_endpoints[x + y * num_blocks_x] = temp.m_endpoints; - } - } + half_float endpoints[3][2]; + endpoints[0][0] = pColor[0]; + endpoints[0][1] = pColor[0]; + + endpoints[1][0] = pColor[1]; + endpoints[1][1] = pColor[1]; - fixup_pvrtc1_4_modulation_rgba(pSrc_blocks, &temp_endpoints[0], pDst_blocks, num_blocks_x, num_blocks_y); + endpoints[2][0] = pColor[2]; + endpoints[2][1] = pColor[2]; + + bc6h_enc_block_1subset_4bit_weights(pPacked_block, endpoints, weights); return true; } - void uastc_init() - { - for (uint32_t range = 0; range < BC7ENC_TOTAL_ASTC_RANGES; range++) - { - if (!astc_is_valid_endpoint_range(range)) - continue; + //-------------------------------------------------------------------------------------------------------------------------- + // basisu_astc_hdr_core.cpp - const uint32_t levels = astc_get_levels(range); + static bool g_astc_hdr_core_initialized; + static int8_t g_astc_partition_id_to_common_bc7_pat_index[1024]; - uint32_t vals[256]; - for (uint32_t i = 0; i < levels; i++) - vals[i] = (unquant_astc_endpoint_val(i, range) << 8) | i; + //-------------------------------------------------------------------------------------------------------------------------- - std::sort(vals, vals + levels); + void astc_hdr_core_init() + { + if (g_astc_hdr_core_initialized) + return; - for (uint32_t i = 0; i < levels; i++) - { - const uint32_t order = vals[i] & 0xFF; - const uint32_t unq = vals[i] >> 8; + memset(g_astc_partition_id_to_common_bc7_pat_index, 0xFF, sizeof(g_astc_partition_id_to_common_bc7_pat_index)); - g_astc_unquant[range][order].m_unquant = (uint8_t)unq; - g_astc_unquant[range][order].m_index = (uint8_t)i; + for (uint32_t part_index = 0; part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; ++part_index) + { + const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc; + //const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7; - } // i + assert(astc_pattern < 1024); + g_astc_partition_id_to_common_bc7_pat_index[astc_pattern] = (int8_t)part_index; } - // TODO: Precompute? - // BC7 777.1 - for (int c = 0; c < 256; c++) - { - for (uint32_t lp = 0; lp < 2; lp++) - { - endpoint_err best; - best.m_error = (uint16_t)UINT16_MAX; + g_astc_hdr_core_initialized = true; + } - for (uint32_t l = 0; l < 128; l++) - { - const uint32_t low = (l << 1) | lp; + //-------------------------------------------------------------------------------------------------------------------------- - for (uint32_t h = 0; h < 128; h++) - { - const uint32_t high = (h << 1) | lp; + static inline int astc_hdr_sign_extend(int src, int num_src_bits) + { + assert(basisu::in_range(num_src_bits, 2, 31)); - const int k = (low * (64 - g_bc7_weights4[BC7ENC_MODE_6_OPTIMAL_INDEX]) + high * g_bc7_weights4[BC7ENC_MODE_6_OPTIMAL_INDEX] + 32) >> 6; + const bool negative = (src & (1 << (num_src_bits - 1))) != 0; + if (negative) + return src | ~((1 << num_src_bits) - 1); + else + return src & ((1 << num_src_bits) - 1); + } - const int err = (k - c) * (k - c); - if (err < best.m_error) - { - best.m_error = (uint16_t)err; - best.m_lo = (uint8_t)l; - best.m_hi = (uint8_t)h; - } - } // h - } // l + static inline void astc_hdr_pack_bit( + int& dst, int dst_bit, + int src_val, int src_bit = 0) + { + assert(dst_bit >= 0 && dst_bit <= 31); + int bit = basisu::get_bit(src_val, src_bit); + dst |= (bit << dst_bit); + } - g_bc7_mode_6_optimal_endpoints[c][lp] = best; - } // lp + //-------------------------------------------------------------------------------------------------------------------------- - } // c + void decode_mode7_to_qlog12_ise20( + const uint8_t* pEndpoints, + int e[2][3], + int* pScale) + { + assert(g_astc_hdr_core_initialized); - // BC7 777 - for (int c = 0; c < 256; c++) + for (uint32_t i = 0; i < NUM_MODE7_ENDPOINTS; i++) { - endpoint_err best; - best.m_error = (uint16_t)UINT16_MAX; + assert(pEndpoints[i] <= 255); + } - for (uint32_t l = 0; l < 128; l++) - { - const uint32_t low = (l << 1) | (l >> 6); + const int v0 = pEndpoints[0], v1 = pEndpoints[1], v2 = pEndpoints[2], v3 = pEndpoints[3]; - for (uint32_t h = 0; h < 128; h++) - { - const uint32_t high = (h << 1) | (h >> 6); + // Extract mode bits and unpack to major component and mode. + const int modeval = ((v0 & 0xC0) >> 6) | ((v1 & 0x80) >> 5) | ((v2 & 0x80) >> 4); - const int k = (low * (64 - g_bc7_weights2[BC7ENC_MODE_5_OPTIMAL_INDEX]) + high * g_bc7_weights2[BC7ENC_MODE_5_OPTIMAL_INDEX] + 32) >> 6; + int majcomp, mode; + if ((modeval & 0xC) != 0xC) + { + majcomp = modeval >> 2; + mode = modeval & 3; + } + else if (modeval != 0xF) + { + majcomp = modeval & 3; + mode = 4; + } + else + { + majcomp = 0; + mode = 5; + } - const int err = (k - c) * (k - c); - if (err < best.m_error) - { - best.m_error = (uint16_t)err; - best.m_lo = (uint8_t)l; - best.m_hi = (uint8_t)h; - } - } // h - } // l + // Extract low-order bits of r, g, b, and s. + int red = v0 & 0x3f; + int green = v1 & 0x1f; + int blue = v2 & 0x1f; + int scale = v3 & 0x1f; - g_bc7_mode_5_optimal_endpoints[c] = best; + // Extract high-order bits, which may be assigned depending on mode + int x0 = (v1 >> 6) & 1; + int x1 = (v1 >> 5) & 1; + int x2 = (v2 >> 6) & 1; + int x3 = (v2 >> 5) & 1; + int x4 = (v3 >> 7) & 1; + int x5 = (v3 >> 6) & 1; + int x6 = (v3 >> 5) & 1; - } // c - } + // Now move the high-order xs into the right place. + const int ohm = 1 << mode; + if (ohm & 0x30) green |= x0 << 6; + if (ohm & 0x3A) green |= x1 << 5; + if (ohm & 0x30) blue |= x2 << 6; + if (ohm & 0x3A) blue |= x3 << 5; + if (ohm & 0x3D) scale |= x6 << 5; + if (ohm & 0x2D) scale |= x5 << 6; + if (ohm & 0x04) scale |= x4 << 7; + if (ohm & 0x3B) red |= x4 << 6; + if (ohm & 0x04) red |= x3 << 6; + if (ohm & 0x10) red |= x5 << 7; + if (ohm & 0x0F) red |= x2 << 7; + if (ohm & 0x05) red |= x1 << 8; + if (ohm & 0x0A) red |= x0 << 8; + if (ohm & 0x05) red |= x0 << 9; + if (ohm & 0x02) red |= x6 << 9; + if (ohm & 0x01) red |= x3 << 10; + if (ohm & 0x02) red |= x5 << 10; -#endif // #if BASISD_SUPPORT_UASTC + // Shift the bits to the top of the 12-bit result. + static const int s_shamts[6] = { 1,1,2,3,4,5 }; -// ------------------------------------------------------------------------------------------------------ -// KTX2 -// ------------------------------------------------------------------------------------------------------ + const int shamt = s_shamts[mode]; + red <<= shamt; + green <<= shamt; + blue <<= shamt; + scale <<= shamt; -#if BASISD_SUPPORT_KTX2 - const uint8_t g_ktx2_file_identifier[12] = { 0xAB, 0x4B, 0x54, 0x58, 0x20, 0x32, 0x30, 0xBB, 0x0D, 0x0A, 0x1A, 0x0A }; + // Minor components are stored as differences + if (mode != 5) + { + green = red - green; + blue = red - blue; + } - ktx2_transcoder::ktx2_transcoder() : - m_etc1s_transcoder() - { - clear(); - } + // Swizzle major component into place + if (majcomp == 1) + std::swap(red, green); - void ktx2_transcoder::clear() - { - m_pData = nullptr; - m_data_size = 0; + if (majcomp == 2) + std::swap(red, blue); - memset(&m_header, 0, sizeof(m_header)); - m_levels.clear(); - m_dfd.clear(); - m_key_values.clear(); - memset(&m_etc1s_header, 0, sizeof(m_etc1s_header)); - m_etc1s_image_descs.clear(); - - m_format = basist::basis_tex_format::cETC1S; + // Clamp output values, set alpha to 1.0 + e[1][0] = basisu::clamp(red, 0, 0xFFF); + e[1][1] = basisu::clamp(green, 0, 0xFFF); + e[1][2] = basisu::clamp(blue, 0, 0xFFF); - m_dfd_color_model = 0; - m_dfd_color_prims = KTX2_DF_PRIMARIES_UNSPECIFIED; - m_dfd_transfer_func = 0; - m_dfd_flags = 0; - m_dfd_samples = 0; - m_dfd_chan0 = KTX2_DF_CHANNEL_UASTC_RGB; - m_dfd_chan1 = KTX2_DF_CHANNEL_UASTC_RGB; + e[0][0] = basisu::clamp(red - scale, 0, 0xFFF); + e[0][1] = basisu::clamp(green - scale, 0, 0xFFF); + e[0][2] = basisu::clamp(blue - scale, 0, 0xFFF); - m_etc1s_transcoder.clear(); - - m_def_transcoder_state.clear(); - - m_has_alpha = false; - m_is_video = false; + if (pScale) + *pScale = scale; } - bool ktx2_transcoder::init(const void* pData, uint32_t data_size) + //-------------------------------------------------------------------------------------------------------------------------- + + bool decode_mode7_to_qlog12( + const uint8_t* pEndpoints, + int e[2][3], + int* pScale, + uint32_t ise_endpoint_range) { - clear(); + assert(g_astc_hdr_core_initialized); - if (!pData) + if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS) { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: pData is nullptr\n"); - assert(0); - return false; + decode_mode7_to_qlog12_ise20(pEndpoints, e, pScale); } - - if (data_size <= sizeof(ktx2_header)) + else { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: File is impossibly too small to be a valid KTX2 file\n"); - return false; + uint8_t dequantized_endpoints[NUM_MODE7_ENDPOINTS]; + + for (uint32_t i = 0; i < NUM_MODE7_ENDPOINTS; i++) + dequantized_endpoints[i] = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_val[pEndpoints[i]]; + + decode_mode7_to_qlog12_ise20(dequantized_endpoints, e, pScale); } - if (memcmp(pData, g_ktx2_file_identifier, sizeof(g_ktx2_file_identifier)) != 0) + for (uint32_t i = 0; i < 2; i++) { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: KTX2 file identifier is not present\n"); - return false; + if (e[i][0] > (int)MAX_QLOG12) + return false; + + if (e[i][1] > (int)MAX_QLOG12) + return false; + + if (e[i][2] > (int)MAX_QLOG12) + return false; } - m_pData = static_cast(pData); - m_data_size = data_size; + return true; + } - memcpy(&m_header, pData, sizeof(m_header)); + //-------------------------------------------------------------------------------------------------------------------------- - // We only support UASTC LDR, UASTC HDR and ETC1S. - // Note the DFD's contents are what we are guided by for decoding the KTX2 file, not this format field (currently). - if ((m_header.m_vk_format != KTX2_VK_FORMAT_UNDEFINED) && - (m_header.m_vk_format != basist::KTX2_FORMAT_UASTC_4x4_SFLOAT_BLOCK)) + void decode_mode11_to_qlog12_ise20( + const uint8_t* pEndpoints, + int e[2][3]) + { +#ifdef _DEBUG + for (uint32_t i = 0; i < NUM_MODE11_ENDPOINTS; i++) { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: KTX2 file must be in ETC1S or UASTC LDR/HDR format\n"); - return false; + assert(pEndpoints[i] <= 255); } +#endif - // 3.3: "When format is VK_FORMAT_UNDEFINED, typeSize must equal 1." - if (m_header.m_type_size != 1) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid type_size\n"); - return false; - } + const uint32_t maj_comp = basisu::get_bit(pEndpoints[4], 7) | (basisu::get_bit(pEndpoints[5], 7) << 1); - // We only currently support 2D textures (plain, cubemapped, or texture array), which is by far the most common use case. - // The BasisU library does not support 1D or 3D textures at all. - if ((m_header.m_pixel_width < 1) || (m_header.m_pixel_height < 1) || (m_header.m_pixel_depth > 0)) + if (maj_comp == 3) { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Only 2D or cubemap textures are supported\n"); - return false; - } + // Direct, qlog8 and qlog7 + e[0][0] = pEndpoints[0] << 4; + e[1][0] = pEndpoints[1] << 4; - // Face count must be 1 or 6 - if ((m_header.m_face_count != 1) && (m_header.m_face_count != 6)) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid face count, file is corrupted or invalid\n"); - return false; - } + e[0][1] = pEndpoints[2] << 4; + e[1][1] = pEndpoints[3] << 4; - if (m_header.m_face_count > 1) + e[0][2] = (pEndpoints[4] & 127) << 5; + e[1][2] = (pEndpoints[5] & 127) << 5; + } + else { - // 3.4: Make sure cubemaps are square. - if (m_header.m_pixel_width != m_header.m_pixel_height) + int v0 = pEndpoints[0]; + int v1 = pEndpoints[1]; + int v2 = pEndpoints[2]; + int v3 = pEndpoints[3]; + int v4 = pEndpoints[4]; + int v5 = pEndpoints[5]; + + int mode = 0; + astc_hdr_pack_bit(mode, 0, v1, 7); + astc_hdr_pack_bit(mode, 1, v2, 7); + astc_hdr_pack_bit(mode, 2, v3, 7); + + int va = v0; + astc_hdr_pack_bit(va, 8, v1, 6); + + int vb0 = v2 & 63; + int vb1 = v3 & 63; + int vc = v1 & 63; + + int vd0 = v4 & 0x7F; // this takes more bits than is sometimes needed + int vd1 = v5 & 0x7F; // this takes more bits than is sometimes needed + static const int8_t dbitstab[8] = { 7,6,7,6,5,6,5,6 }; + vd0 = astc_hdr_sign_extend(vd0, dbitstab[mode]); + vd1 = astc_hdr_sign_extend(vd1, dbitstab[mode]); + + int x0 = basisu::get_bit(v2, 6); + int x1 = basisu::get_bit(v3, 6); + int x2 = basisu::get_bit(v4, 6); + int x3 = basisu::get_bit(v5, 6); + int x4 = basisu::get_bit(v4, 5); + int x5 = basisu::get_bit(v5, 5); + + const uint32_t ohm = 1U << mode; + if (ohm & 0xA4) va |= (x0 << 9); + if (ohm & 0x08) va |= (x2 << 9); + if (ohm & 0x50) va |= (x4 << 9); + if (ohm & 0x50) va |= (x5 << 10); + if (ohm & 0xA0) va |= (x1 << 10); + if (ohm & 0xC0) va |= (x2 << 11); + if (ohm & 0x04) vc |= (x1 << 6); + if (ohm & 0xE8) vc |= (x3 << 6); + if (ohm & 0x20) vc |= (x2 << 7); + if (ohm & 0x5B) vb0 |= (x0 << 6); + if (ohm & 0x5B) vb1 |= (x1 << 6); + if (ohm & 0x12) vb0 |= (x2 << 7); + if (ohm & 0x12) vb1 |= (x3 << 7); + + const int shamt = (mode >> 1) ^ 3; + + va = (uint32_t)va << shamt; + vb0 = (uint32_t)vb0 << shamt; + vb1 = (uint32_t)vb1 << shamt; + vc = (uint32_t)vc << shamt; + vd0 = (uint32_t)vd0 << shamt; + vd1 = (uint32_t)vd1 << shamt; + + // qlog12 + e[1][0] = basisu::clamp(va, 0, 0xFFF); + e[1][1] = basisu::clamp(va - vb0, 0, 0xFFF); + e[1][2] = basisu::clamp(va - vb1, 0, 0xFFF); + + e[0][0] = basisu::clamp(va - vc, 0, 0xFFF); + e[0][1] = basisu::clamp(va - vb0 - vc - vd0, 0, 0xFFF); + e[0][2] = basisu::clamp(va - vb1 - vc - vd1, 0, 0xFFF); + + if (maj_comp) { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Cubemap is not square\n"); - return false; + std::swap(e[0][0], e[0][maj_comp]); + std::swap(e[1][0], e[1][maj_comp]); } } - - // 3.7 levelCount: "levelCount=0 is allowed, except for block-compressed formats" - if (m_header.m_level_count < 1) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid level count\n"); - return false; - } + } - // Sanity check the level count. - if (m_header.m_level_count > KTX2_MAX_SUPPORTED_LEVEL_COUNT) + //-------------------------------------------------------------------------------------------------------------------------- + + bool decode_mode11_to_qlog12( + const uint8_t* pEndpoints, + int e[2][3], + uint32_t ise_endpoint_range) + { + assert(g_astc_hdr_core_initialized); + assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + + if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS) { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Too many levels or file is corrupted or invalid\n"); - return false; + decode_mode11_to_qlog12_ise20(pEndpoints, e); } - - if (m_header.m_supercompression_scheme > KTX2_SS_ZSTANDARD) + else { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid/unsupported supercompression or file is corrupted or invalid\n"); - return false; + uint8_t dequantized_endpoints[NUM_MODE11_ENDPOINTS]; + + for (uint32_t i = 0; i < NUM_MODE11_ENDPOINTS; i++) + dequantized_endpoints[i] = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_val[pEndpoints[i]]; + + decode_mode11_to_qlog12_ise20(dequantized_endpoints, e); } - if (m_header.m_supercompression_scheme == KTX2_SS_BASISLZ) + for (uint32_t i = 0; i < 2; i++) { - if (m_header.m_sgd_byte_length <= sizeof(ktx2_etc1s_global_data_header)) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Supercompression global data is too small\n"); + if (e[i][0] > (int)MAX_QLOG12) return false; - } - if (m_header.m_sgd_byte_offset < sizeof(ktx2_header)) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Supercompression global data offset is too low\n"); + if (e[i][1] > (int)MAX_QLOG12) return false; - } - if (m_header.m_sgd_byte_offset + m_header.m_sgd_byte_length > m_data_size) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Supercompression global data offset and/or length is too high\n"); + if (e[i][2] > (int)MAX_QLOG12) return false; - } } - if (!m_levels.try_resize(m_header.m_level_count)) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Out of memory\n"); - return false; - } + return true; + } - const uint32_t level_index_size_in_bytes = basisu::maximum(1U, (uint32_t)m_header.m_level_count) * sizeof(ktx2_level_index); + //-------------------------------------------------------------------------------------------------------------------------- - if ((sizeof(ktx2_header) + level_index_size_in_bytes) > m_data_size) + bool transcode_bc6h_1subset(half_float h_e[3][2], const astc_helpers::log_astc_block& best_blk, bc6h_block& transcoded_bc6h_blk) + { + assert(g_astc_hdr_core_initialized); + assert((best_blk.m_weight_ise_range >= 1) && (best_blk.m_weight_ise_range <= 8)); + + if (best_blk.m_weight_ise_range == 5) { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: File is too small (can't read level index array)\n"); - return false; + // Use 3-bit BC6H weights which are a perfect match for 3-bit ASTC weights, but encode 1-subset as 2 equal subsets + bc6h_enc_block_1subset_3bit_weights(&transcoded_bc6h_blk, h_e, best_blk.m_weights); } - - memcpy(&m_levels[0], m_pData + sizeof(ktx2_header), level_index_size_in_bytes); - - // Sanity check the level offsets and byte sizes - for (uint32_t i = 0; i < m_levels.size(); i++) + else { - if (m_levels[i].m_byte_offset < sizeof(ktx2_header)) + uint8_t bc6h_weights[16]; + + if (best_blk.m_weight_ise_range == 1) { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid level offset (too low)\n"); - return false; - } + // weight ISE 1: 3 levels + static const uint8_t s_astc1_to_bc6h_3[3] = { 0, 8, 15 }; - if (!m_levels[i].m_byte_length) + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc1_to_bc6h_3[best_blk.m_weights[i]]; + } + else if (best_blk.m_weight_ise_range == 2) { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid level byte length\n"); + // weight ISE 2: 4 levels + static const uint8_t s_astc2_to_bc6h_4[4] = { 0, 5, 10, 15 }; + + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc2_to_bc6h_4[best_blk.m_weights[i]]; } + else if (best_blk.m_weight_ise_range == 3) + { + // weight ISE 3: 5 levels + static const uint8_t s_astc3_to_bc6h_4[5] = { 0, 4, 7, 11, 15 }; - if ((m_levels[i].m_byte_offset + m_levels[i].m_byte_length) > m_data_size) + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc3_to_bc6h_4[best_blk.m_weights[i]]; + } + else if (best_blk.m_weight_ise_range == 4) { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid level offset and/or length\n"); - return false; + // weight ISE 4: 6 levels + static const uint8_t s_astc4_to_bc6h_4[6] = { 0, 15, 3, 12, 6, 9 }; + + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc4_to_bc6h_4[best_blk.m_weights[i]]; } - - const uint64_t MAX_SANE_LEVEL_UNCOMP_SIZE = 2048ULL * 1024ULL * 1024ULL; - - if (m_levels[i].m_uncompressed_byte_length >= MAX_SANE_LEVEL_UNCOMP_SIZE) + else if (best_blk.m_weight_ise_range == 6) { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid level offset (too large)\n"); - return false; + // weight ISE 6: 10 levels + static const uint8_t s_astc6_to_bc6h_4[10] = { 0, 15, 2, 13, 3, 12, 5, 10, 6, 9 }; + + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc6_to_bc6h_4[best_blk.m_weights[i]]; } + else if (best_blk.m_weight_ise_range == 7) + { + // weight ISE 7: 12 levels + static const uint8_t s_astc7_to_bc6h_4[12] = { 0, 15, 4, 11, 1, 14, 5, 10, 2, 13, 6, 9 }; - if (m_header.m_supercompression_scheme == KTX2_SS_BASISLZ) + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc7_to_bc6h_4[best_blk.m_weights[i]]; + } + else if (best_blk.m_weight_ise_range == 8) { - if (m_levels[i].m_uncompressed_byte_length) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid uncompressed length (0)\n"); - return false; - } + // 16 levels + memcpy(bc6h_weights, best_blk.m_weights, 16); } - else if (m_header.m_supercompression_scheme >= KTX2_SS_ZSTANDARD) + else { - if (!m_levels[i].m_uncompressed_byte_length) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid uncompressed length (1)\n"); - return false; - } + assert(0); + return false; } - } - const uint32_t DFD_MINIMUM_SIZE = 44, DFD_MAXIMUM_SIZE = 60; - if ((m_header.m_dfd_byte_length != DFD_MINIMUM_SIZE) && (m_header.m_dfd_byte_length != DFD_MAXIMUM_SIZE)) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Unsupported DFD size\n"); - return false; + bc6h_enc_block_1subset_4bit_weights(&transcoded_bc6h_blk, h_e, bc6h_weights); } - if (((m_header.m_dfd_byte_offset + m_header.m_dfd_byte_length) > m_data_size) || (m_header.m_dfd_byte_offset < sizeof(ktx2_header))) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid DFD offset and/or length\n"); - return false; - } - - const uint8_t* pDFD = m_pData + m_header.m_dfd_byte_offset; + return true; + } - if (!m_dfd.try_resize(m_header.m_dfd_byte_length)) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Out of memory\n"); - return false; - } + //-------------------------------------------------------------------------------------------------------------------------- - memcpy(m_dfd.data(), pDFD, m_header.m_dfd_byte_length); - - // This is all hard coded for only ETC1S and UASTC. - uint32_t dfd_total_size = basisu::read_le_dword(pDFD); + bool transcode_bc6h_2subsets(uint32_t common_part_index, const astc_helpers::log_astc_block& best_blk, bc6h_block& transcoded_bc6h_blk) + { + assert(g_astc_hdr_core_initialized); + assert(best_blk.m_num_partitions == 2); + assert(common_part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); - // 3.10.3: Sanity check - if (dfd_total_size != m_header.m_dfd_byte_length) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: DFD size validation failed (1)\n"); + half_float bc6h_endpoints[2][3][2]; // [subset][comp][lh_index] + + // UASTC HDR checks + // Both CEM's must be equal in 2-subset UASTC HDR. + if (best_blk.m_color_endpoint_modes[0] != best_blk.m_color_endpoint_modes[1]) + return false; + if ((best_blk.m_color_endpoint_modes[0] != 7) && (best_blk.m_color_endpoint_modes[0] != 11)) return false; - } - // 3.10.3: More sanity checking - if (m_header.m_kvd_byte_length) + if (best_blk.m_color_endpoint_modes[0] == 7) { - if (dfd_total_size != m_header.m_kvd_byte_offset - m_header.m_dfd_byte_offset) + if (!(((best_blk.m_weight_ise_range == 1) && (best_blk.m_endpoint_ise_range == 20)) || + ((best_blk.m_weight_ise_range == 2) && (best_blk.m_endpoint_ise_range == 20)) || + ((best_blk.m_weight_ise_range == 3) && (best_blk.m_endpoint_ise_range == 19)) || + ((best_blk.m_weight_ise_range == 4) && (best_blk.m_endpoint_ise_range == 17)) || + ((best_blk.m_weight_ise_range == 5) && (best_blk.m_endpoint_ise_range == 15)))) { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: DFD size validation failed (2)\n"); return false; } } - - const uint32_t dfd_bits = basisu::read_le_dword(pDFD + 3 * sizeof(uint32_t)); - const uint32_t sample_channel0 = basisu::read_le_dword(pDFD + 7 * sizeof(uint32_t)); - - m_dfd_color_model = dfd_bits & 255; - m_dfd_color_prims = (ktx2_df_color_primaries)((dfd_bits >> 8) & 255); - m_dfd_transfer_func = (dfd_bits >> 16) & 255; - m_dfd_flags = (dfd_bits >> 24) & 255; - - // See 3.10.1.Restrictions - if ((m_dfd_transfer_func != KTX2_KHR_DF_TRANSFER_LINEAR) && (m_dfd_transfer_func != KTX2_KHR_DF_TRANSFER_SRGB)) + else { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Invalid DFD transfer function\n"); - return false; + if (!(((best_blk.m_weight_ise_range == 1) && (best_blk.m_endpoint_ise_range == 14)) || + ((best_blk.m_weight_ise_range == 2) && (best_blk.m_endpoint_ise_range == 12)))) + { + return false; + } } - if (m_dfd_color_model == KTX2_KDF_DF_MODEL_ETC1S) + for (uint32_t s = 0; s < 2; s++) { - m_format = basist::basis_tex_format::cETC1S; - - // 3.10.2: "Whether the image has 1 or 2 slices can be determined from the DFD's sample count." - // If m_has_alpha is true it may be 2-channel RRRG or 4-channel RGBA, but we let the caller deal with that. - m_has_alpha = (m_header.m_dfd_byte_length == 60); - - m_dfd_samples = m_has_alpha ? 2 : 1; - m_dfd_chan0 = (ktx2_df_channel_id)((sample_channel0 >> 24) & 15); + int e[2][3]; + if (best_blk.m_color_endpoint_modes[0] == 7) + { + bool success = decode_mode7_to_qlog12(best_blk.m_endpoints + s * NUM_MODE7_ENDPOINTS, e, nullptr, best_blk.m_endpoint_ise_range); + if (!success) + return false; + } + else + { + bool success = decode_mode11_to_qlog12(best_blk.m_endpoints + s * NUM_MODE11_ENDPOINTS, e, best_blk.m_endpoint_ise_range); + if (!success) + return false; + } - if (m_has_alpha) + for (uint32_t c = 0; c < 3; c++) { - const uint32_t sample_channel1 = basisu::read_le_dword(pDFD + 11 * sizeof(uint32_t)); - m_dfd_chan1 = (ktx2_df_channel_id)((sample_channel1 >> 24) & 15); + bc6h_endpoints[s][c][0] = qlog_to_half(e[0][c], 12); + if (is_half_inf_or_nan(bc6h_endpoints[s][c][0])) + return false; + + bc6h_endpoints[s][c][1] = qlog_to_half(e[1][c], 12); + if (is_half_inf_or_nan(bc6h_endpoints[s][c][1])) + return false; } } - else if (m_dfd_color_model == KTX2_KDF_DF_MODEL_UASTC) + + uint8_t bc6h_weights[16]; + if (best_blk.m_weight_ise_range == 1) { - m_format = basist::basis_tex_format::cUASTC4x4; + static const uint8_t s_astc1_to_bc6h_3[3] = { 0, 4, 7 }; - m_dfd_samples = 1; - m_dfd_chan0 = (ktx2_df_channel_id)((sample_channel0 >> 24) & 15); - - // We're assuming "DATA" means RGBA so it has alpha. - m_has_alpha = (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RGBA) || (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RRRG); + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc1_to_bc6h_3[best_blk.m_weights[i]]; } - else if (m_dfd_color_model == KTX2_KDF_DF_MODEL_UASTC_HDR) + else if (best_blk.m_weight_ise_range == 2) { - m_format = basist::basis_tex_format::cUASTC_HDR_4x4; + static const uint8_t s_astc2_to_bc6h_3[4] = { 0, 2, 5, 7 }; - m_dfd_samples = 1; - m_dfd_chan0 = (ktx2_df_channel_id)((sample_channel0 >> 24) & 15); + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc2_to_bc6h_3[best_blk.m_weights[i]]; + } + else if (best_blk.m_weight_ise_range == 3) + { + static const uint8_t s_astc3_to_bc6h_3[5] = { 0, 2, 4, 5, 7 }; - // We're assuming "DATA" means RGBA so it has alpha. - m_has_alpha = (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RGBA) || (m_dfd_chan0 == KTX2_DF_CHANNEL_UASTC_RRRG); + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc3_to_bc6h_3[best_blk.m_weights[i]]; } - else + else if (best_blk.m_weight_ise_range == 4) { - // Unsupported DFD color model. - BASISU_DEVEL_ERROR("ktx2_transcoder::init: Unsupported DFD color model\n"); - return false; + static const uint8_t s_astc4_to_bc6h_3[6] = { 0, 7, 1, 6, 3, 4 }; + + for (uint32_t i = 0; i < 16; i++) + bc6h_weights[i] = s_astc4_to_bc6h_3[best_blk.m_weights[i]]; } - - if (!read_key_values()) + else if (best_blk.m_weight_ise_range == 5) { - BASISU_DEVEL_ERROR("ktx2_transcoder::init: read_key_values() failed\n"); - return false; + memcpy(bc6h_weights, best_blk.m_weights, 16); } - - // Check for a KTXanimData key - for (uint32_t i = 0; i < m_key_values.size(); i++) + else { - if (strcmp(reinterpret_cast(m_key_values[i].m_key.data()), "KTXanimData") == 0) - { - m_is_video = true; - break; - } + assert(0); + return false; } + bc6h_enc_block_2subset_3bit_weights(&transcoded_bc6h_blk, common_part_index, bc6h_endpoints, bc6h_weights); + return true; } - uint32_t ktx2_transcoder::get_etc1s_image_descs_image_flags(uint32_t level_index, uint32_t layer_index, uint32_t face_index) const + //-------------------------------------------------------------------------------------------------------------------------- + // Transcodes an UASTC HDR block to BC6H. Must have been encoded to UASTC HDR, or this fails. + bool astc_hdr_transcode_to_bc6h(const astc_blk& src_blk, bc6h_block& dst_blk) { - const uint32_t etc1s_image_index = - (level_index * basisu::maximum(m_header.m_layer_count, 1) * m_header.m_face_count) + - layer_index * m_header.m_face_count + - face_index; - - if (etc1s_image_index >= get_etc1s_image_descs().size()) + assert(g_astc_hdr_core_initialized); + if (!g_astc_hdr_core_initialized) { assert(0); - return 0; + return false; } - return get_etc1s_image_descs()[etc1s_image_index].m_image_flags; - } + astc_helpers::log_astc_block log_blk; - const basisu::uint8_vec* ktx2_transcoder::find_key(const std::string& key_name) const - { - for (uint32_t i = 0; i < m_key_values.size(); i++) - if (strcmp((const char *)m_key_values[i].m_key.data(), key_name.c_str()) == 0) - return &m_key_values[i].m_value; + if (!astc_helpers::unpack_block(&src_blk, log_blk, 4, 4)) + { + // Failed unpacking ASTC data + return false; + } - return nullptr; + return astc_hdr_transcode_to_bc6h(log_blk, dst_blk); } - - bool ktx2_transcoder::start_transcoding() + + //-------------------------------------------------------------------------------------------------------------------------- + // Transcodes an UASTC HDR block to BC6H. Must have been encoded to UASTC HDR, or this fails. + bool astc_hdr_transcode_to_bc6h(const astc_helpers::log_astc_block& log_blk, bc6h_block& dst_blk) { - if (!m_pData) + assert(g_astc_hdr_core_initialized); + if (!g_astc_hdr_core_initialized) { - BASISU_DEVEL_ERROR("ktx2_transcoder::start_transcoding: Must call init() first\n"); + assert(0); + return false; + } + + if (log_blk.m_solid_color_flag_ldr) + { + // Don't support LDR solid colors. return false; } - if (m_header.m_supercompression_scheme == KTX2_SS_BASISLZ) + if (log_blk.m_solid_color_flag_hdr) { - // Check if we've already decompressed the ETC1S global data. If so don't unpack it again. - if (!m_etc1s_transcoder.get_endpoints().empty()) - return true; + // Solid color HDR block + return bc6h_enc_block_solid_color(&dst_blk, log_blk.m_solid_color); + } - if (!decompress_etc1s_global_data()) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::start_transcoding: decompress_etc1s_global_data() failed\n"); + // Only support 4x4 grid sizes + if ((log_blk.m_grid_width != 4) || (log_blk.m_grid_height != 4)) + return false; + + // Don't support dual plane encoding + if (log_blk.m_dual_plane) + return false; + + if (log_blk.m_num_partitions == 1) + { + // Handle 1 partition (or subset) + + // UASTC HDR checks + if ((log_blk.m_weight_ise_range < 1) || (log_blk.m_weight_ise_range > 8)) return false; + + int e[2][3]; + bool success; + + if (log_blk.m_color_endpoint_modes[0] == 7) + { + if (log_blk.m_endpoint_ise_range != 20) + return false; + + success = decode_mode7_to_qlog12(log_blk.m_endpoints, e, nullptr, log_blk.m_endpoint_ise_range); } - - if (!m_is_video) + else if (log_blk.m_color_endpoint_modes[0] == 11) { - // See if there are any P-frames. If so it must be a video, even if there wasn't a KTXanimData key. - // Video cannot be a cubemap, and it must be a texture array. - if ((m_header.m_face_count == 1) && (m_header.m_layer_count > 1)) + // UASTC HDR checks + if (log_blk.m_weight_ise_range <= 7) { - for (uint32_t i = 0; i < m_etc1s_image_descs.size(); i++) - { - if (m_etc1s_image_descs[i].m_image_flags & KTX2_IMAGE_IS_P_FRAME) - { - m_is_video = true; - break; - } - } + if (log_blk.m_endpoint_ise_range != 20) + return false; + } + else if (log_blk.m_endpoint_ise_range != 19) + { + return false; } - } - } - else if (m_header.m_supercompression_scheme == KTX2_SS_ZSTANDARD) - { -#if !BASISD_SUPPORT_KTX2_ZSTD - BASISU_DEVEL_ERROR("ktx2_transcoder::start_transcoding: File uses zstd supercompression, but zstd support was not enabled at compilation time (BASISD_SUPPORT_KTX2_ZSTD == 0)\n"); - return false; -#endif - } - return true; - } + success = decode_mode11_to_qlog12(log_blk.m_endpoints, e, log_blk.m_endpoint_ise_range); + } + else + { + return false; + } - bool ktx2_transcoder::get_image_level_info(ktx2_image_level_info& level_info, uint32_t level_index, uint32_t layer_index, uint32_t face_index) const - { - if (level_index >= m_levels.size()) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::get_image_level_info: level_index >= m_levels.size()\n"); - return false; - } + if (!success) + return false; - if (m_header.m_face_count > 1) - { - if (face_index >= 6) + // Transform endpoints to half float + half_float h_e[3][2] = { - BASISU_DEVEL_ERROR("ktx2_transcoder::get_image_level_info: face_index >= 6\n"); + { qlog_to_half(e[0][0], 12), qlog_to_half(e[1][0], 12) }, + { qlog_to_half(e[0][1], 12), qlog_to_half(e[1][1], 12) }, + { qlog_to_half(e[0][2], 12), qlog_to_half(e[1][2], 12) } + }; + + // Sanity check for NaN/Inf + for (uint32_t i = 0; i < 2; i++) + if (is_half_inf_or_nan(h_e[0][i]) || is_half_inf_or_nan(h_e[1][i]) || is_half_inf_or_nan(h_e[2][i])) + return false; + + // Transcode to bc6h + if (!transcode_bc6h_1subset(h_e, log_blk, dst_blk)) return false; - } } - else if (face_index != 0) + else if (log_blk.m_num_partitions == 2) { - BASISU_DEVEL_ERROR("ktx2_transcoder::get_image_level_info: face_index != 0\n"); - return false; - } + // Handle 2 partition (or subset) + int common_bc7_pat_index = g_astc_partition_id_to_common_bc7_pat_index[log_blk.m_partition_id]; + if (common_bc7_pat_index < 0) + return false; - if (layer_index >= basisu::maximum(m_header.m_layer_count, 1)) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::get_image_level_info: layer_index >= maximum(m_header.m_layer_count, 1)\n"); - return false; + assert(common_bc7_pat_index < (int)basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); + + if (!transcode_bc6h_2subsets(common_bc7_pat_index, log_blk, dst_blk)) + return false; } - - const uint32_t level_width = basisu::maximum(m_header.m_pixel_width >> level_index, 1); - const uint32_t level_height = basisu::maximum(m_header.m_pixel_height >> level_index, 1); - const uint32_t num_blocks_x = (level_width + 3) >> 2; - const uint32_t num_blocks_y = (level_height + 3) >> 2; - - level_info.m_face_index = face_index; - level_info.m_layer_index = layer_index; - level_info.m_level_index = level_index; - level_info.m_orig_width = level_width; - level_info.m_orig_height = level_height; - level_info.m_width = num_blocks_x * 4; - level_info.m_height = num_blocks_y * 4; - level_info.m_num_blocks_x = num_blocks_x; - level_info.m_num_blocks_y = num_blocks_y; - level_info.m_total_blocks = num_blocks_x * num_blocks_y; - level_info.m_alpha_flag = m_has_alpha; - level_info.m_iframe_flag = false; - if (m_etc1s_image_descs.size()) + else { - const uint32_t etc1s_image_index = - (level_index * basisu::maximum(m_header.m_layer_count, 1) * m_header.m_face_count) + - layer_index * m_header.m_face_count + - face_index; - - level_info.m_iframe_flag = (m_etc1s_image_descs[etc1s_image_index].m_image_flags & KTX2_IMAGE_IS_P_FRAME) == 0; + // Only supports 1 or 2 partitions (or subsets) + return false; } return true; } - - bool ktx2_transcoder::transcode_image_level( - uint32_t level_index, uint32_t layer_index, uint32_t face_index, - void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, - basist::transcoder_texture_format fmt, - uint32_t decode_flags, uint32_t output_row_pitch_in_blocks_or_pixels, uint32_t output_rows_in_pixels, int channel0, int channel1, - ktx2_transcoder_state* pState) + + // ASTC 6x6 support + namespace astc_6x6_hdr { - if (!m_pData) + const block_mode_desc g_block_mode_descs[TOTAL_BLOCK_MODE_DECS] = { - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: Must call init() first\n"); - return false; - } + // ------ mode 11 + { false, 11, 1, 6, 6, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_3_LEVELS, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_3_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 11, 1, 6, 6, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_4_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, - if (!pState) - pState = &m_def_transcoder_state; - - if (level_index >= m_levels.size()) + { false, 11, 1, 6, 5, astc_helpers::BISE_96_LEVELS, astc_helpers::BISE_5_LEVELS, astc_helpers::BISE_96_LEVELS, astc_helpers::BISE_5_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 11, 1, 5, 6, astc_helpers::BISE_96_LEVELS, astc_helpers::BISE_5_LEVELS, astc_helpers::BISE_96_LEVELS, astc_helpers::BISE_5_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + + { false, 11, 1, 6, 4, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_8_LEVELS, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_8_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 11, 1, 4, 6, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_8_LEVELS, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_8_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + + { false, 11, 1, 6, 3, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_16_LEVELS, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_16_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 11, 1, 3, 6, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_16_LEVELS, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_16_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + + { false, 11, 1, 5, 5, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_8_LEVELS, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_8_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 11, 1, 4, 4, astc_helpers::BISE_192_LEVELS, astc_helpers::BISE_16_LEVELS, astc_helpers::BISE_192_LEVELS, astc_helpers::BISE_16_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + + { false, 11, 1, 3, 3, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_16_LEVELS, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_16_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + + // ------ mode 7 + { false, 7, 1, 6, 6, astc_helpers::BISE_96_LEVELS, astc_helpers::BISE_5_LEVELS, astc_helpers::BISE_96_LEVELS, astc_helpers::BISE_5_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + + { false, 7, 1, 6, 6, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_3_LEVELS, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_3_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 7, 1, 6, 6, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_4_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + + { false, 7, 1, 5, 6, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_6_LEVELS, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_6_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 7, 1, 6, 5, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_6_LEVELS, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_6_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + + { false, 7, 1, 3, 6, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_20_LEVELS, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_20_LEVELS, BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 7, 1, 6, 3, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_20_LEVELS, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_20_LEVELS, BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + + // ------ mode 11, 2 subset + { false, 11, 2, 6, 6, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_2_LEVELS, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_2_LEVELS, BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + + // 6x3/3x6 + { false, 11, 2, 6, 3, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_3_LEVELS, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_3_LEVELS, BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 11, 2, 3, 6, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_3_LEVELS, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_3_LEVELS, BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + + // 3x6/6x3 + { false, 11, 2, 3, 6, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_4_LEVELS, BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 11, 2, 6, 3, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_4_LEVELS, BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + + // 3x6/6x3 + { false, 11, 2, 4, 6, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_3_LEVELS, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_3_LEVELS, 0, 0 }, + { false, 11, 2, 6, 4, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_3_LEVELS, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_3_LEVELS, 0, 0 }, + + // ------ mode 7, 2 subset + + // 6x5/5x6 + { false, 7, 2, 5, 6, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_3_LEVELS, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_3_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 7, 2, 6, 5, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_3_LEVELS, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_3_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + + // 6x4/4x6 mode 7 + { false, 7, 2, 4, 6, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_4_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 7, 2, 6, 4, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_4_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + + // 6x6 + { false, 7, 2, 6, 6, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_3_LEVELS, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_3_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + + // 6x6 + { false, 7, 2, 6, 6, astc_helpers::BISE_192_LEVELS, astc_helpers::BISE_2_LEVELS, astc_helpers::BISE_192_LEVELS, astc_helpers::BISE_2_LEVELS, 0, 0 }, + + // 5x5 + { false, 7, 2, 5, 5, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, 0, 0 }, + + // 6x3/3x6 mode 7 + { false, 7, 2, 3, 6, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_8_LEVELS, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_8_LEVELS, 0, 0 }, + { false, 7, 2, 6, 3, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_8_LEVELS, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_8_LEVELS, 0, 0 }, + + // 6x3/3x6 mode 7 + { false, 7, 2, 3, 6, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_6_LEVELS, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_6_LEVELS, 0, 0 }, + { false, 7, 2, 6, 3, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_6_LEVELS, astc_helpers::BISE_80_LEVELS, astc_helpers::BISE_6_LEVELS, 0, 0 }, + + // ------ dual plane + + // 3x6 + { true, 11, 1, 3, 6, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + { true, 11, 1, 3, 6, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 1 }, + { true, 11, 1, 3, 6, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 2 }, + + // 6x3 + { true, 11, 1, 6, 3, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + { true, 11, 1, 6, 3, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 1 }, + { true, 11, 1, 6, 3, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 2 }, + + // 3x3 + { true, 11, 1, 3, 3, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_16_LEVELS, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_16_LEVELS, BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + { true, 11, 1, 3, 3, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_16_LEVELS, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_16_LEVELS, BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 1 }, + { true, 11, 1, 3, 3, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_16_LEVELS, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_16_LEVELS, BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 2 }, + + // 4x4 + { true, 11, 1, 4, 4, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_5_LEVELS, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_5_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + { true, 11, 1, 4, 4, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_5_LEVELS, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_5_LEVELS, BASIST_HDR_6X6_LEVEL2, 1 }, + { true, 11, 1, 4, 4, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_5_LEVELS, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_5_LEVELS, BASIST_HDR_6X6_LEVEL2, 2 }, + + // 5x5 + { true, 11, 1, 5, 5, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_2_LEVELS, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_2_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + { true, 11, 1, 5, 5, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_2_LEVELS, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_2_LEVELS, BASIST_HDR_6X6_LEVEL2, 1 }, + { true, 11, 1, 5, 5, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_2_LEVELS, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_2_LEVELS, BASIST_HDR_6X6_LEVEL2, 2 }, + + // ------ 2x2 modes for RDO + // note 2x2 modes will be upsampled to 4x4 during transcoding (the min # of weight bits is 7 in ASTC) + { true, 11, 1, 2, 2, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_8_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + { true, 11, 1, 2, 2, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_8_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 1 }, + { true, 11, 1, 2, 2, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_8_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 2 }, + { false, 11, 1, 2, 2, astc_helpers::BISE_128_LEVELS, astc_helpers::BISE_2_LEVELS, astc_helpers::BISE_256_LEVELS, astc_helpers::BISE_3_LEVELS, BASIST_HDR_6X6_LEVEL0 | BASIST_HDR_6X6_LEVEL1 | BASIST_HDR_6X6_LEVEL2, 0 }, + + // ------ 3 subsets + + // 6x6 + { false, 7, 3, 6, 6, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_2_LEVELS, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_2_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + + // 5x5 + { false, 7, 3, 5, 5, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_2_LEVELS, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_2_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + + // 4x4 + { false, 7, 3, 4, 4, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_3_LEVELS, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_3_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 7, 3, 4, 4, astc_helpers::BISE_40_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_40_LEVELS, astc_helpers::BISE_4_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 7, 3, 4, 4, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_5_LEVELS, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_5_LEVELS, 0, 0 }, + + // 3x3 + { false, 7, 3, 3, 3, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_8_LEVELS, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_8_LEVELS, 0, 0 }, + + // 6x4 + { false, 7, 3, 6, 4, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_2_LEVELS, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_2_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 7, 3, 4, 6, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_2_LEVELS, astc_helpers::BISE_64_LEVELS, astc_helpers::BISE_2_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + + // 6x4 + { false, 7, 3, 6, 4, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_3_LEVELS, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_3_LEVELS, 0, 0 }, + { false, 7, 3, 4, 6, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_3_LEVELS, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_3_LEVELS, 0, 0 }, + + // 6x5 + { false, 7, 3, 6, 5, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_2_LEVELS, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_2_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 7, 3, 5, 6, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_2_LEVELS, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_2_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + + // 6x3 + { false, 7, 3, 6, 3, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_3_LEVELS, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_3_LEVELS, 0, 0 }, + { false, 7, 3, 3, 6, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_3_LEVELS, astc_helpers::BISE_48_LEVELS, astc_helpers::BISE_3_LEVELS, 0, 0 }, + + // 6x3 + { false, 7, 3, 6, 3, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_4_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + { false, 7, 3, 3, 6, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_4_LEVELS, astc_helpers::BISE_32_LEVELS, astc_helpers::BISE_4_LEVELS, BASIST_HDR_6X6_LEVEL2, 0 }, + + // 6x3 + { false, 7, 3, 6, 3, astc_helpers::BISE_24_LEVELS, astc_helpers::BISE_5_LEVELS, astc_helpers::BISE_24_LEVELS, astc_helpers::BISE_5_LEVELS, 0, 0 }, + { false, 7, 3, 3, 6, astc_helpers::BISE_24_LEVELS, astc_helpers::BISE_5_LEVELS, astc_helpers::BISE_24_LEVELS, astc_helpers::BISE_5_LEVELS, 0, 0 }, + + // 5x4 + { false, 7, 3, 5, 4, astc_helpers::BISE_40_LEVELS, astc_helpers::BISE_3_LEVELS, astc_helpers::BISE_40_LEVELS, astc_helpers::BISE_3_LEVELS, 0, 0 }, + { false, 7, 3, 4, 5, astc_helpers::BISE_40_LEVELS, astc_helpers::BISE_3_LEVELS, astc_helpers::BISE_40_LEVELS, astc_helpers::BISE_3_LEVELS, 0, 0 }, + }; + + const reuse_xy_delta g_reuse_xy_deltas[NUM_REUSE_XY_DELTAS] = { - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: level_index >= m_levels.size()\n"); - return false; - } + { -1, 0 }, { -2, 0 }, { -3, 0 }, { -4, 0 }, + { 3, -1 }, { 2, -1 }, { 1, -1 }, { 0, -1 }, { -1, -1 }, { -2, -1 }, { -3, -1 }, { -4, -1 }, + { 3, -2 }, { 2, -2 }, { 1, -2 }, { 0, -2 }, { -1, -2 }, { -2, -2 }, { -3, -2 }, { -4, -2 }, + { 3, -3 }, { 2, -3 }, { 1, -3 }, { 0, -3 }, { -1, -3 }, { -2, -3 }, { -3, -3 }, { -4, -3 }, + { 3, -4 }, { 2, -4 }, { 1, -4 }, { 0, -4 } + }; - if (m_header.m_face_count > 1) + //-------------------------------------------------------------------------------------------------------------------------- + + void requantize_astc_weights(uint32_t n, const uint8_t* pSrc_ise_vals, uint32_t from_ise_range, uint8_t* pDst_ise_vals, uint32_t to_ise_range) { - if (face_index >= 6) + if (from_ise_range == to_ise_range) { - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: face_index >= 6\n"); - return false; + if (pDst_ise_vals != pSrc_ise_vals) + memcpy(pDst_ise_vals, pSrc_ise_vals, n); + return; } + + const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(from_ise_range).m_ISE_to_val; + const auto& quant_tab = astc_helpers::g_dequant_tables.get_weight_tab(to_ise_range).m_val_to_ise; + + for (uint32_t i = 0; i < n; i++) + pDst_ise_vals[i] = quant_tab[dequant_tab[pSrc_ise_vals[i]]]; } - else if (face_index != 0) + + //-------------------------------------------------------------------------------------------------------------------------- + + inline int get_bit( + int src_val, int src_bit) { - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: face_index != 0\n"); - return false; + assert(src_bit >= 0 && src_bit <= 31); + int bit = (src_val >> src_bit) & 1; + return bit; } - if (layer_index >= basisu::maximum(m_header.m_layer_count, 1)) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: layer_index >= maximum(m_header.m_layer_count, 1)\n"); - return false; + inline void pack_bit( + int& dst, int dst_bit, + int src_val, int src_bit = 0) + { + assert(dst_bit >= 0 && dst_bit <= 31); + int bit = get_bit(src_val, src_bit); + dst |= (bit << dst_bit); } - const uint8_t* pComp_level_data = m_pData + m_levels[level_index].m_byte_offset; - uint64_t comp_level_data_size = m_levels[level_index].m_byte_length; - - const uint8_t* pUncomp_level_data = pComp_level_data; - uint64_t uncomp_level_data_size = comp_level_data_size; + // Valid for weight ISE ranges 12-192 levels. Preserves upper 2 or 3 bits post-quantization. + static uint8_t g_quantize_tables_preserve2[astc_helpers::TOTAL_ISE_RANGES - 1][256]; + static uint8_t g_quantize_tables_preserve3[astc_helpers::TOTAL_ISE_RANGES - 1][256]; - if (uncomp_level_data_size > UINT32_MAX) + const uint32_t g_part2_unique_index_to_seed[NUM_UNIQUE_PARTITIONS2] = { - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: uncomp_level_data_size > UINT32_MAX\n"); - return false; - } - - if (m_header.m_supercompression_scheme == KTX2_SS_ZSTANDARD) + 86, 959, 936, 476, 1007, 672, 447, 423, 488, 422, 273, 65, 267, 786, 585, 195, 108, 731, 878, 812, 264, 125, 868, 581, 258, 390, 549, 872, 661, 352, 645, 543, 988, + 906, 903, 616, 482, 529, 3, 286, 272, 303, 151, 504, 498, 260, 79, 66, 608, 769, 305, 610, 1014, 967, 835, 789, 7, 951, 691, 15, 763, 976, 438, 314, 601, 673, 177, + 252, 615, 436, 220, 899, 623, 433, 674, 278, 797, 107, 847, 114, 470, 760, 821, 490, 329, 945, 387, 471, 225, 172, 83, 418, 966, 439, 316, 247, 43, 343, 625, 798, + 1, 61, 73, 307, 136, 474, 42, 664, 1013, 249, 389, 227, 374, 121, 48, 538, 226, 309, 554, 802, 834, 335, 495, 10, 955, 461, 293, 508, 153, 101, 63, 139, 31, 687, + 132, 174, 324, 545, 289, 39, 178, 594, 963, 854, 222, 323, 998, 964, 598, 475, 720, 1019, 983, 91, 703, 614, 394, 612, 281, 207, 930, 758, 586, 128, 517, 426, 306, + 168, 713, 36, 458, 876, 368, 780, 5, 9, 214, 109, 553, 726, 175, 103, 753, 684, 44, 665, 53, 500, 367, 611, 119, 732, 639, 326, 203, 156, 686, 910, 255, 62, 392, 591, + 112, 88, 213, 19, 1022, 478, 90, 486, 799, 702, 730, 414, 99, 1008, 142, 886, 373, 216, 69, 393, 299, 648, 415, 822, 912, 110, 567, 550, 693, 2, 138, 59, 271, 562, 295, + 714, 719, 199, 893, 831, 1006, 662, 235, 262, 78, 51, 902, 298, 190, 169, 583, 347, 890, 958, 909, 49, 987, 696, 633, 480, 50, 764, 826, 1023, 1016, 437, 891, 774, 257, + 724, 791, 526, 593, 690, 638, 858, 895, 794, 995, 130, 87, 877, 819, 318, 649, 376, 211, 284, 937, 370, 688, 229, 994, 115, 842, 60, 521, 95, 694, 804, 146, 754, 487, 55, + 17, 770, 450, 223, 4, 137, 911, 236, 683, 523, 47, 181, 24, 270, 602, 736, 11, 355, 148, 351, 762, 1009, 16, 210, 619, 805, 874, 807, 887, 403, 999, 810, 27, 402, 551, 135, + 778, 33, 409, 993, 71, 363, 159, 183, 77, 596, 670, 380, 968, 811, 404, 348, 539, 158, 578, 196, 621, 68, 530, 193, 100, 167, 919, 353, 366, 327, 643, 948, 518, 756, 801, 558, + 28, 705, 116, 94, 898, 453, 622, 647, 231, 445, 652, 230, 191, 277, 292, 254, 198, 766, 386, 232, 29, 70, 942, 740, 291, 607, 411, 496, 839, 8, 675, 319, 742, 21, 547, 627, 716, + 663, 23, 914, 631, 595, 499, 685, 950, 510, 54, 587, 432, 45, 646, 25, 122, 947, 171, 862, 441, 808, 722, 14, 74, 658, 129, 266, 1001, 534, 395, 527, 250, 206, 237, 67, 897, 634, + 572, 569, 533, 37, 341, 89, 463, 419, 75, 134, 283, 943, 519, 362, 144, 681, 407, 954, 131, 455, 934, 46, 513, 339, 194, 361, 606, 852, 546, 655, 1015, 147, 506, 240, 56, 836, 76, + 98, 600, 430, 388, 980, 695, 817, 279, 58, 215, 149, 170, 531, 870, 18, 727, 154, 26, 938, 929, 302, 697, 452, 218, 700, 524, 828, 751, 869, 217, 440, 354 + }; + + const uint32_t g_part3_unique_index_to_seed[NUM_UNIQUE_PARTITIONS3] = + { + 0, 8, 11, 14, 15, 17, 18, 19, 26, 31, 34, 35, 36, 38, 44, 47, 48, 49, 51, 56, + 59, 61, 70, 74, 76, 82, 88, 90, 96, 100, 103, 104, 108, 110, 111, 117, 122, 123, + 126, 127, 132, 133, 135, 139, 147, 150, 151, 152, 156, 157, 163, 166, 168, 171, + 175, 176, 179, 181, 182, 183, 186, 189, 192, 199, 203, 205, 207, 210, 214, 216, + 222, 247, 249, 250, 252, 254, 260, 261, 262, 263, 266, 272, 273, 275, 276, 288, + 291, 292, 293, 294, 297, 302, 309, 310, 313, 314, 318, 327, 328, 331, 335, 337, + 346, 356, 357, 358, 363, 365, 368, 378, 381, 384, 386, 390, 391, 392, 396, 397, + 398, 399, 401, 410, 411, 419, 427, 430, 431, 437, 439, 440, 451, 455, 457, 458, + 459, 460, 462, 468, 470, 471, 472, 474, 475, 477, 479, 482, 483, 488, 493, 495, + 496, 502, 503, 504, 507, 510, 511, 512, 515, 516, 518, 519, 522, 523, 525, 526, + 527, 538, 543, 544, 546, 547, 549, 550, 552, 553, 554, 562, 570, 578, 579, 581, + 582, 588, 589, 590, 593, 595, 600, 606, 611, 613, 618, 623, 625, 632, 637, 638, + 645, 646, 650, 651, 658, 659, 662, 666, 667, 669, 670, 678, 679, 685, 686, 687, + 688, 691, 694, 696, 698, 699, 700, 701, 703, 704, 707, 713, 714, 715, 717, 719, + 722, 724, 727, 730, 731, 734, 738, 739, 743, 747, 748, 750, 751, 753, 758, 760, + 764, 766, 769, 775, 776, 783, 784, 785, 787, 791, 793, 798, 799, 802, 804, 805, + 806, 807, 808, 809, 810, 813, 822, 823, 825, 831, 835, 837, 838, 839, 840, 842, + 845, 846, 848, 853, 854, 858, 859, 860, 866, 874, 882, 884, 887, 888, 892, 894, + 898, 902, 907, 914, 915, 918, 919, 922, 923, 925, 927, 931, 932, 937, 938, 940, + 943, 944, 945, 953, 955, 958, 959, 963, 966, 971, 974, 979, 990, 991, 998, 999, + 1007, 1010, 1011, 1012, 1015, 1020, 1023 + }; + + static void init_quantize_tables() { - // Check if we've already decompressed this level's supercompressed data. - if ((int)level_index != pState->m_uncomp_data_level_index) + for (uint32_t ise_range = astc_helpers::BISE_192_LEVELS; ise_range >= astc_helpers::BISE_12_LEVELS; ise_range--) { - // Uncompress the entire level's supercompressed data. - if (!decompress_level_data(level_index, pState->m_level_uncomp_data)) + const uint32_t num_levels = astc_helpers::get_ise_levels(ise_range); + const auto& ise_to_val_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_range).m_ISE_to_val; + + for (uint32_t desired_val = 0; desired_val < 256; desired_val++) { - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: decompress_level_data() failed\n"); - return false; + { + uint32_t best_err = UINT32_MAX; + int best_ise_val = -1; + + for (uint32_t ise_val = 0; ise_val < num_levels; ise_val++) + { + const uint32_t quant_val = ise_to_val_tab[ise_val]; + + if ((quant_val & 0b11000000) != (desired_val & 0b11000000)) + continue; + + uint32_t err = basisu::squarei((int)quant_val - (int)desired_val); + if (err < best_err) + { + best_err = err; + best_ise_val = ise_val; + } + + } // ise_val + + assert(best_ise_val != -1); + + g_quantize_tables_preserve2[ise_range][desired_val] = (uint8_t)best_ise_val; + } + + { + uint32_t best_err = UINT32_MAX; + int best_ise_val = -1; + + for (uint32_t ise_val = 0; ise_val < num_levels; ise_val++) + { + const uint32_t quant_val = ise_to_val_tab[ise_val]; + + if ((quant_val & 0b11100000) != (desired_val & 0b11100000)) + continue; + + uint32_t err = basisu::squarei((int)quant_val - (int)desired_val); + if (err < best_err) + { + best_err = err; + best_ise_val = ise_val; + } + + } // ise_val + + assert(best_ise_val != -1); + + g_quantize_tables_preserve3[ise_range][desired_val] = (uint8_t)best_ise_val; + } + + } // desired_val + +#if 0 + for (uint32_t i = 0; i < 256; i++) + { + if (g_quantize_tables_preserve2[ise_range][i] != astc_helpers::g_dequant_tables.get_endpoint_tab(ise_range).m_val_to_ise[i]) + { + fmt_printf("P2, Range: {}, {} vs. {}\n", ise_range, g_quantize_tables_preserve2[ise_range][i], astc_helpers::g_dequant_tables.get_endpoint_tab(ise_range).m_val_to_ise[i]); + } + + if (g_quantize_tables_preserve3[ise_range][i] != astc_helpers::g_dequant_tables.get_endpoint_tab(ise_range).m_val_to_ise[i]) + { + fmt_printf("P3, Range: {}, {} vs. {}\n", ise_range, g_quantize_tables_preserve3[ise_range][i], astc_helpers::g_dequant_tables.get_endpoint_tab(ise_range).m_val_to_ise[i]); + } } - pState->m_uncomp_data_level_index = level_index; - } +#endif - pUncomp_level_data = pState->m_level_uncomp_data.data(); - uncomp_level_data_size = pState->m_level_uncomp_data.size(); + } // ise_range } - - const uint32_t level_width = basisu::maximum(m_header.m_pixel_width >> level_index, 1); - const uint32_t level_height = basisu::maximum(m_header.m_pixel_height >> level_index, 1); - const uint32_t num_blocks_x = (level_width + 3) >> 2; - const uint32_t num_blocks_y = (level_height + 3) >> 2; - - if (m_format == basist::basis_tex_format::cETC1S) + + void requantize_ise_endpoints(uint32_t cem, uint32_t src_ise_endpoint_range, const uint8_t* pSrc_endpoints, uint32_t dst_ise_endpoint_range, uint8_t* pDst_endpoints) { - // Ensure start_transcoding() was called. - if (m_etc1s_transcoder.get_endpoints().empty()) + assert(pSrc_endpoints != pDst_endpoints); + assert((src_ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (src_ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + assert((dst_ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (dst_ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + + // must be >=12 ISE levels for g_quantize_tables_preserve2 etc. + assert(dst_ise_endpoint_range >= astc_helpers::BISE_12_LEVELS); + + const uint32_t n = (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS; + + if (src_ise_endpoint_range == dst_ise_endpoint_range) { - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: must call start_transcoding() first\n"); - return false; + memcpy(pDst_endpoints, pSrc_endpoints, n); + return; } - const uint32_t etc1s_image_index = - (level_index * basisu::maximum(m_header.m_layer_count, 1) * m_header.m_face_count) + - layer_index * m_header.m_face_count + - face_index; - - // Sanity check - if (etc1s_image_index >= m_etc1s_image_descs.size()) + uint8_t temp_endpoints[basist::NUM_MODE11_ENDPOINTS]; + if (src_ise_endpoint_range != astc_helpers::BISE_256_LEVELS) { - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: etc1s_image_index >= m_etc1s_image_descs.size()\n"); - assert(0); - return false; + assert(n <= basist::NUM_MODE11_ENDPOINTS); + + const auto& endpoint_dequant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(src_ise_endpoint_range).m_ISE_to_val; + + for (uint32_t i = 0; i < n; i++) + temp_endpoints[i] = endpoint_dequant_tab[pSrc_endpoints[i]]; + + pSrc_endpoints = temp_endpoints; } - if (static_cast(m_data_size) != m_data_size) + if (dst_ise_endpoint_range == astc_helpers::BISE_256_LEVELS) { - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: File is too large\n"); - return false; + memcpy(pDst_endpoints, pSrc_endpoints, n); + return; } - const ktx2_etc1s_image_desc& image_desc = m_etc1s_image_descs[etc1s_image_index]; + const auto& quant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(dst_ise_endpoint_range).m_val_to_ise; - if (!m_etc1s_transcoder.transcode_image(fmt, - pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, m_pData, static_cast(m_data_size), - num_blocks_x, num_blocks_y, level_width, level_height, - level_index, - m_levels[level_index].m_byte_offset + image_desc.m_rgb_slice_byte_offset, image_desc.m_rgb_slice_byte_length, - image_desc.m_alpha_slice_byte_length ? (m_levels[level_index].m_byte_offset + image_desc.m_alpha_slice_byte_offset) : 0, image_desc.m_alpha_slice_byte_length, - decode_flags, m_has_alpha, - m_is_video, output_row_pitch_in_blocks_or_pixels, &pState->m_transcoder_state, output_rows_in_pixels)) + const auto& dequant_tab = astc_helpers::g_dequant_tables.get_endpoint_tab(dst_ise_endpoint_range).m_ISE_to_val; + BASISU_NOTE_UNUSED(dequant_tab); + +#if 1 + // A smarter value quantization that preserves the key upper bits. (If these bits get corrupted, the entire meaning of the encoding can get lost.) + if (cem == 11) { - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: ETC1S transcode_image() failed, this is either a bug or the file is corrupted/invalid\n"); - return false; + assert(n == 6); + + int maj_comp = 0; + pack_bit(maj_comp, 0, pSrc_endpoints[4], 7); + pack_bit(maj_comp, 1, pSrc_endpoints[5], 7); + + if (maj_comp == 3) + { + // Direct + pDst_endpoints[0] = quant_tab[pSrc_endpoints[0]]; + pDst_endpoints[1] = quant_tab[pSrc_endpoints[1]]; + pDst_endpoints[2] = quant_tab[pSrc_endpoints[2]]; + pDst_endpoints[3] = quant_tab[pSrc_endpoints[3]]; + // No need for preserve1 tables, we can use the regular quantization tables because they preserve the MSB. + pDst_endpoints[4] = quant_tab[pSrc_endpoints[4]]; + pDst_endpoints[5] = quant_tab[pSrc_endpoints[5]]; + + assert((dequant_tab[pDst_endpoints[4]] & 128) == (pSrc_endpoints[4] & 128)); + assert((dequant_tab[pDst_endpoints[5]] & 128) == (pSrc_endpoints[5] & 128)); + } + else + { + pDst_endpoints[0] = quant_tab[pSrc_endpoints[0]]; + pDst_endpoints[1] = g_quantize_tables_preserve2[dst_ise_endpoint_range][pSrc_endpoints[1]]; + pDst_endpoints[2] = g_quantize_tables_preserve2[dst_ise_endpoint_range][pSrc_endpoints[2]]; + pDst_endpoints[3] = g_quantize_tables_preserve2[dst_ise_endpoint_range][pSrc_endpoints[3]]; + pDst_endpoints[4] = g_quantize_tables_preserve3[dst_ise_endpoint_range][pSrc_endpoints[4]]; + pDst_endpoints[5] = g_quantize_tables_preserve3[dst_ise_endpoint_range][pSrc_endpoints[5]]; + + assert((dequant_tab[pDst_endpoints[1]] & 0b11000000) == (pSrc_endpoints[1] & 0b11000000)); + assert((dequant_tab[pDst_endpoints[2]] & 0b11000000) == (pSrc_endpoints[2] & 0b11000000)); + assert((dequant_tab[pDst_endpoints[3]] & 0b11000000) == (pSrc_endpoints[3] & 0b11000000)); + assert((dequant_tab[pDst_endpoints[4]] & 0b11100000) == (pSrc_endpoints[4] & 0b11100000)); + assert((dequant_tab[pDst_endpoints[5]] & 0b11100000) == (pSrc_endpoints[5] & 0b11100000)); + } } - } - else if ((m_format == basist::basis_tex_format::cUASTC4x4) || - (m_format == basist::basis_tex_format::cUASTC_HDR_4x4)) - { - // Compute length and offset to uncompressed 2D UASTC texture data, given the face/layer indices. - assert(uncomp_level_data_size == m_levels[level_index].m_uncompressed_byte_length); - const uint32_t total_2D_image_size = num_blocks_x * num_blocks_y * KTX2_UASTC_BLOCK_SIZE; - - const uint32_t uncomp_ofs = (layer_index * m_header.m_face_count + face_index) * total_2D_image_size; + else if (cem == 7) + { + assert(n == 4); - // Sanity checks - if (uncomp_ofs >= uncomp_level_data_size) + pDst_endpoints[0] = g_quantize_tables_preserve2[dst_ise_endpoint_range][pSrc_endpoints[0]]; + pDst_endpoints[1] = g_quantize_tables_preserve3[dst_ise_endpoint_range][pSrc_endpoints[1]]; + pDst_endpoints[2] = g_quantize_tables_preserve3[dst_ise_endpoint_range][pSrc_endpoints[2]]; + pDst_endpoints[3] = g_quantize_tables_preserve3[dst_ise_endpoint_range][pSrc_endpoints[3]]; + + assert((dequant_tab[pDst_endpoints[0]] & 0b11000000) == (pSrc_endpoints[0] & 0b11000000)); + assert((dequant_tab[pDst_endpoints[1]] & 0b11100000) == (pSrc_endpoints[1] & 0b11100000)); + assert((dequant_tab[pDst_endpoints[2]] & 0b11100000) == (pSrc_endpoints[2] & 0b11100000)); + assert((dequant_tab[pDst_endpoints[3]] & 0b11100000) == (pSrc_endpoints[3] & 0b11100000)); + } + else { - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: uncomp_ofs >= total_2D_image_size\n"); - return false; + assert(0); } - - if ((uncomp_level_data_size - uncomp_ofs) < total_2D_image_size) +#else + for (uint32_t i = 0; i < n; i++) { - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: (uncomp_level_data_size - uncomp_ofs) < total_2D_image_size\n"); - return false; + uint32_t v = pSrc_endpoints[i]; + assert(v <= 255); + + pDst_endpoints[i] = quant_tab[v]; } +#endif + } - if (m_format == basist::basis_tex_format::cUASTC_HDR_4x4) + void copy_weight_grid(bool dual_plane, uint32_t grid_x, uint32_t grid_y, const uint8_t* transcode_weights, astc_helpers::log_astc_block& decomp_blk) + { + assert(decomp_blk.m_weight_ise_range >= astc_helpers::BISE_2_LEVELS); + assert(decomp_blk.m_weight_ise_range <= astc_helpers::BISE_32_LEVELS); + + // Special case for 2x2 which isn't typically valid ASTC (too few weight bits without dual plane). Upsample to 4x4. + if ((!dual_plane) && (grid_x == 2) && (grid_y == 2)) { - if (!m_uastc_hdr_transcoder.transcode_image(fmt, - pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, - (const uint8_t*)pUncomp_level_data + uncomp_ofs, (uint32_t)total_2D_image_size, num_blocks_x, num_blocks_y, level_width, level_height, level_index, - 0, (uint32_t)total_2D_image_size, - decode_flags, m_has_alpha, m_is_video, output_row_pitch_in_blocks_or_pixels, nullptr, output_rows_in_pixels, channel0, channel1)) + decomp_blk.m_grid_width = 4; + decomp_blk.m_grid_height = 4; + + //const uint32_t total_weight_levels = astc_helpers::bise_levels(decomp_blk.m_weight_ise_range); + const auto& dequant_weight = astc_helpers::g_dequant_tables.get_weight_tab(decomp_blk.m_weight_ise_range).m_ISE_to_val; + const auto& quant_weight = astc_helpers::g_dequant_tables.get_weight_tab(decomp_blk.m_weight_ise_range).m_val_to_ise; + + astc_helpers::weighted_sample weights[16]; + + compute_upsample_weights(4, 4, 2, 2, weights); + + for (uint32_t y = 0; y < 4; y++) { - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: UASTC HDR transcode_image() failed, this is either a bug or the file is corrupted/invalid\n"); - return false; + for (uint32_t x = 0; x < 4; x++) + { + const astc_helpers::weighted_sample& sample = weights[x + y * 4]; + + uint32_t total_weight = 8; + + for (uint32_t yo = 0; yo < 2; yo++) + { + for (uint32_t xo = 0; xo < 2; xo++) + { + if (!sample.m_weights[yo][xo]) + continue; + + total_weight += dequant_weight[transcode_weights[basisu::in_bounds((x + xo) + (y + yo) * grid_x, 0, grid_x * grid_y)]] * sample.m_weights[yo][xo]; + } // x + } // y + + total_weight >>= 4; + + assert(total_weight <= 64); + + decomp_blk.m_weights[x + y * 4] = quant_weight[total_weight]; + } } } else { - if (!m_uastc_transcoder.transcode_image(fmt, - pOutput_blocks, output_blocks_buf_size_in_blocks_or_pixels, - (const uint8_t*)pUncomp_level_data + uncomp_ofs, (uint32_t)total_2D_image_size, num_blocks_x, num_blocks_y, level_width, level_height, level_index, - 0, (uint32_t)total_2D_image_size, - decode_flags, m_has_alpha, m_is_video, output_row_pitch_in_blocks_or_pixels, nullptr, output_rows_in_pixels, channel0, channel1)) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: UASTC transcode_image() failed, this is either a bug or the file is corrupted/invalid\n"); - return false; - } + const uint32_t num_planes = dual_plane ? 2 : 1; + + decomp_blk.m_grid_width = (uint8_t)grid_x; + decomp_blk.m_grid_height = (uint8_t)grid_y; + memcpy(decomp_blk.m_weights, transcode_weights, grid_x * grid_y * num_planes); } } - else + + // cur_y is the current destination row + // prev_y is the row we want to access + static inline int calc_row_index(int cur_y, int prev_y, int cur_row_index) { - // Shouldn't get here. - BASISU_DEVEL_ERROR("ktx2_transcoder::transcode_image_2D: Internal error\n"); - assert(0); - return false; - } + assert((cur_y >= 0) && (prev_y >= 0)); + assert((cur_row_index >= 0) && (cur_row_index < REUSE_MAX_BUFFER_ROWS)); - return true; - } - - bool ktx2_transcoder::decompress_level_data(uint32_t level_index, basisu::uint8_vec& uncomp_data) - { - const uint8_t* pComp_data = m_levels[level_index].m_byte_offset + m_pData; - const uint64_t comp_size = m_levels[level_index].m_byte_length; - - const uint64_t uncomp_size = m_levels[level_index].m_uncompressed_byte_length; + int delta_y = prev_y - cur_y; + assert((delta_y > -REUSE_MAX_BUFFER_ROWS) && (delta_y <= 0)); - if (((size_t)comp_size) != comp_size) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: Compressed data too large\n"); - return false; - } - if (((size_t)uncomp_size) != uncomp_size) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: Uncompressed data too large\n"); - return false; - } + cur_row_index += delta_y; + if (cur_row_index < 0) + cur_row_index += REUSE_MAX_BUFFER_ROWS; - if (!uncomp_data.try_resize((size_t)uncomp_size)) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: Out of memory\n"); - return false; + assert((cur_row_index >= 0) && (cur_row_index < REUSE_MAX_BUFFER_ROWS)); + + return cur_row_index; } - - if (m_header.m_supercompression_scheme == KTX2_SS_ZSTANDARD) + + bool decode_values(basist::bitwise_decoder& decoder, uint32_t total_values, uint32_t ise_range, uint8_t* pValues) { -#if BASISD_SUPPORT_KTX2_ZSTD - size_t actualUncompSize = ZSTD_decompress(uncomp_data.data(), (size_t)uncomp_size, pComp_data, (size_t)comp_size); - if (ZSTD_isError(actualUncompSize)) + assert(ise_range <= astc_helpers::BISE_256_LEVELS); + + const uint32_t ep_bits = astc_helpers::g_ise_range_table[ise_range][0]; + const uint32_t ep_trits = astc_helpers::g_ise_range_table[ise_range][1]; + const uint32_t ep_quints = astc_helpers::g_ise_range_table[ise_range][2]; + + uint32_t total_tqs = 0; + uint32_t bundle_size = 0, mul = 0; + if (ep_trits) { - BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: Zstd decompression failed, file is invalid or corrupted\n"); - return false; + total_tqs = (total_values + 4) / 5; + bundle_size = 5; + mul = 3; } - if (actualUncompSize != uncomp_size) + else if (ep_quints) { - BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: Zstd decompression returned too few bytes, file is invalid or corrupted\n"); - return false; + total_tqs = (total_values + 2) / 3; + bundle_size = 3; + mul = 5; } -#else - BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_level_data: File uses Zstd supercompression, but Zstd support was not enabled at compile time (BASISD_SUPPORT_KTX2_ZSTD is 0)\n"); - return false; -#endif - } - return true; - } - - bool ktx2_transcoder::decompress_etc1s_global_data() - { - // Note: we don't actually support 3D textures in here yet - //uint32_t layer_pixel_depth = basisu::maximum(m_header.m_pixel_depth, 1); - //for (uint32_t i = 1; i < m_header.m_level_count; i++) - // layer_pixel_depth += basisu::maximum(m_header.m_pixel_depth >> i, 1); + const uint32_t MAX_TQ_VALUES = 32; + assert(total_tqs <= MAX_TQ_VALUES); + uint32_t tq_values[MAX_TQ_VALUES]; - const uint32_t image_count = basisu::maximum(m_header.m_layer_count, 1) * m_header.m_face_count * m_header.m_level_count; - assert(image_count); + for (uint32_t i = 0; i < total_tqs; i++) + { + uint32_t num_bits = ep_trits ? 8 : 7; - const uint8_t* pSrc = m_pData + m_header.m_sgd_byte_offset; + if (i == (total_tqs - 1)) + { + uint32_t num_remaining = total_values - (total_tqs - 1) * bundle_size; + if (ep_trits) + { + switch (num_remaining) + { + case 1: num_bits = 2; break; + case 2: num_bits = 4; break; + case 3: num_bits = 5; break; + case 4: num_bits = 7; break; + default: break; + } + } + else if (ep_quints) + { + switch (num_remaining) + { + case 1: num_bits = 3; break; + case 2: num_bits = 5; break; + default: break; + } + } + } - memcpy(&m_etc1s_header, pSrc, sizeof(ktx2_etc1s_global_data_header)); - pSrc += sizeof(ktx2_etc1s_global_data_header); + tq_values[i] = (uint32_t)decoder.get_bits(num_bits); + } // i - if ((!m_etc1s_header.m_endpoints_byte_length) || (!m_etc1s_header.m_selectors_byte_length) || (!m_etc1s_header.m_tables_byte_length)) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: Invalid ETC1S global data\n"); - return false; - } + uint32_t accum = 0; + uint32_t accum_remaining = 0; + uint32_t next_tq_index = 0; + + for (uint32_t i = 0; i < total_values; i++) + { + uint32_t value = (uint32_t)decoder.get_bits(ep_bits); + + if (total_tqs) + { + if (!accum_remaining) + { + assert(next_tq_index < total_tqs); + accum = tq_values[next_tq_index++]; + accum_remaining = bundle_size; + } + + uint32_t v = accum % mul; + accum /= mul; + accum_remaining--; + + value |= (v << ep_bits); + } + + pValues[i] = (uint8_t)value; + } - if ((!m_etc1s_header.m_endpoint_count) || (!m_etc1s_header.m_selector_count)) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: endpoint and/or selector count is 0, file is invalid or corrupted\n"); - return false; + return true; } - // Sanity check the ETC1S header. - if ((sizeof(ktx2_etc1s_global_data_header) + - sizeof(ktx2_etc1s_image_desc) * image_count + - m_etc1s_header.m_endpoints_byte_length + - m_etc1s_header.m_selectors_byte_length + - m_etc1s_header.m_tables_byte_length + - m_etc1s_header.m_extended_byte_length) > m_header.m_sgd_byte_length) + static inline uint32_t get_num_endpoint_vals(uint32_t cem) { - BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: SGD byte length is too small, file is invalid or corrupted\n"); - return false; + assert((cem == 7) || (cem == 11)); + return (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS; } - if (!m_etc1s_image_descs.try_resize(image_count)) + const uint32_t g_bc6h_weights4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; + +#if 0 + static BASISU_FORCE_INLINE int pos_lrintf(float x) { - BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: Out of memory\n"); - return false; + assert(x >= 0.0f); + return (int)(x + .5f); } - - memcpy(m_etc1s_image_descs.data(), pSrc, sizeof(ktx2_etc1s_image_desc) * image_count); - pSrc += sizeof(ktx2_etc1s_image_desc) * image_count; - // Sanity check the ETC1S image descs - for (uint32_t i = 0; i < image_count; i++) + static BASISU_FORCE_INLINE basist::half_float fast_float_to_half_non_neg_no_nan_inf(float val) { - // m_etc1s_transcoder.transcode_image() will validate the slice offsets/lengths before transcoding. + union { float f; int32_t i; uint32_t u; } fi = { val }; + const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF; + int e = 0, m = 0; - if (!m_etc1s_image_descs[i].m_rgb_slice_byte_length) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: ETC1S image descs sanity check failed (1)\n"); - return false; - } + assert(((fi.i >> 31) == 0) && (flt_e != 0xFF)); - if (m_has_alpha) + // not zero or denormal + if (flt_e != 0) { - if (!m_etc1s_image_descs[i].m_alpha_slice_byte_length) + int new_exp = flt_e - 127; + if (new_exp > 15) + e = 31; + else if (new_exp < -14) + m = pos_lrintf((1 << 24) * fabsf(fi.f)); + else { - BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: ETC1S image descs sanity check failed (2)\n"); - return false; + e = new_exp + 15; + m = pos_lrintf(flt_m * (1.0f / ((float)(1 << 13)))); } } - } - const uint8_t* pEndpoint_data = pSrc; - const uint8_t* pSelector_data = pSrc + m_etc1s_header.m_endpoints_byte_length; - const uint8_t* pTables_data = pSrc + m_etc1s_header.m_endpoints_byte_length + m_etc1s_header.m_selectors_byte_length; + assert((0 <= m) && (m <= 1024)); + if (m == 1024) + { + e++; + m = 0; + } - if (!m_etc1s_transcoder.decode_tables(pTables_data, m_etc1s_header.m_tables_byte_length)) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: decode_tables() failed, file is invalid or corrupted\n"); - return false; + assert((e >= 0) && (e <= 31)); + assert((m >= 0) && (m <= 1023)); + + basist::half_float result = (basist::half_float)((e << 10) | m); + return result; } - - if (!m_etc1s_transcoder.decode_palettes( - m_etc1s_header.m_endpoint_count, pEndpoint_data, m_etc1s_header.m_endpoints_byte_length, - m_etc1s_header.m_selector_count, pSelector_data, m_etc1s_header.m_selectors_byte_length)) +#endif + + union fu32 { - BASISU_DEVEL_ERROR("ktx2_transcoder::decompress_etc1s_global_data: decode_palettes() failed, file is likely corrupted\n"); - return false; - } - - return true; - } + uint32_t u; + float f; + }; - bool ktx2_transcoder::read_key_values() - { - if (!m_header.m_kvd_byte_length) + static BASISU_FORCE_INLINE basist::half_float fast_float_to_half_no_clamp_neg_nan_or_inf(float f) { - if (m_header.m_kvd_byte_offset) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Invalid KVD byte offset (it should be zero when the length is zero)\n"); - return false; - } + assert(!isnan(f) && !isinf(f)); + assert((f >= 0.0f) && (f <= basist::MAX_HALF_FLOAT)); - return true; - } + // Sutract 112 from the exponent, to change the bias from 127 to 15. + static const fu32 g_f_to_h{ 0x7800000 }; - if (m_header.m_kvd_byte_offset < sizeof(ktx2_header)) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Invalid KVD byte offset\n"); - return false; + fu32 fu; + + fu.f = f * g_f_to_h.f; + + uint32_t h = (basist::half_float)((fu.u >> (23 - 10)) & 0x7FFF); + + // round to even + uint32_t mant = fu.u & 8191; // examine lowest 13 bits + h += (mant > 4096); + + if (h > basist::MAX_HALF_FLOAT_AS_INT_BITS) + h = basist::MAX_HALF_FLOAT_AS_INT_BITS; + + return (basist::half_float)h; } - if ((m_header.m_kvd_byte_offset + m_header.m_kvd_byte_length) > m_data_size) + static BASISU_FORCE_INLINE float ftoh(float f) { - BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Invalid KVD byte offset and/or length\n"); - return false; + //float res = (float)fast_float_to_half_non_neg_no_nan_inf(fabsf(f)) * ((f < 0.0f) ? -1.0f : 1.0f); + float res = (float)fast_float_to_half_no_clamp_neg_nan_or_inf(fabsf(f)) * ((f < 0.0f) ? -1.0f : 1.0f); + return res; } + + // Supports positive and denormals only. No NaN or Inf. + static BASISU_FORCE_INLINE float fast_half_to_float_pos_not_inf_or_nan(basist::half_float h) + { + assert(!basist::half_is_signed(h) && !basist::is_half_inf_or_nan(h)); - const uint8_t* pSrc = m_pData + m_header.m_kvd_byte_offset; - uint32_t src_left = m_header.m_kvd_byte_length; + // add 112 to the exponent (112+half float's exp bias of 15=float32's bias of 127) + static const fu32 K = { 0x77800000 }; - if (!m_key_values.try_reserve(8)) + fu32 o; + o.u = h << 13; + o.f *= K.f; + + return o.f; + } + + static BASISU_FORCE_INLINE float inv_sqrt(float v) { - BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Out of memory\n"); - return false; + union + { + float flt; + uint32_t ui; + } un; + + un.flt = v; + un.ui = 0x5F1FFFF9UL - (un.ui >> 1); + + return 0.703952253f * un.flt * (2.38924456f - v * (un.flt * un.flt)); } - while (src_left > sizeof(uint32_t)) + static const int FAST_BC6H_STD_DEV_THRESH = 256; + static const int FAST_BC6H_COMPLEX_STD_DEV_THRESH = 512; + static const int FAST_BC6H_VERY_COMPLEX_STD_DEV_THRESH = 2048; + + static void assign_weights_simple_4( + const basist::half_float* pPixels, + uint8_t* pWeights, + int min_r, int min_g, int min_b, + int max_r, int max_g, int max_b, int64_t block_max_var) { - uint32_t l = basisu::read_le_dword(pSrc); + BASISU_NOTE_UNUSED(block_max_var); - pSrc += sizeof(uint32_t); - src_left -= sizeof(uint32_t); + float fmin_r = fast_half_to_float_pos_not_inf_or_nan((basist::half_float)min_r); + float fmin_g = fast_half_to_float_pos_not_inf_or_nan((basist::half_float)min_g); + float fmin_b = fast_half_to_float_pos_not_inf_or_nan((basist::half_float)min_b); - if (l < 2) + float fmax_r = fast_half_to_float_pos_not_inf_or_nan((basist::half_float)max_r); + float fmax_g = fast_half_to_float_pos_not_inf_or_nan((basist::half_float)max_g); + float fmax_b = fast_half_to_float_pos_not_inf_or_nan((basist::half_float)max_b); + + float fdir_r = fmax_r - fmin_r; + float fdir_g = fmax_g - fmin_g; + float fdir_b = fmax_b - fmin_b; + + float l = inv_sqrt(fdir_r * fdir_r + fdir_g * fdir_g + fdir_b * fdir_b); + if (l != 0.0f) { - BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Failed reading key value fields (0)\n"); - return false; + fdir_r *= l; + fdir_g *= l; + fdir_b *= l; } - if (src_left < l) + float lr = ftoh(fmin_r * fdir_r + fmin_g * fdir_g + fmin_b * fdir_b); + float hr = ftoh(fmax_r * fdir_r + fmax_g * fdir_g + fmax_b * fdir_b); + + float frr = (hr == lr) ? 0.0f : (14.93333f / (float)(hr - lr)); + + lr = (-lr * frr) + 0.53333f; + for (uint32_t i = 0; i < 16; i++) { - BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Failed reading key value fields (1)\n"); - return false; + const float r = fast_half_to_float_pos_not_inf_or_nan(pPixels[i * 3 + 0]); + const float g = fast_half_to_float_pos_not_inf_or_nan(pPixels[i * 3 + 1]); + const float b = fast_half_to_float_pos_not_inf_or_nan(pPixels[i * 3 + 2]); + const float w = ftoh(r * fdir_r + g * fdir_g + b * fdir_b); + + pWeights[i] = (uint8_t)basisu::clamp((int)(w * frr + lr), 0, 15); } + } + + static double assign_weights_4( + const vec3F* pFloat_pixels, const float* pPixel_scales, + uint8_t* pWeights, + int min_r, int min_g, int min_b, + int max_r, int max_g, int max_b, int64_t block_max_var, bool try_2subsets_flag, + const fast_bc6h_params& params) + { + float cr[16], cg[16], cb[16]; - if (!m_key_values.try_resize(m_key_values.size() + 1)) + for (uint32_t i = 0; i < 16; i++) { - BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Out of memory\n"); - return false; + const uint32_t w = g_bc6h_weights4[i]; + + cr[i] = fast_half_to_float_pos_not_inf_or_nan((basist::half_float)((min_r * (64 - w) + max_r * w + 32) >> 6)); + cg[i] = fast_half_to_float_pos_not_inf_or_nan((basist::half_float)((min_g * (64 - w) + max_g * w + 32) >> 6)); + cb[i] = fast_half_to_float_pos_not_inf_or_nan((basist::half_float)((min_b * (64 - w) + max_b * w + 32) >> 6)); } - - basisu::uint8_vec& key_data = m_key_values.back().m_key; - basisu::uint8_vec& value_data = m_key_values.back().m_value; - do + double total_err = 0.0f; + + if (params.m_brute_force_weight4_assignment) { - if (!l) + for (uint32_t i = 0; i < 16; i++) { - BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Failed reading key value fields (2)\n"); - return false; + const float qr = pFloat_pixels[i].c[0], qg = pFloat_pixels[i].c[1], qb = pFloat_pixels[i].c[2]; + + float best_err = basisu::squaref(cr[0] - qr) + basisu::squaref(cg[0] - qg) + basisu::squaref(cb[0] - qb); + uint32_t best_idx = 0; + + for (uint32_t j = 1; j < 16; j++) + { + float rd = cr[j] - qr, gd = cg[j] - qg, bd = cb[j] - qb; + float e = rd * rd + gd * gd + bd * bd; + + if (e < best_err) + { + best_err = e; + best_idx = j; + } + } + + pWeights[i] = (uint8_t)best_idx; + + total_err += best_err * pPixel_scales[i]; } + } + else + { + const float dir_r = cr[15] - cr[0], dir_g = cg[15] - cg[0], dir_b = cb[15] - cb[0]; - if (!key_data.try_push_back(*pSrc++)) + float dots[16]; + for (uint32_t i = 0; i < 16; i++) + dots[i] = cr[i] * dir_r + cg[i] * dir_g + cb[i] * dir_b; + + float mid_dots[15]; + bool monotonically_increasing = true; + for (uint32_t i = 0; i < 15; i++) { - BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Out of memory\n"); - return false; + mid_dots[i] = (dots[i] + dots[i + 1]) * .5f; + + if (dots[i] > dots[i + 1]) + monotonically_increasing = false; } - src_left--; - l--; + const bool check_more_colors = block_max_var > (FAST_BC6H_VERY_COMPLEX_STD_DEV_THRESH * FAST_BC6H_VERY_COMPLEX_STD_DEV_THRESH * 16); // watch prec - } while (key_data.back()); - - if (!value_data.try_resize(l)) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Out of memory\n"); - return false; + if (!monotonically_increasing) + { + // Seems very rare, not worth optimizing the other cases + for (uint32_t i = 0; i < 16; i++) + { + const float qr = pFloat_pixels[i].c[0], qg = pFloat_pixels[i].c[1], qb = pFloat_pixels[i].c[2]; + + float d = qr * dir_r + qg * dir_g + qb * dir_b; + + float best_e = fabsf(d - dots[0]); + int best_idx = 0; + + for (int j = 1; j < 16; j++) + { + float e = fabsf(d - dots[j]); + if (e < best_e) + { + best_e = e; + best_idx = j; + } + } + + assert((best_idx >= 0) && (best_idx <= 15)); + + pWeights[i] = (uint8_t)best_idx; + + float err = basisu::squaref(qr - cr[best_idx]) + basisu::squaref(qg - cg[best_idx]) + basisu::squaref(qb - cb[best_idx]); + total_err += err * pPixel_scales[i]; + } + } + else if ((!try_2subsets_flag) || (!check_more_colors)) + { + for (uint32_t i = 0; i < 16; i++) + { + const float qr = pFloat_pixels[i].c[0], qg = pFloat_pixels[i].c[1], qb = pFloat_pixels[i].c[2]; + + uint32_t best_idx = 0; + + float d = qr * dir_r + qg * dir_g + qb * dir_b; + + int low = 0; + + int mid = low + 7; + if (d >= mid_dots[mid]) low = mid + 1; + mid = low + 3; + if (d >= mid_dots[mid]) low = mid + 1; + mid = low + 1; + if (d >= mid_dots[mid]) low = mid + 1; + mid = low; + if (d >= mid_dots[mid]) low = mid + 1; + + best_idx = low; + assert((best_idx >= 0) && (best_idx <= 15)); + + pWeights[i] = (uint8_t)best_idx; + + // Giesen's MRSSE (Mean Relative Sum of Squared Errors). + // Our ASTC HDR encoder uses slightly slower approx. MSLE, and it's too late/risky to eval the difference vs. MRSSE on the larger ASTC HDR blocks. + float err = basisu::squaref(qr - cr[best_idx]) + basisu::squaref(qg - cg[best_idx]) + basisu::squaref(qb - cb[best_idx]); + total_err += err * pPixel_scales[i]; + } + } + else + { + for (uint32_t i = 0; i < 16; i++) + { + const float qr = pFloat_pixels[i].c[0], qg = pFloat_pixels[i].c[1], qb = pFloat_pixels[i].c[2]; + + uint32_t best_idx = 0; + + float d = qr * dir_r + qg * dir_g + qb * dir_b; + + int low = 0; + + int mid = low + 7; + if (d >= mid_dots[mid]) low = mid + 1; + mid = low + 3; + if (d >= mid_dots[mid]) low = mid + 1; + mid = low + 1; + if (d >= mid_dots[mid]) low = mid + 1; + mid = low; + if (d >= mid_dots[mid]) low = mid + 1; + + best_idx = low; + assert((best_idx >= 0) && (best_idx <= 15)); + + float err = basisu::squaref(qr - cr[best_idx]) + basisu::squaref(qg - cg[best_idx]) + basisu::squaref(qb - cb[best_idx]); + + { + int alt_idx = best_idx + 1; + if (alt_idx > 15) + alt_idx = 13; + + float alt_err = basisu::squaref(qr - cr[alt_idx]) + basisu::squaref(qg - cg[alt_idx]) + basisu::squaref(qb - cb[alt_idx]); + if (alt_err < err) + { + err = alt_err; + best_idx = alt_idx; + } + } + + { + int alt_idx2 = best_idx - 1; + if (alt_idx2 < 0) + alt_idx2 = 2; + float alt_err2 = basisu::squaref(qr - cr[alt_idx2]) + basisu::squaref(qg - cg[alt_idx2]) + basisu::squaref(qb - cb[alt_idx2]); + if (alt_err2 < err) + { + err = alt_err2; + best_idx = alt_idx2; + } + } + + pWeights[i] = (uint8_t)best_idx; + + total_err += err * pPixel_scales[i]; + } + } } - if (l) + return total_err; + } + + static void assign_weights3(uint8_t trial_weights[16], + uint32_t best_pat_bits, + uint32_t subset_min_r[2], uint32_t subset_min_g[2], uint32_t subset_min_b[2], + uint32_t subset_max_r[2], uint32_t subset_max_g[2], uint32_t subset_max_b[2], + const vec3F* pFloat_pixels) + { + float subset_cr[2][8], subset_cg[2][8], subset_cb[2][8]; + + for (uint32_t subset = 0; subset < 2; subset++) { - memcpy(value_data.data(), pSrc, l); - pSrc += l; - src_left -= l; - } + const uint32_t min_r = subset_min_r[subset], min_g = subset_min_g[subset], min_b = subset_min_b[subset]; + const uint32_t max_r = subset_max_r[subset], max_g = subset_max_g[subset], max_b = subset_max_b[subset]; + + for (uint32_t j = 0; j < 8; j++) + { + const uint32_t w = g_bc7_weights3[j]; + + subset_cr[subset][j] = fast_half_to_float_pos_not_inf_or_nan((basist::half_float)((min_r * (64 - w) + max_r * w + 32) >> 6)); + subset_cg[subset][j] = fast_half_to_float_pos_not_inf_or_nan((basist::half_float)((min_g * (64 - w) + max_g * w + 32) >> 6)); + subset_cb[subset][j] = fast_half_to_float_pos_not_inf_or_nan((basist::half_float)((min_b * (64 - w) + max_b * w + 32) >> 6)); + } // j + + } // subset + + // TODO: Plane optimization? + + for (uint32_t i = 0; i < 16; i++) + { + const uint32_t subset = (best_pat_bits >> i) & 1; + const float qr = pFloat_pixels[i].c[0], qg = pFloat_pixels[i].c[1], qb = pFloat_pixels[i].c[2]; + + float best_error = basisu::squaref(subset_cr[subset][0] - qr) + basisu::squaref(subset_cg[subset][0] - qg) + basisu::squaref(subset_cb[subset][0] - qb); + uint32_t best_idx = 0; + + for (uint32_t j = 1; j < 8; j++) + { + float e = basisu::squaref(subset_cr[subset][j] - qr) + basisu::squaref(subset_cg[subset][j] - qg) + basisu::squaref(subset_cb[subset][j] - qb); + if (e < best_error) + { + best_error = e; + best_idx = j; + } + } + + trial_weights[i] = (uint8_t)best_idx; + + } // i + } + + static double assign_weights_error_3(uint8_t trial_weights[16], + uint32_t best_pat_bits, + uint32_t subset_min_r[2], uint32_t subset_min_g[2], uint32_t subset_min_b[2], + uint32_t subset_max_r[2], uint32_t subset_max_g[2], uint32_t subset_max_b[2], + const vec3F* pFloat_pixels, const float* pPixel_scales) + { + float subset_cr[2][8], subset_cg[2][8], subset_cb[2][8]; + + for (uint32_t subset = 0; subset < 2; subset++) + { + const uint32_t min_r = subset_min_r[subset], min_g = subset_min_g[subset], min_b = subset_min_b[subset]; + const uint32_t max_r = subset_max_r[subset], max_g = subset_max_g[subset], max_b = subset_max_b[subset]; + + for (uint32_t j = 0; j < 8; j++) + { + const uint32_t w = g_bc7_weights3[j]; + + subset_cr[subset][j] = fast_half_to_float_pos_not_inf_or_nan((basist::half_float)((min_r * (64 - w) + max_r * w + 32) >> 6)); + subset_cg[subset][j] = fast_half_to_float_pos_not_inf_or_nan((basist::half_float)((min_g * (64 - w) + max_g * w + 32) >> 6)); + subset_cb[subset][j] = fast_half_to_float_pos_not_inf_or_nan((basist::half_float)((min_b * (64 - w) + max_b * w + 32) >> 6)); + } // j + + } // subset + + double trial_error = 0.0f; + + // TODO: Plane optimization? + + for (uint32_t i = 0; i < 16; i++) + { + const uint32_t subset = (best_pat_bits >> i) & 1; + const float qr = pFloat_pixels[i].c[0], qg = pFloat_pixels[i].c[1], qb = pFloat_pixels[i].c[2]; + + float best_error = basisu::squaref(subset_cr[subset][0] - qr) + basisu::squaref(subset_cg[subset][0] - qg) + basisu::squaref(subset_cb[subset][0] - qb); + uint32_t best_idx = 0; + + for (uint32_t j = 1; j < 8; j++) + { + float e = basisu::squaref(subset_cr[subset][j] - qr) + basisu::squaref(subset_cg[subset][j] - qg) + basisu::squaref(subset_cb[subset][j] - qb); + if (e < best_error) + { + best_error = e; + best_idx = j; + } + } - uint32_t ofs = (uint32_t)(pSrc - m_pData) & 3; - uint32_t alignment_bytes = (4 - ofs) & 3; + trial_weights[i] = (uint8_t)best_idx; - if (src_left < alignment_bytes) - { - BASISU_DEVEL_ERROR("ktx2_transcoder::read_key_values: Failed reading key value fields (3)\n"); - return false; - } + trial_error += best_error * pPixel_scales[i]; - pSrc += alignment_bytes; - src_left -= alignment_bytes; + } // i + + return trial_error; } - return true; - } - -#endif // BASISD_SUPPORT_KTX2 + static basist::vec4F g_bc6h_ls_weights_3[8]; + static basist::vec4F g_bc6h_ls_weights_4[16]; + + const uint32_t BC6H_NUM_PATS = 32; + static uint32_t g_bc6h_pats2[BC6H_NUM_PATS]; - bool basisu_transcoder_supports_ktx2() - { -#if BASISD_SUPPORT_KTX2 - return true; -#else - return false; -#endif - } + static void fast_encode_bc6h_init() + { + for (uint32_t i = 0; i < 8; i++) + { + const float w = (float)g_bc7_weights3[i] * (1.0f / 64.0f); + g_bc6h_ls_weights_3[i].set(w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w); + } - bool basisu_transcoder_supports_ktx2_zstd() - { -#if BASISD_SUPPORT_KTX2_ZSTD - return true; -#else - return false; -#endif - } + for (uint32_t i = 0; i < 16; i++) + { + const float w = (float)g_bc6h_weights4[i] * (1.0f / 64.0f); + g_bc6h_ls_weights_4[i].set(w * w, (1.0f - w) * w, (1.0f - w) * (1.0f - w), w); + } - //------------------------------- + for (uint32_t pat_index = 0; pat_index < BC6H_NUM_PATS; pat_index++) + { + uint32_t pat_bits = 0; -#ifdef BASISD_SUPPORT_UASTC_HDR - // This float->half conversion matches how "F32TO16" works on Intel GPU's. - basist::half_float float_to_half(float val) - { - union { float f; int32_t i; uint32_t u; } fi = { val }; - const int flt_m = fi.i & 0x7FFFFF, flt_e = (fi.i >> 23) & 0xFF, flt_s = (fi.i >> 31) & 0x1; - int s = flt_s, e = 0, m = 0; + for (uint32_t j = 0; j < 16; j++) + pat_bits |= (g_bc7_partition2[pat_index * 16 + j] << j); - // inf/NaN - if (flt_e == 0xff) - { - e = 31; - if (flt_m != 0) // NaN - m = 1; + g_bc6h_pats2[pat_index] = pat_bits; + } } - // not zero or denormal - else if (flt_e != 0) + + static int bc6h_dequantize(int val, int bits) { - int new_exp = flt_e - 127; - if (new_exp > 15) - e = 31; - else if (new_exp < -14) - m = lrintf((1 << 24) * fabsf(fi.f)); + assert(val < (1 << bits)); + + int result; + if (bits >= 15) + result = val; + else if (!val) + result = 0; + else if (val == ((1 << bits) - 1)) + result = 0xFFFF; else - { - e = new_exp + 15; - m = lrintf(flt_m * (1.0f / ((float)(1 << 13)))); - } + result = ((val << 16) + 0x8000) >> bits; + return result; } - assert((0 <= m) && (m <= 1024)); - if (m == 1024) + static inline basist::half_float bc6h_convert_to_half(int val) { - e++; - m = 0; + assert(val < 65536); + + // scale by 31/64 + return (basist::half_float)((val * 31) >> 6); } - assert((s >= 0) && (s <= 1)); - assert((e >= 0) && (e <= 31)); - assert((m >= 0) && (m <= 1023)); + static void bc6h_quant_dequant_endpoints(uint32_t& min_r, uint32_t& min_g, uint32_t& min_b, uint32_t& max_r, uint32_t& max_g, uint32_t& max_b, int bits) // bits=10 + { + min_r = bc6h_convert_to_half(bc6h_dequantize(basist::bc6h_half_to_blog((basist::half_float)min_r, bits), bits)); + min_g = bc6h_convert_to_half(bc6h_dequantize(basist::bc6h_half_to_blog((basist::half_float)min_g, bits), bits)); + min_b = bc6h_convert_to_half(bc6h_dequantize(basist::bc6h_half_to_blog((basist::half_float)min_b, bits), bits)); - basist::half_float result = (basist::half_float)((s << 15) | (e << 10) | m); - return result; - } - - //------------------------------------------------------------------------------------------------ - // HDR support - // - // Originally from bc6h_enc.cpp - // BC6H decoder fuzzed vs. DirectXTex's for unsigned/signed + max_r = bc6h_convert_to_half(bc6h_dequantize(basist::bc6h_half_to_blog((basist::half_float)max_r, bits), bits)); + max_g = bc6h_convert_to_half(bc6h_dequantize(basist::bc6h_half_to_blog((basist::half_float)max_g, bits), bits)); + max_b = bc6h_convert_to_half(bc6h_dequantize(basist::bc6h_half_to_blog((basist::half_float)max_b, bits), bits)); + } - const uint8_t g_bc6h_mode_sig_bits[NUM_BC6H_MODES][4] = // base bits, r, g, b - { - // 2 subsets - { 10, 5, 5, 5, }, // 0, mode 1 in MS/D3D docs - { 7, 6, 6, 6, }, // 1 - { 11, 5, 4, 4, }, // 2 - { 11, 4, 5, 4, }, // 3 - { 11, 4, 4, 5, }, // 4 - { 9, 5, 5, 5, }, // 5 - { 8, 6, 5, 5, }, // 6 - { 8, 5, 6, 5, }, // 7 - { 8, 5, 5, 6, }, // 8 - { 6, 6, 6, 6, }, // 9, endpoints not delta encoded, mode 10 in MS/D3D docs - // 1 subset - { 10, 10, 10, 10, }, // 10, endpoints not delta encoded, mode 11 in MS/D3D docs - { 11, 9, 9, 9, }, // 11 - { 12, 8, 8, 8, }, // 12 - { 16, 4, 4, 4, } // 13, also useful for solid blocks - }; + static void bc6h_quant_endpoints( + uint32_t min_hr, uint32_t min_hg, uint32_t min_hb, uint32_t max_hr, uint32_t max_hg, uint32_t max_hb, + uint32_t& min_r, uint32_t& min_g, uint32_t& min_b, uint32_t& max_r, uint32_t& max_g, uint32_t& max_b, + int bits) + { + min_r = basist::bc6h_half_to_blog((basist::half_float)min_hr, bits); + min_g = basist::bc6h_half_to_blog((basist::half_float)min_hg, bits); + min_b = basist::bc6h_half_to_blog((basist::half_float)min_hb, bits); - const int8_t g_bc6h_mode_lookup[32] = { 0, 1, 2, 10, 0, 1, 3, 11, 0, 1, 4, 12, 0, 1, 5, 13, 0, 1, 6, -1, 0, 1, 7, -1, 0, 1, 8, -1, 0, 1, 9, -1 }; + max_r = basist::bc6h_half_to_blog((basist::half_float)max_hr, bits); + max_g = basist::bc6h_half_to_blog((basist::half_float)max_hg, bits); + max_b = basist::bc6h_half_to_blog((basist::half_float)max_hb, bits); + } - const bc6h_bit_layout g_bc6h_bit_layouts[NUM_BC6H_MODES][MAX_BC6H_LAYOUT_INDEX] = - { - // comp_index, subset*2+lh_index, last_bit, first_bit - //------------------------ mode 0: 2 subsets, Weight bits: 46 bits, Endpoint bits: 75 bits (10.555, 10.555, 10.555), delta - { { 1, 2, 4, -1 }, { 2, 2, 4, -1 }, { 2, 3, 4, -1 }, { 0, 0, 9, 0 }, { 1, 0, 9, 0 }, { 2, 0, 9, 0 }, { 0, 1, 4, 0 }, - { 1, 3, 4, -1 }, { 1, 2, 3, 0 }, { 1, 1, 4, 0 }, { 2, 3, 0, -1 }, { 1, 3, 3, 0 }, { 2, 1, 4, 0 }, { 2, 3, 1, -1 }, - { 2, 2, 3, 0 }, { 0, 2, 4, 0 }, { 2, 3, 2, -1 }, { 0, 3, 4, 0 }, { 2, 3, 3, -1 }, { 3, -1, 4, 0 }, {-1, 0, 0, 0} }, - //------------------------ mode 1: 2 subsets, Weight bits: 46 bits, Endpoint bits: 75 bits (7.666, 7.666, 7.666), delta - { { 1, 2, 5, -1 },{ 1, 3, 4, -1 },{ 1, 3, 5, -1 },{ 0, 0, 6, 0 },{ 2, 3, 0, -1 },{ 2, 3, 1, -1 },{ 2, 2, 4, -1 }, - { 1, 0, 6, 0 },{ 2, 2, 5, -1 },{ 2, 3, 2, -1 },{ 1, 2, 4, -1 },{ 2, 0, 6, 0 },{ 2, 3, 3, -1 },{ 2, 3, 5, -1 }, - { 2, 3, 4, -1 },{ 0, 1, 5, 0 },{ 1, 2, 3, 0 },{ 1, 1, 5, 0 },{ 1, 3, 3, 0 },{ 2, 1, 5, 0 },{ 2, 2, 3, 0 },{ 0, 2, 5, 0 }, - { 0, 3, 5, 0 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, - //------------------------ mode 2: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (11.555, 11.444, 11.444), delta - { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 4, 0 },{ 0, 0, 10, -1 },{ 1, 2, 3, 0 },{ 1, 1, 3, 0 },{ 1, 0, 10, -1 }, - { 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 3, 0 },{ 2, 0, 10, -1 },{ 2, 3, 1, -1 },{ 2, 2, 3, 0 },{ 0, 2, 4, 0 },{ 2, 3, 2, -1 }, - { 0, 3, 4, 0 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, - //------------------------ mode 3: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (11.444, 11.555, 11.444), delta - { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 3, 0 },{ 0, 0, 10, -1 },{ 1, 3, 4, -1 },{ 1, 2, 3, 0 },{ 1, 1, 4, 0 }, - { 1, 0, 10, -1 },{ 1, 3, 3, 0 },{ 2, 1, 3, 0 },{ 2, 0, 10, -1 },{ 2, 3, 1, -1 },{ 2, 2, 3, 0 },{ 0, 2, 3, 0 },{ 2, 3, 0, -1 }, - { 2, 3, 2, -1 },{ 0, 3, 3, 0 },{ 1, 2, 4, -1 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, - //------------------------ mode 4: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (11.444, 11.444, 11.555), delta - { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 3, 0 },{ 0, 0, 10, -1 },{ 2, 2, 4, -1 },{ 1, 2, 3, 0 },{ 1, 1, 3, 0 }, - { 1, 0, 10, -1 },{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 4, 0 },{ 2, 0, 10, -1 },{ 2, 2, 3, 0 },{ 0, 2, 3, 0 },{ 2, 3, 1, -1 }, - { 2, 3, 2, -1 },{ 0, 3, 3, 0 },{ 2, 3, 4, -1 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, - //------------------------ mode 5: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (9.555, 9.555, 9.555), delta - { { 0, 0, 8, 0 },{ 2, 2, 4, -1 },{ 1, 0, 8, 0 },{ 1, 2, 4, -1 },{ 2, 0, 8, 0 },{ 2, 3, 4, -1 },{ 0, 1, 4, 0 },{ 1, 3, 4, -1 }, - { 1, 2, 3, 0 },{ 1, 1, 4, 0 },{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 4, 0 },{ 2, 3, 1, -1 },{ 2, 2, 3, 0 },{ 0, 2, 4, 0 }, - { 2, 3, 2, -1 },{ 0, 3, 4, 0 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, - //------------------------ mode 6: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (8.666, 8.555, 8.555), delta - { { 0, 0, 7, 0 },{ 1, 3, 4, -1 },{ 2, 2, 4, -1 },{ 1, 0, 7, 0 },{ 2, 3, 2, -1 },{ 1, 2, 4, -1 },{ 2, 0, 7, 0 },{ 2, 3, 3, -1 }, - { 2, 3, 4, -1 },{ 0, 1, 5, 0 },{ 1, 2, 3, 0 },{ 1, 1, 4, 0 },{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 4, 0 },{ 2, 3, 1, -1 }, - { 2, 2, 3, 0 },{ 0, 2, 5, 0 },{ 0, 3, 5, 0 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, - //------------------------ mode 7: 2 subsets, Weight bits: 46 bits, Endpoints bits: 72 bits (8.555, 8.666, 8.555), delta - { { 0, 0, 7, 0 },{ 2, 3, 0, -1 },{ 2, 2, 4, -1 },{ 1, 0, 7, 0 },{ 1, 2, 5, -1 },{ 1, 2, 4, -1 },{ 2, 0, 7, 0 },{ 1, 3, 5, -1 }, - { 2, 3, 4, -1 },{ 0, 1, 4, 0 },{ 1, 3, 4, -1 },{ 1, 2, 3, 0 },{ 1, 1, 5, 0 },{ 1, 3, 3, 0 },{ 2, 1, 4, 0 },{ 2, 3, 1, -1 }, - { 2, 2, 3, 0 },{ 0, 2, 4, 0 },{ 2, 3, 2, -1 },{ 0, 3, 4, 0 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, - //------------------------ mode 8: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (8.555, 8.555, 8.666), delta - { { 0, 0, 7, 0 },{ 2, 3, 1, -1 },{ 2, 2, 4, -1 },{ 1, 0, 7, 0 },{ 2, 2, 5, -1 },{ 1, 2, 4, -1 },{ 2, 0, 7, 0 },{ 2, 3, 5, -1 }, - { 2, 3, 4, -1 },{ 0, 1, 4, 0 },{ 1, 3, 4, -1 },{ 1, 2, 3, 0 },{ 1, 1, 4, 0 },{ 2, 3, 0, -1 },{ 1, 3, 3, 0 },{ 2, 1, 5, 0 }, - { 2, 2, 3, 0 },{ 0, 2, 4, 0 },{ 2, 3, 2, -1 },{ 0, 3, 4, 0 },{ 2, 3, 3, -1 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, - //------------------------ mode 9: 2 subsets, Weight bits: 46 bits, Endpoint bits: 72 bits (6.6.6.6, 6.6.6.6, 6.6.6.6), NO delta - { { 0, 0, 5, 0 },{ 1, 3, 4, -1 },{ 2, 3, 0, -1 },{ 2, 3, 1, -1 },{ 2, 2, 4, -1 },{ 1, 0, 5, 0 },{ 1, 2, 5, -1 },{ 2, 2, 5, -1 }, - { 2, 3, 2, -1 },{ 1, 2, 4, -1 },{ 2, 0, 5, 0 },{ 1, 3, 5, -1 },{ 2, 3, 3, -1 },{ 2, 3, 5, -1 },{ 2, 3, 4, -1 },{ 0, 1, 5, 0 }, - { 1, 2, 3, 0 },{ 1, 1, 5, 0 },{ 1, 3, 3, 0 },{ 2, 1, 5, 0 },{ 2, 2, 3, 0 },{ 0, 2, 5, 0 },{ 0, 3, 5, 0 },{ 3, -1, 4, 0 }, {-1, 0, 0, 0} }, - //------------------------ mode 10: 1 subset, Weight bits: 63 bits, Endpoint bits: 60 bits (10.10, 10.10, 10.10), NO delta - { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 9, 0 },{ 1, 1, 9, 0 },{ 2, 1, 9, 0 }, {-1, 0, 0, 0} }, - //------------------------ mode 11: 1 subset, Weight bits: 63 bits, Endpoint bits: 60 bits (11.9, 11.9, 11.9), delta - { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 8, 0 },{ 0, 0, 10, -1 },{ 1, 1, 8, 0 },{ 1, 0, 10, -1 },{ 2, 1, 8, 0 },{ 2, 0, 10, -1 }, {-1, 0, 0, 0} }, - //------------------------ mode 12: 1 subset, Weight bits: 63 bits, Endpoint bits: 60 bits (12.8, 12.8, 12.8), delta - { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 7, 0 },{ 0, 0, 10, 11 },{ 1, 1, 7, 0 },{ 1, 0, 10, 11 },{ 2, 1, 7, 0 },{ 2, 0, 10, 11 }, {-1, 0, 0, 0} }, - //------------------------ mode 13: 1 subset, Weight bits: 63 bits, Endpoint bits: 60 bits (16.4, 16.4, 16.4), delta - { { 0, 0, 9, 0 },{ 1, 0, 9, 0 },{ 2, 0, 9, 0 },{ 0, 1, 3, 0 },{ 0, 0, 10, 15 },{ 1, 1, 3, 0 },{ 1, 0, 10, 15 },{ 2, 1, 3, 0 },{ 2, 0, 10, 15 }, {-1, 0, 0, 0} } - }; + static void bc6h_dequant_endpoints( + uint32_t min_br, uint32_t min_bg, uint32_t min_bb, uint32_t max_br, uint32_t max_bg, uint32_t max_bb, + uint32_t& min_hr, uint32_t& min_hg, uint32_t& min_hb, uint32_t& max_hr, uint32_t& max_hg, uint32_t& max_hb, + int bits) + { + min_hr = bc6h_convert_to_half(bc6h_dequantize(min_br, bits)); + min_hg = bc6h_convert_to_half(bc6h_dequantize(min_bg, bits)); + min_hb = bc6h_convert_to_half(bc6h_dequantize(min_bb, bits)); - // The same as the first 32 2-subset patterns in BC7. - // Bit 7 is a flag indicating that the weight uses 1 less bit than usual. - const uint8_t g_bc6h_2subset_patterns[TOTAL_BC6H_PARTITION_PATTERNS][4][4] = // [pat][y][x] - { - { {0x80, 0, 1, 1}, { 0, 0, 1, 1 }, { 0, 0, 1, 1 }, { 0, 0, 1, 0x81 }}, { {0x80, 0, 0, 1}, {0, 0, 0, 1}, {0, 0, 0, 1}, {0, 0, 0, 0x81} }, - { {0x80, 1, 1, 1}, {0, 1, 1, 1}, {0, 1, 1, 1}, {0, 1, 1, 0x81} }, { {0x80, 0, 0, 1}, {0, 0, 1, 1}, {0, 0, 1, 1}, {0, 1, 1, 0x81} }, - { {0x80, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 0, 1}, {0, 0, 1, 0x81} }, { {0x80, 0, 1, 1}, {0, 1, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 0x81} }, - { {0x80, 0, 0, 1}, {0, 0, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 0x81} }, { {0x80, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 1, 1}, {0, 1, 1, 0x81} }, - { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 1}, {0, 0, 1, 0x81} }, { {0x80, 0, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 0x81} }, - { {0x80, 0, 0, 0}, {0, 0, 0, 1}, {0, 1, 1, 1}, {1, 1, 1, 0x81} }, { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 1}, {0, 1, 1, 0x81} }, - { {0x80, 0, 0, 1}, {0, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 0x81} }, { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {1, 1, 1, 1}, {1, 1, 1, 0x81} }, - { {0x80, 0, 0, 0}, {1, 1, 1, 1}, {1, 1, 1, 1}, {1, 1, 1, 0x81} }, { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}, {1, 1, 1, 0x81} }, - { {0x80, 0, 0, 0}, {1, 0, 0, 0}, {1, 1, 1, 0}, {1, 1, 1, 0x81} }, { {0x80, 1, 0x81, 1}, {0, 0, 0, 1}, {0, 0, 0, 0}, {0, 0, 0, 0} }, - { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0x81, 0, 0, 0}, {1, 1, 1, 0} }, { {0x80, 1, 0x81, 1}, {0, 0, 1, 1}, {0, 0, 0, 1}, {0, 0, 0, 0} }, - { {0x80, 0, 0x81, 1}, {0, 0, 0, 1}, {0, 0, 0, 0}, {0, 0, 0, 0} }, { {0x80, 0, 0, 0}, {1, 0, 0, 0}, {0x81, 1, 0, 0}, {1, 1, 1, 0} }, - { {0x80, 0, 0, 0}, {0, 0, 0, 0}, {0x81, 0, 0, 0}, {1, 1, 0, 0} }, { {0x80, 1, 1, 1}, {0, 0, 1, 1}, { 0, 0, 1, 1}, {0, 0, 0, 0x81} }, - { {0x80, 0, 0x81, 1}, {0, 0, 0, 1}, {0, 0, 0, 1}, {0, 0, 0, 0} }, { {0x80, 0, 0, 0}, {1, 0, 0, 0}, {0x81, 0, 0, 0}, {1, 1, 0, 0} }, - { {0x80, 1, 0x81, 0}, {0, 1, 1, 0}, {0, 1, 1, 0}, {0, 1, 1, 0} }, { {0x80, 0, 0x81, 1}, {0, 1, 1, 0}, {0, 1, 1, 0}, {1, 1, 0, 0} }, - { {0x80, 0, 0, 1}, {0, 1, 1, 1}, {0x81, 1, 1, 0}, {1, 0, 0, 0} }, { {0x80, 0, 0, 0}, {1, 1, 1, 1}, {0x81, 1, 1, 1}, {0, 0, 0, 0} }, - { {0x80, 1, 0x81, 1}, {0, 0, 0, 1}, {1, 0, 0, 0}, {1, 1, 1, 0} }, { {0x80, 0, 0x81, 1}, {1, 0, 0, 1}, {1, 0, 0, 1}, {1, 1, 0, 0} } - }; + max_hr = bc6h_convert_to_half(bc6h_dequantize(max_br, bits)); + max_hg = bc6h_convert_to_half(bc6h_dequantize(max_bg, bits)); + max_hb = bc6h_convert_to_half(bc6h_dequantize(max_bb, bits)); + } - const uint8_t g_bc6h_weight3[8] = { 0, 9, 18, 27, 37, 46, 55, 64 }; - const uint8_t g_bc6h_weight4[16] = { 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64 }; - - struct bc6h_logical_block - { - uint32_t m_mode; - uint32_t m_partition_pattern; // must be 0 if 1 subset - uint32_t m_endpoints[3][4]; // [comp][subset*2+lh_index] - must be already properly packed - uint8_t m_weights[16]; // weights must be of the proper size, taking into account skipped MSB's which must be 0 + static BASISU_FORCE_INLINE int popcount32(uint32_t x) + { +#if defined(__EMSCRIPTEN__) || defined(__clang__) || defined(__GNUC__) + return __builtin_popcount(x); +#elif defined(_MSC_VER) + return __popcnt(x); +#else + int count = 0; + while (x) + { + x &= (x - 1); + ++count; + } + return count; +#endif + } - void clear() + static BASISU_FORCE_INLINE int fast_roundf_int(float x) { - basisu::clear_obj(*this); + return (x >= 0.0f) ? (int)(x + 0.5f) : (int)(x - 0.5f); } - }; + + static void fast_encode_bc6h_2subsets_pattern( + uint32_t best_pat_index, uint32_t best_pat_bits, + const basist::half_float* pPixels, const vec3F* pFloat_pixels, const float* pPixel_scales, + double& cur_error, basist::bc6h_logical_block& log_blk, + int64_t block_max_var, + int mean_r, int mean_g, int mean_b, + const fast_bc6h_params& params) + { + BASISU_NOTE_UNUSED(block_max_var); + + uint32_t subset_means[2][3] = { { 0 } }; + for (uint32_t i = 0; i < 16; i++) + { + const uint32_t subset_index = (best_pat_bits >> i) & 1; + const uint32_t r = pPixels[i * 3 + 0], g = pPixels[i * 3 + 1], b = pPixels[i * 3 + 2]; + + subset_means[subset_index][0] += r; + subset_means[subset_index][1] += g; + subset_means[subset_index][2] += b; + } - static inline void write_bits(uint64_t val, uint32_t num_bits, uint32_t& bit_pos, uint64_t& l, uint64_t& h) - { - assert((num_bits) && (num_bits < 64) && (bit_pos < 128)); - assert(val < (1ULL << num_bits)); + for (uint32_t s = 0; s < 2; s++) + for (uint32_t c = 0; c < 3; c++) + subset_means[s][c] = (subset_means[s][c] + 8) / 16; - if (bit_pos < 64) - { - l |= (val << bit_pos); + int64_t subset_icov[2][6] = { { 0 } }; - if ((bit_pos + num_bits) > 64) - h |= (val >> (64 - bit_pos)); - } - else - { - h |= (val << (bit_pos - 64)); - } + for (uint32_t i = 0; i < 16; i++) + { + const uint32_t subset_index = (best_pat_bits >> i) & 1; + const int r = (int)pPixels[i * 3 + 0] - mean_r, g = (int)pPixels[i * 3 + 1] - mean_g, b = (int)pPixels[i * 3 + 2] - mean_b; - bit_pos += num_bits; - assert(bit_pos <= 128); - } + subset_icov[subset_index][0] += r * r; + subset_icov[subset_index][1] += r * g; + subset_icov[subset_index][2] += r * b; + subset_icov[subset_index][3] += g * g; + subset_icov[subset_index][4] += g * b; + subset_icov[subset_index][5] += b * b; + } - static inline void write_rev_bits(uint64_t val, uint32_t num_bits, uint32_t& bit_pos, uint64_t& l, uint64_t& h) - { - assert((num_bits) && (num_bits < 64) && (bit_pos < 128)); - assert(val < (1ULL << num_bits)); + vec3F subset_axis[2]; - for (uint32_t i = 0; i < num_bits; i++) - write_bits((val >> (num_bits - 1u - i)) & 1, 1, bit_pos, l, h); - } + for (uint32_t subset_index = 0; subset_index < 2; subset_index++) + { + float cov[6]; + for (uint32_t i = 0; i < 6; i++) + cov[i] = (float)subset_icov[subset_index][i]; - static void pack_bc6h_block(bc6h_block& dst_blk, bc6h_logical_block& log_blk) - { - const uint8_t s_mode_bits[NUM_BC6H_MODES] = { 0b00, 0b01, 0b00010, 0b00110, 0b01010, 0b01110, 0b10010, 0b10110, 0b11010, 0b11110, 0b00011, 0b00111, 0b01011, 0b01111 }; + const float sc = 1.0f / (basisu::maximum(cov[0], cov[3], cov[5]) + basisu::REALLY_SMALL_FLOAT_VAL); + const float wx = sc * cov[0], wy = sc * cov[3], wz = sc * cov[5]; - const uint32_t mode = log_blk.m_mode; - assert(mode < NUM_BC6H_MODES); + const float alt_xr = cov[0] * wx + cov[1] * wy + cov[2] * wz; + const float alt_xg = cov[1] * wx + cov[3] * wy + cov[4] * wz; + const float alt_xb = cov[2] * wx + cov[4] * wy + cov[5] * wz; - uint64_t l = s_mode_bits[mode], h = 0; - uint32_t bit_pos = (mode >= 2) ? 5 : 2; + float l = basisu::squaref(alt_xr) + basisu::squaref(alt_xg) + basisu::squaref(alt_xb); - const uint32_t num_subsets = (mode >= BC6H_FIRST_1SUBSET_MODE_INDEX) ? 1 : 2; + float axis_r = 0.57735027f, axis_g = 0.57735027f, axis_b = 0.57735027f; + if (fabs(l) >= basisu::SMALL_FLOAT_VAL) + { + const float inv_l = inv_sqrt(l); + axis_r = alt_xr * inv_l; + axis_g = alt_xg * inv_l; + axis_b = alt_xb * inv_l; + } - assert(((num_subsets == 2) && (log_blk.m_partition_pattern < TOTAL_BC6H_PARTITION_PATTERNS)) || - ((num_subsets == 1) && (!log_blk.m_partition_pattern))); + subset_axis[subset_index].set(axis_r, axis_g, axis_b); + } // s + + float subset_min_dot[2] = { basisu::BIG_FLOAT_VAL, basisu::BIG_FLOAT_VAL }; + float subset_max_dot[2] = { -basisu::BIG_FLOAT_VAL, -basisu::BIG_FLOAT_VAL }; + int subset_min_idx[2] = { 0 }, subset_max_idx[2] = { 0 }; - // Sanity checks - for (uint32_t c = 0; c < 3; c++) - { - assert(log_blk.m_endpoints[c][0] < (1u << g_bc6h_mode_sig_bits[mode][0])); // 1st subset l, base bits - assert(log_blk.m_endpoints[c][1] < (1u << g_bc6h_mode_sig_bits[mode][c + 1])); // 1st subset h, these are deltas except for modes 9,10 - assert(log_blk.m_endpoints[c][2] < (1u << g_bc6h_mode_sig_bits[mode][c + 1])); // 2nd subset l - assert(log_blk.m_endpoints[c][3] < (1u << g_bc6h_mode_sig_bits[mode][c + 1])); // 2nd subset h - } + for (uint32_t i = 0; i < 16; i++) + { + const uint32_t subset_index = (best_pat_bits >> i) & 1; + const float r = (float)pPixels[i * 3 + 0], g = (float)pPixels[i * 3 + 1], b = (float)pPixels[i * 3 + 2]; + const float dot = r * subset_axis[subset_index].c[0] + g * subset_axis[subset_index].c[1] + b * subset_axis[subset_index].c[2]; + + if (dot < subset_min_dot[subset_index]) + { + subset_min_dot[subset_index] = dot; + subset_min_idx[subset_index] = i; + } - const bc6h_bit_layout* pLayout = &g_bc6h_bit_layouts[mode][0]; + if (dot > subset_max_dot[subset_index]) + { + subset_max_dot[subset_index] = dot; + subset_max_idx[subset_index] = i; + } + } // i - while (pLayout->m_comp != -1) - { - uint32_t v = (pLayout->m_comp == 3) ? log_blk.m_partition_pattern : log_blk.m_endpoints[pLayout->m_comp][pLayout->m_index]; + uint32_t subset_min_r[2], subset_min_g[2], subset_min_b[2]; + uint32_t subset_max_r[2], subset_max_g[2], subset_max_b[2]; - if (pLayout->m_first_bit == -1) - { - write_bits((v >> pLayout->m_last_bit) & 1, 1, bit_pos, l, h); - } - else + for (uint32_t subset_index = 0; subset_index < 2; subset_index++) { - const uint32_t total_bits = basisu::iabs(pLayout->m_last_bit - pLayout->m_first_bit) + 1; + const uint32_t min_index = subset_min_idx[subset_index] * 3, max_index = subset_max_idx[subset_index] * 3; - v >>= basisu::minimum(pLayout->m_first_bit, pLayout->m_last_bit); - v &= ((1 << total_bits) - 1); + subset_min_r[subset_index] = pPixels[min_index + 0]; + subset_min_g[subset_index] = pPixels[min_index + 1]; + subset_min_b[subset_index] = pPixels[min_index + 2]; - if (pLayout->m_first_bit > pLayout->m_last_bit) - write_rev_bits(v, total_bits, bit_pos, l, h); - else - write_bits(v, total_bits, bit_pos, l, h); - } + subset_max_r[subset_index] = pPixels[max_index + 0]; + subset_max_g[subset_index] = pPixels[max_index + 1]; + subset_max_b[subset_index] = pPixels[max_index + 2]; - pLayout++; - } + } // subset_index - const uint32_t num_mode_sel_bits = (num_subsets == 1) ? 4 : 3; - const uint8_t* pPat = &g_bc6h_2subset_patterns[log_blk.m_partition_pattern][0][0]; + // least squares with unquantized endpoints + const bool use_ls = true; + if (use_ls) + { + uint8_t trial_weights[16]; + assign_weights3(trial_weights, best_pat_bits, subset_min_r, subset_min_g, subset_min_b, subset_max_r, subset_max_g, subset_max_b, pFloat_pixels); - for (uint32_t i = 0; i < 16; i++) - { - const uint32_t sel = log_blk.m_weights[i]; + float z00[2] = { 0.0f }, z01[2] = { 0.0f }, z10[2] = { 0.0f }, z11[2] = { 0.0f }; + float q00_r[2] = { 0.0f }, q10_r[2] = { 0.0f }, t_r[2] = { 0.0f }; + float q00_g[2] = { 0.0f }, q10_g[2] = { 0.0f }, t_g[2] = { 0.0f }; + float q00_b[2] = { 0.0f }, q10_b[2] = { 0.0f }, t_b[2] = { 0.0f }; - uint32_t num_bits = num_mode_sel_bits; - if (num_subsets == 2) - { - const uint32_t subset_index = pPat[i]; - num_bits -= (subset_index >> 7); - } - else if (!i) - { - num_bits--; - } + for (uint32_t i = 0; i < 16; i++) + { + const uint32_t subset = (best_pat_bits >> i) & 1; - assert(sel < (1u << num_bits)); + float r = (float)pPixels[i * 3 + 0]; + float g = (float)pPixels[i * 3 + 1]; + float b = (float)pPixels[i * 3 + 2]; - write_bits(sel, num_bits, bit_pos, l, h); - } + const uint32_t sel = trial_weights[i]; - assert(bit_pos == 128); + z00[subset] += g_bc6h_ls_weights_3[sel][0]; + z10[subset] += g_bc6h_ls_weights_3[sel][1]; + z11[subset] += g_bc6h_ls_weights_3[sel][2]; - basisu::write_le_dword(&dst_blk.m_bytes[0], (uint32_t)l); - basisu::write_le_dword(&dst_blk.m_bytes[4], (uint32_t)(l >> 32u)); - basisu::write_le_dword(&dst_blk.m_bytes[8], (uint32_t)h); - basisu::write_le_dword(&dst_blk.m_bytes[12], (uint32_t)(h >> 32u)); - } + float w = g_bc6h_ls_weights_3[sel][3]; -#if 0 - static inline uint32_t bc6h_blog_dequantize_to_blog16(uint32_t comp, uint32_t bits_per_comp) - { - int unq; + q00_r[subset] += w * r; + t_r[subset] += r; - if (bits_per_comp >= 15) - unq = comp; - else if (comp == 0) - unq = 0; - else if (comp == ((1u << bits_per_comp) - 1u)) - unq = 0xFFFFu; - else - unq = ((comp << 16u) + 0x8000u) >> bits_per_comp; + q00_g[subset] += w * g; + t_g[subset] += g; - return unq; - } -#endif + q00_b[subset] += w * b; + t_b[subset] += b; + } - // Suboptimal, but very close. - static inline uint32_t bc6h_half_to_blog(half_float h, uint32_t num_bits) - { - assert(h <= MAX_BC6H_HALF_FLOAT_AS_UINT); - return (h * 64 + 30) / (31 * (1 << (16 - num_bits))); - } + for (uint32_t subset = 0; subset < 2; subset++) + { + q10_r[subset] = t_r[subset] - q00_r[subset]; + q10_g[subset] = t_g[subset] - q00_g[subset]; + q10_b[subset] = t_b[subset] - q00_b[subset]; - // 6,7,8,9,10,11,12 - const uint32_t BC6H_BLOG_TAB_MIN = 6; - const uint32_t BC6H_BLOG_TAB_MAX = 12; - //const uint32_t BC6H_BLOG_TAB_NUM = BC6H_BLOG_TAB_MAX - BC6H_BLOG_TAB_MIN + 1; - - // Handles 16, or 6-12 bits. Others assert. - static inline uint32_t half_to_blog_tab(half_float h, uint32_t num_bits) - { - BASISU_NOTE_UNUSED(BC6H_BLOG_TAB_MIN); - BASISU_NOTE_UNUSED(BC6H_BLOG_TAB_MAX); + z01[subset] = z10[subset]; - assert(h <= MAX_BC6H_HALF_FLOAT_AS_UINT); + float det = z00[subset] * z11[subset] - z01[subset] * z10[subset]; + if (fabs(det) >= basisu::SMALL_FLOAT_VAL) + { + det = 1.0f / det; - if (num_bits == 16) - { - return bc6h_half_to_blog(h, 16); - } - else - { - assert((num_bits >= BC6H_BLOG_TAB_MIN) && (num_bits <= BC6H_BLOG_TAB_MAX)); - - // Note: This used to be done using a table lookup, but it required ~224KB of tables. This isn't quite as accurate, but the error is very slight (+-1 half values as ints). - return bc6h_half_to_blog(h, num_bits); - } - } + float iz00 = z11[subset] * det; + float iz01 = -z01[subset] * det; + float iz10 = -z10[subset] * det; + float iz11 = z00[subset] * det; - bool g_bc6h_enc_initialized; + subset_max_r[subset] = basisu::clamp(fast_roundf_int(iz00 * q00_r[subset] + iz01 * q10_r[subset]), 0, (int)basist::MAX_BC6H_HALF_FLOAT_AS_UINT); + subset_min_r[subset] = basisu::clamp(fast_roundf_int(iz10 * q00_r[subset] + iz11 * q10_r[subset]), 0, (int)basist::MAX_BC6H_HALF_FLOAT_AS_UINT); - void bc6h_enc_init() - { - if (g_bc6h_enc_initialized) - return; + subset_max_g[subset] = basisu::clamp(fast_roundf_int(iz00 * q00_g[subset] + iz01 * q10_g[subset]), 0, (int)basist::MAX_BC6H_HALF_FLOAT_AS_UINT); + subset_min_g[subset] = basisu::clamp(fast_roundf_int(iz10 * q00_g[subset] + iz11 * q10_g[subset]), 0, (int)basist::MAX_BC6H_HALF_FLOAT_AS_UINT); - g_bc6h_enc_initialized = true; - } + subset_max_b[subset] = basisu::clamp(fast_roundf_int(iz00 * q00_b[subset] + iz01 * q10_b[subset]), 0, (int)basist::MAX_BC6H_HALF_FLOAT_AS_UINT); + subset_min_b[subset] = basisu::clamp(fast_roundf_int(iz10 * q00_b[subset] + iz11 * q10_b[subset]), 0, (int)basist::MAX_BC6H_HALF_FLOAT_AS_UINT); + } + } // subset + } - // mode 10, 4-bit weights - void bc6h_enc_block_mode10(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights) - { - assert(g_bc6h_enc_initialized); + const int BC6H_2SUBSET_ABS_ENDPOINT_MODE = 9; - for (uint32_t i = 0; i < 16; i++) - { - assert(pWeights[i] <= 15); - } + int bc6h_mode_index = BC6H_2SUBSET_ABS_ENDPOINT_MODE, num_endpoint_bits = 6; + uint32_t abs_blog_endpoints[3][4]; - bc6h_logical_block log_blk; - log_blk.clear(); + if (params.m_num_diff_endpoint_modes_to_try) + { + // ordered from largest base bits to least + static const int s_bc6h_mode_order2[2] = { 5, 1 }; + static const int s_bc6h_mode_order4[4] = { 0, 5, 7, 1 }; + static const int s_bc6h_mode_order9[9] = { 2, 3, 4, 0, 5, 6, 7, 8, 1 }; - // Convert half endpoints to blog10 (mode 10 doesn't use delta encoding) - for (uint32_t c = 0; c < 3; c++) - { - log_blk.m_endpoints[c][0] = half_to_blog_tab(pEndpoints[c][0], 10); - log_blk.m_endpoints[c][1] = half_to_blog_tab(pEndpoints[c][1], 10); - } + uint32_t num_endpoint_modes = 2; + const int* pBC6H_mode_order = s_bc6h_mode_order2; - memcpy(log_blk.m_weights, pWeights, 16); + if (params.m_num_diff_endpoint_modes_to_try >= 9) + { + num_endpoint_modes = 9; + pBC6H_mode_order = s_bc6h_mode_order9; + } + else if (params.m_num_diff_endpoint_modes_to_try >= 4) + { + num_endpoint_modes = 4; + pBC6H_mode_order = s_bc6h_mode_order4; + } - if (log_blk.m_weights[0] & 8) - { - for (uint32_t i = 0; i < 16; i++) - log_blk.m_weights[i] = 15 - log_blk.m_weights[i]; + // Find the BC6H mode that will conservatively encode our trial endpoints. The mode chosen will handle any endpoint swaps. + for (uint32_t bc6h_mode_iter = 0; bc6h_mode_iter < num_endpoint_modes; bc6h_mode_iter++) + { + const uint32_t mode = pBC6H_mode_order[bc6h_mode_iter]; - for (uint32_t c = 0; c < 3; c++) - { - std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][1]); - } - } + const uint32_t num_base_bits = g_bc6h_mode_sig_bits[mode][0]; + const int base_bitmask = (1 << num_base_bits) - 1; + BASISU_NOTE_UNUSED(base_bitmask); - log_blk.m_mode = BC6H_FIRST_1SUBSET_MODE_INDEX; - pack_bc6h_block(*pPacked_block, log_blk); - } + const uint32_t num_delta_bits[3] = { g_bc6h_mode_sig_bits[mode][1], g_bc6h_mode_sig_bits[mode][2], g_bc6h_mode_sig_bits[mode][3] }; + const int delta_bitmasks[3] = { (1 << num_delta_bits[0]) - 1, (1 << num_delta_bits[1]) - 1, (1 << num_delta_bits[2]) - 1 }; - // Tries modes 11-13 (delta endpoint) encoding, falling back to mode 10 only when necessary, 4-bit weights - void bc6h_enc_block_1subset_4bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights) - { - assert(g_bc6h_enc_initialized); + for (uint32_t subset_index = 0; subset_index < 2; subset_index++) + { + bc6h_quant_endpoints( + subset_min_r[subset_index], subset_min_g[subset_index], subset_min_b[subset_index], subset_max_r[subset_index], subset_max_g[subset_index], subset_max_b[subset_index], + abs_blog_endpoints[0][subset_index * 2 + 0], abs_blog_endpoints[1][subset_index * 2 + 0], abs_blog_endpoints[2][subset_index * 2 + 0], + abs_blog_endpoints[0][subset_index * 2 + 1], abs_blog_endpoints[1][subset_index * 2 + 1], abs_blog_endpoints[2][subset_index * 2 + 1], + num_base_bits); + } - for (uint32_t i = 0; i < 16; i++) - { - assert(pWeights[i] <= 15); - } + uint32_t c; + for (c = 0; c < 3; c++) + { + // a very conservative check because we don't have the weight indices yet, so we don't know how to swap end point values + // purposely enforcing a symmetric limit here so we can invert any endpoints later if needed + const int max_delta = (1 << (num_delta_bits[c] - 1)) - 1; + const int min_delta = -max_delta; - bc6h_logical_block log_blk; - log_blk.clear(); + int delta0 = (int)abs_blog_endpoints[c][1] - (int)abs_blog_endpoints[c][0]; + if ((delta0 < min_delta) || (delta0 > max_delta)) + break; - for (uint32_t mode = BC6H_LAST_MODE_INDEX; mode > BC6H_FIRST_1SUBSET_MODE_INDEX; mode--) - { - const uint32_t num_base_bits = g_bc6h_mode_sig_bits[mode][0], num_delta_bits = g_bc6h_mode_sig_bits[mode][1]; - const int base_bitmask = (1 << num_base_bits) - 1; - const int delta_bitmask = (1 << num_delta_bits) - 1; - BASISU_NOTE_UNUSED(base_bitmask); + int delta1 = (int)abs_blog_endpoints[c][2] - (int)abs_blog_endpoints[c][0]; + if ((delta1 < min_delta) || (delta1 > max_delta)) + break; - assert(num_delta_bits < num_base_bits); - assert((num_delta_bits == g_bc6h_mode_sig_bits[mode][2]) && (num_delta_bits == g_bc6h_mode_sig_bits[mode][3])); + int delta2 = (int)abs_blog_endpoints[c][3] - (int)abs_blog_endpoints[c][0]; + if ((delta2 < min_delta) || (delta2 > max_delta)) + break; - uint32_t blog_endpoints[3][2]; + // in case the endpoints are swapped + int delta3 = (int)abs_blog_endpoints[c][2] - (int)abs_blog_endpoints[c][1]; + if ((delta3 < min_delta) || (delta3 > max_delta)) + break; - // Convert half endpoints to blog 16, 12, or 11 - for (uint32_t c = 0; c < 3; c++) - { - blog_endpoints[c][0] = half_to_blog_tab(pEndpoints[c][0], num_base_bits); - assert((int)blog_endpoints[c][0] <= base_bitmask); + int delta4 = (int)abs_blog_endpoints[c][3] - (int)abs_blog_endpoints[c][1]; + if ((delta4 < min_delta) || (delta4 > max_delta)) + break; + } - blog_endpoints[c][1] = half_to_blog_tab(pEndpoints[c][1], num_base_bits); - assert((int)blog_endpoints[c][1] <= base_bitmask); + if (c == 3) + { + bc6h_mode_index = mode; + num_endpoint_bits = num_base_bits; + break; + } + } } - // Copy weights - memcpy(log_blk.m_weights, pWeights, 16); - - // Ensure first weight MSB is 0 - if (log_blk.m_weights[0] & 8) + if (bc6h_mode_index == BC6H_2SUBSET_ABS_ENDPOINT_MODE) { - // Invert weights - for (uint32_t i = 0; i < 16; i++) - log_blk.m_weights[i] = 15 - log_blk.m_weights[i]; - - // Swap blog quantized endpoints - for (uint32_t c = 0; c < 3; c++) + for (uint32_t subset_index = 0; subset_index < 2; subset_index++) { - std::swap(blog_endpoints[c][0], blog_endpoints[c][1]); + bc6h_quant_endpoints( + subset_min_r[subset_index], subset_min_g[subset_index], subset_min_b[subset_index], subset_max_r[subset_index], subset_max_g[subset_index], subset_max_b[subset_index], + abs_blog_endpoints[0][subset_index * 2 + 0], abs_blog_endpoints[1][subset_index * 2 + 0], abs_blog_endpoints[2][subset_index * 2 + 0], + abs_blog_endpoints[0][subset_index * 2 + 1], abs_blog_endpoints[1][subset_index * 2 + 1], abs_blog_endpoints[2][subset_index * 2 + 1], + num_endpoint_bits); } } - const int max_delta = (1 << (num_delta_bits - 1)) - 1; - const int min_delta = -(max_delta + 1); - assert((max_delta - min_delta) == delta_bitmask); + for (uint32_t subset_index = 0; subset_index < 2; subset_index++) + { + bc6h_dequant_endpoints( + abs_blog_endpoints[0][subset_index * 2 + 0], abs_blog_endpoints[1][subset_index * 2 + 0], abs_blog_endpoints[2][subset_index * 2 + 0], + abs_blog_endpoints[0][subset_index * 2 + 1], abs_blog_endpoints[1][subset_index * 2 + 1], abs_blog_endpoints[2][subset_index * 2 + 1], + subset_min_r[subset_index], subset_min_g[subset_index], subset_min_b[subset_index], + subset_max_r[subset_index], subset_max_g[subset_index], subset_max_b[subset_index], num_endpoint_bits); + } - bool failed_flag = false; - for (uint32_t c = 0; c < 3; c++) + uint8_t trial_weights[16]; + double trial_error = assign_weights_error_3(trial_weights, best_pat_bits, subset_min_r, subset_min_g, subset_min_b, subset_max_r, subset_max_g, subset_max_b, pFloat_pixels, pPixel_scales); + + if (trial_error < cur_error) { - log_blk.m_endpoints[c][0] = blog_endpoints[c][0]; + basist::bc6h_logical_block trial_log_blk; - int delta = (int)blog_endpoints[c][1] - (int)blog_endpoints[c][0]; - if ((delta < min_delta) || (delta > max_delta)) + trial_log_blk.m_mode = bc6h_mode_index; + trial_log_blk.m_partition_pattern = best_pat_index; + + memcpy(trial_log_blk.m_endpoints, abs_blog_endpoints, sizeof(trial_log_blk.m_endpoints)); + memcpy(trial_log_blk.m_weights, trial_weights, 16); + + if (trial_log_blk.m_weights[0] & 4) { - failed_flag = true; - break; + for (uint32_t c = 0; c < 3; c++) + std::swap(trial_log_blk.m_endpoints[c][0], trial_log_blk.m_endpoints[c][1]); + + for (uint32_t i = 0; i < 16; i++) + { + const uint32_t subset_index = (best_pat_bits >> i) & 1; + if (subset_index == 0) + trial_log_blk.m_weights[i] = 7 - trial_log_blk.m_weights[i]; + } } - log_blk.m_endpoints[c][1] = delta & delta_bitmask; - } + const uint32_t subset2_anchor_index = g_bc7_table_anchor_index_second_subset[best_pat_index]; + if (trial_log_blk.m_weights[subset2_anchor_index] & 4) + { + for (uint32_t c = 0; c < 3; c++) + std::swap(trial_log_blk.m_endpoints[c][2], trial_log_blk.m_endpoints[c][3]); - if (failed_flag) - continue; + for (uint32_t i = 0; i < 16; i++) + { + const uint32_t subset_index = (best_pat_bits >> i) & 1; + if (subset_index == 1) + trial_log_blk.m_weights[i] = 7 - trial_log_blk.m_weights[i]; + } + } + + if (bc6h_mode_index != BC6H_2SUBSET_ABS_ENDPOINT_MODE) + { + const uint32_t num_delta_bits[3] = { g_bc6h_mode_sig_bits[bc6h_mode_index][1], g_bc6h_mode_sig_bits[bc6h_mode_index][2], g_bc6h_mode_sig_bits[bc6h_mode_index][3] }; + const int delta_bitmasks[3] = { (1 << num_delta_bits[0]) - 1, (1 << num_delta_bits[1]) - 1, (1 << num_delta_bits[2]) - 1 }; - log_blk.m_mode = mode; - pack_bc6h_block(*pPacked_block, log_blk); - - return; - } + for (uint32_t c = 0; c < 3; c++) + { + const int delta0 = (int)trial_log_blk.m_endpoints[c][1] - (int)trial_log_blk.m_endpoints[c][0]; + const int delta1 = (int)trial_log_blk.m_endpoints[c][2] - (int)trial_log_blk.m_endpoints[c][0]; + const int delta2 = (int)trial_log_blk.m_endpoints[c][3] - (int)trial_log_blk.m_endpoints[c][0]; - // Worst case fall back to mode 10, which can handle any endpoints - bc6h_enc_block_mode10(pPacked_block, pEndpoints, pWeights); - } +#ifdef _DEBUG + // sanity check the final endpoints + const int max_delta = (1 << (num_delta_bits[c] - 1)) - 1; + const int min_delta = -(max_delta + 1); + assert((max_delta - min_delta) == delta_bitmasks[c]); - // Mode 9 (direct endpoint encoding), 3-bit weights, but only 1 subset - void bc6h_enc_block_1subset_mode9_3bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights) - { - assert(g_bc6h_enc_initialized); + if ((delta0 < min_delta) || (delta0 > max_delta) || (delta1 < min_delta) || (delta1 > max_delta) || (delta2 < min_delta) || (delta2 > max_delta)) + { + assert(0); + break; + } +#endif - for (uint32_t i = 0; i < 16; i++) - { - assert(pWeights[i] <= 7); + trial_log_blk.m_endpoints[c][1] = delta0 & delta_bitmasks[c]; + trial_log_blk.m_endpoints[c][2] = delta1 & delta_bitmasks[c]; + trial_log_blk.m_endpoints[c][3] = delta2 & delta_bitmasks[c]; + + } // c + } + + cur_error = trial_error; + log_blk = trial_log_blk; + } } - bc6h_logical_block log_blk; - log_blk.clear(); + static void fast_encode_bc6h_2subsets( + const basist::half_float* pPixels, const vec3F* pFloat_pixels, const float* pPixel_scales, + double& cur_error, basist::bc6h_logical_block& log_blk, + int64_t block_max_var, + int mean_r, int mean_g, int mean_b, float block_axis_r, float block_axis_g, float block_axis_b, + const fast_bc6h_params& params) + { + assert((params.m_max_2subset_pats_to_try > 0) && (params.m_max_2subset_pats_to_try <= BC6H_NUM_PATS)); + + if (params.m_max_2subset_pats_to_try == BC6H_NUM_PATS) + { + for (uint32_t i = 0; i < BC6H_NUM_PATS; i++) + { + const uint32_t best_pat_index = i; + const uint32_t best_pat_bits = g_bc6h_pats2[best_pat_index]; + + fast_encode_bc6h_2subsets_pattern( + best_pat_index, best_pat_bits, + pPixels, pFloat_pixels, pPixel_scales, + cur_error, log_blk, + block_max_var, + mean_r, mean_g, mean_b, params); + } + return; + } + + uint32_t desired_pat_bits = 0; + for (uint32_t i = 0; i < 16; i++) + { + float f = (float)(pPixels[i * 3 + 0] - mean_r) * block_axis_r + + (float)(pPixels[i * 3 + 1] - mean_g) * block_axis_g + + (float)(pPixels[i * 3 + 2] - mean_b) * block_axis_b; + + desired_pat_bits |= (((f >= 0.0f) ? 1 : 0) << i); + } // i - // Convert half endpoints to blog6 (mode 9 doesn't use delta encoding) - for (uint32_t c = 0; c < 3; c++) - { - log_blk.m_endpoints[c][0] = half_to_blog_tab(pEndpoints[c][0], 6); - log_blk.m_endpoints[c][2] = log_blk.m_endpoints[c][0]; + if (params.m_max_2subset_pats_to_try == 1) + { + uint32_t best_diff = UINT32_MAX; + for (uint32_t p = 0; p < BC6H_NUM_PATS; p++) + { + const uint32_t bc6h_pat_bits = g_bc6h_pats2[p]; - log_blk.m_endpoints[c][1] = half_to_blog_tab(pEndpoints[c][1], 6); - log_blk.m_endpoints[c][3] = log_blk.m_endpoints[c][1]; - } + int diff = popcount32(bc6h_pat_bits ^ desired_pat_bits); + int diff_inv = 16 - diff; - memcpy(log_blk.m_weights, pWeights, 16); + uint32_t min_diff = (basisu::minimum(diff, diff_inv) << 8) | p; + if (min_diff < best_diff) + best_diff = min_diff; + } // p - const uint32_t pat_index = 0; - const uint8_t* pPat = &g_bc6h_2subset_patterns[pat_index][0][0]; + const uint32_t best_pat_index = best_diff & 0xFF; + const uint32_t best_pat_bits = g_bc6h_pats2[best_pat_index]; - if (log_blk.m_weights[0] & 4) - { - for (uint32_t c = 0; c < 3; c++) - std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][1]); + fast_encode_bc6h_2subsets_pattern( + best_pat_index, best_pat_bits, + pPixels, pFloat_pixels, pPixel_scales, + cur_error, log_blk, + block_max_var, + mean_r, mean_g, mean_b, params); + } + else + { + assert(params.m_max_2subset_pats_to_try <= BC6H_NUM_PATS); + uint32_t pat_diffs[BC6H_NUM_PATS]; - for (uint32_t i = 0; i < 16; i++) - if ((pPat[i] & 0x7F) == 0) - log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; - } + for (uint32_t p = 0; p < BC6H_NUM_PATS; p++) + { + const uint32_t bc6h_pat_bits = g_bc6h_pats2[p]; - if (log_blk.m_weights[15] & 4) - { - for (uint32_t c = 0; c < 3; c++) - std::swap(log_blk.m_endpoints[c][2], log_blk.m_endpoints[c][3]); + int diff = popcount32(bc6h_pat_bits ^ desired_pat_bits); + int diff_inv = 16 - diff; - for (uint32_t i = 0; i < 16; i++) - if ((pPat[i] & 0x7F) == 1) - log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; - } + pat_diffs[p] = (basisu::minimum(diff, diff_inv) << 8) | p; + } // p - log_blk.m_mode = 9; - log_blk.m_partition_pattern = pat_index; - pack_bc6h_block(*pPacked_block, log_blk); - } + std::sort(pat_diffs, pat_diffs + BC6H_NUM_PATS); - // Tries modes 0-8, falls back to mode 9 - void bc6h_enc_block_1subset_3bit_weights(bc6h_block* pPacked_block, const half_float pEndpoints[3][2], const uint8_t* pWeights) - { - assert(g_bc6h_enc_initialized); + for (uint32_t pat_iter = 0; pat_iter < params.m_max_2subset_pats_to_try; pat_iter++) + { + const uint32_t best_pat_index = pat_diffs[pat_iter] & 0xFF; + const uint32_t best_pat_bits = g_bc6h_pats2[best_pat_index]; - for (uint32_t i = 0; i < 16; i++) - { - assert(pWeights[i] <= 7); + fast_encode_bc6h_2subsets_pattern( + best_pat_index, best_pat_bits, + pPixels, pFloat_pixels, pPixel_scales, + cur_error, log_blk, + block_max_var, + mean_r, mean_g, mean_b, params); + } + } } - bc6h_logical_block log_blk; - log_blk.clear(); - - for (uint32_t mode_iter = 0; mode_iter <= 8; mode_iter++) + void fast_encode_bc6h(const basist::half_float* pPixels, basist::bc6h_block* pBlock, const fast_bc6h_params ¶ms) { - static const int s_mode_order[9] = { 2, 3, 4, 0, 5, 6, 7, 8, 1 }; // ordered from largest base bits to least - const uint32_t mode = s_mode_order[mode_iter]; + basist::bc6h_logical_block log_blk; + log_blk.clear(); - const uint32_t num_base_bits = g_bc6h_mode_sig_bits[mode][0]; - const int base_bitmask = (1 << num_base_bits) - 1; - BASISU_NOTE_UNUSED(base_bitmask); + log_blk.m_mode = basist::BC6H_FIRST_1SUBSET_MODE_INDEX; - const uint32_t num_delta_bits[3] = { g_bc6h_mode_sig_bits[mode][1], g_bc6h_mode_sig_bits[mode][2], g_bc6h_mode_sig_bits[mode][3] }; - const int delta_bitmasks[3] = { (1 << num_delta_bits[0]) - 1, (1 << num_delta_bits[1]) - 1, (1 << num_delta_bits[2]) - 1 }; + uint32_t omin_r = UINT32_MAX, omin_g = UINT32_MAX, omin_b = UINT32_MAX; + uint32_t omax_r = 0, omax_g = 0, omax_b = 0; + uint32_t total_r = 0, total_g = 0, total_b = 0; + + for (uint32_t i = 0; i < 16; i++) + { + uint32_t r = pPixels[i * 3 + 0]; + uint32_t g = pPixels[i * 3 + 1]; + uint32_t b = pPixels[i * 3 + 2]; + + total_r += r; + total_g += g; + total_b += b; - uint32_t blog_endpoints[3][4]; + omin_r = basisu::minimum(omin_r, r); + omin_g = basisu::minimum(omin_g, g); + omin_b = basisu::minimum(omin_b, b); - // Convert half endpoints to blog 7-11 - for (uint32_t c = 0; c < 3; c++) + omax_r = basisu::maximum(omax_r, r); + omax_g = basisu::maximum(omax_g, g); + omax_b = basisu::maximum(omax_b, b); + } + + if ((omin_r == omax_r) && (omin_g == omax_g) && (omin_b == omax_b)) { - blog_endpoints[c][0] = half_to_blog_tab(pEndpoints[c][0], num_base_bits); - blog_endpoints[c][2] = blog_endpoints[c][0]; - assert((int)blog_endpoints[c][0] <= base_bitmask); + // Solid block + log_blk.m_endpoints[0][0] = basist::bc6h_half_to_blog16((basist::half_float)omin_r); + log_blk.m_endpoints[0][1] = 0; - blog_endpoints[c][1] = half_to_blog_tab(pEndpoints[c][1], num_base_bits); - blog_endpoints[c][3] = blog_endpoints[c][1]; - assert((int)blog_endpoints[c][1] <= base_bitmask); + log_blk.m_endpoints[1][0] = basist::bc6h_half_to_blog16((basist::half_float)omin_g); + log_blk.m_endpoints[1][1] = 0; + + log_blk.m_endpoints[2][0] = basist::bc6h_half_to_blog16((basist::half_float)omin_b); + log_blk.m_endpoints[2][1] = 0; + + log_blk.m_mode = 13; + pack_bc6h_block(*pBlock, log_blk); + + return; } + + uint32_t min_r, min_g, min_b, max_r, max_g, max_b; - const uint32_t pat_index = 0; - const uint8_t* pPat = &g_bc6h_2subset_patterns[pat_index][0][0]; + int mean_r = (total_r + 8) / 16; + int mean_g = (total_g + 8) / 16; + int mean_b = (total_b + 8) / 16; - memcpy(log_blk.m_weights, pWeights, 16); + int64_t icov[6] = { 0, 0, 0, 0, 0, 0 }; - if (log_blk.m_weights[0] & 4) + for (uint32_t i = 0; i < 16; i++) { - // Swap part 0's endpoints/weights - for (uint32_t c = 0; c < 3; c++) - std::swap(blog_endpoints[c][0], blog_endpoints[c][1]); + int r = (int)pPixels[i * 3 + 0] - mean_r; + int g = (int)pPixels[i * 3 + 1] - mean_g; + int b = (int)pPixels[i * 3 + 2] - mean_b; - for (uint32_t i = 0; i < 16; i++) - if ((pPat[i] & 0x7F) == 0) - log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; + icov[0] += r * r; + icov[1] += r * g; + icov[2] += r * b; + icov[3] += g * g; + icov[4] += g * b; + icov[5] += b * b; } - - if (log_blk.m_weights[15] & 4) + + int64_t block_max_var = basisu::maximum(icov[0], icov[3], icov[5]); // not divided by 16, i.e. scaled by 16 + + if (block_max_var < (FAST_BC6H_STD_DEV_THRESH * FAST_BC6H_STD_DEV_THRESH * 16)) { - // Swap part 1's endpoints/weights - for (uint32_t c = 0; c < 3; c++) - std::swap(blog_endpoints[c][2], blog_endpoints[c][3]); + // Simple block + min_r = (omax_r - omin_r) / 32 + omin_r; + min_g = (omax_g - omin_g) / 32 + omin_g; + min_b = (omax_b - omin_b) / 32 + omin_b; - for (uint32_t i = 0; i < 16; i++) - if ((pPat[i] & 0x7F) == 1) - log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; - } + max_r = ((omax_r - omin_r) * 31) / 32 + omin_r; + max_g = ((omax_g - omin_g) * 31) / 32 + omin_g; + max_b = ((omax_b - omin_b) * 31) / 32 + omin_b; - bool failed_flag = false; + assert((max_r < MAX_HALF_FLOAT_AS_INT_BITS) && (max_g < MAX_HALF_FLOAT_AS_INT_BITS) && (max_b < MAX_HALF_FLOAT_AS_INT_BITS)); - for (uint32_t c = 0; c < 3; c++) - { - const int max_delta = (1 << (num_delta_bits[c] - 1)) - 1; + bc6h_quant_dequant_endpoints(min_r, min_g, min_b, max_r, max_g, max_b, 10); - const int min_delta = -(max_delta + 1); - assert((max_delta - min_delta) == delta_bitmasks[c]); + assign_weights_simple_4(pPixels, log_blk.m_weights, min_r, min_g, min_b, max_r, max_g, max_b, block_max_var); - log_blk.m_endpoints[c][0] = blog_endpoints[c][0]; + log_blk.m_endpoints[0][0] = basist::bc6h_half_to_blog((basist::half_float)min_r, 10); + log_blk.m_endpoints[0][1] = basist::bc6h_half_to_blog((basist::half_float)max_r, 10); - int delta0 = (int)blog_endpoints[c][1] - (int)blog_endpoints[c][0]; - int delta1 = (int)blog_endpoints[c][2] - (int)blog_endpoints[c][0]; - int delta2 = (int)blog_endpoints[c][3] - (int)blog_endpoints[c][0]; + log_blk.m_endpoints[1][0] = basist::bc6h_half_to_blog((basist::half_float)min_g, 10); + log_blk.m_endpoints[1][1] = basist::bc6h_half_to_blog((basist::half_float)max_g, 10); - if ((delta0 < min_delta) || (delta0 > max_delta) || - (delta1 < min_delta) || (delta1 > max_delta) || - (delta2 < min_delta) || (delta2 > max_delta)) + log_blk.m_endpoints[2][0] = basist::bc6h_half_to_blog((basist::half_float)min_b, 10); + log_blk.m_endpoints[2][1] = basist::bc6h_half_to_blog((basist::half_float)max_b, 10); + + if (log_blk.m_weights[0] & 8) { - failed_flag = true; - break; + for (uint32_t i = 0; i < 16; i++) + log_blk.m_weights[i] = 15 - log_blk.m_weights[i]; + + for (uint32_t c = 0; c < 3; c++) + { + std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][1]); + } } - log_blk.m_endpoints[c][1] = delta0 & delta_bitmasks[c]; - log_blk.m_endpoints[c][2] = delta1 & delta_bitmasks[c]; - log_blk.m_endpoints[c][3] = delta2 & delta_bitmasks[c]; + pack_bc6h_block(*pBlock, log_blk); - if (failed_flag) - break; + return; } - if (failed_flag) - continue; - - log_blk.m_mode = mode; - log_blk.m_partition_pattern = pat_index; - pack_bc6h_block(*pPacked_block, log_blk); - return; + // block_max_var cannot be 0 here, also trace cannot be 0 - } // mode_iter + // Complex block (edges/strong gradients) + bool try_2subsets = false; + double cur_err = 0.0f; + vec3F float_pixels[16]; + float pixel_scales[16]; - bc6h_enc_block_1subset_mode9_3bit_weights(pPacked_block, pEndpoints, pWeights); - } + // covar rows are: + // 0, 1, 2 + // 1, 3, 4 + // 2, 4, 5 + float cov[6]; + for (uint32_t i = 0; i < 6; i++) + cov[i] = (float)icov[i]; - // pEndpoints[subset][comp][lh_index] - void bc6h_enc_block_2subset_mode9_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights) - { - assert(g_bc6h_enc_initialized); - assert(common_part_index < basist::TOTAL_ASTC_BC7_COMMON_PARTITIONS2); + const float sc = 1.0f / (float)block_max_var; + const float wx = sc * cov[0], wy = sc * cov[3], wz = sc * cov[5]; - for (uint32_t i = 0; i < 16; i++) - { - assert(pWeights[i] <= 7); - } + const float alt_xr = cov[0] * wx + cov[1] * wy + cov[2] * wz; + const float alt_xg = cov[1] * wx + cov[3] * wy + cov[4] * wz; + const float alt_xb = cov[2] * wx + cov[4] * wy + cov[5] * wz; - bc6h_logical_block log_blk; - log_blk.clear(); + float l = basisu::squaref(alt_xr) + basisu::squaref(alt_xg) + basisu::squaref(alt_xb); - // Convert half endpoints to blog6 (mode 9 doesn't use delta encoding) - for (uint32_t s = 0; s < 2; s++) - { - for (uint32_t c = 0; c < 3; c++) + float axis_r = 0.57735027f, axis_g = 0.57735027f, axis_b = 0.57735027f; + if (fabs(l) >= basisu::SMALL_FLOAT_VAL) { - log_blk.m_endpoints[c][0 + s * 2] = half_to_blog_tab(pEndpoints[s][c][0], 6); - log_blk.m_endpoints[c][1 + s * 2] = half_to_blog_tab(pEndpoints[s][c][1], 6); + const float inv_l = inv_sqrt(l); + axis_r = alt_xr * inv_l; + axis_g = alt_xg * inv_l; + axis_b = alt_xb * inv_l; } - } - memcpy(log_blk.m_weights, pWeights, 16); + const float tr = axis_r * cov[0] + axis_g * cov[1] + axis_b * cov[2]; + const float tg = axis_r * cov[1] + axis_g * cov[3] + axis_b * cov[4]; + const float tb = axis_r * cov[2] + axis_g * cov[4] + axis_b * cov[5]; + const float principle_axis_var = tr * axis_r + tg * axis_g + tb * axis_b; - //const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[common_part_index].m_astc; - const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[common_part_index].m_bc7; + const float inv_principle_axis_var = 1.0f / (principle_axis_var + basisu::REALLY_SMALL_FLOAT_VAL); + axis_r = tr * inv_principle_axis_var; + axis_g = tg * inv_principle_axis_var; + axis_b = tb * inv_principle_axis_var; - const bool invert_flag = basist::g_astc_bc7_common_partitions2[common_part_index].m_invert; - if (invert_flag) - { - for (uint32_t c = 0; c < 3; c++) + float total_var = cov[0] + cov[3] + cov[5]; + + // If the principle axis variance vs. the block's total variance accounts for less than this threshold, it's a "very complex" block that may benefit from 2 subsets. + const float COMPLEX_BLOCK_PRINCIPLE_AXIS_FRACT_THRESH = .995f; + try_2subsets = principle_axis_var < (total_var * COMPLEX_BLOCK_PRINCIPLE_AXIS_FRACT_THRESH); + + uint32_t min_idx = 0, max_idx = 0; + float min_dot = basisu::BIG_FLOAT_VAL, max_dot = -basisu::BIG_FLOAT_VAL; + + for (uint32_t i = 0; i < 16; i++) { - std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][2]); - std::swap(log_blk.m_endpoints[c][1], log_blk.m_endpoints[c][3]); + float r = (float)pPixels[i * 3 + 0]; + float g = (float)pPixels[i * 3 + 1]; + float b = (float)pPixels[i * 3 + 2]; + + float_pixels[i].c[0] = fast_half_to_float_pos_not_inf_or_nan((half_float)r); + float_pixels[i].c[1] = fast_half_to_float_pos_not_inf_or_nan((half_float)g); + float_pixels[i].c[2] = fast_half_to_float_pos_not_inf_or_nan((half_float)b); + + pixel_scales[i] = 1.0f / (basisu::squaref(float_pixels[i].c[0]) + basisu::squaref(float_pixels[i].c[1]) + basisu::squaref(float_pixels[i].c[2]) + (float)MIN_HALF_FLOAT); + + float dot = r * axis_r + g * axis_g + b * axis_b; + + if (dot < min_dot) + { + min_dot = dot; + min_idx = i; + } + + if (dot > max_dot) + { + max_dot = dot; + max_idx = i; + } } - } - const uint32_t pat_index = bc7_pattern; - assert(pat_index < 32); - const uint8_t* pPat = &g_bc6h_2subset_patterns[pat_index][0][0]; + min_r = pPixels[min_idx * 3 + 0]; + min_g = pPixels[min_idx * 3 + 1]; + min_b = pPixels[min_idx * 3 + 2]; - bool swap_flags[2] = { false, false }; - for (uint32_t i = 0; i < 16; i++) - { - if ((pPat[i] & 0x80) == 0) - continue; + max_r = pPixels[max_idx * 3 + 0]; + max_g = pPixels[max_idx * 3 + 1]; + max_b = pPixels[max_idx * 3 + 2]; - if (log_blk.m_weights[i] & 4) + assert((max_r < MAX_HALF_FLOAT_AS_INT_BITS) && (max_g < MAX_HALF_FLOAT_AS_INT_BITS) && (max_b < MAX_HALF_FLOAT_AS_INT_BITS)); + + bc6h_quant_dequant_endpoints(min_r, min_g, min_b, max_r, max_g, max_b, 10); + + cur_err = assign_weights_4(float_pixels, pixel_scales, log_blk.m_weights, min_r, min_g, min_b, max_r, max_g, max_b, block_max_var, try_2subsets, params); + + const uint32_t MAX_LS_PASSES = params.m_hq_ls ? 2 : 1; + for (uint32_t pass = 0; pass < MAX_LS_PASSES; pass++) { - const uint32_t p = pPat[i] & 1; - swap_flags[p] = true; - } - } + float z00 = 0.0f, z01 = 0.0f, z10 = 0.0f, z11 = 0.0f; + float q00_r = 0.0f, q10_r = 0.0f, t_r = 0.0f; + float q00_g = 0.0f, q10_g = 0.0f, t_g = 0.0f; + float q00_b = 0.0f, q10_b = 0.0f, t_b = 0.0f; - if (swap_flags[0]) - { - for (uint32_t c = 0; c < 3; c++) - std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][1]); + for (uint32_t i = 0; i < 16; i++) + { + float r = (float)pPixels[i * 3 + 0]; + float g = (float)pPixels[i * 3 + 1]; + float b = (float)pPixels[i * 3 + 2]; - for (uint32_t i = 0; i < 16; i++) - if ((pPat[i] & 0x7F) == 0) - log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; - } + const uint32_t sel = log_blk.m_weights[i]; - if (swap_flags[1]) - { - for (uint32_t c = 0; c < 3; c++) - std::swap(log_blk.m_endpoints[c][2], log_blk.m_endpoints[c][3]); + z00 += g_bc6h_ls_weights_4[sel][0]; + z10 += g_bc6h_ls_weights_4[sel][1]; + z11 += g_bc6h_ls_weights_4[sel][2]; - for (uint32_t i = 0; i < 16; i++) - if ((pPat[i] & 0x7F) == 1) - log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; - } + float w = g_bc6h_ls_weights_4[sel][3]; - log_blk.m_mode = 9; - log_blk.m_partition_pattern = pat_index; - pack_bc6h_block(*pPacked_block, log_blk); - } + q00_r += w * r; + t_r += r; - void bc6h_enc_block_2subset_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights) - { - assert(g_bc6h_enc_initialized); + q00_g += w * g; + t_g += g; - for (uint32_t i = 0; i < 16; i++) - { - assert(pWeights[i] <= 7); - } + q00_b += w * b; + t_b += b; + } - bc6h_logical_block log_blk; - log_blk.clear(); + q10_r = t_r - q00_r; + q10_g = t_g - q00_g; + q10_b = t_b - q00_b; - for (uint32_t mode_iter = 0; mode_iter <= 8; mode_iter++) - { - static const int s_mode_order[9] = { 2, 3, 4, 0, 5, 6, 7, 8, 1 }; // ordered from largest base bits to least - const uint32_t mode = s_mode_order[mode_iter]; + z01 = z10; - const uint32_t num_base_bits = g_bc6h_mode_sig_bits[mode][0]; - const int base_bitmask = (1 << num_base_bits) - 1; - BASISU_NOTE_UNUSED(base_bitmask); + float det = z00 * z11 - z01 * z10; + if (fabs(det) < basisu::SMALL_FLOAT_VAL) + break; - const uint32_t num_delta_bits[3] = { g_bc6h_mode_sig_bits[mode][1], g_bc6h_mode_sig_bits[mode][2], g_bc6h_mode_sig_bits[mode][3] }; - const int delta_bitmasks[3] = { (1 << num_delta_bits[0]) - 1, (1 << num_delta_bits[1]) - 1, (1 << num_delta_bits[2]) - 1 }; + det = 1.0f / det; - uint32_t blog_endpoints[3][4]; + float iz00 = z11 * det; + float iz01 = -z01 * det; + float iz10 = -z10 * det; + float iz11 = z00 * det; - // Convert half endpoints to blog 7-11 - for (uint32_t s = 0; s < 2; s++) - { - for (uint32_t c = 0; c < 3; c++) - { - blog_endpoints[c][0 + s * 2] = half_to_blog_tab(pEndpoints[s][c][0], num_base_bits); - blog_endpoints[c][1 + s * 2] = half_to_blog_tab(pEndpoints[s][c][1], num_base_bits); - } - } + uint32_t trial_max_r = (int)basisu::clamp(std::round(iz00 * q00_r + iz01 * q10_r), 0, (float)basist::MAX_BC6H_HALF_FLOAT_AS_UINT); + uint32_t trial_min_r = (int)basisu::clamp(std::round(iz10 * q00_r + iz11 * q10_r), 0, (float)basist::MAX_BC6H_HALF_FLOAT_AS_UINT); - memcpy(log_blk.m_weights, pWeights, 16); + uint32_t trial_max_g = (int)basisu::clamp(std::round(iz00 * q00_g + iz01 * q10_g), 0, (float)basist::MAX_BC6H_HALF_FLOAT_AS_UINT); + uint32_t trial_min_g = (int)basisu::clamp(std::round(iz10 * q00_g + iz11 * q10_g), 0, (float)basist::MAX_BC6H_HALF_FLOAT_AS_UINT); - //const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[common_part_index].m_astc; - const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[common_part_index].m_bc7; + uint32_t trial_max_b = (int)basisu::clamp(std::round(iz00 * q00_b + iz01 * q10_b), 0, (float)basist::MAX_BC6H_HALF_FLOAT_AS_UINT); + uint32_t trial_min_b = (int)basisu::clamp(std::round(iz10 * q00_b + iz11 * q10_b), 0, (float)basist::MAX_BC6H_HALF_FLOAT_AS_UINT); - const bool invert_flag = basist::g_astc_bc7_common_partitions2[common_part_index].m_invert; - if (invert_flag) - { - for (uint32_t c = 0; c < 3; c++) + bc6h_quant_dequant_endpoints(trial_min_r, trial_min_g, trial_min_b, trial_max_r, trial_max_g, trial_max_b, 10); + + uint8_t trial_weights[16]; + double trial_err = assign_weights_4(float_pixels, pixel_scales, trial_weights, trial_min_r, trial_min_g, trial_min_b, trial_max_r, trial_max_g, trial_max_b, block_max_var, try_2subsets, params); + + if (trial_err < cur_err) { - std::swap(blog_endpoints[c][0], blog_endpoints[c][2]); - std::swap(blog_endpoints[c][1], blog_endpoints[c][3]); - } - } + cur_err = trial_err; - const uint32_t pat_index = bc7_pattern; - assert(pat_index < 32); - const uint8_t* pPat = &g_bc6h_2subset_patterns[pat_index][0][0]; + min_r = trial_min_r; + max_r = trial_max_r; - bool swap_flags[2] = { false, false }; - for (uint32_t i = 0; i < 16; i++) - { - if ((pPat[i] & 0x80) == 0) - continue; + min_g = trial_min_g; + max_g = trial_max_g; - if (log_blk.m_weights[i] & 4) + min_b = trial_min_b; + max_b = trial_max_b; + + memcpy(log_blk.m_weights, trial_weights, 16); + } + else { - const uint32_t p = pPat[i] & 1; - swap_flags[p] = true; + break; } - } - if (swap_flags[0]) - { - for (uint32_t c = 0; c < 3; c++) - std::swap(blog_endpoints[c][0], blog_endpoints[c][1]); + } // pass - for (uint32_t i = 0; i < 16; i++) - if ((pPat[i] & 0x7F) == 0) - log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; +#if 0 + //if (full_flag) + if ((try_2subsets) && (block_max_var > (FAST_BC6H_COMPLEX_STD_DEV_THRESH * FAST_BC6H_COMPLEX_STD_DEV_THRESH * 16))) + { + min_r = 0; + max_r = 0; + min_g = 0; + max_g = 0; + min_b = 0; + max_b = 0; } +#endif - if (swap_flags[1]) - { - for (uint32_t c = 0; c < 3; c++) - std::swap(blog_endpoints[c][2], blog_endpoints[c][3]); + log_blk.m_endpoints[0][0] = basist::bc6h_half_to_blog((basist::half_float)min_r, 10); + log_blk.m_endpoints[0][1] = basist::bc6h_half_to_blog((basist::half_float)max_r, 10); - for (uint32_t i = 0; i < 16; i++) - if ((pPat[i] & 0x7F) == 1) - log_blk.m_weights[i] = 7 - log_blk.m_weights[i]; - } + log_blk.m_endpoints[1][0] = basist::bc6h_half_to_blog((basist::half_float)min_g, 10); + log_blk.m_endpoints[1][1] = basist::bc6h_half_to_blog((basist::half_float)max_g, 10); - // Try packing the endpoints - bool failed_flag = false; + log_blk.m_endpoints[2][0] = basist::bc6h_half_to_blog((basist::half_float)min_b, 10); + log_blk.m_endpoints[2][1] = basist::bc6h_half_to_blog((basist::half_float)max_b, 10); - for (uint32_t c = 0; c < 3; c++) + if (log_blk.m_weights[0] & 8) { - const int max_delta = (1 << (num_delta_bits[c] - 1)) - 1; + for (uint32_t i = 0; i < 16; i++) + log_blk.m_weights[i] = 15 - log_blk.m_weights[i]; - const int min_delta = -(max_delta + 1); - assert((max_delta - min_delta) == delta_bitmasks[c]); + for (uint32_t c = 0; c < 3; c++) + { + std::swap(log_blk.m_endpoints[c][0], log_blk.m_endpoints[c][1]); + } + } + + if ((params.m_max_2subset_pats_to_try > 0) && ((try_2subsets) && (block_max_var > (FAST_BC6H_COMPLEX_STD_DEV_THRESH * FAST_BC6H_COMPLEX_STD_DEV_THRESH * 16)))) + { + fast_encode_bc6h_2subsets(pPixels, float_pixels, pixel_scales, cur_err, log_blk, block_max_var, mean_r, mean_g, mean_b, axis_r, axis_g, axis_b, params); + } - log_blk.m_endpoints[c][0] = blog_endpoints[c][0]; + pack_bc6h_block(*pBlock, log_blk); + } - int delta0 = (int)blog_endpoints[c][1] - (int)blog_endpoints[c][0]; - int delta1 = (int)blog_endpoints[c][2] - (int)blog_endpoints[c][0]; - int delta2 = (int)blog_endpoints[c][3] - (int)blog_endpoints[c][0]; + bool decode_6x6_hdr(const uint8_t *pComp_data, uint32_t comp_data_size, basisu::vector2D& decoded_blocks, uint32_t& width, uint32_t& height) + { + const uint32_t BLOCK_W = 6, BLOCK_H = 6; - if ((delta0 < min_delta) || (delta0 > max_delta) || - (delta1 < min_delta) || (delta1 > max_delta) || - (delta2 < min_delta) || (delta2 > max_delta)) - { - failed_flag = true; - break; - } + //interval_timer tm; + //tm.start(); - log_blk.m_endpoints[c][1] = delta0 & delta_bitmasks[c]; - log_blk.m_endpoints[c][2] = delta1 & delta_bitmasks[c]; - log_blk.m_endpoints[c][3] = delta2 & delta_bitmasks[c]; + width = 0; + height = 0; - if (failed_flag) - break; - } - if (failed_flag) - continue; + if (comp_data_size <= (2 * 3 + 1)) + return false; - log_blk.m_mode = mode; - log_blk.m_partition_pattern = pat_index; - pack_bc6h_block(*pPacked_block, log_blk); + basist::bitwise_decoder decoder; + if (!decoder.init(pComp_data, comp_data_size)) + return false; - //half_float blk[16 * 3]; - //unpack_bc6h(pPacked_block, blk, false); + if (decoder.get_bits(16) != 0xABCD) + return false; - return; - } + width = decoder.get_bits(16); + height = decoder.get_bits(16); - bc6h_enc_block_2subset_mode9_3bit_weights(pPacked_block, common_part_index, pEndpoints, pWeights); - } + if (!width || !height || (width > MAX_ASTC_HDR_6X6_DIM) || (height > MAX_ASTC_HDR_6X6_DIM)) + return false; - bool bc6h_enc_block_solid_color(bc6h_block* pPacked_block, const half_float pColor[3]) - { - assert(g_bc6h_enc_initialized); + const uint32_t num_blocks_x = (width + BLOCK_W - 1) / BLOCK_W; + const uint32_t num_blocks_y = (height + BLOCK_H - 1) / BLOCK_H; - if ((pColor[0] | pColor[1] | pColor[2]) & 0x8000) - return false; + const uint32_t total_blocks = num_blocks_x * num_blocks_y; - // ASTC block unpacker won't allow Inf/NaN's to come through. - //if (is_half_inf_or_nan(pColor[0]) || is_half_inf_or_nan(pColor[1]) || is_half_inf_or_nan(pColor[2])) - // return false; + decoded_blocks.resize(num_blocks_x, num_blocks_y); + //memset(decoded_blocks.get_ptr(), 0, decoded_blocks.size_in_bytes()); - uint8_t weights[16]; - memset(weights, 0, sizeof(weights)); + // These are the decoded log blocks, NOT the output log blocks. + basisu::vector2D decoded_log_blocks(num_blocks_x, REUSE_MAX_BUFFER_ROWS); + memset(decoded_log_blocks.get_ptr(), 0, decoded_log_blocks.size_in_bytes()); - half_float endpoints[3][2]; - endpoints[0][0] = pColor[0]; - endpoints[0][1] = pColor[0]; - - endpoints[1][0] = pColor[1]; - endpoints[1][1] = pColor[1]; + uint32_t cur_bx = 0, cur_by = 0; + int cur_row_index = 0; - endpoints[2][0] = pColor[2]; - endpoints[2][1] = pColor[2]; - - bc6h_enc_block_1subset_4bit_weights(pPacked_block, endpoints, weights); + uint32_t step_counter = 0; + BASISU_NOTE_UNUSED(step_counter); - return true; - } + while (cur_by < num_blocks_y) + { + step_counter++; - //-------------------------------------------------------------------------------------------------------------------------- - // basisu_astc_hdr_core.cpp + //if ((cur_bx == 9) && (cur_by == 13)) + // printf("!"); - static bool g_astc_hdr_core_initialized; - static int8_t g_astc_partition_id_to_common_bc7_pat_index[1024]; +#if SYNC_MARKERS + uint32_t mk = decoder.get_bits(16); + if (mk != 0xDEAD) + { + printf("!"); + assert(0); + return false; + } +#endif + if (decoder.get_bits_remaining() < 1) + return false; - //-------------------------------------------------------------------------------------------------------------------------- + encoding_type et = encoding_type::cBlock; - void astc_hdr_core_init() - { - if (g_astc_hdr_core_initialized) - return; + uint32_t b0 = decoder.get_bits(1); + if (!b0) + { + uint32_t b1 = decoder.get_bits(1); + if (b1) + et = encoding_type::cReuse; + else + { + uint32_t b2 = decoder.get_bits(1); + if (b2) + et = encoding_type::cSolid; + else + et = encoding_type::cRun; + } + } - memset(g_astc_partition_id_to_common_bc7_pat_index, 0xFF, sizeof(g_astc_partition_id_to_common_bc7_pat_index)); + switch (et) + { + case encoding_type::cRun: + { + if (!cur_bx && !cur_by) + return false; - for (uint32_t part_index = 0; part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2; ++part_index) - { - const uint32_t astc_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_astc; - //const uint32_t bc7_pattern = basist::g_astc_bc7_common_partitions2[part_index].m_bc7; + const uint32_t run_len = decoder.decode_vlc(5) + 1; - assert(astc_pattern < 1024); - g_astc_partition_id_to_common_bc7_pat_index[astc_pattern] = (int8_t)part_index; - } + uint32_t num_blocks_remaining = total_blocks - (cur_bx + cur_by * num_blocks_x); + if (run_len > num_blocks_remaining) + return false; - g_astc_hdr_core_initialized = true; - } + uint32_t prev_bx = cur_bx, prev_by = cur_by; - //-------------------------------------------------------------------------------------------------------------------------- + if (cur_bx) + prev_bx--; + else + { + prev_bx = num_blocks_x - 1; + prev_by--; + } - static inline int astc_hdr_sign_extend(int src, int num_src_bits) - { - assert(basisu::in_range(num_src_bits, 2, 31)); + const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, calc_row_index(cur_by, prev_by, cur_row_index)); + const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by); - const bool negative = (src & (1 << (num_src_bits - 1))) != 0; - if (negative) - return src | ~((1 << num_src_bits) - 1); - else - return src & ((1 << num_src_bits) - 1); - } + assert((prev_log_blk.m_user_mode == 255) || (prev_log_blk.m_user_mode < TOTAL_BLOCK_MODE_DECS)); - static inline void astc_hdr_pack_bit( - int& dst, int dst_bit, - int src_val, int src_bit = 0) - { - assert(dst_bit >= 0 && dst_bit <= 31); - int bit = basisu::get_bit(src_val, src_bit); - dst |= (bit << dst_bit); - } + for (uint32_t i = 0; i < run_len; i++) + { + decoded_log_blocks(cur_bx, calc_row_index(cur_by, cur_by, cur_row_index)) = prev_log_blk; + decoded_blocks(cur_bx, cur_by) = prev_phys_blk; - //-------------------------------------------------------------------------------------------------------------------------- + cur_bx++; + if (cur_bx == num_blocks_x) + { + cur_bx = 0; + cur_by++; + cur_row_index = (cur_row_index + 1) % REUSE_MAX_BUFFER_ROWS; + } + } - void decode_mode7_to_qlog12_ise20( - const uint8_t* pEndpoints, - int e[2][3], - int* pScale) - { - assert(g_astc_hdr_core_initialized); + break; + } + case encoding_type::cSolid: + { + const basist::half_float rh = (basist::half_float)decoder.get_bits(15); + const basist::half_float gh = (basist::half_float)decoder.get_bits(15); + const basist::half_float bh = (basist::half_float)decoder.get_bits(15); - for (uint32_t i = 0; i < NUM_MODE7_ENDPOINTS; i++) - { - assert(pEndpoints[i] <= 255); - } + astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, calc_row_index(cur_by, cur_by, cur_row_index)); - const int v0 = pEndpoints[0], v1 = pEndpoints[1], v2 = pEndpoints[2], v3 = pEndpoints[3]; + log_blk.clear(); + log_blk.m_user_mode = 255; + log_blk.m_solid_color_flag_hdr = true; + log_blk.m_solid_color[0] = rh; + log_blk.m_solid_color[1] = gh; + log_blk.m_solid_color[2] = bh; + log_blk.m_solid_color[3] = basist::float_to_half(1.0f); - // Extract mode bits and unpack to major component and mode. - const int modeval = ((v0 & 0xC0) >> 6) | ((v1 & 0x80) >> 5) | ((v2 & 0x80) >> 4); + bool status = astc_helpers::pack_astc_block(decoded_blocks(cur_bx, cur_by), log_blk); + if (!status) + return false; - int majcomp, mode; - if ((modeval & 0xC) != 0xC) - { - majcomp = modeval >> 2; - mode = modeval & 3; - } - else if (modeval != 0xF) - { - majcomp = modeval & 3; - mode = 4; - } - else - { - majcomp = 0; - mode = 5; - } + cur_bx++; + if (cur_bx == num_blocks_x) + { + cur_bx = 0; + cur_by++; + cur_row_index = (cur_row_index + 1) % REUSE_MAX_BUFFER_ROWS; + } - // Extract low-order bits of r, g, b, and s. - int red = v0 & 0x3f; - int green = v1 & 0x1f; - int blue = v2 & 0x1f; - int scale = v3 & 0x1f; + break; + } + case encoding_type::cReuse: + { + if (!cur_bx && !cur_by) + return false; - // Extract high-order bits, which may be assigned depending on mode - int x0 = (v1 >> 6) & 1; - int x1 = (v1 >> 5) & 1; - int x2 = (v2 >> 6) & 1; - int x3 = (v2 >> 5) & 1; - int x4 = (v3 >> 7) & 1; - int x5 = (v3 >> 6) & 1; - int x6 = (v3 >> 5) & 1; + const uint32_t reuse_delta_index = decoder.get_bits(REUSE_XY_DELTA_BITS); - // Now move the high-order xs into the right place. - const int ohm = 1 << mode; - if (ohm & 0x30) green |= x0 << 6; - if (ohm & 0x3A) green |= x1 << 5; - if (ohm & 0x30) blue |= x2 << 6; - if (ohm & 0x3A) blue |= x3 << 5; - if (ohm & 0x3D) scale |= x6 << 5; - if (ohm & 0x2D) scale |= x5 << 6; - if (ohm & 0x04) scale |= x4 << 7; - if (ohm & 0x3B) red |= x4 << 6; - if (ohm & 0x04) red |= x3 << 6; - if (ohm & 0x10) red |= x5 << 7; - if (ohm & 0x0F) red |= x2 << 7; - if (ohm & 0x05) red |= x1 << 8; - if (ohm & 0x0A) red |= x0 << 8; - if (ohm & 0x05) red |= x0 << 9; - if (ohm & 0x02) red |= x6 << 9; - if (ohm & 0x01) red |= x3 << 10; - if (ohm & 0x02) red |= x5 << 10; + const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x; + const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y; - // Shift the bits to the top of the 12-bit result. - static const int s_shamts[6] = { 1,1,2,3,4,5 }; + const int prev_bx = cur_bx + reuse_delta_x, prev_by = cur_by + reuse_delta_y; + if ((prev_bx < 0) || (prev_bx >= (int)num_blocks_x)) + return false; + if (prev_by < 0) + return false; - const int shamt = s_shamts[mode]; - red <<= shamt; - green <<= shamt; - blue <<= shamt; - scale <<= shamt; + const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, calc_row_index(cur_by, prev_by, cur_row_index)); - // Minor components are stored as differences - if (mode != 5) - { - green = red - green; - blue = red - blue; - } + if (prev_log_blk.m_solid_color_flag_hdr) + return false; + assert(prev_log_blk.m_user_mode < TOTAL_BLOCK_MODE_DECS); - // Swizzle major component into place - if (majcomp == 1) - std::swap(red, green); + astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, calc_row_index(cur_by, cur_by, cur_row_index)); + astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by); - if (majcomp == 2) - std::swap(red, blue); + log_blk = prev_log_blk; - // Clamp output values, set alpha to 1.0 - e[1][0] = basisu::clamp(red, 0, 0xFFF); - e[1][1] = basisu::clamp(green, 0, 0xFFF); - e[1][2] = basisu::clamp(blue, 0, 0xFFF); + const uint32_t total_grid_weights = log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1); - e[0][0] = basisu::clamp(red - scale, 0, 0xFFF); - e[0][1] = basisu::clamp(green - scale, 0, 0xFFF); - e[0][2] = basisu::clamp(blue - scale, 0, 0xFFF); + bool status = decode_values(decoder, total_grid_weights, log_blk.m_weight_ise_range, log_blk.m_weights); + if (!status) + return false; - if (pScale) - *pScale = scale; - } +#if 0 + const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by); - //-------------------------------------------------------------------------------------------------------------------------- + astc_helpers::log_astc_block decomp_blk; + status = astc_helpers::unpack_block(&prev_phys_blk, decomp_blk, BLOCK_W, BLOCK_H); + if (!status) + return false; - bool decode_mode7_to_qlog12( - const uint8_t* pEndpoints, - int e[2][3], - int* pScale, - uint32_t ise_endpoint_range) - { - assert(g_astc_hdr_core_initialized); + uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; + requantize_astc_weights(total_grid_weights, log_blk.m_weights, log_blk.m_weight_ise_range, transcode_weights, decomp_blk.m_weight_ise_range); - if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS) - { - decode_mode7_to_qlog12_ise20(pEndpoints, e, pScale); - } - else - { - uint8_t dequantized_endpoints[NUM_MODE7_ENDPOINTS]; + copy_weight_grid(log_blk.m_dual_plane, log_blk.m_grid_width, log_blk.m_grid_height, transcode_weights, decomp_blk); +#else + assert(log_blk.m_user_mode < TOTAL_BLOCK_MODE_DECS); + const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)log_blk.m_user_mode]; + const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem); - for (uint32_t i = 0; i < NUM_MODE7_ENDPOINTS; i++) - dequantized_endpoints[i] = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_val[pEndpoints[i]]; + assert(bmd.m_grid_x == log_blk.m_grid_width && bmd.m_grid_y == log_blk.m_grid_height); + assert(bmd.m_dp == log_blk.m_dual_plane); + assert(bmd.m_cem == log_blk.m_color_endpoint_modes[0]); + assert(bmd.m_num_partitions == log_blk.m_num_partitions); + assert(bmd.m_dp_channel == log_blk.m_color_component_selector); - decode_mode7_to_qlog12_ise20(dequantized_endpoints, e, pScale); - } + // important: bmd.m_weight_ise_range/m_endpoint_ise_range may not match the logical block's due to deltas. - for (uint32_t i = 0; i < 2; i++) - { - if (e[i][0] > (int)MAX_QLOG12) - return false; + astc_helpers::log_astc_block decomp_blk; + decomp_blk.clear(); + decomp_blk.m_dual_plane = bmd.m_dp; + decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; + decomp_blk.m_partition_id = log_blk.m_partition_id; - if (e[i][1] > (int)MAX_QLOG12) - return false; + decomp_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions; - if (e[i][2] > (int)MAX_QLOG12) - return false; - } + for (uint32_t p = 0; p < bmd.m_num_partitions; p++) + decomp_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem; - return true; - } + decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range; + decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range; - //-------------------------------------------------------------------------------------------------------------------------- + for (uint32_t p = 0; p < bmd.m_num_partitions; p++) + requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints + num_endpoint_values * p, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints + num_endpoint_values * p); - void decode_mode11_to_qlog12_ise20( - const uint8_t* pEndpoints, - int e[2][3]) - { -#ifdef _DEBUG - for (uint32_t i = 0; i < NUM_MODE11_ENDPOINTS; i++) - { - assert(pEndpoints[i] <= 255); - } + uint8_t transcode_weights[BLOCK_W * BLOCK_H * 2]; + requantize_astc_weights(total_grid_weights, log_blk.m_weights, log_blk.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range); + + copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk); #endif + status = astc_helpers::pack_astc_block(phys_blk, decomp_blk); + if (!status) + return false; - const uint32_t maj_comp = basisu::get_bit(pEndpoints[4], 7) | (basisu::get_bit(pEndpoints[5], 7) << 1); + cur_bx++; + if (cur_bx == num_blocks_x) + { + cur_bx = 0; + cur_by++; + cur_row_index = (cur_row_index + 1) % REUSE_MAX_BUFFER_ROWS; + } - if (maj_comp == 3) - { - // Direct, qlog8 and qlog7 - e[0][0] = pEndpoints[0] << 4; - e[1][0] = pEndpoints[1] << 4; + break; + } + case encoding_type::cBlock: + { + const block_mode bm = (block_mode)decoder.decode_truncated_binary((uint32_t)block_mode::cBMTotalModes); + const endpoint_mode em = (endpoint_mode)decoder.decode_truncated_binary((uint32_t)endpoint_mode::cTotal); - e[0][1] = pEndpoints[2] << 4; - e[1][1] = pEndpoints[3] << 4; + switch (em) + { + case endpoint_mode::cUseLeft: + case endpoint_mode::cUseUpper: + { + int neighbor_bx = cur_bx, neighbor_by = cur_by; - e[0][2] = (pEndpoints[4] & 127) << 5; - e[1][2] = (pEndpoints[5] & 127) << 5; - } - else - { - int v0 = pEndpoints[0]; - int v1 = pEndpoints[1]; - int v2 = pEndpoints[2]; - int v3 = pEndpoints[3]; - int v4 = pEndpoints[4]; - int v5 = pEndpoints[5]; + if (em == endpoint_mode::cUseLeft) + neighbor_bx--; + else + neighbor_by--; - int mode = 0; - astc_hdr_pack_bit(mode, 0, v1, 7); - astc_hdr_pack_bit(mode, 1, v2, 7); - astc_hdr_pack_bit(mode, 2, v3, 7); + if ((neighbor_bx < 0) || (neighbor_by < 0)) + return false; - int va = v0; - astc_hdr_pack_bit(va, 8, v1, 6); + const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, calc_row_index(cur_by, neighbor_by, cur_row_index)); + if (!neighbor_blk.m_color_endpoint_modes[0]) + return false; - int vb0 = v2 & 63; - int vb1 = v3 & 63; - int vc = v1 & 63; + const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm]; + const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem); - int vd0 = v4 & 0x7F; // this takes more bits than is sometimes needed - int vd1 = v5 & 0x7F; // this takes more bits than is sometimes needed - static const int8_t dbitstab[8] = { 7,6,7,6,5,6,5,6 }; - vd0 = astc_hdr_sign_extend(vd0, dbitstab[mode]); - vd1 = astc_hdr_sign_extend(vd1, dbitstab[mode]); + if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0]) + return false; - int x0 = basisu::get_bit(v2, 6); - int x1 = basisu::get_bit(v3, 6); - int x2 = basisu::get_bit(v4, 6); - int x3 = basisu::get_bit(v5, 6); - int x4 = basisu::get_bit(v4, 5); - int x5 = basisu::get_bit(v5, 5); + astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, calc_row_index(cur_by, cur_by, cur_row_index)); + astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by); + + log_blk.clear(); + assert((uint32_t)bm <= UINT8_MAX); + log_blk.m_user_mode = (uint8_t)bm; + log_blk.m_num_partitions = 1; + log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem; + // Important: Notice how we're copying the neighbor's endpoint ISE range. Not using the mode's endpoint ISE range here. + // This is to avoid introducing more quantization error. + log_blk.m_endpoint_ise_range = neighbor_blk.m_endpoint_ise_range; + log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range; + log_blk.m_grid_width = (uint8_t)bmd.m_grid_x; + log_blk.m_grid_height = (uint8_t)bmd.m_grid_y; + log_blk.m_dual_plane = (uint8_t)bmd.m_dp; + log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; + + memcpy(log_blk.m_endpoints, neighbor_blk.m_endpoints, num_endpoint_values); + + const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1); + + bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights); + if (!status) + return false; - const uint32_t ohm = 1U << mode; - if (ohm & 0xA4) va |= (x0 << 9); - if (ohm & 0x08) va |= (x2 << 9); - if (ohm & 0x50) va |= (x4 << 9); - if (ohm & 0x50) va |= (x5 << 10); - if (ohm & 0xA0) va |= (x1 << 10); - if (ohm & 0xC0) va |= (x2 << 11); - if (ohm & 0x04) vc |= (x1 << 6); - if (ohm & 0xE8) vc |= (x3 << 6); - if (ohm & 0x20) vc |= (x2 << 7); - if (ohm & 0x5B) vb0 |= (x0 << 6); - if (ohm & 0x5B) vb1 |= (x1 << 6); - if (ohm & 0x12) vb0 |= (x2 << 7); - if (ohm & 0x12) vb1 |= (x3 << 7); + astc_helpers::log_astc_block decomp_blk; + decomp_blk.clear(); - const int shamt = (mode >> 1) ^ 3; - - va = (uint32_t)va << shamt; - vb0 = (uint32_t)vb0 << shamt; - vb1 = (uint32_t)vb1 << shamt; - vc = (uint32_t)vc << shamt; - vd0 = (uint32_t)vd0 << shamt; - vd1 = (uint32_t)vd1 << shamt; + decomp_blk.m_num_partitions = 1; + decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem; + decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range; + decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range; + decomp_blk.m_dual_plane = (uint8_t)bmd.m_dp; + decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; - // qlog12 - e[1][0] = basisu::clamp(va, 0, 0xFFF); - e[1][1] = basisu::clamp(va - vb0, 0, 0xFFF); - e[1][2] = basisu::clamp(va - vb1, 0, 0xFFF); + requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints); - e[0][0] = basisu::clamp(va - vc, 0, 0xFFF); - e[0][1] = basisu::clamp(va - vb0 - vc - vd0, 0, 0xFFF); - e[0][2] = basisu::clamp(va - vb1 - vc - vd1, 0, 0xFFF); + uint8_t transcode_weights[BLOCK_W * BLOCK_H * 2]; + requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range); - if (maj_comp) - { - std::swap(e[0][0], e[0][maj_comp]); - std::swap(e[1][0], e[1][maj_comp]); - } - } - } + copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk); - //-------------------------------------------------------------------------------------------------------------------------- + status = astc_helpers::pack_astc_block(phys_blk, decomp_blk); + if (!status) + return false; - bool decode_mode11_to_qlog12( - const uint8_t* pEndpoints, - int e[2][3], - uint32_t ise_endpoint_range) - { - assert(g_astc_hdr_core_initialized); - assert((ise_endpoint_range >= astc_helpers::FIRST_VALID_ENDPOINT_ISE_RANGE) && (ise_endpoint_range <= astc_helpers::LAST_VALID_ENDPOINT_ISE_RANGE)); + cur_bx++; + if (cur_bx == num_blocks_x) + { + cur_bx = 0; + cur_by++; + cur_row_index = (cur_row_index + 1) % REUSE_MAX_BUFFER_ROWS; + } - if (ise_endpoint_range == astc_helpers::BISE_256_LEVELS) - { - decode_mode11_to_qlog12_ise20(pEndpoints, e); - } - else - { - uint8_t dequantized_endpoints[NUM_MODE11_ENDPOINTS]; + break; + } + case endpoint_mode::cUseLeftDelta: + case endpoint_mode::cUseUpperDelta: + { + int neighbor_bx = cur_bx, neighbor_by = cur_by; - for (uint32_t i = 0; i < NUM_MODE11_ENDPOINTS; i++) - dequantized_endpoints[i] = astc_helpers::g_dequant_tables.get_endpoint_tab(ise_endpoint_range).m_ISE_to_val[pEndpoints[i]]; + if (em == endpoint_mode::cUseLeftDelta) + neighbor_bx--; + else + neighbor_by--; - decode_mode11_to_qlog12_ise20(dequantized_endpoints, e); - } + if ((neighbor_bx < 0) || (neighbor_by < 0)) + return false; - for (uint32_t i = 0; i < 2; i++) - { - if (e[i][0] > (int)MAX_QLOG12) - return false; + const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, calc_row_index(cur_by, neighbor_by, cur_row_index)); + if (!neighbor_blk.m_color_endpoint_modes[0]) + return false; - if (e[i][1] > (int)MAX_QLOG12) - return false; + const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm]; + const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem); - if (e[i][2] > (int)MAX_QLOG12) - return false; - } + if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0]) + return false; - return true; - } + astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, calc_row_index(cur_by, cur_by, cur_row_index)); + astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by); - //-------------------------------------------------------------------------------------------------------------------------- + log_blk.clear(); + assert((uint32_t)bm <= UINT8_MAX); + log_blk.m_user_mode = (uint8_t)bm; + log_blk.m_num_partitions = 1; + log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem; + log_blk.m_dual_plane = bmd.m_dp; + log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; - bool transcode_bc6h_1subset(half_float h_e[3][2], const astc_helpers::log_astc_block& best_blk, bc6h_block& transcoded_bc6h_blk) - { - assert(g_astc_hdr_core_initialized); - assert((best_blk.m_weight_ise_range >= 1) && (best_blk.m_weight_ise_range <= 8)); - - if (best_blk.m_weight_ise_range == 5) - { - // Use 3-bit BC6H weights which are a perfect match for 3-bit ASTC weights, but encode 1-subset as 2 equal subsets - bc6h_enc_block_1subset_3bit_weights(&transcoded_bc6h_blk, h_e, best_blk.m_weights); - } - else - { - uint8_t bc6h_weights[16]; + log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range; + requantize_ise_endpoints(bmd.m_cem, neighbor_blk.m_endpoint_ise_range, neighbor_blk.m_endpoints, bmd.m_endpoint_ise_range, log_blk.m_endpoints); - if (best_blk.m_weight_ise_range == 1) - { - // weight ISE 1: 3 levels - static const uint8_t s_astc1_to_bc6h_3[3] = { 0, 8, 15 }; + const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS; + const int low_delta_limit = -(total_endpoint_delta_vals / 2); // high_delta_limit = (total_endpoint_delta_vals / 2) - 1; - for (uint32_t i = 0; i < 16; i++) - bc6h_weights[i] = s_astc1_to_bc6h_3[best_blk.m_weights[i]]; - } - else if (best_blk.m_weight_ise_range == 2) - { - // weight ISE 2: 4 levels - static const uint8_t s_astc2_to_bc6h_4[4] = { 0, 5, 10, 15 }; + const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_ISE_to_rank; + const auto& rank_to_ise = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_rank_to_ISE; + const int total_endpoint_levels = astc_helpers::get_ise_levels(log_blk.m_endpoint_ise_range); - for (uint32_t i = 0; i < 16; i++) - bc6h_weights[i] = s_astc2_to_bc6h_4[best_blk.m_weights[i]]; - } - else if (best_blk.m_weight_ise_range == 3) - { - // weight ISE 3: 5 levels - static const uint8_t s_astc3_to_bc6h_4[5] = { 0, 4, 7, 11, 15 }; + for (uint32_t i = 0; i < num_endpoint_values; i++) + { + int cur_val = ise_to_rank[log_blk.m_endpoints[i]]; - for (uint32_t i = 0; i < 16; i++) - bc6h_weights[i] = s_astc3_to_bc6h_4[best_blk.m_weights[i]]; - } - else if (best_blk.m_weight_ise_range == 4) - { - // weight ISE 4: 6 levels - static const uint8_t s_astc4_to_bc6h_4[6] = { 0, 15, 3, 12, 6, 9 }; + int delta = (int)decoder.get_bits(NUM_ENDPOINT_DELTA_BITS) + low_delta_limit; - for (uint32_t i = 0; i < 16; i++) - bc6h_weights[i] = s_astc4_to_bc6h_4[best_blk.m_weights[i]]; - } - else if (best_blk.m_weight_ise_range == 6) - { - // weight ISE 6: 10 levels - static const uint8_t s_astc6_to_bc6h_4[10] = { 0, 15, 2, 13, 3, 12, 5, 10, 6, 9 }; + cur_val += delta; + if ((cur_val < 0) || (cur_val >= total_endpoint_levels)) + return false; + + log_blk.m_endpoints[i] = rank_to_ise[cur_val]; + } + + log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range; + log_blk.m_grid_width = (uint8_t)bmd.m_grid_x; + log_blk.m_grid_height = (uint8_t)bmd.m_grid_y; + + const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1); + + bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights); + if (!status) + return false; - for (uint32_t i = 0; i < 16; i++) - bc6h_weights[i] = s_astc6_to_bc6h_4[best_blk.m_weights[i]]; - } - else if (best_blk.m_weight_ise_range == 7) - { - // weight ISE 7: 12 levels - static const uint8_t s_astc7_to_bc6h_4[12] = { 0, 15, 4, 11, 1, 14, 5, 10, 2, 13, 6, 9 }; + astc_helpers::log_astc_block decomp_blk; + decomp_blk.clear(); - for (uint32_t i = 0; i < 16; i++) - bc6h_weights[i] = s_astc7_to_bc6h_4[best_blk.m_weights[i]]; - } - else if (best_blk.m_weight_ise_range == 8) - { - // 16 levels - memcpy(bc6h_weights, best_blk.m_weights, 16); - } - else - { - assert(0); - return false; - } + decomp_blk.m_num_partitions = 1; + decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem; + decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range; + decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range; + decomp_blk.m_dual_plane = (uint8_t)bmd.m_dp; + decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; - bc6h_enc_block_1subset_4bit_weights(&transcoded_bc6h_blk, h_e, bc6h_weights); - } + requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints); - return true; - } + uint8_t transcode_weights[BLOCK_W * BLOCK_H * 2]; + requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range); - //-------------------------------------------------------------------------------------------------------------------------- + copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk); - bool transcode_bc6h_2subsets(uint32_t common_part_index, const astc_helpers::log_astc_block& best_blk, bc6h_block& transcoded_bc6h_blk) - { - assert(g_astc_hdr_core_initialized); - assert(best_blk.m_num_partitions == 2); - assert(common_part_index < basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); - - half_float bc6h_endpoints[2][3][2]; // [subset][comp][lh_index] + status = astc_helpers::pack_astc_block(phys_blk, decomp_blk); + if (!status) + return false; - // UASTC HDR checks - // Both CEM's must be equal in 2-subset UASTC HDR. - if (best_blk.m_color_endpoint_modes[0] != best_blk.m_color_endpoint_modes[1]) - return false; - if ((best_blk.m_color_endpoint_modes[0] != 7) && (best_blk.m_color_endpoint_modes[0] != 11)) - return false; - - if (best_blk.m_color_endpoint_modes[0] == 7) - { - if (!(((best_blk.m_weight_ise_range == 1) && (best_blk.m_endpoint_ise_range == 20)) || - ((best_blk.m_weight_ise_range == 2) && (best_blk.m_endpoint_ise_range == 20)) || - ((best_blk.m_weight_ise_range == 3) && (best_blk.m_endpoint_ise_range == 19)) || - ((best_blk.m_weight_ise_range == 4) && (best_blk.m_endpoint_ise_range == 17)) || - ((best_blk.m_weight_ise_range == 5) && (best_blk.m_endpoint_ise_range == 15)))) - { - return false; - } - } - else - { - if (!(((best_blk.m_weight_ise_range == 1) && (best_blk.m_endpoint_ise_range == 14)) || - ((best_blk.m_weight_ise_range == 2) && (best_blk.m_endpoint_ise_range == 12)))) - { - return false; - } - } + cur_bx++; + if (cur_bx == num_blocks_x) + { + cur_bx = 0; + cur_by++; + cur_row_index = (cur_row_index + 1) % REUSE_MAX_BUFFER_ROWS; + } - for (uint32_t s = 0; s < 2; s++) - { - int e[2][3]; - if (best_blk.m_color_endpoint_modes[0] == 7) - { - bool success = decode_mode7_to_qlog12(best_blk.m_endpoints + s * NUM_MODE7_ENDPOINTS, e, nullptr, best_blk.m_endpoint_ise_range); - if (!success) - return false; - } - else - { - bool success = decode_mode11_to_qlog12(best_blk.m_endpoints + s * NUM_MODE11_ENDPOINTS, e, best_blk.m_endpoint_ise_range); - if (!success) - return false; - } + break; + } + case endpoint_mode::cRaw: + { + const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm]; - for (uint32_t c = 0; c < 3; c++) - { - bc6h_endpoints[s][c][0] = qlog_to_half_slow(e[0][c], 12); - if (is_half_inf_or_nan(bc6h_endpoints[s][c][0])) - return false; + const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem); - bc6h_endpoints[s][c][1] = qlog_to_half_slow(e[1][c], 12); - if (is_half_inf_or_nan(bc6h_endpoints[s][c][1])) - return false; - } - } + astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, calc_row_index(cur_by, cur_by, cur_row_index)); + astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by); - uint8_t bc6h_weights[16]; - if (best_blk.m_weight_ise_range == 1) - { - static const uint8_t s_astc1_to_bc6h_3[3] = { 0, 4, 7 }; + log_blk.clear(); - for (uint32_t i = 0; i < 16; i++) - bc6h_weights[i] = s_astc1_to_bc6h_3[best_blk.m_weights[i]]; - } - else if (best_blk.m_weight_ise_range == 2) - { - static const uint8_t s_astc2_to_bc6h_3[4] = { 0, 2, 5, 7 }; + assert((uint32_t)bm <= UINT8_MAX); + log_blk.m_user_mode = (uint8_t)bm; - for (uint32_t i = 0; i < 16; i++) - bc6h_weights[i] = s_astc2_to_bc6h_3[best_blk.m_weights[i]]; - } - else if (best_blk.m_weight_ise_range == 3) - { - static const uint8_t s_astc3_to_bc6h_3[5] = { 0, 2, 4, 5, 7 }; + log_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions; - for (uint32_t i = 0; i < 16; i++) - bc6h_weights[i] = s_astc3_to_bc6h_3[best_blk.m_weights[i]]; - } - else if (best_blk.m_weight_ise_range == 4) - { - static const uint8_t s_astc4_to_bc6h_3[6] = { 0, 7, 1, 6, 3, 4 }; + for (uint32_t p = 0; p < bmd.m_num_partitions; p++) + log_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem; - for (uint32_t i = 0; i < 16; i++) - bc6h_weights[i] = s_astc4_to_bc6h_3[best_blk.m_weights[i]]; - } - else if (best_blk.m_weight_ise_range == 5) - { - memcpy(bc6h_weights, best_blk.m_weights, 16); - } - else - { - assert(0); - return false; - } + log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range; + log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range; - bc6h_enc_block_2subset_3bit_weights(&transcoded_bc6h_blk, common_part_index, bc6h_endpoints, bc6h_weights); + log_blk.m_grid_width = (uint8_t)bmd.m_grid_x; + log_blk.m_grid_height = (uint8_t)bmd.m_grid_y; + log_blk.m_dual_plane = (uint8_t)bmd.m_dp; + log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; - return true; - } + if (bmd.m_num_partitions == 2) + { + const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS2); + log_blk.m_partition_id = (uint16_t)g_part2_unique_index_to_seed[unique_partition_index]; + } + else if (bmd.m_num_partitions == 3) + { + const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS3); + log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_partition_index]; + } - //-------------------------------------------------------------------------------------------------------------------------- - // Transcodes an UASTC HDR block to BC6H. Must have been encoded to UASTC HDR, or this fails. - bool astc_hdr_transcode_to_bc6h(const astc_blk& src_blk, bc6h_block& dst_blk) - { - assert(g_astc_hdr_core_initialized); - if (!g_astc_hdr_core_initialized) - { - assert(0); - return false; - } + bool status = decode_values(decoder, num_endpoint_values * bmd.m_num_partitions, bmd.m_endpoint_ise_range, log_blk.m_endpoints); + if (!status) + return false; - astc_helpers::log_astc_block log_blk; + const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1); - if (!astc_helpers::unpack_block(&src_blk, log_blk, 4, 4)) - { - // Failed unpacking ASTC data - return false; - } + status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights); + if (!status) + return false; - return astc_hdr_transcode_to_bc6h(log_blk, dst_blk); - } + astc_helpers::log_astc_block decomp_blk; + decomp_blk.clear(); + decomp_blk.m_dual_plane = bmd.m_dp; + decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; + decomp_blk.m_partition_id = log_blk.m_partition_id; - //-------------------------------------------------------------------------------------------------------------------------- - // Transcodes an UASTC HDR block to BC6H. Must have been encoded to UASTC HDR, or this fails. - bool astc_hdr_transcode_to_bc6h(const astc_helpers::log_astc_block& log_blk, bc6h_block& dst_blk) - { - assert(g_astc_hdr_core_initialized); - if (!g_astc_hdr_core_initialized) - { - assert(0); - return false; - } - - if (log_blk.m_solid_color_flag_ldr) - { - // Don't support LDR solid colors. - return false; - } + decomp_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions; - if (log_blk.m_solid_color_flag_hdr) - { - // Solid color HDR block - return bc6h_enc_block_solid_color(&dst_blk, log_blk.m_solid_color); - } + for (uint32_t p = 0; p < bmd.m_num_partitions; p++) + decomp_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem; - // Only support 4x4 grid sizes - if ((log_blk.m_grid_width != 4) || (log_blk.m_grid_height != 4)) - return false; - - // Don't support dual plane encoding - if (log_blk.m_dual_plane) - return false; + decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range; + decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range; - if (log_blk.m_num_partitions == 1) - { - // Handle 1 partition (or subset) - - // UASTC HDR checks - if ((log_blk.m_weight_ise_range < 1) || (log_blk.m_weight_ise_range > 8)) - return false; - - int e[2][3]; - bool success; + for (uint32_t p = 0; p < bmd.m_num_partitions; p++) + requantize_ise_endpoints(bmd.m_cem, bmd.m_endpoint_ise_range, log_blk.m_endpoints + num_endpoint_values * p, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints + num_endpoint_values * p); - if (log_blk.m_color_endpoint_modes[0] == 7) - { - if (log_blk.m_endpoint_ise_range != 20) - return false; + uint8_t transcode_weights[BLOCK_W * BLOCK_H * 2]; + requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range); - success = decode_mode7_to_qlog12(log_blk.m_endpoints, e, nullptr, log_blk.m_endpoint_ise_range); - } - else if (log_blk.m_color_endpoint_modes[0] == 11) - { - // UASTC HDR checks - if (log_blk.m_weight_ise_range <= 7) - { - if (log_blk.m_endpoint_ise_range != 20) + copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk); + + status = astc_helpers::pack_astc_block(phys_blk, decomp_blk); + if (!status) + return false; + + cur_bx++; + if (cur_bx == num_blocks_x) + { + cur_bx = 0; + cur_by++; + cur_row_index = (cur_row_index + 1) % REUSE_MAX_BUFFER_ROWS; + } + + break; + } + default: + { + assert(0); return false; + } + } + + break; } - else if (log_blk.m_endpoint_ise_range != 19) + default: { + assert(0); return false; } - - success = decode_mode11_to_qlog12(log_blk.m_endpoints, e, log_blk.m_endpoint_ise_range); + } } - else + + if (decoder.get_bits(16) != 0xA742) { + //fmt_error_printf("End marker not found!\n"); return false; } - if (!success) - return false; - - // Transform endpoints to half float - half_float h_e[3][2] = - { - { qlog_to_half_slow(e[0][0], 12), qlog_to_half_slow(e[1][0], 12) }, - { qlog_to_half_slow(e[0][1], 12), qlog_to_half_slow(e[1][1], 12) }, - { qlog_to_half_slow(e[0][2], 12), qlog_to_half_slow(e[1][2], 12) } - }; + //fmt_printf("Total decode_file() time: {} secs\n", tm.get_elapsed_secs()); - // Sanity check for NaN/Inf - for (uint32_t i = 0; i < 2; i++) - if (is_half_inf_or_nan(h_e[0][i]) || is_half_inf_or_nan(h_e[1][i]) || is_half_inf_or_nan(h_e[2][i])) - return false; - - // Transcode to bc6h - if (!transcode_bc6h_1subset(h_e, log_blk, dst_blk)) - return false; + return true; } - else if (log_blk.m_num_partitions == 2) - { - // Handle 2 partition (or subset) - int common_bc7_pat_index = g_astc_partition_id_to_common_bc7_pat_index[log_blk.m_partition_id]; - if (common_bc7_pat_index < 0) - return false; - assert(common_bc7_pat_index < (int)basist::TOTAL_ASTC_BC6H_COMMON_PARTITIONS2); - - if (!transcode_bc6h_2subsets(common_bc7_pat_index, log_blk, dst_blk)) - return false; - } - else - { - // Only supports 1 or 2 partitions (or subsets) - return false; - } + } // namespace astc_6x6_hdr - return true; - } #endif // BASISD_SUPPORT_UASTC_HDR } // namespace basist diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder.h b/thirdparty/basis_universal/transcoder/basisu_transcoder.h index 8324e996989c..4667943e4610 100644 --- a/thirdparty/basis_universal/transcoder/basisu_transcoder.h +++ b/thirdparty/basis_universal/transcoder/basisu_transcoder.h @@ -79,30 +79,32 @@ namespace basist // Punch-through alpha is relatively easy to support, but full alpha is harder. This format is only here for completeness so opaque-only is fine for now. // See the BASISU_USE_ORIGINAL_3DFX_FXT1_ENCODING macro in basisu_transcoder_internal.h. - cTFPVRTC2_4_RGB = 18, // Opaque-only, almost BC1 quality, much faster to transcode and supports arbitrary texture dimensions (unlike PVRTC1 RGB). - cTFPVRTC2_4_RGBA = 19, // Opaque+alpha, slower to encode than cTFPVRTC2_4_RGB. Premultiplied alpha is highly recommended, otherwise the color channel can leak into the alpha channel on transparent blocks. + cTFPVRTC2_4_RGB = 18, // Opaque-only, almost BC1 quality, much faster to transcode and supports arbitrary texture dimensions (unlike PVRTC1 RGB). + cTFPVRTC2_4_RGBA = 19, // Opaque+alpha, slower to encode than cTFPVRTC2_4_RGB. Premultiplied alpha is highly recommended, otherwise the color channel can leak into the alpha channel on transparent blocks. - cTFETC2_EAC_R11 = 20, // R only (ETC2 EAC R11 unsigned) - cTFETC2_EAC_RG11 = 21, // RG only (ETC2 EAC RG11 unsigned), R=opaque.r, G=alpha - for tangent space normal maps + cTFETC2_EAC_R11 = 20, // R only (ETC2 EAC R11 unsigned) + cTFETC2_EAC_RG11 = 21, // RG only (ETC2 EAC RG11 unsigned), R=opaque.r, G=alpha - for tangent space normal maps - cTFBC6H = 22, // HDR, RGB only, unsigned - cTFASTC_HDR_4x4_RGBA = 23, // HDR, RGBA (currently UASTC HDR is only RGB), unsigned + cTFBC6H = 22, // HDR, RGB only, unsigned + cTFASTC_HDR_4x4_RGBA = 23, // HDR, RGBA (currently UASTC HDR 4x4 encoders are only RGB), unsigned // Uncompressed (raw pixel) formats // Note these uncompressed formats (RGBA32, 565, and 4444) can only be transcoded to from LDR input files (ETC1S or UASTC LDR). - cTFRGBA32 = 13, // 32bpp RGBA image stored in raster (not block) order in memory, R is first byte, A is last byte. - cTFRGB565 = 14, // 16bpp RGB image stored in raster (not block) order in memory, R at bit position 11 - cTFBGR565 = 15, // 16bpp RGB image stored in raster (not block) order in memory, R at bit position 0 - cTFRGBA4444 = 16, // 16bpp RGBA image stored in raster (not block) order in memory, R at bit position 12, A at bit position 0 + cTFRGBA32 = 13, // 32bpp RGBA image stored in raster (not block) order in memory, R is first byte, A is last byte. + cTFRGB565 = 14, // 16bpp RGB image stored in raster (not block) order in memory, R at bit position 11 + cTFBGR565 = 15, // 16bpp RGB image stored in raster (not block) order in memory, R at bit position 0 + cTFRGBA4444 = 16, // 16bpp RGBA image stored in raster (not block) order in memory, R at bit position 12, A at bit position 0 - // Note these uncompressed formats (HALF and 9E5) can only be transcoded to from HDR input files (UASTC HDR). - cTFRGB_HALF = 24, // 48bpp RGB half (16-bits/component, 3 components) - cTFRGBA_HALF = 25, // 64bpp RGBA half (16-bits/component, 4 components) (A will always currently 1.0, UASTC_HDR doesn't support alpha) - cTFRGB_9E5 = 26, // 32bpp RGB 9E5 (shared exponent, positive only, see GL_EXT_texture_shared_exponent) + // Note these uncompressed formats (HALF and 9E5) can only be transcoded to from HDR input files (UASTC HDR 4x4 or ASTC HDR 6x6). + cTFRGB_HALF = 24, // 48bpp RGB half (16-bits/component, 3 components) + cTFRGBA_HALF = 25, // 64bpp RGBA half (16-bits/component, 4 components) (A will always currently 1.0, UASTC_HDR doesn't support alpha) + cTFRGB_9E5 = 26, // 32bpp RGB 9E5 (shared exponent, positive only, see GL_EXT_texture_shared_exponent) - cTFTotalTextureFormats = 27, + cTFASTC_HDR_6x6_RGBA = 27, // HDR, RGBA (currently our ASTC HDR 6x6 encodes are only RGB), unsigned - // Old enums for compatibility with code compiled against previous versions + cTFTotalTextureFormats = 28, + + // ----- The following are old/legacy enums for compatibility with code compiled against previous versions cTFETC1 = cTFETC1_RGB, cTFETC2 = cTFETC2_RGBA, cTFBC1 = cTFBC1_RGB, @@ -111,8 +113,8 @@ namespace basist cTFBC5 = cTFBC5_RG, // Previously, the caller had some control over which BC7 mode the transcoder output. We've simplified this due to UASTC, which supports numerous modes. - cTFBC7_M6_RGB = cTFBC7_RGBA, // Opaque only, RGB or alpha if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified. Highest quality of all the non-ETC1 formats. - cTFBC7_M5_RGBA = cTFBC7_RGBA, // Opaque+alpha, alpha channel will be opaque for opaque .basis files + cTFBC7_M6_RGB = cTFBC7_RGBA, // Opaque only, RGB or alpha if cDecodeFlagsTranscodeAlphaDataToOpaqueFormats flag is specified. Highest quality of all the non-ETC1 formats. + cTFBC7_M5_RGBA = cTFBC7_RGBA, // Opaque+alpha, alpha channel will be opaque for opaque .basis files cTFBC7_M6_OPAQUE_ONLY = cTFBC7_RGBA, cTFBC7_M5 = cTFBC7_RGBA, cTFBC7_ALT = 7, @@ -138,6 +140,9 @@ namespace basist // Returns true if the format is HDR. bool basis_transcoder_format_is_hdr(transcoder_texture_format fmt); + // Returns true if the format is LDR. + inline bool basis_transcoder_format_is_ldr(transcoder_texture_format fmt) { return !basis_transcoder_format_is_hdr(fmt); } + // Returns the basisu::texture_format corresponding to the specified transcoder_texture_format. basisu::texture_format basis_get_basisu_texture_format(transcoder_texture_format fmt); @@ -159,14 +164,25 @@ namespace basist // Returns true if the specified format was enabled at compile time, and is supported for the specific basis/ktx2 texture format (ETC1S, UASTC, or UASTC HDR). bool basis_is_format_supported(transcoder_texture_format tex_type, basis_tex_format fmt = basis_tex_format::cETC1S); + // Returns the block width/height for the specified basis texture file format. + uint32_t basis_tex_format_get_block_width(basis_tex_format fmt); + uint32_t basis_tex_format_get_block_height(basis_tex_format fmt); + + bool basis_tex_format_is_hdr(basis_tex_format fmt); + inline bool basis_tex_format_is_ldr(basis_tex_format fmt) { return !basis_tex_format_is_hdr(fmt); } + // Validates that the output buffer is large enough to hold the entire transcoded texture. // For uncompressed texture formats, most input parameters are in pixels, not blocks. Blocks are 4x4 pixels. bool basis_validate_output_buffer_size(transcoder_texture_format target_format, uint32_t output_blocks_buf_size_in_blocks_or_pixels, uint32_t orig_width, uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels, - uint32_t output_rows_in_pixels, - uint32_t total_slice_blocks); + uint32_t output_rows_in_pixels); + + // Computes the size in bytes of a transcoded image or texture, taking into account the format's block width/height and any minimum size PVRTC1 requirements required by OpenGL. + // Note the returned value is not necessarily the # of bytes a transcoder could write to the output buffer due to these minimum PVRTC1 requirements. + // (These PVRTC1 requirements are not ours, but OpenGL's.) + uint32_t basis_compute_transcoded_image_size_in_bytes(transcoder_texture_format target_format, uint32_t orig_width, uint32_t orig_height); class basisu_transcoder; @@ -197,7 +213,9 @@ namespace basist } }; - // Low-level helper class that does the actual transcoding. + // Low-level helper classes that do the actual transcoding. + + // ETC1S class basisu_lowlevel_etc1s_transcoder { friend class basisu_transcoder; @@ -216,18 +234,18 @@ namespace basist bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const bool is_video, const bool is_alpha_slice, const uint32_t level_index, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0, - basisu_transcoder_state* pState = nullptr, bool astc_transcode_alpha = false, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0); + basisu_transcoder_state* pState = nullptr, bool astc_transcode_alpha = false, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0, uint32_t decode_flags = 0); bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const basis_file_header& header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks_or_pixels = 0, - basisu_transcoder_state* pState = nullptr, bool astc_transcode_alpha = false, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0) + basisu_transcoder_state* pState = nullptr, bool astc_transcode_alpha = false, void* pAlpha_blocks = nullptr, uint32_t output_rows_in_pixels = 0, uint32_t decode_flags = 0) { return transcode_slice(pDst_blocks, num_blocks_x, num_blocks_y, pImage_data, image_data_size, fmt, output_block_or_pixel_stride_in_bytes, bc1_allow_threecolor_blocks, header.m_tex_type == cBASISTexTypeVideoFrames, (slice_desc.m_flags & cSliceDescFlagsHasAlpha) != 0, slice_desc.m_level_index, slice_desc.m_orig_width, slice_desc.m_orig_height, output_row_pitch_in_blocks_or_pixels, pState, astc_transcode_alpha, pAlpha_blocks, - output_rows_in_pixels); + output_rows_in_pixels, decode_flags); } // Container independent transcoding @@ -292,15 +310,92 @@ namespace basist // Used internally when decoding formats like ASTC that require both color and alpha data to be available when transcoding to the output format. cDecodeFlagsOutputHasAlphaIndices = 16, - cDecodeFlagsHighQuality = 32 + cDecodeFlagsHighQuality = 32, + + cDecodeFlagsNoETC1SChromaFiltering = 64 + }; + + // UASTC LDR 4x4 + class basisu_lowlevel_uastc_ldr_4x4_transcoder + { + friend class basisu_transcoder; + + public: + basisu_lowlevel_uastc_ldr_4x4_transcoder(); + + bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, + uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0, + basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1, uint32_t decode_flags = 0); + + bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, + uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const basis_file_header& header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks_or_pixels = 0, + basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1, uint32_t decode_flags = 0) + { + return transcode_slice(pDst_blocks, num_blocks_x, num_blocks_y, pImage_data, image_data_size, fmt, + output_block_or_pixel_stride_in_bytes, bc1_allow_threecolor_blocks, (header.m_flags & cBASISHeaderFlagHasAlphaSlices) != 0, slice_desc.m_orig_width, slice_desc.m_orig_height, output_row_pitch_in_blocks_or_pixels, + pState, output_rows_in_pixels, channel0, channel1, decode_flags); + } + + // Container independent transcoding + bool transcode_image( + transcoder_texture_format target_format, + void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, + const uint8_t* pCompressed_data, uint32_t compressed_data_length, + uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index, + uint32_t slice_offset, uint32_t slice_length, + uint32_t decode_flags = 0, + bool has_alpha = false, + bool is_video = false, + uint32_t output_row_pitch_in_blocks_or_pixels = 0, + basisu_transcoder_state* pState = nullptr, + uint32_t output_rows_in_pixels = 0, + int channel0 = -1, int channel1 = -1); + }; + + // UASTC HDR 4x4 + class basisu_lowlevel_uastc_hdr_4x4_transcoder + { + friend class basisu_transcoder; + + public: + basisu_lowlevel_uastc_hdr_4x4_transcoder(); + + bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, + uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0, + basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1, uint32_t decode_flags = 0); + + bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, + uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, const basis_file_header& header, const basis_slice_desc& slice_desc, uint32_t output_row_pitch_in_blocks_or_pixels = 0, + basisu_transcoder_state* pState = nullptr, uint32_t output_rows_in_pixels = 0, int channel0 = -1, int channel1 = -1, uint32_t decode_flags = 0) + { + return transcode_slice(pDst_blocks, num_blocks_x, num_blocks_y, pImage_data, image_data_size, fmt, + output_block_or_pixel_stride_in_bytes, bc1_allow_threecolor_blocks, (header.m_flags & cBASISHeaderFlagHasAlphaSlices) != 0, slice_desc.m_orig_width, slice_desc.m_orig_height, output_row_pitch_in_blocks_or_pixels, + pState, output_rows_in_pixels, channel0, channel1, decode_flags); + } + + // Container independent transcoding + bool transcode_image( + transcoder_texture_format target_format, + void* pOutput_blocks, uint32_t output_blocks_buf_size_in_blocks_or_pixels, + const uint8_t* pCompressed_data, uint32_t compressed_data_length, + uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t orig_width, uint32_t orig_height, uint32_t level_index, + uint32_t slice_offset, uint32_t slice_length, + uint32_t decode_flags = 0, + bool has_alpha = false, + bool is_video = false, + uint32_t output_row_pitch_in_blocks_or_pixels = 0, + basisu_transcoder_state* pState = nullptr, + uint32_t output_rows_in_pixels = 0, + int channel0 = -1, int channel1 = -1); }; - class basisu_lowlevel_uastc_transcoder + // ASTC HDR 6x6 + class basisu_lowlevel_astc_hdr_6x6_transcoder { friend class basisu_transcoder; public: - basisu_lowlevel_uastc_transcoder(); + basisu_lowlevel_astc_hdr_6x6_transcoder(); bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0, @@ -331,12 +426,13 @@ namespace basist int channel0 = -1, int channel1 = -1); }; - class basisu_lowlevel_uastc_hdr_transcoder + // ASTC HDR 6x6 intermediate + class basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder { friend class basisu_transcoder; public: - basisu_lowlevel_uastc_hdr_transcoder(); + basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder(); bool transcode_slice(void* pDst_blocks, uint32_t num_blocks_x, uint32_t num_blocks_y, const uint8_t* pImage_data, uint32_t image_data_size, block_format fmt, uint32_t output_block_or_pixel_stride_in_bytes, bool bc1_allow_threecolor_blocks, bool has_alpha, const uint32_t orig_width, const uint32_t orig_height, uint32_t output_row_pitch_in_blocks_or_pixels = 0, @@ -379,6 +475,9 @@ namespace basist uint32_t m_num_blocks_y; uint32_t m_total_blocks; + uint32_t m_block_width; + uint32_t m_block_height; + uint32_t m_compressed_size; uint32_t m_slice_index; // the slice index in the .basis file @@ -400,10 +499,13 @@ namespace basist uint32_t m_orig_width; uint32_t m_orig_height; - + uint32_t m_width; uint32_t m_height; + uint32_t m_block_width; + uint32_t m_block_height; + uint32_t m_num_blocks_x; uint32_t m_num_blocks_y; uint32_t m_total_blocks; @@ -425,6 +527,9 @@ namespace basist uint32_t m_width; uint32_t m_height; + uint32_t m_block_width; + uint32_t m_block_height; + uint32_t m_num_blocks_x; uint32_t m_num_blocks_y; uint32_t m_total_blocks; @@ -474,6 +579,9 @@ namespace basist basis_tex_format m_tex_format; // ETC1S, UASTC, etc. + uint32_t m_block_width; + uint32_t m_block_height; + bool m_y_flipped; // true if the image was Y flipped bool m_etc1s; // true if the file is ETC1S bool m_has_alpha_slices; // true if the texture has alpha slices (for ETC1S: even slices RGB, odd slices alpha) @@ -502,7 +610,7 @@ namespace basist // Note that the number of mipmap levels for each image may differ, and that images may have different resolutions. uint32_t get_total_images(const void* pData, uint32_t data_size) const; - basis_tex_format get_tex_format(const void* pData, uint32_t data_size) const; + basis_tex_format get_basis_tex_format(const void* pData, uint32_t data_size) const; // Returns the number of mipmap levels in an image. uint32_t get_total_image_levels(const void* pData, uint32_t data_size, uint32_t image_index) const; @@ -532,7 +640,7 @@ namespace basist // It'll first find the slice(s) to transcode, then call transcode_slice() one or two times to decode both the color and alpha texture data (or RG texture data from two slices for BC5). // If the .basis file doesn't have alpha slices, the output alpha blocks will be set to fully opaque (all 255's). // Currently, to decode to PVRTC1 the basis texture's dimensions in pixels must be a power of 2, due to PVRTC1 format requirements. - // output_blocks_buf_size_in_blocks_or_pixels should be at least the image level's total_blocks (num_blocks_x * num_blocks_y), or the total number of output pixels if fmt==cTFRGBA32. + // output_blocks_buf_size_in_blocks_or_pixels should be at least the image level's total_blocks (num_blocks_x * num_blocks_y), or the total number of output pixels if fmt==cTFRGBA32 etc. // output_row_pitch_in_blocks_or_pixels: Number of blocks or pixels per row. If 0, the transcoder uses the slice's num_blocks_x or orig_width (NOT num_blocks_x * 4). Ignored for PVRTC1 (due to texture swizzling). // output_rows_in_pixels: Ignored unless fmt is uncompressed (cRGBA32, etc.). The total number of output rows in the output buffer. If 0, the transcoder assumes the slice's orig_height (NOT num_blocks_y * 4). // Notes: @@ -574,13 +682,15 @@ namespace basist const basisu_lowlevel_etc1s_transcoder& get_lowlevel_etc1s_decoder() const { return m_lowlevel_etc1s_decoder; } basisu_lowlevel_etc1s_transcoder& get_lowlevel_etc1s_decoder() { return m_lowlevel_etc1s_decoder; } - const basisu_lowlevel_uastc_transcoder& get_lowlevel_uastc_decoder() const { return m_lowlevel_uastc_decoder; } - basisu_lowlevel_uastc_transcoder& get_lowlevel_uastc_decoder() { return m_lowlevel_uastc_decoder; } + const basisu_lowlevel_uastc_ldr_4x4_transcoder& get_lowlevel_uastc_decoder() const { return m_lowlevel_uastc_decoder; } + basisu_lowlevel_uastc_ldr_4x4_transcoder& get_lowlevel_uastc_decoder() { return m_lowlevel_uastc_decoder; } private: mutable basisu_lowlevel_etc1s_transcoder m_lowlevel_etc1s_decoder; - mutable basisu_lowlevel_uastc_transcoder m_lowlevel_uastc_decoder; - mutable basisu_lowlevel_uastc_hdr_transcoder m_lowlevel_uastc_hdr_decoder; + mutable basisu_lowlevel_uastc_ldr_4x4_transcoder m_lowlevel_uastc_decoder; + mutable basisu_lowlevel_uastc_hdr_4x4_transcoder m_lowlevel_uastc_4x4_hdr_decoder; + mutable basisu_lowlevel_astc_hdr_6x6_transcoder m_lowlevel_astc_6x6_hdr_decoder; + mutable basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder m_lowlevel_astc_6x6_hdr_intermediate_decoder; bool m_ready_to_transcode; @@ -654,6 +764,12 @@ namespace basist basisu::packed_uint<4> m_alpha_slice_byte_length; }; + struct ktx2_astc_hdr_6x6_intermediate_image_desc + { + basisu::packed_uint<4> m_rgb_slice_byte_offset; + basisu::packed_uint<4> m_rgb_slice_byte_length; + }; + struct ktx2_animdata { basisu::packed_uint<4> m_duration; @@ -663,10 +779,22 @@ namespace basist #pragma pack(pop) const uint32_t KTX2_VK_FORMAT_UNDEFINED = 0; - const uint32_t KTX2_FORMAT_UASTC_4x4_SFLOAT_BLOCK = 1000066000; // TODO, is this correct? - const uint32_t KTX2_KDF_DF_MODEL_UASTC = 166; - const uint32_t KTX2_KDF_DF_MODEL_UASTC_HDR = 167; - const uint32_t KTX2_KDF_DF_MODEL_ETC1S = 163; + + // These are standard Vulkan texture VkFormat ID's, see https://registry.khronos.org/vulkan/specs/1.3-extensions/man/html/VkFormat.html + const uint32_t KTX2_FORMAT_ASTC_4x4_SFLOAT_BLOCK = 1000066000; + const uint32_t KTX2_FORMAT_ASTC_5x4_SFLOAT_BLOCK = 1000066001; + const uint32_t KTX2_FORMAT_ASTC_5x5_SFLOAT_BLOCK = 1000066002; + const uint32_t KTX2_FORMAT_ASTC_6x5_SFLOAT_BLOCK = 1000066003; + const uint32_t KTX2_FORMAT_ASTC_6x6_SFLOAT_BLOCK = 1000066004; + const uint32_t KTX2_FORMAT_ASTC_8x5_SFLOAT_BLOCK = 1000066005; + const uint32_t KTX2_FORMAT_ASTC_8x6_SFLOAT_BLOCK = 1000066006; + + const uint32_t KTX2_KDF_DF_MODEL_ASTC = 162; // 0xA2 + const uint32_t KTX2_KDF_DF_MODEL_ETC1S = 163; // 0xA3 + const uint32_t KTX2_KDF_DF_MODEL_UASTC_LDR_4X4 = 166; // 0xA6 + const uint32_t KTX2_KDF_DF_MODEL_UASTC_HDR_4X4 = 167; // 0xA7 + const uint32_t KTX2_KDF_DF_MODEL_ASTC_HDR_6X6_INTERMEDIATE = 168; // 0xA8, TODO - coordinate with Khronos on this + const uint32_t KTX2_IMAGE_IS_P_FRAME = 2; const uint32_t KTX2_UASTC_BLOCK_SIZE = 16; // also the block size for UASTC_HDR const uint32_t KTX2_MAX_SUPPORTED_LEVEL_COUNT = 16; // this is an implementation specific constraint and can be increased @@ -679,7 +807,8 @@ namespace basist { KTX2_SS_NONE = 0, KTX2_SS_BASISLZ = 1, - KTX2_SS_ZSTANDARD = 2 + KTX2_SS_ZSTANDARD = 2, + KTX2_SS_BASIS }; extern const uint8_t g_ktx2_file_identifier[12]; @@ -779,11 +908,15 @@ namespace basist // The image's physical width/height, which will always be divisible by 4 pixels. uint32_t m_width; uint32_t m_height; - - // The texture's dimensions in 4x4 texel blocks. + + // The texture's dimensions in 4x4 or 6x6 texel blocks. uint32_t m_num_blocks_x; uint32_t m_num_blocks_y; + // The format's block width/height (currently either 4 or 6). + uint32_t m_block_width; + uint32_t m_block_height; + // The total number of blocks uint32_t m_total_blocks; @@ -853,14 +986,38 @@ namespace basist // Returns 0 or the number of layers in the texture array or texture video. Valid after init(). uint32_t get_layers() const { return m_header.m_layer_count; } - // Returns cETC1S, cUASTC4x4, or cUASTC_HDR_4x4. Valid after init(). - basist::basis_tex_format get_format() const { return m_format; } - - bool is_etc1s() const { return get_format() == basist::basis_tex_format::cETC1S; } + // Returns cETC1S, cUASTC4x4, cUASTC_HDR_4x4, cASTC_HDR_6x6, cASTC_HDR_6x6_INTERMEDIATE. Valid after init(). + basist::basis_tex_format get_basis_tex_format() const { return m_format; } + + // ETC1S LDR 4x4 + bool is_etc1s() const { return get_basis_tex_format() == basist::basis_tex_format::cETC1S; } - bool is_uastc() const { return get_format() == basist::basis_tex_format::cUASTC4x4; } + // UASTC LDR 4x4 (only) + bool is_uastc() const { return get_basis_tex_format() == basist::basis_tex_format::cUASTC4x4; } + + // Is ASTC HDR 4x4 or 6x6 + bool is_hdr() const + { + return basis_tex_format_is_hdr(get_basis_tex_format()); + } - bool is_hdr() const { return get_format() == basist::basis_tex_format::cUASTC_HDR_4x4; } + bool is_ldr() const + { + return !is_hdr(); + } + + bool is_hdr_4x4() const + { + return (get_basis_tex_format() == basist::basis_tex_format::cUASTC_HDR_4x4); + } + + bool is_hdr_6x6() const + { + return (get_basis_tex_format() == basist::basis_tex_format::cASTC_HDR_6x6) || (get_basis_tex_format() == basist::basis_tex_format::cASTC_HDR_6x6_INTERMEDIATE); + } + + uint32_t get_block_width() const { return basis_tex_format_get_block_width(get_basis_tex_format()); } + uint32_t get_block_height() const { return basis_tex_format_get_block_height(get_basis_tex_format()); } // Returns true if the ETC1S file has two planes (typically RGBA, or RRRG), or true if the UASTC file has alpha data. Valid after init(). uint32_t get_has_alpha() const { return m_has_alpha; } @@ -893,10 +1050,12 @@ namespace basist // Key value field data. struct key_value { - // The key field is UTF8 and always zero terminated. + // The key field is UTF8 and always zero terminated. + // In memory we always append a zero terminator to the key. basisu::uint8_vec m_key; - // The value may be empty. It consists of raw bytes which may or may not be zero terminated. + // The value may be empty. In the KTX2 file it consists of raw bytes which may or may not be zero terminated. + // In memory we always append a zero terminator to the value. basisu::uint8_vec m_value; bool operator< (const key_value& rhs) const { return strcmp((const char*)m_key.data(), (const char *)rhs.m_key.data()) < 0; } @@ -917,12 +1076,17 @@ namespace basist // Returns the array of ETC1S image descriptors, which is only valid after get_etc1s_image_descs() is called. const basisu::vector& get_etc1s_image_descs() const { return m_etc1s_image_descs; } + const basisu::vector& get_astc_hdr_6x6_intermediate_image_descs() const { return m_astc_6x6_intermediate_image_descs; } + // Must have called startTranscoding() first uint32_t get_etc1s_image_descs_image_flags(uint32_t level_index, uint32_t layer_index, uint32_t face_index) const; // is_video() is only valid after start_transcoding() is called. // For ETC1S data, if this returns true you must currently transcode the file from first to last frame, in order, without skipping any frames. bool is_video() const { return m_is_video; } + + // Defaults to 0, only non-zero if the key existed in the source KTX2 file. + float get_ldr_hdr_upconversion_nit_multiplier() const { return m_ldr_hdr_upconversion_nit_multiplier; } // start_transcoding() MUST be called before calling transcode_image(). // This method decompresses the ETC1S global endpoint/selector codebooks, which is not free, so try to avoid calling it excessively. @@ -956,6 +1120,7 @@ namespace basist ktx2_etc1s_global_data_header m_etc1s_header; basisu::vector m_etc1s_image_descs; + basisu::vector m_astc_6x6_intermediate_image_descs; basist::basis_tex_format m_format; @@ -967,19 +1132,54 @@ namespace basist ktx2_df_channel_id m_dfd_chan0, m_dfd_chan1; basist::basisu_lowlevel_etc1s_transcoder m_etc1s_transcoder; - basist::basisu_lowlevel_uastc_transcoder m_uastc_transcoder; - basist::basisu_lowlevel_uastc_hdr_transcoder m_uastc_hdr_transcoder; + basist::basisu_lowlevel_uastc_ldr_4x4_transcoder m_uastc_transcoder; + basist::basisu_lowlevel_uastc_hdr_4x4_transcoder m_uastc_hdr_transcoder; + basist::basisu_lowlevel_astc_hdr_6x6_transcoder m_astc_hdr_6x6_transcoder; + basist::basisu_lowlevel_astc_hdr_6x6_intermediate_transcoder m_astc_hdr_6x6_intermediate_transcoder; ktx2_transcoder_state m_def_transcoder_state; bool m_has_alpha; bool m_is_video; + float m_ldr_hdr_upconversion_nit_multiplier; bool decompress_level_data(uint32_t level_index, basisu::uint8_vec& uncomp_data); + bool read_astc_6x6_hdr_intermediate_global_data(); bool decompress_etc1s_global_data(); bool read_key_values(); }; + // Replaces if the key already exists + inline void ktx2_add_key_value(ktx2_transcoder::key_value_vec& key_values, const std::string& key, const std::string& val) + { + assert(key.size()); + + basist::ktx2_transcoder::key_value* p = nullptr; + + // Try to find an existing key + for (size_t i = 0; i < key_values.size(); i++) + { + if (strcmp((const char*)key_values[i].m_key.data(), key.c_str()) == 0) + { + p = &key_values[i]; + break; + } + } + + if (!p) + p = key_values.enlarge(1); + + p->m_key.resize(0); + p->m_value.resize(0); + + p->m_key.resize(key.size() + 1); + memcpy(p->m_key.data(), key.c_str(), key.size()); + + p->m_value.resize(val.size() + 1); + if (val.size()) + memcpy(p->m_value.data(), val.c_str(), val.size()); + } + #endif // BASISD_SUPPORT_KTX2 // Returns true if the transcoder was compiled with KTX2 support. diff --git a/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h b/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h index 17c9dc7c8c9d..8bf4abad6ca5 100644 --- a/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h +++ b/thirdparty/basis_universal/transcoder/basisu_transcoder_internal.h @@ -20,9 +20,10 @@ #pragma warning (disable: 4127) // conditional expression is constant #endif -// v1.50: Added UASTC HDR support -#define BASISD_LIB_VERSION 150 -#define BASISD_VERSION_STRING "01.50" +// v1.50: Added UASTC HDR 4x4 support +// v1.60: Added RDO ASTC HDR 6x6 and intermediate support +#define BASISD_LIB_VERSION 160 +#define BASISD_VERSION_STRING "01.60" #ifdef _DEBUG #define BASISD_BUILD_DEBUG @@ -91,10 +92,37 @@ namespace basist cUASTC_HDR_4x4, // HDR, transcodes only to 4x4 HDR ASTC, BC6H, or uncompressed cBC6H, cASTC_HDR_4x4, + cASTC_HDR_6x6, cTotalBlockFormats }; + inline uint32_t get_block_width(block_format fmt) + { + switch (fmt) + { + case block_format::cFXT1_RGB: + return 8; + case block_format::cASTC_HDR_6x6: + return 6; + default: + break; + } + return 4; + } + + inline uint32_t get_block_height(block_format fmt) + { + switch (fmt) + { + case block_format::cASTC_HDR_6x6: + return 6; + default: + break; + } + return 4; + } + const int COLOR5_PAL0_PREV_HI = 9, COLOR5_PAL0_DELTA_LO = -9, COLOR5_PAL0_DELTA_HI = 31; const int COLOR5_PAL1_PREV_HI = 21, COLOR5_PAL1_DELTA_LO = -21, COLOR5_PAL1_DELTA_HI = 21; const int COLOR5_PAL2_PREV_HI = 31, COLOR5_PAL2_DELTA_LO = -31, COLOR5_PAL2_DELTA_HI = 9; @@ -559,6 +587,12 @@ namespace basist return ct.init(total_used_syms, &code_sizes[0]); } + size_t get_bits_remaining() const + { + size_t total_bytes_remaining = m_pBuf_end - m_pBuf; + return total_bytes_remaining * 8 + m_bit_buf_size; + } + private: uint32_t m_buf_size; const uint8_t *m_pBuf; @@ -804,6 +838,7 @@ namespace basist const double MIN_DENORM_HALF_FLOAT = 0.000000059604645; // smallest positive subnormal number const double MIN_HALF_FLOAT = 0.00006103515625; // smallest positive normal number const double MAX_HALF_FLOAT = 65504.0; // largest normal number + const uint32_t MAX_HALF_FLOAT_AS_INT_BITS = 0x7BFF; // the half float rep for 65504.0 inline uint32_t get_bits(uint32_t val, int low, int high) { @@ -975,6 +1010,13 @@ namespace basist return (h * 64 + 30) / 31; } + // Suboptimal, but very close. + inline uint32_t bc6h_half_to_blog(half_float h, uint32_t num_bits) + { + assert(h <= MAX_BC6H_HALF_FLOAT_AS_UINT); + return (h * 64 + 30) / (31 * (1 << (16 - num_bits))); + } + struct bc6h_block { uint8_t m_bytes[16]; @@ -987,6 +1029,26 @@ namespace basist void bc6h_enc_block_2subset_mode9_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights); // pEndpoints[subset][comp][lh_index] void bc6h_enc_block_2subset_3bit_weights(bc6h_block* pPacked_block, uint32_t common_part_index, const half_float pEndpoints[2][3][2], const uint8_t* pWeights); // pEndpoints[subset][comp][lh_index] bool bc6h_enc_block_solid_color(bc6h_block* pPacked_block, const half_float pColor[3]); + + struct bc6h_logical_block + { + uint32_t m_mode; + uint32_t m_partition_pattern; // must be 0 if 1 subset + uint32_t m_endpoints[3][4]; // [comp][subset*2+lh_index] - must be already properly packed + uint8_t m_weights[16]; // weights must be of the proper size, taking into account skipped MSB's which must be 0 + + void clear() + { + basisu::clear_obj(*this); + } + }; + + void pack_bc6h_block(bc6h_block& dst_blk, bc6h_logical_block& log_blk); + + namespace bc7_mode_5_encoder + { + void encode_bc7_mode_5_block(void* pDst_block, color32* pPixels, bool hq_mode); + } } // namespace basist diff --git a/thirdparty/libktx/lib/basis_transcode.cpp b/thirdparty/libktx/lib/basis_transcode.cpp index d7ecb7a0fdc7..43ad05915007 100644 --- a/thirdparty/libktx/lib/basis_transcode.cpp +++ b/thirdparty/libktx/lib/basis_transcode.cpp @@ -658,7 +658,7 @@ ktxTexture2_transcodeUastc(ktxTexture2* This, ktxLevelIndexEntry* protoLevelIndex = protoPriv._levelIndex; ktx_size_t levelOffsetWrite = 0; - basisu_lowlevel_uastc_transcoder uit; + basisu_lowlevel_uastc_ldr_4x4_transcoder uit; // See comment on same declaration in transcodeEtc1s. std::vector xcoderStates; xcoderStates.resize(This->isVideo ? This->numFaces : 1); diff --git a/thirdparty/libktx/patches/0003-basisu-1.60.patch b/thirdparty/libktx/patches/0003-basisu-1.60.patch new file mode 100644 index 000000000000..e772a0d74b15 --- /dev/null +++ b/thirdparty/libktx/patches/0003-basisu-1.60.patch @@ -0,0 +1,13 @@ +diff --git a/thirdparty/libktx/lib/basis_transcode.cpp b/thirdparty/libktx/lib/basis_transcode.cpp +index d7ecb7a0fd..43ad059150 100644 +--- a/thirdparty/libktx/lib/basis_transcode.cpp ++++ b/thirdparty/libktx/lib/basis_transcode.cpp +@@ -658,7 +658,7 @@ ktxTexture2_transcodeUastc(ktxTexture2* This, + ktxLevelIndexEntry* protoLevelIndex = protoPriv._levelIndex; + ktx_size_t levelOffsetWrite = 0; + +- basisu_lowlevel_uastc_transcoder uit; ++ basisu_lowlevel_uastc_ldr_4x4_transcoder uit; + // See comment on same declaration in transcodeEtc1s. + std::vector xcoderStates; + xcoderStates.resize(This->isVideo ? This->numFaces : 1);