// File: basisu_astc_hdr_6x6_enc.cpp // Copyright (C) 2019-2026 Binomial LLC. All Rights Reserved. // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "basisu_astc_hdr_6x6_enc.h" #include "basisu_enc.h" #include "basisu_astc_hdr_common.h" #include "basisu_math.h" #include "basisu_resampler.h" #include "basisu_resampler_filters.h" #define MINIZ_HEADER_FILE_ONLY #define MINIZ_NO_ZLIB_COMPATIBLE_NAMES #include "basisu_miniz.h" #include "3rdparty/android_astc_decomp.h" #include #include using namespace basisu; using namespace buminiz; using namespace basist::astc_6x6_hdr; namespace astc_6x6_hdr { static void atomic_max(std::atomic& atomic_var, uint32_t new_value) { uint32_t current = atomic_var.load(std::memory_order_relaxed); for ( ; ; ) { uint32_t new_max = std::max(current, new_value); if (atomic_var.compare_exchange_weak(current, new_max, std::memory_order_relaxed, std::memory_order_relaxed)) break; } } void astc_hdr_6x6_global_config::set_user_level(int level) { level = basisu::clamp(level, 0, ASTC_HDR_6X6_MAX_USER_COMP_LEVEL); m_master_comp_level = 0; m_highest_comp_level = 0; m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS; m_extra_patterns_flag = false; m_brute_force_partition_matching = false; switch (level) { case 0: { // Both reduce compression a lot when lambda>0 m_favor_higher_compression = false; m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS / 2; break; } case 1: { m_master_comp_level = 0; m_highest_comp_level = 0; break; } case 2: { m_master_comp_level = 0; m_highest_comp_level = 1; break; } case 3: { m_master_comp_level = 1; m_highest_comp_level = 1; break; } case 4: { m_master_comp_level = 1; m_highest_comp_level = 2; break; } case 5: { m_master_comp_level = 1; m_highest_comp_level = 3; break; } case 6: { m_master_comp_level = 1; m_highest_comp_level = 4; break; } case 7: { m_master_comp_level = 2; m_highest_comp_level = 2; break; } case 8: { m_master_comp_level = 2; m_highest_comp_level = 3; break; } case 9: { m_master_comp_level = 2; m_highest_comp_level = 4; break; } case 10: { m_master_comp_level = 3; m_highest_comp_level = 3; break; } case 11: { m_master_comp_level = 3; m_highest_comp_level = 4; break; } case 12: default: { m_master_comp_level = 4; m_highest_comp_level = 4; m_extra_patterns_flag = true; m_brute_force_partition_matching = true; break; } } } const float m1 = 0.1593017578125f; // (2610 / 2^14) * (1/100) const float m2 = 78.84375f; // (2523 / 32) * (1/100) const float c1 = 0.8359375f; // 3424 / (2^12) const float c2 = 18.8515625f; // (2413 / 128) const float c3 = 18.6875f; // (2392 / 128) static float forwardPQ(float Y) { // 10,000 here is an absolute scale - it's in nits (cd per square meter) float L = Y * (1.0f / 10000.0f); float num = powf(L, m1); float N = powf((c1 + c2 * num) / (1 + c3 * num), m2); return N; } #if 0 static float inversePQ(float E) { float N = powf(E, 1.0f / m2); float num = basisu::maximum((N - c1), 0.0f) / (c2 - c3 * N); float L = powf(num, 1.0f / m1); return L * 10000.0f; } #endif // PQ function approximation: convert input to bfloat16, look up in tables, bilinear interpolation between table entries. // max_er: 0.000023007392883, max_rel_er: 0.000023472490284, avg_er: 0.000004330495689, 6-7x faster on x86 // Highest error is for values less than SMALLEST_PQ_VAL_IN. // // Approximation is round trip lossless for 10-12 bits at [0,10000] nits: // for x [0,1024] (SCALE=1023) or for x [0,4095] (SCALE=4096): // round(forwardPQTab(inversePQ(x / SCALE)) * SCALE) == x // // bfloat16 has enough precision to handle 8-bit sRGB to linear conversions: // round(linear_to_srgb(bfloat16_to_float(float_to_bfloat16(srgb_to_linear(isRGB/255.0f))))*255.0) is lossless const int PQ_APPROX_MIN_EXP = -16, PQ_APPROX_MAX_EXP = 16; const int PQ_APPROX_EXP_RANGE = (PQ_APPROX_MAX_EXP - PQ_APPROX_MIN_EXP + 1); const float SMALLEST_PQ_VAL_IN = 0.000015258829080f; const float SMALLEST_PQ_VAL = 0.000551903737f; // forwardPQ(SMALLEST_PQ_VAL_IN) const float LARGEST_PQ_VAL = 1.251312f; float g_pq_approx_tabs[PQ_APPROX_EXP_RANGE][128]; static void init_pq_tables() { for (int exp = PQ_APPROX_MIN_EXP; exp <= PQ_APPROX_MAX_EXP; exp++) { for (int mant = 0; mant < 128; mant++) { bfloat16 b = bfloat16_init(1, exp, mant); float bf = bfloat16_to_float(b); float pq = forwardPQ(bf); g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant] = pq; } } //fmt_printf("{.15} {.15}\n", g_pq_approx_tabs[0][0], inversePQ(g_pq_approx_tabs[0][0])); //fmt_printf("{.15}\n", forwardPQ(SMALLEST_PQ_VAL_IN)); } static inline float forwardPQTab(float v) { assert(g_pq_approx_tabs[0][0]); assert(v >= 0.0f); if (v == 0.0f) return 0.0f; bfloat16 bf = float_to_bfloat16(v, false); assert(v >= bfloat16_to_float(bf)); int exp = bfloat16_get_exp(bf); if (exp < PQ_APPROX_MIN_EXP) { // not accurate but should be good enough for our uses return lerp(0.0f, SMALLEST_PQ_VAL, minimum(1.0f, v / SMALLEST_PQ_VAL_IN)); } else if (exp > PQ_APPROX_MAX_EXP) return LARGEST_PQ_VAL; int mant = bfloat16_get_mantissa(bf); float a = g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant]; float bf_f32 = bfloat16_to_float(bf); int next_mant = mant + 1; int next_exp = exp; if (next_mant == 128) { next_mant = 0; next_exp++; if (next_exp > PQ_APPROX_MAX_EXP) return a; } float b = g_pq_approx_tabs[next_exp - PQ_APPROX_MIN_EXP][next_mant]; bfloat16 next_bf = bfloat16_init(1, next_exp, next_mant); float next_bf_f32 = bfloat16_to_float(next_bf); assert(v <= next_bf_f32); float lerp_factor = (v - bf_f32) / (next_bf_f32 - bf_f32); assert((lerp_factor >= 0) && (lerp_factor <= 1.0f)); return lerp(a, b, lerp_factor); } // 100 nits = ~.5 i // This converts absolute linear RGB light in either REC 709 or REC2020/BT2100 color gamut to ICtCp, a coding space where Ct is scaled by 2. // To convert to perceptual ITP for error/distance calculations, multiply the result Ct by .5 (or set itp_flag to true). // Assumes REC 709 input, or REC 2020/BT.2100 RGB input if rec2020_bt2100_color_gamut is true. // // ITP info: // https://www.portrait.com/resource-center/ictcp-color-difference-metric/ // https://professional.dolby.com/siteassets/pdfs/measuringperceptualcolorvolume_v07.253.pdf (see scale to JND's) // This also converts from a ICtCp coding space to threshold or perceptually uniform space ITP. // // Linear REC709 to REC2020/BT.2100 gamut conversion: // rgb_2100[0] = rgb_in[0] * 0.6274f + rgb_in[1] * 0.3293f + rgb_in[2] * 0.0433f; // rgb_2100[1] = rgb_in[0] * 0.0691f + rgb_in[1] * 0.9195f + rgb_in[2] * 0.0114f; // rgb_2100[2] = rgb_in[0] * 0.0164f + rgb_in[1] * 0.0880f + rgb_in[2] * 0.8956f; // const float S = 1.0f / 4096.0f; // l = (1688.0f * S) * rgb_2100[0] + (2146.0f * S) * rgb_2100[1] + (262.0f * S) * rgb_2100[2]; // m = (683.0f * S) * rgb_2100[0] + (2951.0f * S) * rgb_2100[1] + (462.0f * S) * rgb_2100[2]; // s = (99.0f * S) * rgb_2100[0] + (309.0f * S) * rgb_2100[1] + (3688.0f * S) * rgb_2100[2]; static void linear_rgb_to_ictcp(const vec3F& rgb_in, vec3F& ictcp, bool itp_flag = false, bool rec2020_bt2100_color_gamut = false) { vec3F rgb_2100(rgb_in); float l, m, s; if (!rec2020_bt2100_color_gamut) { // Assume REC 709 input color gamut // (REC2020_to_LMS * REC709_to_2020) * input_color l = rgb_2100[0] * 0.2958097f + rgb_2100[1] * 0.6230863f + rgb_2100[2] * 0.0811040f; m = rgb_2100[0] * 0.1562512f + rgb_2100[1] * 0.7272980f + rgb_2100[2] * 0.1164508f; s = rgb_2100[0] * 0.0351435f + rgb_2100[1] * 0.1565601f + rgb_2100[2] * 0.8082964f; } else { // Assumes REC2020/BT.2100 input color gamut (this is from the spec) l = 0.412109375f * rgb_2100[0] + 0.52392578125f * rgb_2100[1] + 0.06396484375f * rgb_2100[2]; m = 0.166748046875f * rgb_2100[0] + 0.720458984375f * rgb_2100[1] + 0.11279296875f * rgb_2100[2]; s = 0.024169921875f * rgb_2100[0] + 0.075439453125f * rgb_2100[1] + 0.900390625f * rgb_2100[2]; } float ld = forwardPQTab(l); float md = forwardPQTab(m); float sd = forwardPQTab(s); ictcp[0] = .5f * ld + .5f * md; // if ITP scale Ct by .5 (the ICtCp spec scaled Ct to better exploit the full scaled output, which is not perceptually linear) if (itp_flag) ictcp[1] = 0.806884765625f * ld + -1.6617431640625f * md + 0.8548583984375f * sd; else ictcp[1] = 1.61376953125f * ld + -3.323486328125f * md + 1.709716796875f * sd; ictcp[2] = 4.378173828125f * ld + -4.24560546875f * md + -0.132568359375f * sd; } static inline void linear_rgb_to_itp(const vec3F& rgb_in, vec3F& itp, const astc_hdr_6x6_global_config &cfg) { linear_rgb_to_ictcp(rgb_in, itp, true, cfg.m_rec2020_bt2100_color_gamut); } #if 0 // Outputs rec2020/bt2100 color gamut (i.e. this doesn't convert back to REC709 gamut). static void ictcp_to_linear_rgb(const vec3F& ictcp, vec3F& rgb, bool itp_flag = false) { float ct = ictcp[1]; if (itp_flag) ct *= 2.0f; float ld = ictcp[0] + ct * 0.008609037037932726f + ictcp[2] * 0.11102962500302596f; float md = ictcp[0] + ct * -0.008609037037932726f + ictcp[2] * -0.11102962500302596f; float sd = ictcp[0] + ct * 0.5600313357106792f + ictcp[2] * -0.32062717498731885f; float l = inversePQ(ld); float m = inversePQ(md); float s = inversePQ(sd); rgb[0] = l * 3.436606694333079f + m * -2.5064521186562705f + s * 0.06984542432319149f; rgb[1] = l * -0.7913295555989289f + m * 1.983600451792291f + s * -0.192270896193362f; rgb[2] = l * -0.025949899690592672f + m * -0.09891371471172646f + s * 1.1248636144023192f; } #endif struct half_vec3 { basist::half_float m_vals[3]; inline half_vec3() { } inline half_vec3(basist::half_float x, basist::half_float y, basist::half_float z) { m_vals[0] = x; m_vals[1] = y; m_vals[2] = z; } inline half_vec3(const half_vec3& other) { *this = other; } inline half_vec3& operator= (const half_vec3& rhs) { m_vals[0] = rhs.m_vals[0]; m_vals[1] = rhs.m_vals[1]; m_vals[2] = rhs.m_vals[2]; return *this; } inline void clear() { clear_obj(m_vals); } inline half_vec3 &set(basist::half_float x, basist::half_float y, basist::half_float z) { m_vals[0] = x; m_vals[1] = y; m_vals[2] = z; return *this; } inline half_vec3& set(float x, float y, float z) { m_vals[0] = basist::float_to_half(x); m_vals[1] = basist::float_to_half(y); m_vals[2] = basist::float_to_half(z); return *this; } template inline half_vec3& set_vec(const T& vec) { m_vals[0] = basist::float_to_half(vec[0]); m_vals[1] = basist::float_to_half(vec[1]); m_vals[2] = basist::float_to_half(vec[2]); return *this; } template inline T get_vec() const { return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2])); } inline basist::half_float operator[] (uint32_t c) const { assert(c < 3); return m_vals[c]; } inline basist::half_float& operator[] (uint32_t c) { assert(c < 3); return m_vals[c]; } float get_float_comp(uint32_t c) const { assert(c < 3); return basist::half_to_float(m_vals[c]); } half_vec3& set_float_comp(uint32_t c, float v) { assert(c < 3); m_vals[c] = basist::float_to_half(v); return *this; } }; struct half_vec4 { basist::half_float m_vals[4]; inline half_vec4() { } inline half_vec4(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w) { m_vals[0] = x; m_vals[1] = y; m_vals[2] = z; m_vals[3] = w; } inline half_vec4(const half_vec4& other) { *this = other; } inline half_vec4& operator= (const half_vec4& rhs) { m_vals[0] = rhs.m_vals[0]; m_vals[1] = rhs.m_vals[1]; m_vals[2] = rhs.m_vals[2]; m_vals[3] = rhs.m_vals[3]; return *this; } inline void clear() { clear_obj(m_vals); } inline half_vec4& set(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w) { m_vals[0] = x; m_vals[1] = y; m_vals[2] = z; m_vals[3] = w; return *this; } inline half_vec4& set(float x, float y, float z, float w) { m_vals[0] = basist::float_to_half(x); m_vals[1] = basist::float_to_half(y); m_vals[2] = basist::float_to_half(z); m_vals[3] = basist::float_to_half(w); return *this; } template inline half_vec4& set_vec(const T& vec) { m_vals[0] = basist::float_to_half(vec[0]); m_vals[1] = basist::float_to_half(vec[1]); m_vals[2] = basist::float_to_half(vec[2]); m_vals[3] = basist::float_to_half(vec[3]); return *this; } template inline T get_vec() const { return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]), basist::half_to_float(m_vals[3])); } inline basist::half_float operator[] (uint32_t c) const { assert(c < 4); return m_vals[c]; } inline basist::half_float &operator[] (uint32_t c) { assert(c < 4); return m_vals[c]; } float get_float_comp(uint32_t c) const { assert(c < 4); return basist::half_to_float(m_vals[c]); } half_vec4& set_float_comp(uint32_t c, float v) { assert(c < 4); m_vals[c] = basist::float_to_half(v); return *this; } }; const uint32_t MAX_BLOCK_W = 6, MAX_BLOCK_H = 6; struct trial_result { astc_helpers::log_astc_block m_log_blk; double m_err; bool m_valid; }; //---------------------------------------------------------- const uint32_t NUM_PART3_MAPPINGS = 6; static uint8_t g_part3_mapping[NUM_PART3_MAPPINGS][3] = { { 0, 1, 2 }, { 1, 2, 0 }, { 2, 0, 1 }, { 0, 2, 1 }, { 1, 0, 2 }, { 2, 1, 0 } }; struct partition_pattern_vec { uint8_t m_parts[6 * 6]; partition_pattern_vec() { clear(); } partition_pattern_vec(const partition_pattern_vec& other) { *this = other; } void clear() { memset(m_parts, 0, sizeof(m_parts)); } partition_pattern_vec& operator= (const partition_pattern_vec& rhs) { if (this == &rhs) return *this; memcpy(m_parts, rhs.m_parts, 36); return *this; } uint8_t operator[] (uint32_t i) const { assert(i < 36); return m_parts[i]; } uint8_t& operator[] (uint32_t i) { assert(i < 36); return m_parts[i]; } uint8_t operator() (uint32_t x, uint32_t y) const { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; } uint8_t& operator() (uint32_t x, uint32_t y) { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; } int get_squared_distance(const partition_pattern_vec& other) const { int total_dist = 0; for (uint32_t i = 0; i < 36; i++) total_dist += iabs((int)m_parts[i] - (int)other.m_parts[i]); return total_dist; } float get_distance(const partition_pattern_vec& other) const { return sqrtf((float)get_squared_distance(other)); } partition_pattern_vec get_permuted2(uint32_t permute_index) const { assert(permute_index <= 1); partition_pattern_vec res; for (uint32_t i = 0; i < 36; i++) { assert(m_parts[i] <= 1); res.m_parts[i] = (uint8_t)(m_parts[i] ^ permute_index); } return res; } partition_pattern_vec get_permuted3(uint32_t permute_index) const { assert(permute_index <= 5); partition_pattern_vec res; for (uint32_t i = 0; i < 36; i++) { assert(m_parts[i] <= 2); res.m_parts[i] = g_part3_mapping[permute_index][m_parts[i]]; } return res; } partition_pattern_vec get_canonicalized() const { partition_pattern_vec res; int new_labels[3] = { -1, -1, -1 }; uint32_t next_index = 0; for (uint32_t i = 0; i < 36; i++) { uint32_t p = m_parts[i]; if (new_labels[p] == -1) new_labels[p] = next_index++; res.m_parts[i] = (uint8_t)new_labels[p]; } return res; } bool operator== (const partition_pattern_vec& rhs) const { return memcmp(m_parts, rhs.m_parts, sizeof(m_parts)) == 0; } operator size_t() const { return basist::hash_hsieh(m_parts, sizeof(m_parts)); } }; struct vp_tree_node { partition_pattern_vec m_vantage_point; uint32_t m_point_index; float m_dist; int m_inner_node, m_outer_node; }; #define BRUTE_FORCE_PART_SEARCH (0) class vp_tree { public: vp_tree() { } void clear() { m_nodes.clear(); } // This requires no redundant patterns, i.e. all must be unique. bool init(uint32_t n, const partition_pattern_vec* pUnique_pats) { clear(); uint_vec pat_indices(n); for (uint32_t i = 0; i < n; i++) pat_indices[i] = i; std::pair root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices); if (root_idx.first == -1) return false; m_nodes.resize(1); m_nodes[0].m_vantage_point = pUnique_pats[root_idx.first]; m_nodes[0].m_point_index = root_idx.first; m_nodes[0].m_dist = root_idx.second; m_nodes[0].m_inner_node = -1; m_nodes[0].m_outer_node = -1; uint_vec inner_list, outer_list; inner_list.reserve(n / 2); outer_list.reserve(n / 2); for (uint32_t pat_index = 0; pat_index < n; pat_index++) { if ((int)pat_index == root_idx.first) continue; const float dist = m_nodes[0].m_vantage_point.get_distance(pUnique_pats[pat_index]); if (dist <= root_idx.second) inner_list.push_back(pat_index); else outer_list.push_back(pat_index); } if (inner_list.size()) { m_nodes[0].m_inner_node = create_node(n, pUnique_pats, inner_list); if (m_nodes[0].m_inner_node < 0) return false; } if (outer_list.size()) { m_nodes[0].m_outer_node = create_node(n, pUnique_pats, outer_list); if (m_nodes[0].m_outer_node < 0) return false; } return true; } struct result { uint32_t m_pat_index; uint32_t m_mapping_index; float m_dist; bool operator< (const result& rhs) const { return m_dist < rhs.m_dist; } bool operator> (const result& rhs) const { return m_dist > rhs.m_dist; } }; class result_queue { enum { MaxSupportedSize = 256 + 1 }; public: result_queue() : m_cur_size(0) { } size_t get_size() const { return m_cur_size; } bool empty() const { return !m_cur_size; } typedef std::array result_array_type; const result_array_type& get_elements() const { return m_elements; } result_array_type& get_elements() { return m_elements; } void clear() { m_cur_size = 0; } void reserve(uint32_t n) { BASISU_NOTE_UNUSED(n); } const result& top() const { assert(m_cur_size); return m_elements[1]; } bool insert(const result& val, uint32_t max_size) { assert(max_size < MaxSupportedSize); if (m_cur_size >= MaxSupportedSize) return false; m_elements[++m_cur_size] = val; up_heap(m_cur_size); if (m_cur_size > max_size) pop(); return true; } bool pop() { if (m_cur_size == 0) return false; m_elements[1] = m_elements[m_cur_size--]; down_heap(1); return true; } float get_highest_dist() const { if (!m_cur_size) return 0.0f; return top().m_dist; } private: result_array_type m_elements; size_t m_cur_size; void up_heap(size_t index) { while ((index > 1) && (m_elements[index] > m_elements[index >> 1])) { std::swap(m_elements[index], m_elements[index >> 1]); index >>= 1; } } void down_heap(size_t index) { for ( ; ; ) { size_t largest = index, left_child = 2 * index, right_child = 2 * index + 1; if ((left_child <= m_cur_size) && (m_elements[left_child] > m_elements[largest])) largest = left_child; if ((right_child <= m_cur_size) && (m_elements[right_child] > m_elements[largest])) largest = right_child; if (largest == index) break; std::swap(m_elements[index], m_elements[largest]); index = largest; } } }; void find_nearest(uint32_t num_subsets, const partition_pattern_vec& desired_pat, result_queue& results, uint32_t max_results) { assert((num_subsets >= 2) && (num_subsets <= 3)); results.clear(); if (!m_nodes.size()) return; uint32_t num_desired_pats; partition_pattern_vec desired_pats[NUM_PART3_MAPPINGS]; if (num_subsets == 2) { num_desired_pats = 2; for (uint32_t i = 0; i < 2; i++) desired_pats[i] = desired_pat.get_permuted2(i); } else { num_desired_pats = NUM_PART3_MAPPINGS; for (uint32_t i = 0; i < NUM_PART3_MAPPINGS; i++) desired_pats[i] = desired_pat.get_permuted3(i); } #if 0 find_nearest_at_node(0, num_desired_pats, desired_pats, results, max_results); #else find_nearest_at_node_non_recursive(0, num_desired_pats, desired_pats, results, max_results); #endif } private: basisu::vector m_nodes; void find_nearest_at_node(int node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results) { float best_dist_to_vantage = BIG_FLOAT_VAL; uint32_t best_mapping = 0; for (uint32_t i = 0; i < num_desired_pats; i++) { float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point); if (dist < best_dist_to_vantage) { best_dist_to_vantage = dist; best_mapping = i; } } result r; r.m_dist = best_dist_to_vantage; r.m_mapping_index = best_mapping; r.m_pat_index = m_nodes[node_index].m_point_index; results.insert(r, max_results); if (best_dist_to_vantage <= m_nodes[node_index].m_dist) { // inner first if (m_nodes[node_index].m_inner_node >= 0) find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results); if (m_nodes[node_index].m_outer_node >= 0) { if ( (results.get_size() < max_results) || ((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist()) ) { find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results); } } } else { // outer first if (m_nodes[node_index].m_outer_node >= 0) find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results); if (m_nodes[node_index].m_inner_node >= 0) { if ( (results.get_size() < max_results) || ((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist()) ) { find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results); } } } } void find_nearest_at_node_non_recursive(int init_node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results) { uint_vec node_stack; node_stack.reserve(16); node_stack.push_back(init_node_index); do { const uint32_t node_index = node_stack.back(); node_stack.pop_back(); float best_dist_to_vantage = BIG_FLOAT_VAL; uint32_t best_mapping = 0; for (uint32_t i = 0; i < num_desired_pats; i++) { float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point); if (dist < best_dist_to_vantage) { best_dist_to_vantage = dist; best_mapping = i; } } result r; r.m_dist = best_dist_to_vantage; r.m_mapping_index = best_mapping; r.m_pat_index = m_nodes[node_index].m_point_index; results.insert(r, max_results); if (best_dist_to_vantage <= m_nodes[node_index].m_dist) { if (m_nodes[node_index].m_outer_node >= 0) { if ((results.get_size() < max_results) || ((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist()) ) { node_stack.push_back(m_nodes[node_index].m_outer_node); } } // inner first if (m_nodes[node_index].m_inner_node >= 0) { node_stack.push_back(m_nodes[node_index].m_inner_node); } } else { if (m_nodes[node_index].m_inner_node >= 0) { if ((results.get_size() < max_results) || ((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist()) ) { node_stack.push_back(m_nodes[node_index].m_inner_node); } } // outer first if (m_nodes[node_index].m_outer_node >= 0) { node_stack.push_back(m_nodes[node_index].m_outer_node); } } } while (!node_stack.empty()); } // returns the index of the new node, or -1 on error int create_node(uint32_t n, const partition_pattern_vec* pUnique_pats, const uint_vec& pat_indices) { std::pair root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices); if (root_idx.first < 0) return -1; m_nodes.resize(m_nodes.size() + 1); const uint32_t new_node_index = m_nodes.size_u32() - 1; m_nodes[new_node_index].m_vantage_point = pUnique_pats[root_idx.first]; m_nodes[new_node_index].m_point_index = root_idx.first; m_nodes[new_node_index].m_dist = root_idx.second; m_nodes[new_node_index].m_inner_node = -1; m_nodes[new_node_index].m_outer_node = -1; uint_vec inner_list, outer_list; inner_list.reserve(pat_indices.size_u32() / 2); outer_list.reserve(pat_indices.size_u32() / 2); for (uint32_t pat_indices_iter = 0; pat_indices_iter < pat_indices.size(); pat_indices_iter++) { const uint32_t pat_index = pat_indices[pat_indices_iter]; if ((int)pat_index == root_idx.first) continue; const float dist = m_nodes[new_node_index].m_vantage_point.get_distance(pUnique_pats[pat_index]); if (dist <= root_idx.second) inner_list.push_back(pat_index); else outer_list.push_back(pat_index); } if (inner_list.size()) m_nodes[new_node_index].m_inner_node = create_node(n, pUnique_pats, inner_list); if (outer_list.size()) m_nodes[new_node_index].m_outer_node = create_node(n, pUnique_pats, outer_list); return new_node_index; } // returns the pattern index of the vantage point (-1 on error), and the optimal split distance std::pair find_best_vantage_point(uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, const uint_vec &pat_indices) { BASISU_NOTE_UNUSED(num_unique_pats); const uint32_t n = pat_indices.size_u32(); assert(n); if (n == 1) return std::pair(pat_indices[0], 0.0f); float best_split_metric = -1.0f; int best_split_pat = -1; float best_split_dist = 0.0f; float best_split_var = 0.0f; basisu::vector< std::pair > dists; dists.reserve(n); float_vec float_dists; float_dists.reserve(n); for (uint32_t pat_indices_iter = 0; pat_indices_iter < n; pat_indices_iter++) { const uint32_t split_pat_index = pat_indices[pat_indices_iter]; assert(split_pat_index < num_unique_pats); const partition_pattern_vec& trial_vantage = pUnique_pats[split_pat_index]; dists.resize(0); float_dists.resize(0); for (uint32_t j = 0; j < n; j++) { const uint32_t pat_index = pat_indices[j]; assert(pat_index < num_unique_pats); if (pat_index == split_pat_index) continue; float dist = trial_vantage.get_distance(pUnique_pats[pat_index]); dists.emplace_back(std::pair(dist, pat_index)); float_dists.push_back(dist); } stats s; s.calc(float_dists.size_u32(), float_dists.data()); std::sort(dists.begin(), dists.end(), [](const auto &a, const auto &b) { return a.first < b.first; }); const uint32_t num_dists = dists.size_u32(); float split_dist = dists[num_dists / 2].first; if ((num_dists & 1) == 0) split_dist = (split_dist + dists[(num_dists / 2) - 1].first) * .5f; uint32_t total_inner = 0, total_outer = 0; for (uint32_t j = 0; j < n; j++) { const uint32_t pat_index = pat_indices[j]; if (pat_index == split_pat_index) continue; float dist = trial_vantage.get_distance(pUnique_pats[pat_index]); if (dist <= split_dist) total_inner++; else total_outer++; } float split_metric = (float)minimum(total_inner, total_outer) / (float)maximum(total_inner, total_outer); if ( (split_metric > best_split_metric) || ((split_metric == best_split_metric) && (s.m_var > best_split_var)) ) { best_split_metric = split_metric; best_split_dist = split_dist; best_split_pat = split_pat_index; best_split_var = (float)s.m_var; } } return std::pair(best_split_pat, best_split_dist); } }; struct partition { uint64_t m_p; inline partition() : m_p(0) { } inline partition(uint64_t p) : m_p(p) { assert(p < (1ULL << 36)); } inline partition& operator=(uint64_t p) { assert(p < (1ULL << 36)); m_p = p; return *this; } inline bool operator< (const partition& p) const { return m_p < p.m_p; } inline bool operator== (const partition& p) const { return m_p == p.m_p; } inline operator size_t() const { return basist::hash_hsieh((const uint8_t *)&m_p, sizeof(m_p)); } }; partition_pattern_vec g_partitions2[NUM_UNIQUE_PARTITIONS2]; int g_part2_seed_to_unique_index[1024]; vp_tree g_part2_vp_tree; static inline vec3F vec3F_norm_approx(vec3F axis) { float l = axis.norm(); axis = (fabs(l) >= SMALL_FLOAT_VAL) ? (axis * bu_math::inv_sqrt(l)) : vec3F(0.577350269f); return axis; } static void init_partitions2_6x6() { #if 0 // makes pattern bits to the 10-bit ASTC seed index typedef basisu::hash_map partition2_hash_map; partition2_hash_map phash; phash.reserve(1024); for (uint32_t i = 0; i < 1024; i++) { uint64_t p_bits = 0; uint64_t p_bits_inv = 0; for (uint32_t y = 0; y < 6; y++) { for (uint32_t x = 0; x < 6; x++) { uint64_t p = astc_helpers::compute_texel_partition(i, x, y, 0, 2, false); assert(p < 2); p_bits |= (p << (x + y * 6)); p_bits_inv |= ((1 - p) << (x + y * 6)); } } if (!p_bits) continue; if (p_bits == ((1ULL << 36) - 1)) continue; assert(p_bits < (1ULL << 36)); assert(p_bits_inv < (1ULL << 36)); if (phash.contains(p_bits)) { } else if (phash.contains(p_bits_inv)) { } else { auto res = phash.insert(p_bits, i); assert(res.second); BASISU_NOTE_UNUSED(res); } } uint32_t num_unique_partitions2 = 0; for (const auto& r : phash) { assert(r.second < 1024); const uint32_t unique_index = num_unique_partitions2; assert(unique_index < NUM_UNIQUE_PARTITIONS2); partition_pattern_vec pat_vec; for (uint32_t i = 0; i < 36; i++) pat_vec[i] = (uint8_t)((r.first >> i) & 1); g_partitions2[unique_index] = pat_vec; assert(g_part2_unique_index_to_seed[unique_index] == r.second); g_part2_seed_to_unique_index[r.second] = unique_index; num_unique_partitions2++; } assert(num_unique_partitions2 == NUM_UNIQUE_PARTITIONS2); #else for (uint32_t unique_index = 0; unique_index < NUM_UNIQUE_PARTITIONS2; unique_index++) { const uint32_t seed_index = g_part2_unique_index_to_seed[unique_index]; assert(seed_index < 1024); assert(g_part2_seed_to_unique_index[seed_index] == 0); g_part2_seed_to_unique_index[seed_index] = unique_index; partition_pattern_vec& pat_vec = g_partitions2[unique_index]; for (uint32_t y = 0; y < 6; y++) { for (uint32_t x = 0; x < 6; x++) { uint8_t p = (uint8_t)astc_helpers::compute_texel_partition(seed_index, x, y, 0, 2, false); assert(p < 2); pat_vec[x + y * 6] = p; } } } #endif g_part2_vp_tree.init(NUM_UNIQUE_PARTITIONS2, g_partitions2); } static bool estimate_partition2_6x6( const basist::half_float pBlock_pixels_half[][3], int* pBest_parts, uint32_t num_best_parts) { const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H; vec3F training_vecs[BLOCK_T], mean(0.0f); for (uint32_t i = 0; i < BLOCK_T; i++) { vec3F& v = training_vecs[i]; v[0] = (float)pBlock_pixels_half[i][0]; v[1] = (float)pBlock_pixels_half[i][1]; v[2] = (float)pBlock_pixels_half[i][2]; mean += v; } mean *= (1.0f / (float)BLOCK_T); vec3F max_vals(-BIG_FLOAT_VAL); for (uint32_t i = 0; i < BLOCK_T; i++) { vec3F& v = training_vecs[i]; max_vals = vec3F::component_max(max_vals, v); } // Initialize principle axis approximation vec3F axis(max_vals - mean); // Incremental approx. PCA - only viable if we have a reasonably fast approximation for 1.0/sqrt(x). for (uint32_t i = 0; i < BLOCK_T; i++) { axis = vec3F_norm_approx(axis); vec3F color(training_vecs[i] - mean); float d = color.dot(axis); axis += color * d; } if (axis.norm() < SMALL_FLOAT_VAL) axis.set(0.57735027f); else axis.normalize_in_place(); #if BRUTE_FORCE_PART_SEARCH int desired_parts[BLOCK_H][BLOCK_W]; // [y][x] for (uint32_t i = 0; i < BLOCK_T; i++) { float proj = (training_vecs[i] - mean).dot(axis); desired_parts[i / BLOCK_W][i % BLOCK_W] = proj < 0.0f; } #else partition_pattern_vec desired_part; for (uint32_t i = 0; i < BLOCK_T; i++) { float proj = (training_vecs[i] - mean).dot(axis); desired_part.m_parts[i] = proj < 0.0f; } #endif //interval_timer tm; //tm.start(); #if BRUTE_FORCE_PART_SEARCH uint32_t part_similarity[NUM_UNIQUE_PARTITIONS2]; for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS2; part_index++) { const partition_pattern_vec &pat_vec = g_partitions2[part_index]; int total_sim_non_inv = 0; int total_sim_inv = 0; for (uint32_t y = 0; y < BLOCK_H; y++) { for (uint32_t x = 0; x < BLOCK_W; x++) { int part = pat_vec[x + y * 6]; if (part == desired_parts[y][x]) total_sim_non_inv++; if ((part ^ 1) == desired_parts[y][x]) total_sim_inv++; } } int total_sim = maximum(total_sim_non_inv, total_sim_inv); part_similarity[part_index] = (total_sim << 16) | part_index; } // part_index; std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS2); for (uint32_t i = 0; i < num_best_parts; i++) pBest_parts[i] = part_similarity[(NUM_UNIQUE_PARTITIONS2 - 1) - i] & 0xFFFF; #else vp_tree::result_queue results; results.reserve(num_best_parts); g_part2_vp_tree.find_nearest(2, desired_part, results, num_best_parts); assert(results.get_size() == num_best_parts); const auto& elements = results.get_elements(); for (uint32_t i = 0; i < results.get_size(); i++) pBest_parts[i] = elements[1 + i].m_pat_index; #endif //fmt_printf("{} ", tm.get_elapsed_ms()); return true; } const uint32_t MIN_REFINE_LEVEL = 0; static bool encode_block_2_subsets( trial_result res[2], uint32_t grid_w, uint32_t grid_h, uint32_t cem, uint32_t weights_ise_range, uint32_t endpoints_ise_range, const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16, astc_hdr_codec_base_options& coptions, bool uber_mode_flag, int unique_pat_index, uint32_t comp_level, opt_mode_t mode11_opt_mode, bool refine_endpoints_flag) { const uint32_t num_endpoint_vals = (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS; res[0].m_valid = false; res[1].m_valid = false; const uint32_t BLOCK_W = 6, BLOCK_H = 6; astc_helpers::log_astc_block best_log_blk; clear_obj(best_log_blk); best_log_blk.m_num_partitions = 2; best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem; best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem; best_log_blk.m_grid_width = (uint8_t)grid_w; best_log_blk.m_grid_height = (uint8_t)grid_h; best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range; best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range; partition_pattern_vec* pPat = &g_partitions2[unique_pat_index]; const uint32_t p_seed = g_part2_unique_index_to_seed[unique_pat_index]; vec4F part_pixels_q16[2][64]; half_vec3 part_half_pixels[2][64]; uint8_t part_pixel_index[2][64]; uint32_t part_total_pixels[2] = { 0 }; for (uint32_t y = 0; y < BLOCK_H; y++) { for (uint32_t x = 0; x < BLOCK_W; x++) { uint32_t part_index = (*pPat)[x + y * BLOCK_W]; uint32_t l = part_total_pixels[part_index]; part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W]; part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W]; part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W); part_total_pixels[part_index] = l + 1; } // x } // y uint8_t blk_endpoints[2][basist::NUM_MODE11_ENDPOINTS]; uint8_t blk_weights[2][BLOCK_W * BLOCK_H]; uint32_t best_submode[2]; for (uint32_t part_iter = 0; part_iter < 2; part_iter++) { assert(part_total_pixels[part_iter]); double e; if (cem == 7) { e = encode_astc_hdr_block_mode_7( part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], best_log_blk.m_weight_ise_range, best_submode[part_iter], BIG_FLOAT_VAL, blk_endpoints[part_iter], blk_weights[part_iter], coptions, best_log_blk.m_endpoint_ise_range); } else { assert(cem == 11); e = encode_astc_hdr_block_mode_11( part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], best_log_blk.m_weight_ise_range, best_submode[part_iter], BIG_FLOAT_VAL, blk_endpoints[part_iter], blk_weights[part_iter], coptions, false, best_log_blk.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false, mode11_opt_mode); } if (e == BIG_FLOAT_VAL) return false; } // part_iter uint8_t ise_weights[BLOCK_W * BLOCK_H]; uint32_t src_pixel_index[2] = { 0, 0 }; for (uint32_t y = 0; y < BLOCK_H; y++) { for (uint32_t x = 0; x < BLOCK_W; x++) { uint32_t part_index = (*pPat)[x + y * BLOCK_W]; ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]]; src_pixel_index[part_index]++; } // x } // y if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H)) { best_log_blk.m_partition_id = (uint16_t)p_seed; memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals); memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals); memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H); res[0].m_valid = true; res[0].m_log_blk = best_log_blk; } else { uint8_t desired_weights[BLOCK_H * BLOCK_W]; const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val; for (uint32_t by = 0; by < BLOCK_H; by++) for (uint32_t bx = 0; bx < BLOCK_W; bx++) desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]]; uint8_t downsampled_weights[BLOCK_H * BLOCK_W]; const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h); if (!pDownsample_matrix) { assert(0); return false; } downsample_weight_grid( pDownsample_matrix, BLOCK_W, BLOCK_H, // source/from dimension (block size) grid_w, grid_h, // dest/to dimension (grid size) desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx] downsampled_weights); // [wy][wx] best_log_blk.m_partition_id = (uint16_t)p_seed; memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals); memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals); const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise; for (uint32_t gy = 0; gy < grid_h; gy++) for (uint32_t gx = 0; gx < grid_w; gx++) best_log_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]]; res[0].m_valid = true; res[0].m_log_blk = best_log_blk; if ((refine_endpoints_flag) && (comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6))) { bool any_refined = false; for (uint32_t part_iter = 0; part_iter < 2; part_iter++) { bool refine_status = refine_endpoints( cem, endpoints_ise_range, best_log_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize BLOCK_W, BLOCK_H, // block dimensions grid_w, grid_h, best_log_blk.m_weights, weights_ise_range, // weight grid part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], &part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets coptions, mode11_opt_mode); if (refine_status) any_refined = true; } if (any_refined) { res[1].m_valid = true; res[1].m_log_blk = best_log_blk; } } } return true; } typedef basisu::hash_map > partition3_hash_map; partition_pattern_vec g_partitions3[NUM_UNIQUE_PARTITIONS3]; int g_part3_seed_to_unique_index[1024]; vp_tree g_part3_vp_tree; static void init_partitions3_6x6() { uint32_t t = 0; for (uint32_t i = 0; i < 1024; i++) g_part3_seed_to_unique_index[i] = -1; partition3_hash_map part3_hash; part3_hash.reserve(512); for (uint32_t seed_index = 0; seed_index < 1024; seed_index++) { partition_pattern_vec p3; uint32_t part_hist[3] = { 0 }; for (uint32_t y = 0; y < 6; y++) { for (uint32_t x = 0; x < 6; x++) { uint64_t p = astc_helpers::compute_texel_partition(seed_index, x, y, 0, 3, false); assert(p < 3); p3.m_parts[x + y * 6] = (uint8_t)p; part_hist[p]++; } } if (!part_hist[0] || !part_hist[1] || !part_hist[2]) continue; uint32_t j; for (j = 0; j < NUM_PART3_MAPPINGS; j++) { partition_pattern_vec temp_part3(p3.get_permuted3(j)); if (part3_hash.contains(temp_part3)) break; } if (j < NUM_PART3_MAPPINGS) continue; part3_hash.insert(p3, std::make_pair(seed_index, t) ); assert(g_part3_unique_index_to_seed[t] == seed_index); g_part3_seed_to_unique_index[seed_index] = t; g_partitions3[t] = p3; t++; } g_part3_vp_tree.init(NUM_UNIQUE_PARTITIONS3, g_partitions3); } static bool estimate_partition3_6x6( const basist::half_float pBlock_pixels_half[][3], int* pBest_parts, uint32_t num_best_parts) { const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H, NUM_SUBSETS = 3; assert(num_best_parts && (num_best_parts <= NUM_UNIQUE_PARTITIONS3)); vec3F training_vecs[BLOCK_T], mean(0.0f); float brightest_inten = 0.0f, darkest_inten = BIG_FLOAT_VAL; vec3F cluster_centroids[NUM_SUBSETS]; clear_obj(cluster_centroids); for (uint32_t i = 0; i < BLOCK_T; i++) { vec3F& v = training_vecs[i]; v.set((float)pBlock_pixels_half[i][0], (float)pBlock_pixels_half[i][1], (float)pBlock_pixels_half[i][2]); float inten = v.dot(vec3F(1.0f)); if (inten < darkest_inten) { darkest_inten = inten; cluster_centroids[0] = v; } if (inten > brightest_inten) { brightest_inten = inten; cluster_centroids[1] = v; } } if (cluster_centroids[0] == cluster_centroids[1]) return false; float furthest_dist2 = 0.0f; for (uint32_t i = 0; i < BLOCK_T; i++) { vec3F& v = training_vecs[i]; float dist_a = v.squared_distance(cluster_centroids[0]); if (dist_a == 0.0f) continue; float dist_b = v.squared_distance(cluster_centroids[1]); if (dist_b == 0.0f) continue; float dist2 = dist_a + dist_b; if (dist2 > furthest_dist2) { furthest_dist2 = dist2; cluster_centroids[2] = v; } } if ((cluster_centroids[0] == cluster_centroids[2]) || (cluster_centroids[1] == cluster_centroids[2])) return false; uint32_t cluster_pixels[NUM_SUBSETS][BLOCK_T]; uint32_t num_cluster_pixels[NUM_SUBSETS]; vec3F new_cluster_means[NUM_SUBSETS]; const uint32_t NUM_ITERS = 4; for (uint32_t s = 0; s < NUM_ITERS; s++) { memset(num_cluster_pixels, 0, sizeof(num_cluster_pixels)); memset((void *)new_cluster_means, 0, sizeof(new_cluster_means)); for (uint32_t i = 0; i < BLOCK_T; i++) { float d[NUM_SUBSETS] = { training_vecs[i].squared_distance(cluster_centroids[0]), training_vecs[i].squared_distance(cluster_centroids[1]), training_vecs[i].squared_distance(cluster_centroids[2]) }; float min_d = d[0]; uint32_t min_idx = 0; for (uint32_t j = 1; j < NUM_SUBSETS; j++) { if (d[j] < min_d) { min_d = d[j]; min_idx = j; } } cluster_pixels[min_idx][num_cluster_pixels[min_idx]] = i; new_cluster_means[min_idx] += training_vecs[i]; num_cluster_pixels[min_idx]++; } // i for (uint32_t j = 0; j < NUM_SUBSETS; j++) { if (!num_cluster_pixels[j]) return false; cluster_centroids[j] = new_cluster_means[j] / (float)num_cluster_pixels[j]; } } // s partition_pattern_vec desired_part; for (uint32_t p = 0; p < NUM_SUBSETS; p++) { for (uint32_t i = 0; i < num_cluster_pixels[p]; i++) { const uint32_t pix_index = cluster_pixels[p][i]; desired_part[pix_index] = (uint8_t)p; } } #if BRUTE_FORCE_PART_SEARCH partition_pattern_vec desired_parts[NUM_PART3_MAPPINGS]; for (uint32_t j = 0; j < NUM_PART3_MAPPINGS; j++) desired_parts[j] = desired_part.get_permuted3(j); uint32_t part_similarity[NUM_UNIQUE_PARTITIONS3]; for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS3; part_index++) { const partition_pattern_vec& pat = g_partitions3[part_index]; uint32_t lowest_pat_dist = UINT32_MAX; for (uint32_t p = 0; p < NUM_PART3_MAPPINGS; p++) { uint32_t dist = pat.get_squared_distance(desired_parts[p]); if (dist < lowest_pat_dist) lowest_pat_dist = dist; } part_similarity[part_index] = (lowest_pat_dist << 16) | part_index; } // part_index; std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS3); for (uint32_t i = 0; i < num_best_parts; i++) pBest_parts[i] = part_similarity[i] & 0xFFFF; #else vp_tree::result_queue results; results.reserve(num_best_parts); g_part3_vp_tree.find_nearest(3, desired_part, results, num_best_parts); assert(results.get_size() == num_best_parts); const auto& elements = results.get_elements(); for (uint32_t i = 0; i < results.get_size(); i++) pBest_parts[i] = elements[1 + i].m_pat_index; #endif return true; } static bool encode_block_3_subsets( trial_result& res, uint32_t cem, uint32_t grid_w, uint32_t grid_h, uint32_t weights_ise_range, uint32_t endpoints_ise_range, const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16, astc_hdr_codec_base_options& coptions, bool uber_mode_flag, const int* pEst_patterns, int num_est_patterns, uint32_t comp_level, opt_mode_t mode11_opt_mode) { BASISU_NOTE_UNUSED(uber_mode_flag); const uint32_t BLOCK_W = 6, BLOCK_H = 6, NUM_SUBSETS = 3; const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem); res.m_valid = false; double best_e = BIG_FLOAT_VAL; astc_helpers::log_astc_block best_log_blk; clear_obj(best_log_blk); best_log_blk.m_num_partitions = NUM_SUBSETS; best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem; best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem; best_log_blk.m_color_endpoint_modes[2] = (uint8_t)cem; best_log_blk.m_grid_width = (uint8_t)grid_w; best_log_blk.m_grid_height = (uint8_t)grid_h; best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range; best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range; const uint32_t n = num_est_patterns ? num_est_patterns : NUM_UNIQUE_PARTITIONS3; for (uint32_t unique_p_iter = 0; unique_p_iter < n; unique_p_iter++) { const uint32_t unique_part_index = num_est_patterns ? pEst_patterns[unique_p_iter] : unique_p_iter; assert(unique_part_index < NUM_UNIQUE_PARTITIONS3); const partition_pattern_vec*pPart = &g_partitions3[unique_part_index]; vec4F part_pixels_q16[NUM_SUBSETS][64]; half_vec3 part_half_pixels[NUM_SUBSETS][64]; uint8_t part_pixel_index[NUM_SUBSETS][64]; uint32_t part_total_pixels[NUM_SUBSETS] = { 0 }; for (uint32_t y = 0; y < BLOCK_H; y++) { for (uint32_t x = 0; x < BLOCK_W; x++) { const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W]; uint32_t l = part_total_pixels[part_index]; part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W]; part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W]; part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W); part_total_pixels[part_index] = l + 1; } // x } // y uint8_t blk_endpoints[NUM_SUBSETS][basist::NUM_MODE11_ENDPOINTS]; uint8_t blk_weights[NUM_SUBSETS][BLOCK_W * BLOCK_H]; uint32_t best_submode[NUM_SUBSETS]; bool failed_flag = false; double e = 0.0f; for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++) { assert(part_total_pixels[part_iter]); double part_e; if (cem == 7) { part_e = encode_astc_hdr_block_mode_7( part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], best_log_blk.m_weight_ise_range, best_submode[part_iter], BIG_FLOAT_VAL, blk_endpoints[part_iter], blk_weights[part_iter], coptions, best_log_blk.m_endpoint_ise_range); } else { assert(cem == 11); part_e = encode_astc_hdr_block_mode_11( part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], best_log_blk.m_weight_ise_range, best_submode[part_iter], BIG_FLOAT_VAL, blk_endpoints[part_iter], blk_weights[part_iter], coptions, false, best_log_blk.m_endpoint_ise_range, uber_mode_flag, false, FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode); } if (part_e == BIG_FLOAT_VAL) { failed_flag = true; break; } e += part_e; } // part_iter if (failed_flag) continue; uint8_t ise_weights[BLOCK_W * BLOCK_H]; uint32_t src_pixel_index[NUM_SUBSETS] = { 0 }; for (uint32_t y = 0; y < BLOCK_H; y++) { for (uint32_t x = 0; x < BLOCK_W; x++) { const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W]; ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]]; src_pixel_index[part_index]++; } // x } // y if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H)) { if (e < best_e) { best_e = e; best_log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index]; for (uint32_t p = 0; p < NUM_SUBSETS; p++) memcpy(best_log_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals); memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H); } } else { uint8_t desired_weights[BLOCK_H * BLOCK_W]; const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val; for (uint32_t by = 0; by < BLOCK_H; by++) for (uint32_t bx = 0; bx < BLOCK_W; bx++) desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]]; uint8_t downsampled_weights[BLOCK_H * BLOCK_W]; const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h); if (!pDownsample_matrix) { assert(0); return false; } downsample_weight_grid( pDownsample_matrix, BLOCK_W, BLOCK_H, // source/from dimension (block size) grid_w, grid_h, // dest/to dimension (grid size) desired_weights, // these are dequantized weights, NOT ISE symbols, [by][bx] downsampled_weights); // [wy][wx] astc_helpers::log_astc_block trial_blk(best_log_blk); trial_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index]; for (uint32_t p = 0; p < NUM_SUBSETS; p++) memcpy(trial_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals); const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise; for (uint32_t gy = 0; gy < grid_h; gy++) for (uint32_t gx = 0; gx < grid_w; gx++) trial_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]]; if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6))) { for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++) { bool refine_status = refine_endpoints( cem, endpoints_ise_range, trial_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize BLOCK_W, BLOCK_H, // block dimensions grid_w, grid_h, trial_blk.m_weights, weights_ise_range, // weight grid part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter], &part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets coptions, mode11_opt_mode); BASISU_NOTE_UNUSED(refine_status); } } half_vec4 decoded_pixels_half4[BLOCK_H][BLOCK_W]; // [y][x] bool status = astc_helpers::decode_block(trial_blk, decoded_pixels_half4, BLOCK_W, BLOCK_H, astc_helpers::cDecodeModeHDR16); assert(status); if (!status) return false; half_vec3 decoded_pixels_half3[BLOCK_H][BLOCK_W]; for (uint32_t y = 0; y < BLOCK_H; y++) for (uint32_t x = 0; x < BLOCK_W; x++) decoded_pixels_half3[y][x].set(decoded_pixels_half4[y][x][0], decoded_pixels_half4[y][x][1], decoded_pixels_half4[y][x][2]); double trial_err = compute_block_error(BLOCK_W * BLOCK_H, (const basist::half_float*)pBlock_pixels_half, (const basist::half_float*)decoded_pixels_half3, coptions); if (trial_err < best_e) { best_e = trial_err; best_log_blk = trial_blk; } } } // unique_p_iter if (best_e < BIG_FLOAT_VAL) { res.m_log_blk = best_log_blk; res.m_valid = true; res.m_err = best_e; } else { res.m_valid = false; } return res.m_valid; } static uint32_t encode_values(bitwise_coder &coder, uint32_t total_values, const uint8_t *pVals, uint32_t endpoint_range) { const uint32_t MAX_VALS = 64; uint32_t bit_values[MAX_VALS], tq_values[(MAX_VALS + 2) / 3]; uint32_t total_tq_values = 0, tq_accum = 0, tq_mul = 1; assert((total_values) && (total_values <= MAX_VALS)); const uint32_t ep_bits = astc_helpers::g_ise_range_table[endpoint_range][0]; const uint32_t ep_trits = astc_helpers::g_ise_range_table[endpoint_range][1]; const uint32_t ep_quints = astc_helpers::g_ise_range_table[endpoint_range][2]; for (uint32_t i = 0; i < total_values; i++) { uint32_t val = pVals[i]; uint32_t bits = val & ((1 << ep_bits) - 1); uint32_t tq = val >> ep_bits; bit_values[i] = bits; if (ep_trits) { assert(tq < 3); tq_accum += tq * tq_mul; tq_mul *= 3; if (tq_mul == 243) { assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values)); tq_values[total_tq_values++] = tq_accum; tq_accum = 0; tq_mul = 1; } } else if (ep_quints) { assert(tq < 5); tq_accum += tq * tq_mul; tq_mul *= 5; if (tq_mul == 125) { assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values)); tq_values[total_tq_values++] = tq_accum; tq_accum = 0; tq_mul = 1; } } } uint32_t total_bits_output = 0; for (uint32_t i = 0; i < total_tq_values; i++) { const uint32_t num_bits = ep_trits ? 8 : 7; coder.put_bits(tq_values[i], num_bits); total_bits_output += num_bits; } if (tq_mul > 1) { uint32_t num_bits; if (ep_trits) { if (tq_mul == 3) num_bits = 2; else if (tq_mul == 9) num_bits = 4; else if (tq_mul == 27) num_bits = 5; else //if (tq_mul == 81) num_bits = 7; } else { if (tq_mul == 5) num_bits = 3; else //if (tq_mul == 25) num_bits = 5; } coder.put_bits(tq_accum, num_bits); total_bits_output += num_bits; } for (uint32_t i = 0; i < total_values; i++) { coder.put_bits(bit_values[i], ep_bits); total_bits_output += ep_bits; } return total_bits_output; } static inline uint32_t get_num_endpoint_vals(uint32_t cem) { assert((cem == 7) || (cem == 11)); return (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS; } static void code_block(bitwise_coder& coder, const astc_helpers::log_astc_block& log_blk, block_mode block_mode_index, endpoint_mode em, const uint8_t *pEP_deltas) { coder.put_truncated_binary((uint32_t)block_mode_index, (uint32_t)block_mode::cBMTotalModes); coder.put_truncated_binary((uint32_t)em, (uint32_t)endpoint_mode::cTotal); const uint32_t num_endpoint_vals = get_num_endpoint_vals(log_blk.m_color_endpoint_modes[0]); if ((em == endpoint_mode::cUseLeftDelta) || (em == endpoint_mode::cUseUpperDelta)) { assert(log_blk.m_num_partitions == 1); for (uint32_t i = 0; i < num_endpoint_vals; i++) coder.put_bits(pEP_deltas[i], NUM_ENDPOINT_DELTA_BITS); } else if (em == endpoint_mode::cRaw) { if (log_blk.m_num_partitions == 2) { const int unique_partition_index = g_part2_seed_to_unique_index[log_blk.m_partition_id]; assert(unique_partition_index != -1); coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS2); } else if (log_blk.m_num_partitions == 3) { const int unique_partition_index = g_part3_seed_to_unique_index[log_blk.m_partition_id]; assert(unique_partition_index != -1); coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS3); } encode_values(coder, num_endpoint_vals * log_blk.m_num_partitions, log_blk.m_endpoints, log_blk.m_endpoint_ise_range); } encode_values(coder, log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1), log_blk.m_weights, log_blk.m_weight_ise_range); } struct smooth_map_params { bool m_no_mse_scaling; float m_max_smooth_std_dev; float m_smooth_max_mse_scale; float m_max_med_smooth_std_dev; float m_med_smooth_max_mse_scale; float m_max_ultra_smooth_std_dev; float m_ultra_smooth_max_mse_scale; bool m_debug_images; smooth_map_params() { clear(); } void clear() { m_no_mse_scaling = false; // 3x3 region m_max_smooth_std_dev = 100.0f; m_smooth_max_mse_scale = 13000.0f; // 7x7 region m_max_med_smooth_std_dev = 9.0f; m_med_smooth_max_mse_scale = 15000.0f; // 11x11 region m_max_ultra_smooth_std_dev = 4.0f; //m_ultra_smooth_max_mse_scale = 4500.0f; //m_ultra_smooth_max_mse_scale = 10000.0f; //m_ultra_smooth_max_mse_scale = 50000.0f; //m_ultra_smooth_max_mse_scale = 100000.0f; //m_ultra_smooth_max_mse_scale = 400000.0f; //m_ultra_smooth_max_mse_scale = 800000.0f; m_ultra_smooth_max_mse_scale = 2000000.0f; m_debug_images = true; } }; Resampler::Contrib_List* g_contrib_lists[7]; // 1-6 static void init_contrib_lists() { for (uint32_t dst_width = 1; dst_width <= 6; dst_width++) //g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_GAUSSIAN_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f); g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_BELL_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f); } #if 0 static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, half_vec3 *pDst_block_half3, vec4F *pDst_block_q16) { vec3F temp_block[6][6]; // [y][x] // first filter rows to temp_block if (grid_x == 6) { memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6); } else { Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x]; for (uint32_t y = 0; y < 6; y++) { for (uint32_t x = 0; x < 6; x++) { vec3F p(0.0f); for (uint32_t i = 0; i < pRow_lists[x].n; i++) p += pSrc_block[y * 6 + pRow_lists[x].p[i].pixel] * pRow_lists[x].p[i].weight; p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL); temp_block[y][x] = p; } // x } // y } // filter columns if (grid_y == 6) { for (uint32_t y = 0; y < 6; y++) { for (uint32_t x = 0; x < 6; x++) { for (uint32_t c = 0; c < 3; c++) { const basist::half_float h = basist::float_to_half(temp_block[y][x][c]); pDst_block_half3[x + y * 6][c] = h; pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h); } pDst_block_q16[x + y * 6][3] = 0.0f; } // x } // y } else { Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y]; for (uint32_t x = 0; x < 6; x++) { for (uint32_t y = 0; y < 6; y++) { vec3F p(0.0f); for (uint32_t i = 0; i < pCol_lists[y].n; i++) p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight; p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL); for (uint32_t c = 0; c < 3; c++) { const basist::half_float h = basist::float_to_half(p[c]); pDst_block_half3[x + y * 6][c] = h; pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h); } pDst_block_q16[x + y * 6][3] = 0.0f; } // x } // y } } #endif static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec4F* pSrc_block, vec4F* pDst_block) { vec4F temp_block[6][6]; // [y][x] // first filter rows to temp_block if (grid_x == 6) { memcpy(temp_block, pSrc_block, sizeof(vec4F) * 6 * 6); } else { Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x]; for (uint32_t y = 0; y < 6; y++) { for (uint32_t x = 0; x < 6; x++) { vec3F p(0.0f); for (uint32_t i = 0; i < pRow_lists[x].n; i++) p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight; p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL); temp_block[y][x] = p; } // x } // y } // filter columns if (grid_y == 6) { for (uint32_t y = 0; y < 6; y++) { for (uint32_t x = 0; x < 6; x++) { for (uint32_t c = 0; c < 3; c++) pDst_block[x + y * 6][c] = temp_block[y][x][c]; } // x } // y } else { Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y]; for (uint32_t x = 0; x < 6; x++) { for (uint32_t y = 0; y < 6; y++) { vec3F p(0.0f); for (uint32_t i = 0; i < pCol_lists[y].n; i++) p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight; p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL); pDst_block[x + y * 6] = p; } // x } // y } } static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, vec3F* pDst_block) { vec3F temp_block[6][6]; // [y][x] // first filter rows to temp_block if (grid_x == 6) { memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6); } else { Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x]; for (uint32_t y = 0; y < 6; y++) { for (uint32_t x = 0; x < 6; x++) { vec3F p(0.0f); for (uint32_t i = 0; i < pRow_lists[x].n; i++) p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight; temp_block[y][x] = p; } // x } // y } // filter columns if (grid_y == 6) { memcpy((void *)pDst_block, temp_block, sizeof(vec3F) * 6 * 6); } else { Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y]; for (uint32_t x = 0; x < 6; x++) { for (uint32_t y = 0; y < 6; y++) { vec3F& p = pDst_block[x + y * 6]; p.set(0.0f); for (uint32_t i = 0; i < pCol_lists[y].n; i++) p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight; } // x } // y } } static float diff_blocks(const vec4F* pA, const vec4F* pB) { const uint32_t BLOCK_T = 36; float diff = 0.0f; for (uint32_t i = 0; i < BLOCK_T; i++) diff += square(pA[i][0] - pB[i][0]) + square(pA[i][1] - pB[i][1]) + square(pA[i][2] - pB[i][2]); return diff * (1.0f / (float)BLOCK_T); } static float sub_and_compute_std_dev(const vec3F* pA, const vec3F* pB) { const uint32_t BLOCK_T = 36; vec3F mean(0.0f); for (uint32_t i = 0; i < BLOCK_T; i++) { vec3F diff(pA[i] - pB[i]); mean += diff; } mean *= (1.0f / (float)BLOCK_T); vec3F diff_sum(0.0f); for (uint32_t i = 0; i < BLOCK_T; i++) { vec3F diff(pA[i] - pB[i]); diff -= mean; diff_sum += vec3F::component_mul(diff, diff); } vec3F var(diff_sum * (1.0f / (float)BLOCK_T)); vec3F std_dev(sqrtf(var[0]), sqrtf(var[1]), sqrtf(var[2])); return maximum(std_dev[0], std_dev[1], std_dev[2]); } static void create_smooth_maps2( vector2D& smooth_block_mse_scales, const image& orig_img, smooth_map_params& params, image* pUltra_smooth_img = nullptr) { const uint32_t width = orig_img.get_width(); const uint32_t height = orig_img.get_height(); //const uint32_t total_pixels = orig_img.get_total_pixels(); const uint32_t num_comps = 3; if (params.m_no_mse_scaling) { smooth_block_mse_scales.set_all(1.0f); return; } // TODO: - move up before the no mse scaling check (harmless as that is only a debug aid) smooth_block_mse_scales.resize(width, height); image smooth_vis, med_smooth_vis, ultra_smooth_vis; if (params.m_debug_images) { smooth_vis.resize(width, height); med_smooth_vis.resize(width, height); ultra_smooth_vis.resize(width, height); } for (uint32_t y = 0; y < height; y++) { for (uint32_t x = 0; x < width; x++) { { tracked_stat_dbl comp_stats[4]; for (int yd = -1; yd <= 1; yd++) { for (int xd = -1; xd <= 1; xd++) { const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd); comp_stats[0].update((float)p[0]); comp_stats[1].update((float)p[1]); comp_stats[2].update((float)p[2]); } } float max_std_dev = 0.0f; for (uint32_t i = 0; i < num_comps; i++) max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev()); float yl = clampf(max_std_dev / params.m_max_smooth_std_dev, 0.0f, 1.0f); //yl = powf(yl, 2.0f); yl = powf(yl, 1.0f / 2.0f); // substantially less bits smooth_block_mse_scales(x, y) = lerp(params.m_smooth_max_mse_scale, 1.0f, yl); if (params.m_debug_images) { //smooth_vis(x, y).set(clamp((int)((smooth_block_mse_scales(x, y) - 1.0f) / (params.m_smooth_max_mse_scale - 1.0f) * 255.0f + .5f), 0, 255)); // white=high local activity (edges/detail) // black=low local activity (smooth - error is amplified) smooth_vis(x, y).set(clamp((int)((yl * 255.0f) + .5f), 0, 255)); } } { tracked_stat_dbl comp_stats[4]; const int S = 3; for (int yd = -S; yd < S; yd++) { for (int xd = -S; xd < S; xd++) { const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd); comp_stats[0].update((float)p[0]); comp_stats[1].update((float)p[1]); comp_stats[2].update((float)p[2]); } } float max_std_dev = 0.0f; for (uint32_t i = 0; i < num_comps; i++) max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev()); float yl = clampf(max_std_dev / params.m_max_med_smooth_std_dev, 0.0f, 1.0f); //yl = powf(yl, 2.0f); smooth_block_mse_scales(x, y) = lerp(params.m_med_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl); if (params.m_debug_images) med_smooth_vis(x, y).set((int)std::round(yl * 255.0f)); } { tracked_stat_dbl comp_stats[4]; const int S = 5; for (int yd = -S; yd < S; yd++) { for (int xd = -S; xd < S; xd++) { const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd); comp_stats[0].update((float)p[0]); comp_stats[1].update((float)p[1]); comp_stats[2].update((float)p[2]); } } float max_std_dev = 0.0f; for (uint32_t i = 0; i < num_comps; i++) max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev()); float yl = clampf(max_std_dev / params.m_max_ultra_smooth_std_dev, 0.0f, 1.0f); yl = powf(yl, 2.0f); smooth_block_mse_scales(x, y) = lerp(params.m_ultra_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl); if (params.m_debug_images) ultra_smooth_vis(x, y).set((int)std::round(yl * 255.0f)); } } } if (params.m_debug_images) { save_png("dbg_smooth_vis.png", smooth_vis); save_png("dbg_med_smooth_vis.png", med_smooth_vis); save_png("dbg_ultra_smooth_vis.png", ultra_smooth_vis); image vis_img(width, height); float max_scale = 0.0f; for (uint32_t y = 0; y < height; y++) for (uint32_t x = 0; x < width; x++) max_scale = basisu::maximumf(max_scale, smooth_block_mse_scales(x, y)); for (uint32_t y = 0; y < height; y++) for (uint32_t x = 0; x < width; x++) vis_img(x, y).set((int)std::round(smooth_block_mse_scales(x, y) * 255.0f / max_scale)); save_png("scale_vis.png", vis_img); } if (pUltra_smooth_img) *pUltra_smooth_img = ultra_smooth_vis; } const float REALLY_DARK_I_THRESHOLD = 0.0625f; const float REALLY_DARK_MSE_ERR_SCALE = 128.0f; const float REALLY_DARK_DELTA_ITP_JND_SCALE = 5.0f; static float compute_pixel_mse_itp(const vec3F& orig_pixel_itp, const vec3F& comp_pixel_itp, bool delta_itp_dark_adjustment) { float delta_i = orig_pixel_itp[0] - comp_pixel_itp[0]; float delta_t = orig_pixel_itp[1] - comp_pixel_itp[1]; float delta_p = orig_pixel_itp[2] - comp_pixel_itp[2]; float err = (delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p); if (delta_itp_dark_adjustment) { // We have to process a large range of inputs, including extremely dark inputs. // Artifically amplify MSE on very dark pixels - otherwise they'll be overly compressed at higher lambdas. // This is to better handle very dark signals which could be explictly overexposed. float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig_pixel_itp[0]); s = lerp(REALLY_DARK_MSE_ERR_SCALE, 1.0f, s); err *= s; } return err; } static float compute_block_mse_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp, bool delta_itp_dark_adjustment) { float total_mse = 0.0f; for (uint32_t y = 0; y < block_h; y++) { for (uint32_t x = 0; x < block_w; x++) { total_mse += compute_pixel_mse_itp(pOrig_pixels_itp[x + y * block_w], pComp_pixels_itp[x + y * block_w], delta_itp_dark_adjustment); } // x } // y return total_mse * (1.0f / (float)(block_w * block_h)); } static float compute_block_ssim_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp) { const uint32_t n = block_w * block_h; assert(n <= 36); stats x_stats[3], y_stats[3]; comparative_stats xy_cov[3]; for (uint32_t c = 0; c < 3; c++) { x_stats[c].calc_simplified(n, &pOrig_pixels_itp[0][c], 3); y_stats[c].calc_simplified(n, &pComp_pixels_itp[0][c], 3); } for (uint32_t c = 0; c < 3; c++) xy_cov[c].calc_cov(n, &pOrig_pixels_itp[0][c], &pComp_pixels_itp[0][c], 3, 3, &x_stats[c], &y_stats[c]); float ssim[3]; const double d = 1.0f, k1 = .01f, k2 = .03f; // weight mean error more highly to reduce blocking float ap = 1.5f, bp = 1.0f, cp = 1.0f; const double s_c1 = square(k1 * d), s_c2 = square(k2 * d); const double s_c3(s_c2 * .5f); for (uint32_t c = 0; c < 3; c++) { float lum = (float)((2.0f * x_stats[c].m_avg * y_stats[c].m_avg + s_c1) / (square(x_stats[c].m_avg) + square(y_stats[c].m_avg) + s_c1)); lum = saturate(lum); float con = (float)((2.0f * x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c2) / (x_stats[c].m_var + y_stats[c].m_var + s_c2)); con = saturate(con); float str = (float)((xy_cov[c].m_cov + s_c3) / (x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c3)); str = saturate(str); ssim[c] = powf(lum, ap) * powf(con, bp) * powf(str, cp); } #if 0 float final_ssim = (ssim[0] * .4f + ssim[1] * .3f + ssim[2] * .3f); #elif 1 float final_ssim = ssim[0] * ssim[1] * ssim[2]; #else const float LP = .75f; float final_ssim = ssim[0] * powf((ssim[1] + ssim[2]) * .5f, LP); #endif return final_ssim; } // delta ITP, 1.0 is JND (Rec. ITU-R BT.2124), modified for higher error at low light static float compute_pixel_delta_itp(const vec3F& a, const vec3F& b, const vec3F& orig, bool delta_itp_dark_adjustment) { float delta_i = a[0] - b[0]; float delta_t = a[1] - b[1]; float delta_p = a[2] - b[2]; float err = 720.0f * sqrtf((delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p)); float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig[0]); if (delta_itp_dark_adjustment) { // This is to better handle very dark signals which could be explictly overexposed. s = lerp(REALLY_DARK_DELTA_ITP_JND_SCALE, 1.0f, s); err *= s; } return err; } struct candidate_encoding { encoding_type m_encoding_type; basist::half_float m_solid_color[3]; uint32_t m_run_len; vec3F m_comp_pixels[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x] vec3F m_comp_pixels_itp[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x] endpoint_mode m_endpoint_mode; block_mode m_block_mode; bitwise_coder m_coder; // The block to code, which may not be valid ASTC. This may have to be transcoded (by requantizing the weights/endpoints) before it's valid ASTC. // Note the endpoints may be coded endpoints OR transcoded endpoints, depending on the encoding type. astc_helpers::log_astc_block m_coded_log_blk; // The block the decoder outputs. astc_helpers::log_astc_block m_decomp_log_blk; int m_reuse_delta_index; // m_t can get VERY large double m_t, m_d; float m_bits; candidate_encoding() { clear(); } candidate_encoding(const candidate_encoding &other) { *this = other; } candidate_encoding(candidate_encoding&& other) { *this = std::move(other); } candidate_encoding& operator=(const candidate_encoding& rhs) { if (this == &rhs) return *this; m_encoding_type = rhs.m_encoding_type; memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color)); m_run_len = rhs.m_run_len; memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels)); m_endpoint_mode = rhs.m_endpoint_mode; m_block_mode = rhs.m_block_mode; m_coder = rhs.m_coder; m_coded_log_blk = rhs.m_coded_log_blk; m_decomp_log_blk = rhs.m_decomp_log_blk; m_reuse_delta_index = rhs.m_reuse_delta_index; return *this; } candidate_encoding& operator=(candidate_encoding&& rhs) { if (this == &rhs) return *this; m_encoding_type = rhs.m_encoding_type; memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color)); m_run_len = rhs.m_run_len; memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels)); m_endpoint_mode = rhs.m_endpoint_mode; m_block_mode = rhs.m_block_mode; m_coder = std::move(rhs.m_coder); m_coded_log_blk = rhs.m_coded_log_blk; m_decomp_log_blk = rhs.m_decomp_log_blk; m_reuse_delta_index = rhs.m_reuse_delta_index; return *this; } void clear() { m_encoding_type = encoding_type::cInvalid; clear_obj(m_solid_color); m_run_len = 0; clear_obj(m_comp_pixels); m_endpoint_mode = endpoint_mode::cInvalid; m_block_mode = block_mode::cInvalid; m_coder.restart(); m_coded_log_blk.clear(); m_decomp_log_blk.clear(); m_t = 0; m_d = 0; m_bits = 0; m_reuse_delta_index = 0; } }; bool decode_astc_block(uint32_t block_w, uint32_t block_h, astc_helpers::log_astc_block &log_blk, vec3F *pPixels) { assert((block_w <= 6) && (block_h <= 6)); half_vec4 decoded_pixels_half4[6 * 6]; // [y][x] bool status = astc_helpers::decode_block(log_blk, decoded_pixels_half4, block_w, block_h, astc_helpers::cDecodeModeHDR16); assert(status); if (!status) return false; for (uint32_t y = 0; y < block_h; y++) { for (uint32_t x = 0; x < block_w; x++) { pPixels[x + y * block_w].set( basist::half_to_float(decoded_pixels_half4[x + y * block_w][0]), basist::half_to_float(decoded_pixels_half4[x + y * block_w][1]), basist::half_to_float(decoded_pixels_half4[x + y * block_w][2])); } // x } //y return true; } static inline bool validate_log_blk(const astc_helpers::log_astc_block &decomp_blk) { astc_helpers::astc_block phys_blk; return astc_helpers::pack_astc_block(phys_blk, decomp_blk); } #define SYNC_MARKERS (0) static bool decode_file(const uint8_vec& comp_data, vector2D& decoded_blocks, uint32_t &width, uint32_t &height) { interval_timer tm; tm.start(); const uint32_t BLOCK_W = 6, BLOCK_H = 6; width = 0; height = 0; if (comp_data.size() <= 2*3) return false; basist::bitwise_decoder decoder; if (!decoder.init(comp_data.data(), comp_data.size_u32())) return false; // Read initial LE marker const uint32_t marker = decoder.get_bits(16); // Check for v1.60 and v2.0 markers - if it's not either, it's not valid data. if ((marker != UASTC_6x6_HDR_SIG0) && (marker != UASTC_6x6_HDR_SIG1)) return false; // Use original v1.60 behavior for tiny weight grid upsampling if it's the original marker, otherwise v2.0. const bool use_orig_behavior = (marker == UASTC_6x6_HDR_SIG0); width = decoder.get_bits(16); height = decoder.get_bits(16); if (!width || !height || (width > MAX_ASTC_HDR_6X6_DIM) || (height > MAX_ASTC_HDR_6X6_DIM)) return false; const uint32_t num_blocks_x = (width + BLOCK_W - 1) / BLOCK_W; const uint32_t num_blocks_y = (height + BLOCK_H - 1) / BLOCK_H; const uint32_t total_blocks = num_blocks_x * num_blocks_y; decoded_blocks.resize(num_blocks_x, num_blocks_y); //memset(decoded_blocks.get_ptr(), 0, decoded_blocks.size_in_bytes()); vector2D decoded_log_blocks(num_blocks_x, num_blocks_y); //memset(decoded_log_blocks.get_ptr(), 0, decoded_log_blocks.size_in_bytes()); uint32_t cur_bx = 0, cur_by = 0; uint32_t step_counter = 0; BASISU_NOTE_UNUSED(step_counter); while (cur_by < num_blocks_y) { step_counter++; //if ((cur_bx == 9) && (cur_by == 13)) // printf("!"); #if SYNC_MARKERS uint32_t mk = decoder.get_bits(16); if (mk != 0xDEAD) { printf("!"); assert(0); return false; } #endif if (decoder.get_bits_remaining() < 1) return false; encoding_type et = encoding_type::cBlock; uint32_t b0 = decoder.get_bits(1); if (!b0) { uint32_t b1 = decoder.get_bits(1); if (b1) et = encoding_type::cReuse; else { uint32_t b2 = decoder.get_bits(1); if (b2) et = encoding_type::cSolid; else et = encoding_type::cRun; } } switch (et) { case encoding_type::cRun: { if (!cur_bx && !cur_by) return false; const uint32_t run_len = decoder.decode_vlc(5) + 1; uint32_t num_blocks_remaining = total_blocks - (cur_bx + cur_by * num_blocks_x); if (run_len > num_blocks_remaining) return false; uint32_t prev_bx = cur_bx, prev_by = cur_by; if (cur_bx) prev_bx--; else { prev_bx = num_blocks_x - 1; prev_by--; } const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by); const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by); for (uint32_t i = 0; i < run_len; i++) { decoded_log_blocks(cur_bx, cur_by) = prev_log_blk; decoded_blocks(cur_bx, cur_by) = prev_phys_blk; cur_bx++; if (cur_bx == num_blocks_x) { cur_bx = 0; cur_by++; } } break; } case encoding_type::cSolid: { const basist::half_float rh = (basist::half_float)decoder.get_bits(15); const basist::half_float gh = (basist::half_float)decoder.get_bits(15); const basist::half_float bh = (basist::half_float)decoder.get_bits(15); astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by); log_blk.clear(); log_blk.m_solid_color_flag_hdr = true; log_blk.m_solid_color[0] = rh; log_blk.m_solid_color[1] = gh; log_blk.m_solid_color[2] = bh; log_blk.m_solid_color[3] = basist::float_to_half(1.0f); bool status = astc_helpers::pack_astc_block(decoded_blocks(cur_bx, cur_by), log_blk); if (!status) return false; cur_bx++; if (cur_bx == num_blocks_x) { cur_bx = 0; cur_by++; } break; } case encoding_type::cReuse: { if (!cur_bx && !cur_by) return false; const uint32_t reuse_delta_index = decoder.get_bits(REUSE_XY_DELTA_BITS); const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x; const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y; const int prev_bx = cur_bx + reuse_delta_x, prev_by = cur_by + reuse_delta_y; if ((prev_bx < 0) || (prev_bx >= (int)num_blocks_x)) return false; if (prev_by < 0) return false; const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by); const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by); if (prev_log_blk.m_solid_color_flag_hdr) return false; astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by); astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by); log_blk = prev_log_blk; const uint32_t total_grid_weights = log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1); bool status = basist::astc_6x6_hdr::decode_values(decoder, total_grid_weights, log_blk.m_weight_ise_range, log_blk.m_weights); if (!status) return false; astc_helpers::log_astc_block decomp_blk; status = astc_helpers::unpack_block(&prev_phys_blk, decomp_blk, BLOCK_W, BLOCK_H); if (!status) return false; uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, log_blk.m_weight_ise_range, transcode_weights, decomp_blk.m_weight_ise_range); copy_weight_grid(log_blk.m_dual_plane, log_blk.m_grid_width, log_blk.m_grid_height, transcode_weights, decomp_blk, use_orig_behavior); status = astc_helpers::pack_astc_block(phys_blk, decomp_blk); if (!status) return false; cur_bx++; if (cur_bx == num_blocks_x) { cur_bx = 0; cur_by++; } break; } case encoding_type::cBlock: { const block_mode bm = (block_mode)decoder.decode_truncated_binary((uint32_t)block_mode::cBMTotalModes); const endpoint_mode em = (endpoint_mode)decoder.decode_truncated_binary((uint32_t)endpoint_mode::cTotal); switch (em) { case endpoint_mode::cUseLeft: case endpoint_mode::cUseUpper: { int neighbor_bx = cur_bx, neighbor_by = cur_by; if (em == endpoint_mode::cUseLeft) neighbor_bx--; else neighbor_by--; if ((neighbor_bx < 0) || (neighbor_by < 0)) return false; const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by); if (!neighbor_blk.m_color_endpoint_modes[0]) return false; const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm]; const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem); if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0]) return false; astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by); astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by); log_blk.clear(); log_blk.m_num_partitions = 1; log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem; log_blk.m_endpoint_ise_range = neighbor_blk.m_endpoint_ise_range; log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range; log_blk.m_grid_width = (uint8_t)bmd.m_grid_x; log_blk.m_grid_height = (uint8_t)bmd.m_grid_y; log_blk.m_dual_plane = (uint8_t)bmd.m_dp; log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; memcpy(log_blk.m_endpoints, neighbor_blk.m_endpoints, num_endpoint_values); const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1); bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights); if (!status) return false; astc_helpers::log_astc_block decomp_blk; decomp_blk.clear(); decomp_blk.m_num_partitions = 1; decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem; decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range; decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range; decomp_blk.m_dual_plane = bmd.m_dp; decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints); uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range); copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk, use_orig_behavior); status = astc_helpers::pack_astc_block(phys_blk, decomp_blk); if (!status) return false; cur_bx++; if (cur_bx == num_blocks_x) { cur_bx = 0; cur_by++; } break; } case endpoint_mode::cUseLeftDelta: case endpoint_mode::cUseUpperDelta: { int neighbor_bx = cur_bx, neighbor_by = cur_by; if (em == endpoint_mode::cUseLeftDelta) neighbor_bx--; else neighbor_by--; if ((neighbor_bx < 0) || (neighbor_by < 0)) return false; const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by); if (!neighbor_blk.m_color_endpoint_modes[0]) return false; const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm]; const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem); if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0]) return false; astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by); astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by); log_blk.clear(); log_blk.m_num_partitions = 1; log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem; log_blk.m_dual_plane = bmd.m_dp; log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range; basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, neighbor_blk.m_endpoint_ise_range, neighbor_blk.m_endpoints, bmd.m_endpoint_ise_range, log_blk.m_endpoints); const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS; const int low_delta_limit = -(total_endpoint_delta_vals / 2); // high_delta_limit = (total_endpoint_delta_vals / 2) - 1; const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_ISE_to_rank; const auto& rank_to_ise = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_rank_to_ISE; const int total_endpoint_levels = astc_helpers::get_ise_levels(log_blk.m_endpoint_ise_range); for (uint32_t i = 0; i < num_endpoint_values; i++) { int cur_val = ise_to_rank[log_blk.m_endpoints[i]]; int delta = (int)decoder.get_bits(NUM_ENDPOINT_DELTA_BITS) + low_delta_limit; cur_val += delta; if ((cur_val < 0) || (cur_val >= total_endpoint_levels)) return false; log_blk.m_endpoints[i] = rank_to_ise[cur_val]; } log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range; log_blk.m_grid_width = (uint8_t)bmd.m_grid_x; log_blk.m_grid_height = (uint8_t)bmd.m_grid_y; const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1); bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights); if (!status) return false; astc_helpers::log_astc_block decomp_blk; decomp_blk.clear(); decomp_blk.m_num_partitions = 1; decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem; decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range; decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range; decomp_blk.m_dual_plane = (uint8_t)bmd.m_dp; decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints); uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range); copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk, use_orig_behavior); status = astc_helpers::pack_astc_block(phys_blk, decomp_blk); if (!status) return false; cur_bx++; if (cur_bx == num_blocks_x) { cur_bx = 0; cur_by++; } break; } case endpoint_mode::cRaw: { const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm]; const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem); astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by); astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by); log_blk.clear(); log_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions; for (uint32_t p = 0; p < bmd.m_num_partitions; p++) log_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem; log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range; log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range; log_blk.m_grid_width = (uint8_t)bmd.m_grid_x; log_blk.m_grid_height = (uint8_t)bmd.m_grid_y; log_blk.m_dual_plane = (uint8_t)bmd.m_dp; log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; if (bmd.m_num_partitions == 2) { const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS2); log_blk.m_partition_id = (uint16_t)g_part2_unique_index_to_seed[unique_partition_index]; } else if (bmd.m_num_partitions == 3) { const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS3); log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_partition_index]; } bool status = decode_values(decoder, num_endpoint_values * bmd.m_num_partitions, bmd.m_endpoint_ise_range, log_blk.m_endpoints); if (!status) return false; const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1); status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights); if (!status) return false; astc_helpers::log_astc_block decomp_blk; decomp_blk.clear(); decomp_blk.m_dual_plane = bmd.m_dp; decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel; decomp_blk.m_partition_id = log_blk.m_partition_id; decomp_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions; for (uint32_t p = 0; p < bmd.m_num_partitions; p++) decomp_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem; decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range; decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range; for (uint32_t p = 0; p < bmd.m_num_partitions; p++) basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, bmd.m_endpoint_ise_range, log_blk.m_endpoints + num_endpoint_values * p, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints + num_endpoint_values * p); uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range); copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk, use_orig_behavior); status = astc_helpers::pack_astc_block(phys_blk, decomp_blk); if (!status) return false; cur_bx++; if (cur_bx == num_blocks_x) { cur_bx = 0; cur_by++; } break; } default: { assert(0); return false; } } break; } default: { assert(0); return false; } } } if (decoder.get_bits(16) != 0xA742) { fmt_error_printf("End marker not found!\n"); return false; } //fmt_printf("Total decode_file() time: {} secs\n", tm.get_elapsed_secs()); return true; } static bool unpack_physical_astc_block(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels) { astc_helpers::log_astc_block log_blk; if (!astc_helpers::unpack_block(pBlock, log_blk, block_width, block_height)) return false; basist::half_float half_block[MAX_BLOCK_W * MAX_BLOCK_H][4]; if (!astc_helpers::decode_block(log_blk, half_block, block_width, block_height, astc_helpers::cDecodeModeHDR16)) return false; const uint32_t total_block_pixels = block_width * block_height; for (uint32_t p = 0; p < total_block_pixels; p++) { pPixels[p][0] = basist::half_to_float(half_block[p][0]); pPixels[p][1] = basist::half_to_float(half_block[p][1]); pPixels[p][2] = basist::half_to_float(half_block[p][2]); pPixels[p][3] = basist::half_to_float(half_block[p][3]); } return true; } static bool unpack_physical_astc_block_google(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels) { return basisu_astc::astc::decompress_hdr((float *)pPixels, (uint8_t*)pBlock, block_width, block_height); } static bool pack_bc6h_image(const imagef &src_img, vector2D &bc6h_blocks, imagef *pPacked_bc6h_img, const fast_bc6h_params &enc_params) { const uint32_t width = src_img.get_width(); const uint32_t height = src_img.get_height(); if (pPacked_bc6h_img) pPacked_bc6h_img->resize(width, height); interval_timer tm; double total_enc_time = 0.0f; BASISU_NOTE_UNUSED(total_enc_time); const uint32_t num_blocks_x = src_img.get_block_width(4); const uint32_t num_blocks_y = src_img.get_block_height(4); bc6h_blocks.resize(num_blocks_x, num_blocks_y); for (uint32_t by = 0; by < num_blocks_y; by++) { for (uint32_t bx = 0; bx < num_blocks_x; bx++) { // Extract source image block vec4F block_pixels[4][4]; // [y][x] src_img.extract_block_clamped(&block_pixels[0][0], bx * 4, by * 4, 4, 4); basist::half_float half_pixels[16 * 3]; // [y][x] for (uint32_t y = 0; y < 4; y++) { for (uint32_t x = 0; x < 4; x++) { for (uint32_t c = 0; c < 3; c++) { float v = block_pixels[y][x][c]; basist::half_float h = basist::float_to_half(v); half_pixels[(x + y * 4) * 3 + c] = h; } // c } // x } // y basist::bc6h_block& bc6h_blk = bc6h_blocks(bx, by); tm.start(); basist::astc_6x6_hdr::fast_encode_bc6h(half_pixels, &bc6h_blk, enc_params); total_enc_time += tm.get_elapsed_secs(); if (pPacked_bc6h_img) { basist::half_float unpacked_blk[16 * 3]; bool status = unpack_bc6h(&bc6h_blk, unpacked_blk, false); assert(status); if (!status) { fmt_error_printf("unpack_bc6h() failed\n"); return false; } for (uint32_t y = 0; y < 4; y++) { for (uint32_t x = 0; x < 4; x++) { vec4F p; for (uint32_t c = 0; c < 3; c++) { float v = basist::half_to_float(unpacked_blk[(x + y * 4) * 3 + c]); p[c] = v; } // c p[3] = 1.0f; pPacked_bc6h_img->set_clipped(bx * 4 + x, by * 4 + y, p); } // x } // y } } // bx } // by //fmt_printf("Total BC6H encode time: {}\n", total_enc_time); return true; } static float dist_to_line_squared(const vec3F& p, const vec3F &line_org, const vec3F &line_dir) { vec3F q(p - line_org); vec3F v(q - q.dot(line_dir) * line_dir); return v.dot(v); } static void estimate_partitions_mode7_and_11( uint32_t num_parts, // 2 or 3 partitions uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine const vec3F *pHalf_pixels_as_floats, // block's half pixel values casted to floats const astc_hdr_codec_base_options& coptions, // options uint32_t num_desired_pats, int *pDesired_pat_indices_mode11, int *pDesired_pat_indices_mode7) // output indices { BASISU_NOTE_UNUSED(coptions); BASISU_NOTE_UNUSED(num_unique_pats); const uint32_t BLOCK_W = 6, BLOCK_H = 6, MAX_PARTS = 3; // BLOCK_T = 6 * 6 assert(num_parts <= MAX_PARTS); struct candidate_res { float m_total_sq_dist; uint32_t m_index; bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; } }; const uint32_t MAX_CANDIDATES = 1024; assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES)); candidate_res mode11_candidates[MAX_CANDIDATES]; candidate_res mode7_candidates[MAX_CANDIDATES]; const vec3F grayscale_axis(0.5773502691f); for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++) { const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter]; assert(unique_part_index < num_unique_pats); const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index]; vec3F part_means[MAX_PARTS]; uint32_t part_total_texels[MAX_PARTS] = { 0 }; for (uint32_t i = 0; i < num_parts; i++) part_means[i].clear(); for (uint32_t y = 0; y < BLOCK_H; y++) { for (uint32_t x = 0; x < BLOCK_W; x++) { const uint32_t part_index = (*pPat)(x, y); assert(part_index < num_parts); part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W]; part_total_texels[part_index]++; } // x } // y for (uint32_t i = 0; i < num_parts; i++) { assert(part_total_texels[i]); part_means[i] /= (float)part_total_texels[i]; } float part_cov[MAX_PARTS][6]; memset(part_cov, 0, sizeof(part_cov)); for (uint32_t y = 0; y < BLOCK_H; y++) { for (uint32_t x = 0; x < BLOCK_W; x++) { const uint32_t part_index = (*pPat)(x, y); assert(part_index < num_parts); const vec3F p(pHalf_pixels_as_floats[x + y * BLOCK_W] - part_means[part_index]); const float r = p[0], g = p[1], b = p[2]; part_cov[part_index][0] += r * r; part_cov[part_index][1] += r * g; part_cov[part_index][2] += r * b; part_cov[part_index][3] += g * g; part_cov[part_index][4] += g * b; part_cov[part_index][5] += b * b; } // x } // y // For each partition compute the total variance of all channels. float total_variance[MAX_PARTS]; for (uint32_t part_index = 0; part_index < num_parts; part_index++) total_variance[part_index] = part_cov[part_index][0] + part_cov[part_index][3] + part_cov[part_index][5]; //vec3F part_axis[MAX_PARTS]; float mode11_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis float mode7_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis for (uint32_t part_index = 0; part_index < num_parts; part_index++) { float* pCov = &part_cov[part_index][0]; float xr = .9f, xg = 1.0f, xb = .7f; const uint32_t NUM_POWER_ITERS = 4; for (uint32_t iter = 0; iter < NUM_POWER_ITERS; iter++) { float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2]; float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4]; float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5]; float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b)); if (m >= 1e-10f) { m = 1.0f / m; r *= m; g *= m; b *= m; } xr = r; xg = g; xb = b; } float len_sq = xr * xr + xg * xg + xb * xb; if (len_sq < 1e-10f) { xr = grayscale_axis[0]; xg = grayscale_axis[0]; xb = grayscale_axis[0]; } else { len_sq = 1.0f / sqrtf(len_sq); xr *= len_sq; xg *= len_sq; xb *= len_sq; } { // Transform the principle axis by the covariance matrix, which will scale the vector by its eigenvalue (the variance of the dataset projected onto the principle axis). float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2]; float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4]; float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5]; // Estimate the principle eigenvalue by computing the magnitude of the transformed vector. // The result is the variance along the principle axis. //float z1 = sqrtf(r * r + g * g + b * b); // this works with the principle axis //float z2 = r * xr + g * xg + b * xb; // compute length projected along xr,xg,xb mode11_eigenvalue_est[part_index] = r * xr + g * xg + b * xb; } { const float yrgb = grayscale_axis[0]; // Transform the grayscale axis by the covariance matrix, which will scale the vector by the eigenvalue (which is the variance of the dataset projected onto this vector). float r = yrgb * pCov[0] + yrgb * pCov[1] + yrgb * pCov[2]; float g = yrgb * pCov[1] + yrgb * pCov[3] + yrgb * pCov[4]; float b = yrgb * pCov[2] + yrgb * pCov[4] + yrgb * pCov[5]; mode7_eigenvalue_est[part_index] = r * yrgb + g * yrgb + b * yrgb; } } // part_index // Compute the total variance (squared error) of the other 2 axes by subtracting the total variance of all channels by the variance of the principle axis. // TODO: Could also compute the ratio of the principle axis's variance vs. the total variance. float mode11_total_sq_dist_to_line_alt = 0.0f; for (uint32_t part_index = 0; part_index < num_parts; part_index++) { float d = maximum(0.0f, total_variance[part_index] - mode11_eigenvalue_est[part_index]); mode11_total_sq_dist_to_line_alt += d; } { #if 0 // TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix), // then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances. float total_sq_dist_to_line = 0.0f; for (uint32_t i = 0; i < BLOCK_T; i++) { const uint32_t part_index = (*pPat)[i]; assert(part_index < num_parts); total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis[part_index]); } mode11_candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line; #else mode11_candidates[examine_iter].m_total_sq_dist = mode11_total_sq_dist_to_line_alt; #endif mode11_candidates[examine_iter].m_index = unique_part_index; } { float mode7_total_sq_dist_to_line_alt = 0.0f; for (uint32_t part_index = 0; part_index < num_parts; part_index++) { float d = maximum(0.0f, total_variance[part_index] - mode7_eigenvalue_est[part_index]); mode7_total_sq_dist_to_line_alt += d; } mode7_candidates[examine_iter].m_total_sq_dist = mode7_total_sq_dist_to_line_alt; mode7_candidates[examine_iter].m_index = unique_part_index; } } // examine_iter std::sort(&mode11_candidates[0], &mode11_candidates[num_pats_to_examine]); std::sort(&mode7_candidates[0], &mode7_candidates[num_pats_to_examine]); for (uint32_t i = 0; i < num_desired_pats; i++) pDesired_pat_indices_mode11[i] = mode11_candidates[i].m_index; for (uint32_t i = 0; i < num_desired_pats; i++) pDesired_pat_indices_mode7[i] = mode7_candidates[i].m_index; } static void estimate_partitions_mode7( uint32_t num_parts, // 2 or 3 partitions uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine const vec3F* pHalf_pixels_as_floats, // block's half pixel values casted to floats const astc_hdr_codec_base_options& coptions, // options uint32_t num_desired_pats, uint32_t* pDesired_pat_indices) // output indices { BASISU_NOTE_UNUSED(coptions); BASISU_NOTE_UNUSED(num_unique_pats); const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = 6 * 6, MAX_PARTS = 3; assert(num_parts <= MAX_PARTS); struct candidate_res { float m_total_sq_dist; uint32_t m_index; bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; } }; const uint32_t MAX_CANDIDATES = 1024; assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES)); candidate_res candidates[MAX_CANDIDATES]; for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++) { const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter]; assert(unique_part_index < num_unique_pats); const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index]; vec3F part_means[MAX_PARTS]; uint32_t part_total_texels[MAX_PARTS] = { 0 }; for (uint32_t i = 0; i < num_parts; i++) part_means[i].clear(); for (uint32_t y = 0; y < BLOCK_H; y++) { for (uint32_t x = 0; x < BLOCK_W; x++) { const uint32_t part_index = (*pPat)(x, y); assert(part_index < num_parts); part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W]; part_total_texels[part_index]++; } // x } // y for (uint32_t i = 0; i < num_parts; i++) { assert(part_total_texels[i]); part_means[i] /= (float)part_total_texels[i]; } vec3F part_axis(0.5773502691f); // TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix), // then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances. float total_sq_dist_to_line = 0.0f; for (uint32_t i = 0; i < BLOCK_T; i++) { const uint32_t part_index = (*pPat)[i]; assert(part_index < num_parts); total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis); } candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line; candidates[examine_iter].m_index = unique_part_index; } // examine_iter std::sort(&candidates[0], &candidates[num_pats_to_examine]); for (uint32_t i = 0; i < num_desired_pats; i++) pDesired_pat_indices[i] = candidates[i].m_index; } static float calc_deblocking_penalty_itp( uint32_t bx, uint32_t by, uint32_t width, uint32_t height, const imagef& pass_src_img_itp, const candidate_encoding& candidate) { float total_deblock_penalty = 0.0f; float total_orig_mse = 0.0f, total_comp_mse = 0.0f; uint32_t total_c = 0; for (uint32_t b = 0; b < 4; b++) { for (uint32_t i = 0; i < 6; i++) { int ox = 0, oy = 0, qx = 0, qy = 0; switch (b) { case 0: ox = bx * 6 + i; oy = (by - 1) * 6 + 5; qx = bx * 6 + i; qy = by * 6; break; case 1: ox = bx * 6 + i; oy = (by + 1) * 6; qx = bx * 6 + i; qy = by * 6 + 5; break; case 2: ox = (bx - 1) * 6 + 5; oy = by * 6 + i; qx = bx * 6; qy = by * 6 + i; break; case 3: ox = (bx + 1) * 6; oy = by * 6 + i; qx = bx * 6 + 5; qy = by * 6 + i; break; } if ((ox < 0) || (oy < 0) || (ox >= (int)width) || (oy >= (int)height)) continue; const vec3F& o_pixel_itp = pass_src_img_itp(ox, oy); const vec3F& q_pixel_itp = pass_src_img_itp(qx, qy); const vec3F &d_pixel_itp = candidate.m_comp_pixels_itp[qy - by * 6][qx - bx * 6]; // compressed block vec3F orig_delta_v(o_pixel_itp - q_pixel_itp); total_orig_mse += square(orig_delta_v[0]) + square(orig_delta_v[1]) + square(orig_delta_v[2]); vec3F d_delta_v(o_pixel_itp - d_pixel_itp); total_comp_mse += square(d_delta_v[0]) + square(d_delta_v[1]) + square(d_delta_v[2]); total_c++; } } if (total_c) { total_orig_mse /= (float)total_c; total_comp_mse /= (float)total_c; if (total_orig_mse) { total_deblock_penalty = fabsf((total_comp_mse - total_orig_mse) / total_orig_mse); } } return total_deblock_penalty; } static bool calc_strip_size( float lambda, uint32_t num_blocks_y, uint32_t total_threads, bool force_one_strip, uint32_t& res_total_strips, uint32_t& res_rows_per_strip, astc_hdr_6x6_global_config &global_cfg) { uint32_t total_strips = 1; if (lambda == 0.0f) { if (!force_one_strip) { total_strips = total_threads; } } else { const uint32_t MIN_DESIRED_STRIPS = 8; const uint32_t MAX_TARGET_STRIPS = 32; const uint32_t TARGET_ASTC_6X6_ROWS_PER_STRIP = 12; if (!force_one_strip) { total_strips = maximum(1, num_blocks_y / TARGET_ASTC_6X6_ROWS_PER_STRIP); if (num_blocks_y >= MIN_DESIRED_STRIPS * 2) total_strips = maximum(total_strips, MIN_DESIRED_STRIPS); } total_strips = minimum(total_strips, MAX_TARGET_STRIPS); } uint32_t rows_per_strip = 0; if (total_strips <= 1) { rows_per_strip = num_blocks_y; } else { rows_per_strip = (num_blocks_y / total_strips) & ~1; if (rows_per_strip < 2) rows_per_strip = 2;// num_blocks_y; } assert((rows_per_strip == num_blocks_y) || ((rows_per_strip & 1) == 0)); total_strips = (num_blocks_y + rows_per_strip - 1) / rows_per_strip; if (global_cfg.m_debug_output) { fmt_printf("num_blocks_y: {}, total_threads : {}, Total strips : {}\n", num_blocks_y, total_threads, total_strips); fmt_printf("ASTC 6x6 block rows per strip: {}\n", rows_per_strip); fmt_printf("ASTC 6x6 block rows on final strip: {}\n", num_blocks_y - (total_strips - 1) * rows_per_strip); } uint32_t total_rows = 0; for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++) { uint32_t strip_first_by = strip_index * rows_per_strip; uint32_t strip_last_by = minimum(strip_first_by + rows_per_strip - 1, num_blocks_y); if (strip_index == (total_strips - 1)) strip_last_by = num_blocks_y - 1; uint32_t num_strip_block_rows = (strip_last_by - strip_first_by) + 1; total_rows += num_strip_block_rows; if (global_cfg.m_debug_output) fmt_printf("Strip row: {}, total block rows: {}\n", strip_index, num_strip_block_rows); } if (total_rows != num_blocks_y) { fmt_error_printf("Strip calc failed\n"); return false; } res_total_strips = total_strips; res_rows_per_strip = rows_per_strip; return true; } static void convet_rgb_image_to_itp(const imagef &src_img, imagef &dst_img, const astc_hdr_6x6_global_config& cfg) { const uint32_t width = src_img.get_width(), height = src_img.get_height(); dst_img.resize(width, height); for (uint32_t y = 0; y < height; y++) { for (uint32_t x = 0; x < width; x++) { vec3F src_rgb(src_img(x, y)); vec3F src_itp; linear_rgb_to_itp(src_rgb, src_itp, cfg); dst_img(x, y) = src_itp; } } } const uint32_t BLOCK_W = 6, BLOCK_H = 6; const uint32_t NUM_BLOCK_PIXELS = BLOCK_W * BLOCK_H; const float SOLID_PENALTY = 4.0f; const float REUSE_PENALTY = 1.0f; const float RUN_PENALTY = 10.0f; const float MSE_WEIGHT = 300000.0f; const float SSIM_WEIGHT = 200.0f; const float TWO_LEVEL_PENALTY = 1.425f; const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM = .04f; const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM = .04f; const float COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY = 1.5f; const float COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY = 1.25f; const float COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY = 1.15f; struct uastc_hdr_6x6_debug_state { uint32_t m_encoding_type_hist[(uint32_t)encoding_type::cTotal] = { 0 }; uint32_t m_endpoint_mode_hist[(uint32_t)endpoint_mode::cTotal] = { 0 }; uint32_t m_block_mode_hist[(uint32_t)block_mode::cBMTotalModes] = { 0 }; uint64_t m_block_mode_total_bits[(uint32_t)block_mode::cBMTotalModes] = { 0 }; basisu::vector< basisu::stats > m_block_mode_comp_stats[(uint32_t)block_mode::cBMTotalModes][3]; basisu::vector< basisu::comparative_stats > m_block_mode_comparative_stats[(uint32_t)block_mode::cBMTotalModes][3]; std::atomic m_total_gaussian1_blocks; std::atomic m_total_gaussian2_blocks; std::atomic m_total_filter_horizontal; std::atomic m_detail_stats[5]; std::atomic m_total_mode7_skips; std::atomic m_total_blocks_compressed; std::atomic m_total_candidates_considered; std::atomic m_max_candidates_considered; std::atomic m_total_part2_stats[4]; std::atomic m_dp_stats[5]; std::atomic m_reuse_num_parts[4]; std::atomic m_reuse_total_dp; imagef m_stat_vis; std::mutex m_stat_vis_mutex; image m_part_vis; image m_mode_vis; image m_mode_vis2; image m_grid_vis; image m_enc_vis; std::mutex m_vis_image_mutex; std::atomic m_comp_level_hist[ASTC_HDR_6X6_MAX_COMP_LEVEL + 1]; std::atomic m_total_jnd_replacements; std::mutex m_stats_mutex; uastc_hdr_6x6_debug_state() { for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++) { for (uint32_t j = 0; j < 3; j++) { m_block_mode_comp_stats[i][j].reserve(512); m_block_mode_comparative_stats[i][j].reserve(512); } } } void init(uint32_t width, uint32_t height) { m_stat_vis.resize(width, height); m_part_vis.resize(width, height); m_mode_vis.resize(width, height); m_mode_vis2.resize(width, height); m_grid_vis.resize(width, height); m_enc_vis.resize(width, height); basisu::clear_obj(m_encoding_type_hist); basisu::clear_obj(m_endpoint_mode_hist); basisu::clear_obj(m_block_mode_hist); basisu::clear_obj(m_block_mode_total_bits); for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++) { for (uint32_t j = 0; j < 3; j++) { m_block_mode_comp_stats[i][j].clear(); m_block_mode_comparative_stats[i][j].clear(); } } m_total_gaussian1_blocks.store(0); m_total_gaussian2_blocks.store(0); m_total_filter_horizontal.store(0); for (uint32_t i = 0; i < std::size(m_detail_stats); i++) m_detail_stats[i].store(0); m_total_mode7_skips.store(0); for (uint32_t i = 0; i < std::size(m_comp_level_hist); i++) m_comp_level_hist[i].store(0); m_total_blocks_compressed.store(0); m_total_candidates_considered.store(0); m_max_candidates_considered.store(0); for (uint32_t i = 0; i < std::size(m_total_part2_stats); i++) m_total_part2_stats[i].store(0); for (uint32_t i = 0; i < std::size(m_dp_stats); i++) m_dp_stats[i].store(0); for (uint32_t i = 0; i < std::size(m_reuse_num_parts); i++) m_reuse_num_parts[i] .store(0); m_reuse_total_dp.store(0); m_total_jnd_replacements.store(0); } void print(uint32_t total_blocks) const { fmt_printf("Total blocks: {}\n", total_blocks); fmt_printf("Total JND replacements: {} {3.2}%\n", m_total_jnd_replacements, (float)m_total_jnd_replacements * 100.0f / (float)total_blocks); fmt_printf("Comp level histogram: {} {} {} {} {}\n", m_comp_level_hist[0], m_comp_level_hist[1], m_comp_level_hist[2], m_comp_level_hist[3], m_comp_level_hist[4]); fmt_printf("Total gaussian 1 blocks: {} {3.2}%\n", m_total_gaussian1_blocks, (float)m_total_gaussian1_blocks * 100.0f / (float)total_blocks); fmt_printf("Total gaussian 2 blocks: {} {3.2}%\n", m_total_gaussian2_blocks, (float)m_total_gaussian2_blocks * 100.0f / (float)total_blocks); fmt_printf("Total filter horizontal: {} {3.2}%\n", m_total_filter_horizontal, (float)m_total_filter_horizontal * 100.0f / (float)total_blocks); fmt_printf("Detail stats: Detailed block low grid skip: {}, Blurry block skip: {}, Very blurry block skip: {}, NH:{} H:{}\n", m_detail_stats[0], m_detail_stats[1], m_detail_stats[2], m_detail_stats[3], m_detail_stats[4]); fmt_printf("Total mode7 skips: {}\n", m_total_mode7_skips); fmt_printf("Total candidates: {}, {} avg per block\n", m_total_candidates_considered, (float)m_total_candidates_considered / (float)total_blocks); fmt_printf("Max ever candidates: {}\n", m_max_candidates_considered); fmt_printf("Part2/3 stats: {} {} {} {}\n", m_total_part2_stats[0], m_total_part2_stats[1], m_total_part2_stats[2], m_total_part2_stats[3]); fmt_printf("Dual plane stats: {} {} {} {} {}\n", m_dp_stats[0], m_dp_stats[1], m_dp_stats[2], m_dp_stats[3], m_dp_stats[4]); fmt_printf("Reuse total dual plane: {}\n", m_reuse_total_dp); fmt_printf("Reuse part stats: {} {} {}\n", m_reuse_num_parts[1], m_reuse_num_parts[2], m_reuse_num_parts[3]); fmt_printf("\nEncoding type histogram:\n"); for (uint32_t i = 0; i < std::size(m_encoding_type_hist); i++) fmt_printf("{}: {}\n", i, m_encoding_type_hist[i]); fmt_printf("\nEndpoint mode histogram:\n"); for (uint32_t i = 0; i < std::size(m_endpoint_mode_hist); i++) fmt_printf("{}: {}\n", i, m_endpoint_mode_hist[i]); fmt_printf("\nBlock mode histogram:\n"); uint32_t total_dp = 0, total_sp = 0; uint32_t total_mode11 = 0, total_mode7 = 0; uint32_t part_hist[3] = { 0 }; uint32_t part2_mode7_total = 0, part2_mode11_total = 0; uint32_t total_used_modes = 0; for (uint32_t i = 0; i < std::size(m_block_mode_hist); i++) { const auto& bm_desc = g_block_mode_descs[i]; const uint32_t total_uses = m_block_mode_hist[i]; if (bm_desc.m_dp) total_dp += total_uses; else total_sp += total_uses; if (bm_desc.m_cem == 7) total_mode7 += total_uses; else total_mode11 += total_uses; part_hist[bm_desc.m_num_partitions - 1] += total_uses; if (bm_desc.m_num_partitions == 2) { if (bm_desc.m_cem == 7) part2_mode7_total += total_uses; else { assert(bm_desc.m_cem == 11); part2_mode11_total += total_uses; } } float avg_std_dev = 0.0f; float avg_cross_correlations[3] = { 0 }; if (m_block_mode_comp_stats[i][0].size()) { const uint32_t num_uses = m_block_mode_comp_stats[i][0].size_u32(); for (uint32_t j = 0; j < num_uses; j++) avg_std_dev += (float)maximum(m_block_mode_comp_stats[i][0][j].m_std_dev, m_block_mode_comp_stats[i][1][j].m_std_dev, m_block_mode_comp_stats[i][2][j].m_std_dev); avg_std_dev /= (float)num_uses; for (uint32_t j = 0; j < num_uses; j++) { avg_cross_correlations[0] += fabsf((float)m_block_mode_comparative_stats[i][0][j].m_pearson); avg_cross_correlations[1] += fabsf((float)m_block_mode_comparative_stats[i][1][j].m_pearson); avg_cross_correlations[2] += fabsf((float)m_block_mode_comparative_stats[i][2][j].m_pearson); } avg_cross_correlations[0] /= (float)num_uses; avg_cross_correlations[1] /= (float)num_uses; avg_cross_correlations[2] /= (float)num_uses; } fmt_printf("{ 2}: uses: { 6}, cem: {}, dp: {} chan: {}, parts: {}, grid: {}x{}, endpoint levels: {}, weight levels: {}, Avg bits: {}, Avg Max Std Dev: {}, RG: {} RB: {} GB: {}\n", i, total_uses, bm_desc.m_cem, bm_desc.m_dp, bm_desc.m_dp_channel, bm_desc.m_num_partitions, bm_desc.m_grid_x, bm_desc.m_grid_y, astc_helpers::get_ise_levels(bm_desc.m_endpoint_ise_range), astc_helpers::get_ise_levels(bm_desc.m_weight_ise_range), total_uses ? ((double)m_block_mode_total_bits[i] / total_uses) : 0.0f, avg_std_dev, avg_cross_correlations[0], avg_cross_correlations[1], avg_cross_correlations[2]); if (total_uses) total_used_modes++; } fmt_printf("Total used modes: {}\n", total_used_modes); fmt_printf("Total single plane: {}, total dual plane: {}\n", total_sp, total_dp); fmt_printf("Total mode 11: {}, mode 7: {}\n", total_mode11, total_mode7); fmt_printf("Partition histogram: {} {} {}\n", part_hist[0], part_hist[1], part_hist[2]); fmt_printf("2 subset mode 7 uses: {}, mode 11 uses: {}\n", part2_mode7_total, part2_mode11_total); } }; struct uastc_hdr_6x6_encode_state { astc_hdr_codec_base_options master_coptions; imagef src_img; imagef src_img_filtered1; imagef src_img_filtered2; imagef src_img_itp; imagef src_img_filtered1_itp; imagef src_img_filtered2_itp; vector2D smooth_block_mse_scales; imagef packed_img; basisu::vector strip_bits; basisu::vector2D final_astc_blocks; vector2D coded_blocks; }; static bool compress_strip_task( uint32_t strip_index, uint32_t total_strips, uint32_t strip_first_by, uint32_t strip_last_by, uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t total_blocks, uint32_t width, uint32_t height, astc_hdr_6x6_global_config &global_cfg, uastc_hdr_6x6_debug_state &debug_state, uastc_hdr_6x6_encode_state &enc_state) { BASISU_NOTE_UNUSED(num_blocks_y); BASISU_NOTE_UNUSED(total_strips); vec3F prev_comp_pixels[BLOCK_H][BLOCK_W]; // [y][x] basisu::clear_obj(prev_comp_pixels); uint32_t prev_run_len = 0; bitwise_coder prev_encoding; candidate_encoding prev_candidate_encoding; // the previous candidate written, which may have been a run extension candidate_encoding prev_non_run_candidate_encoding; // the previous *non-run* candidate written bitwise_coder& strip_coded_bits = enc_state.strip_bits[strip_index]; const uint32_t CANDIDATES_TO_RESERVE = 1536; basisu::vector candidates; candidates.reserve(CANDIDATES_TO_RESERVE); const bool use_orig_behavior = global_cfg.m_write_basisu_1_6_compatible_files; for (uint32_t by = strip_first_by; by <= strip_last_by; by++) { const bool has_upper_neighbor = by > strip_first_by; for (uint32_t bx = 0; bx < num_blocks_x; bx++) { //if ((bx == 1) && (by == 2)) // basisu::fmt_printf("!"); for (uint32_t outer_pass = 0; outer_pass < 3; outer_pass++) { const bool has_left_neighbor = bx > 0; //const bool has_prev = has_left_neighbor || has_upper_neighbor; // Select either the original source image, or the Gaussian filtered version. // From here the encoder *must* use these 2 sources. const imagef& pass_src_img = (outer_pass == 2) ? enc_state.src_img_filtered2 : ((outer_pass == 1) ? enc_state.src_img_filtered1 : enc_state.src_img); const imagef& pass_src_img_itp = (outer_pass == 2) ? enc_state.src_img_filtered2_itp : ((outer_pass == 1) ? enc_state.src_img_filtered1_itp : enc_state.src_img_itp); // Extract source image block vec4F block_pixels[BLOCK_H][BLOCK_W]; // [y][x] pass_src_img.extract_block_clamped(&block_pixels[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H); vec4F block_pixels_itp[BLOCK_H][BLOCK_W]; // [y][x] pass_src_img_itp.extract_block_clamped(&block_pixels_itp[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H); half_vec3 half_pixels[BLOCK_H][BLOCK_W]; // [y][x] half-float values vec3F half_pixels_as_floats[BLOCK_H][BLOCK_W]; // [y][x] half float values, integer bits as floats vec4F block_pixels_q16[BLOCK_H][BLOCK_W]; // [y][x], q16 space for low-level ASTC encoding vec3F block_pixels_as_itp[BLOCK_H][BLOCK_W]; // [y][x] input converted to itp space, for faster error calculations bool is_grayscale = true; candidates.resize(0); float block_ly = BIG_FLOAT_VAL, block_hy = 0.0f, block_avg_y = 0.0f; for (uint32_t y = 0; y < BLOCK_H; y++) { for (uint32_t x = 0; x < BLOCK_W; x++) { vec3F rgb_input; for (uint32_t c = 0; c < 3; c++) { float v = block_pixels[y][x][c]; rgb_input[c] = v; const basist::half_float h = basisu::fast_float_to_half_no_clamp_neg_nan_or_inf(v); assert(h == basist::float_to_half(v)); half_pixels[y][x][c] = h; block_pixels_q16[y][x][c] = (float)half_to_qlog16(h); half_pixels_as_floats[y][x][c] = (float)h; } // c float py = rgb_input.dot(vec3F(REC_709_R, REC_709_G, REC_709_B)); if (py < block_ly) block_ly = py; if (py > block_hy) block_hy = py; block_avg_y += py; //linear_rgb_to_itp(rgb_input, block_pixels_as_itp[y][x]); block_pixels_as_itp[y][x] = block_pixels_itp[y][x]; block_pixels_q16[y][x][3] = 0.0f; if ((half_pixels[y][x][0] != half_pixels[y][x][1]) || (half_pixels[y][x][0] != half_pixels[y][x][2])) is_grayscale = false; } // x } // y block_avg_y *= (1.0f / (float)NUM_BLOCK_PIXELS); encode_astc_block_stats enc_block_stats; enc_block_stats.init(NUM_BLOCK_PIXELS, &block_pixels_q16[0][0]); vec4F x_filtered[6][6], y_filtered[6][6]; filter_block(3, 6, (vec4F*)block_pixels, (vec4F*)x_filtered); // filter rows (horizontal) filter_block(6, 3, (vec4F*)block_pixels, (vec4F*)y_filtered); // filter cols (vertically) const float filtered_x_err = diff_blocks((vec4F*)block_pixels, (vec4F*)x_filtered); const float filtered_y_err = diff_blocks((vec4F*)block_pixels, (vec4F*)y_filtered); const bool filter_horizontally = filtered_x_err < filtered_y_err; //const float block_mag_gradient_mag = block_max_gradient_mag(bx, by); if (filter_horizontally) debug_state.m_total_filter_horizontal.fetch_add(1, std::memory_order_relaxed); vec3F lowpass_filtered[6][6]; filter_block(3, 3, &half_pixels_as_floats[0][0], &lowpass_filtered[0][0]); float lowpass_std_dev = sub_and_compute_std_dev(&lowpass_filtered[0][0], &half_pixels_as_floats[0][0]); const bool very_detailed_block = lowpass_std_dev > 350.0f; const bool very_blurry_block = lowpass_std_dev < 30.0f; const bool super_blurry_block = lowpass_std_dev < 15.0f; basisu::stats half_comp_stats[3]; for (uint32_t c = 0; c < 3; c++) half_comp_stats[c].calc(NUM_BLOCK_PIXELS, &half_pixels_as_floats[0][0][c], 3); const float SINGLE_PART_HALF_THRESH = 256.0f; const float COMPLEX_HALF_THRESH = 1024.0f; // HACK HACK const float VERY_COMPLEX_HALF_THRESH = 1400.0f; // 1536.0f; const float max_std_dev = (float)maximum(half_comp_stats[0].m_std_dev, half_comp_stats[1].m_std_dev, half_comp_stats[2].m_std_dev); const bool very_simple_block = (max_std_dev < SINGLE_PART_HALF_THRESH); const bool complex_block = (max_std_dev > COMPLEX_HALF_THRESH); const bool very_complex_block = (max_std_dev > VERY_COMPLEX_HALF_THRESH); // Dynamically choose a comp_level for this block. astc_hdr_codec_base_options coptions(enc_state.master_coptions); uint32_t comp_level = global_cfg.m_master_comp_level; if (very_complex_block) comp_level = global_cfg.m_highest_comp_level; else if (complex_block) comp_level = (global_cfg.m_master_comp_level + global_cfg.m_highest_comp_level + 1) / 2; debug_state.m_comp_level_hist[comp_level].fetch_add(1, std::memory_order_relaxed); bool any_2subset_enabled = false, any_2subset_mode11_enabled = false, any_2subset_mode7_enabled = false, any_3subset_enabled = false; BASISU_NOTE_UNUSED(any_2subset_mode11_enabled); for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++) { if (comp_level == 0) { if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL0) == 0) continue; } else if (comp_level == 1) { if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL1) == 0) continue; } else if (comp_level == 2) { if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL2) == 0) continue; } if (g_block_mode_descs[i].m_num_partitions == 2) { any_2subset_enabled = true; if (g_block_mode_descs[i].m_cem == 7) { any_2subset_mode7_enabled = true; } else { assert(g_block_mode_descs[i].m_cem == 11); any_2subset_mode11_enabled = true; } } else if (g_block_mode_descs[i].m_num_partitions == 3) any_3subset_enabled = true; } coptions.m_mode7_full_s_optimization = (comp_level >= 2); const bool uber_mode_flag = (comp_level >= 3); coptions.m_allow_uber_mode = uber_mode_flag; coptions.m_ultra_quant = (comp_level >= 4); coptions.m_take_first_non_clamping_mode11_submode = (comp_level <= 2); coptions.m_take_first_non_clamping_mode7_submode = (comp_level <= 2); coptions.m_disable_weight_plane_optimization = (comp_level >= 2); // ------------------- uint32_t total_used_block_chans = 0; for (uint32_t i = 0; i < 3; i++) total_used_block_chans += (half_comp_stats[i].m_range > 0.0f); const bool is_solid_block = (total_used_block_chans == 0); basisu::comparative_stats half_cross_chan_stats[3]; // R vs. G half_cross_chan_stats[0].calc_pearson(NUM_BLOCK_PIXELS, &half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][1], 3, 3, &half_comp_stats[0], &half_comp_stats[1]); // R vs. B half_cross_chan_stats[1].calc_pearson(NUM_BLOCK_PIXELS, &half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][2], 3, 3, &half_comp_stats[0], &half_comp_stats[2]); // G vs. B half_cross_chan_stats[2].calc_pearson(NUM_BLOCK_PIXELS, &half_pixels_as_floats[0][0][1], &half_pixels_as_floats[0][0][2], 3, 3, &half_comp_stats[1], &half_comp_stats[2]); const float rg_corr = fabsf((float)half_cross_chan_stats[0].m_pearson); const float rb_corr = fabsf((float)half_cross_chan_stats[1].m_pearson); const float gb_corr = fabsf((float)half_cross_chan_stats[2].m_pearson); float min_corr = BIG_FLOAT_VAL, max_corr = -BIG_FLOAT_VAL; for (uint32_t i = 0; i < 3; i++) { #if 0 // 9/5/2025, wrong metric, we're iterating channels pairs here, not individual channels. // On 3 active channel blocks this causes no difference. if (half_comp_stats[i].m_range > 0.0f) #else static const uint8_t s_chan_pairs[3][2] = { {0, 1}, {0, 2}, {1, 2} }; const uint32_t chanA = s_chan_pairs[i][0]; const uint32_t chanB = s_chan_pairs[i][1]; if ((half_comp_stats[chanA].m_range > 0.0f) && (half_comp_stats[chanB].m_range > 0.0f)) #endif { const float c = fabsf((float)half_cross_chan_stats[i].m_pearson); min_corr = minimum(min_corr, c); max_corr = maximum(max_corr, c); } } bool use_single_subset_mode7 = true; if (comp_level <= 1) { // TODO: could also compute angle between principle axis and the grayscale axis. // TODO: Transform grayscale axis by covar matrix, compute variance vs. total variance const float MODE7_MIN_CHAN_CORR = .5f; const float MODE7_PCA_ANGLE_THRESH = .9f; use_single_subset_mode7 = is_grayscale || is_solid_block || ((total_used_block_chans == 1) || (min_corr >= MODE7_MIN_CHAN_CORR)); if (use_single_subset_mode7) { float cos_ang = fabsf(enc_block_stats.m_axis_q16.dot(vec3F(0.5773502691f))); if (cos_ang < MODE7_PCA_ANGLE_THRESH) use_single_subset_mode7 = false; } } const float STRONG_CORR_THRESH = (comp_level <= 1) ? .5f : ((comp_level <= 3) ? .75f : .9f); int desired_dp_chan = -1; if (total_used_block_chans <= 1) { // no need for dual plane (except possibly 2x2 weight grids for RDO) } else { if (min_corr >= STRONG_CORR_THRESH) { // all channel pairs strongly correlated, no need for dual plane debug_state.m_dp_stats[0].fetch_add(1, std::memory_order_relaxed); } else { if (total_used_block_chans == 2) { if (half_comp_stats[0].m_range == 0.0f) { // r unused, check for strong gb correlation if (gb_corr < STRONG_CORR_THRESH) desired_dp_chan = 1; } else if (half_comp_stats[1].m_range == 0.0f) { // g unused, check for strong rb correlation if (rb_corr < STRONG_CORR_THRESH) desired_dp_chan = 0; } else { // b unused, check for strong rg correlation if (rg_corr < STRONG_CORR_THRESH) desired_dp_chan = 0; } } else { assert(total_used_block_chans == 3); // see if rg/rb is weakly correlated vs. gb if ((rg_corr < gb_corr) && (rb_corr < gb_corr)) desired_dp_chan = 0; // see if gr/gb is weakly correlated vs. rb else if ((rg_corr < rb_corr) && (gb_corr < rb_corr)) desired_dp_chan = 1; // assume b is weakest else desired_dp_chan = 2; } if (desired_dp_chan == -1) debug_state.m_dp_stats[1].fetch_add(1, std::memory_order_relaxed); else debug_state.m_dp_stats[2 + desired_dp_chan].fetch_add(1, std::memory_order_relaxed); } } // 2x2 is special for RDO at higher lambdas - always pick a preferred channel. int desired_dp_chan_2x2 = 0; if (total_used_block_chans == 2) { if (half_comp_stats[0].m_range == 0.0f) desired_dp_chan_2x2 = 1; } else if (total_used_block_chans == 3) { // see if rg/rb is weakly correlated vs. gb if ((rg_corr < gb_corr) && (rb_corr < gb_corr)) desired_dp_chan_2x2 = 0; // see if gr/gb is weakly correlated vs. rb else if ((rg_corr < rb_corr) && (gb_corr < rb_corr)) desired_dp_chan_2x2 = 1; // assume b is weakest else desired_dp_chan_2x2 = 2; } // Gather all candidate encodings bool status = false; // ---- Run candidate if ((global_cfg.m_use_runs) && (has_left_neighbor || has_upper_neighbor)) { candidate_encoding candidate; candidate.m_coder.reserve(24); candidate.m_encoding_type = encoding_type::cRun; candidate.m_decomp_log_blk = prev_non_run_candidate_encoding.m_decomp_log_blk; candidate.m_coded_log_blk = prev_non_run_candidate_encoding.m_coded_log_blk; memcpy(candidate.m_comp_pixels, prev_comp_pixels, sizeof(prev_comp_pixels)); if (!prev_run_len) { candidate.m_coder.put_bits(RUN_CODE, RUN_CODE_LEN); candidate.m_coder.put_vlc(0, 5); } else { // extend current run - compute the # of new bits needed for the extension. uint32_t prev_run_bits = prev_encoding.get_total_bits_u32(); assert(prev_run_bits > 0); // We're not actually going to code this, because the previously emitted run code will be extended. bitwise_coder temp_coder; temp_coder.put_bits(RUN_CODE, RUN_CODE_LEN); temp_coder.put_vlc((prev_run_len + 1) - 1, 5); uint32_t cur_run_bits = temp_coder.get_total_bits_u32(); assert(cur_run_bits >= prev_run_bits); uint32_t total_new_bits = cur_run_bits - prev_run_bits; if (total_new_bits > 0) candidate.m_coder.put_bits(0, total_new_bits); // dummy bits } candidate.m_run_len = prev_run_len + 1; candidates.emplace_back(std::move(candidate)); } // ---- Reuse candidate if ((!is_solid_block) && (global_cfg.m_lambda > 0.0f)) { for (uint32_t reuse_delta_index = 0; reuse_delta_index < global_cfg.m_num_reuse_xy_deltas; reuse_delta_index++) { const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x; const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y; const int reuse_bx = bx + reuse_delta_x, reuse_by = by + reuse_delta_y; if ((reuse_bx < 0) || (reuse_bx >= (int)num_blocks_x)) continue; if (reuse_by < (int)strip_first_by) break; const candidate_encoding& prev_candidate = enc_state.coded_blocks(reuse_bx, reuse_by); // TODO - support this. if (prev_candidate.m_encoding_type == encoding_type::cSolid) continue; assert((prev_candidate.m_encoding_type == encoding_type::cBlock) || (prev_candidate.m_encoding_type == encoding_type::cReuse)); candidate_encoding candidate; candidate.m_coder.reserve(24); astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; astc_helpers::log_astc_block& decomp_log_blk = candidate.m_decomp_log_blk; const astc_helpers::log_astc_block& prev_coded_log_blk = prev_candidate.m_coded_log_blk; const uint32_t grid_x = prev_coded_log_blk.m_grid_width, grid_y = prev_coded_log_blk.m_grid_height; const bool dual_plane = prev_candidate.m_coded_log_blk.m_dual_plane; const uint32_t num_grid_samples = grid_x * grid_y; const uint32_t num_endpoint_vals = get_num_endpoint_vals(prev_coded_log_blk.m_color_endpoint_modes[0]); coded_log_blk = prev_candidate.m_coded_log_blk; decomp_log_blk = prev_candidate.m_decomp_log_blk; if (prev_coded_log_blk.m_num_partitions == 1) { // Now encode the block using the transcoded endpoints basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3]; if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7) { status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr, astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range); } else { status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr, astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range); } assert(status); uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H]; uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; if (dual_plane) { eval_selectors_dual_plane(prev_candidate.m_coded_log_blk.m_color_component_selector, BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); downsample_ise_weights_dual_plane( coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range, BLOCK_W, BLOCK_H, grid_x, grid_y, trial_weights0, trial_weights1, coded_log_blk.m_weights); basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * 2, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range); } else { eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, coded_log_blk.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); downsample_ise_weights( coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range, BLOCK_W, BLOCK_H, grid_x, grid_y, trial_weights0, coded_log_blk.m_weights); basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range); } // Create the block the decoder would transcode into. copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk, use_orig_behavior); } else if (prev_coded_log_blk.m_num_partitions == 2) { assert(!dual_plane); const int unique_pat_index = g_part2_seed_to_unique_index[coded_log_blk.m_partition_id]; assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS2)); const partition_pattern_vec& pat_vec = g_partitions2[unique_pat_index]; vec4F part_pixels_q16[2][64]; half_vec3 part_half_pixels[2][64]; uint32_t part_total_pixels[2] = { 0 }; for (uint32_t y = 0; y < BLOCK_H; y++) { for (uint32_t x = 0; x < BLOCK_W; x++) { const uint32_t part_index = pat_vec[x + y * 6]; uint32_t l = part_total_pixels[part_index]; part_pixels_q16[part_index][l] = block_pixels_q16[y][x]; part_half_pixels[part_index][l] = half_pixels[y][x]; part_total_pixels[part_index] = l + 1; } // x } // y uint8_t blk_weights[2][BLOCK_W * BLOCK_H]; for (uint32_t part_index = 0; part_index < 2; part_index++) { basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3]; if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7) { status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr, astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range); } else { status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr, astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range); } assert(status); eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range, (basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); } // part_index uint8_t ise_weights[BLOCK_W * BLOCK_H]; uint32_t src_pixel_index[2] = { 0, 0 }; for (uint32_t y = 0; y < BLOCK_H; y++) { for (uint32_t x = 0; x < BLOCK_W; x++) { const uint32_t part_index = pat_vec[x + y * 6]; ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]]; src_pixel_index[part_index]++; } // x } // y downsample_ise_weights( coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range, BLOCK_W, BLOCK_H, grid_x, grid_y, ise_weights, coded_log_blk.m_weights); // Transcode these codable weights to ASTC weights. uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H]; basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range); // Create the block the decoder would transcode into. copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk, use_orig_behavior); } else if (prev_coded_log_blk.m_num_partitions == 3) { assert(!dual_plane); const int unique_pat_index = g_part3_seed_to_unique_index[coded_log_blk.m_partition_id]; assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS3)); const partition_pattern_vec& pat = g_partitions3[unique_pat_index]; vec4F part_pixels_q16[3][64]; half_vec3 part_half_pixels[3][64]; uint32_t part_total_pixels[3] = { 0 }; for (uint32_t y = 0; y < BLOCK_H; y++) { for (uint32_t x = 0; x < BLOCK_W; x++) { const uint32_t part_index = pat.m_parts[x + y * BLOCK_W]; uint32_t l = part_total_pixels[part_index]; part_pixels_q16[part_index][l] = block_pixels_q16[y][x]; part_half_pixels[part_index][l] = half_pixels[y][x]; part_total_pixels[part_index] = l + 1; } // x } // y uint8_t blk_weights[3][BLOCK_W * BLOCK_H]; for (uint32_t part_index = 0; part_index < 3; part_index++) { basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3]; status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr, astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range); assert(status); eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range, (basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); } // part_index uint8_t ise_weights[BLOCK_W * BLOCK_H]; uint32_t src_pixel_index[3] = { 0 }; for (uint32_t y = 0; y < BLOCK_H; y++) { for (uint32_t x = 0; x < BLOCK_W; x++) { const uint32_t part_index = pat.m_parts[x + y * BLOCK_W]; ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]]; src_pixel_index[part_index]++; } // x } // y downsample_ise_weights( coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range, BLOCK_W, BLOCK_H, grid_x, grid_y, ise_weights, coded_log_blk.m_weights); // Transcode these codable weights to ASTC weights. uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H]; basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range); // Create the block the decoder would transcode into. copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk, use_orig_behavior); } if (!validate_log_blk(decomp_log_blk)) { fmt_error_printf("pack_astc_block() failed\n"); return false; } status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_log_blk, &candidate.m_comp_pixels[0][0]); if (!status) { fmt_error_printf("decode_astc_block() failed\n"); return false; } candidate.m_coder.put_bits(REUSE_CODE, REUSE_CODE_LEN); candidate.m_coder.put_bits(reuse_delta_index, REUSE_XY_DELTA_BITS); encode_values(candidate.m_coder, num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range); candidate.m_encoding_type = encoding_type::cReuse; candidate.m_block_mode = prev_candidate.m_block_mode; candidate.m_endpoint_mode = prev_candidate.m_endpoint_mode; candidate.m_reuse_delta_index = reuse_delta_index; candidates.emplace_back(std::move(candidate)); } // reuse_delta_index } // ---- Solid candidate if (global_cfg.m_use_solid_blocks) { candidate_encoding candidate; candidate.m_coder.reserve(24); // solid candidate.m_encoding_type = encoding_type::cSolid; float r = 0.0f, g = 0.0f, b = 0.0f; const float LOG_BIAS = .125f; bool solid_block = true; for (uint32_t y = 0; y < BLOCK_H; y++) { for (uint32_t x = 0; x < BLOCK_W; x++) { if ((block_pixels[0][0][0] != block_pixels[y][x][0]) || (block_pixels[0][0][1] != block_pixels[y][x][1]) || (block_pixels[0][0][2] != block_pixels[y][x][2])) { solid_block = false; } r += log2f(block_pixels[y][x][0] + LOG_BIAS); g += log2f(block_pixels[y][x][1] + LOG_BIAS); b += log2f(block_pixels[y][x][2] + LOG_BIAS); } } if (solid_block) { r = block_pixels[0][0][0]; g = block_pixels[0][0][1]; b = block_pixels[0][0][2]; } else { r = maximum(0.0f, powf(2.0f, r * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS); g = maximum(0.0f, powf(2.0f, g * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS); b = maximum(0.0f, powf(2.0f, b * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS); r = minimum(r, basist::MAX_HALF_FLOAT); g = minimum(g, basist::MAX_HALF_FLOAT); b = minimum(b, basist::MAX_HALF_FLOAT); } basist::half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b); candidate.m_solid_color[0] = rh; candidate.m_solid_color[1] = gh; candidate.m_solid_color[2] = bh; candidate.m_coder.put_bits(SOLID_CODE, SOLID_CODE_LEN); candidate.m_coder.put_bits(rh, 15); candidate.m_coder.put_bits(gh, 15); candidate.m_coder.put_bits(bh, 15); vec3F cp(basist::half_to_float(rh), basist::half_to_float(gh), basist::half_to_float(bh)); for (uint32_t y = 0; y < BLOCK_H; y++) for (uint32_t x = 0; x < BLOCK_W; x++) candidate.m_comp_pixels[y][x] = cp; astc_helpers::log_astc_block& log_blk = candidate.m_coded_log_blk; log_blk.clear(); log_blk.m_solid_color_flag_hdr = true; log_blk.m_solid_color[0] = rh; log_blk.m_solid_color[1] = gh; log_blk.m_solid_color[2] = bh; log_blk.m_solid_color[3] = basist::float_to_half(1.0f); candidate.m_decomp_log_blk = log_blk; candidates.emplace_back(std::move(candidate)); } if ((!is_solid_block) || (!global_cfg.m_use_solid_blocks)) { static uint8_t s_parts2_normal[5] = { 0, 2, 4, 6, 8 }; static uint8_t s_parts3_normal[5] = { 0, 0, 4, 6, 8 }; static uint8_t s_parts2_complex[5] = { 0, 4, 8, 10, 16 }; static uint8_t s_parts3_complex[5] = { 0, 0, 8, 10, 16 }; static uint8_t s_parts2_very_complex[5] = { 0, 8, 12, 14, 20 }; static uint8_t s_parts3_very_complex[5] = { 0, 0, 12, 14, 20 }; uint32_t total_parts2 = 0, total_parts3 = 0; assert(comp_level < 5); if ((very_simple_block) && (comp_level <= 3)) { // Block's std dev is so low that 2-3 subsets are unlikely to help much total_parts2 = 0; total_parts3 = 0; debug_state.m_total_part2_stats[0].fetch_add(1, std::memory_order_relaxed); } else if (very_complex_block) { total_parts2 = s_parts2_very_complex[comp_level]; total_parts3 = s_parts3_very_complex[comp_level]; if (global_cfg.m_extra_patterns_flag) { total_parts2 += (comp_level == 4) ? 30 : 20; total_parts3 += (comp_level == 4) ? 30 : 20; } debug_state.m_total_part2_stats[2].fetch_add(1, std::memory_order_relaxed); } else if (complex_block) { total_parts2 = s_parts2_complex[comp_level]; total_parts3 = s_parts3_complex[comp_level]; if (global_cfg.m_extra_patterns_flag) { total_parts2 += (comp_level == 4) ? 15 : 10; total_parts3 += (comp_level == 4) ? 15 : 10; } debug_state.m_total_part2_stats[3].fetch_add(1, std::memory_order_relaxed); } else { // moderate complexity - use defaults total_parts2 = s_parts2_normal[comp_level]; total_parts3 = s_parts3_normal[comp_level]; if (global_cfg.m_extra_patterns_flag) { total_parts2 += 5; total_parts3 += 5; } debug_state.m_total_part2_stats[1].fetch_add(1, std::memory_order_relaxed); } if (!any_2subset_enabled) total_parts2 = 0; if (!any_3subset_enabled) total_parts3 = 0; int best_parts2_mode11[NUM_UNIQUE_PARTITIONS2], best_parts2_mode7[NUM_UNIQUE_PARTITIONS2]; bool has_estimated_parts2 = false; if (total_parts2) { if (global_cfg.m_brute_force_partition_matching) { int candidate_pats2[NUM_UNIQUE_PARTITIONS2]; for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS2; i++) candidate_pats2[i] = i; if (any_2subset_enabled) { estimate_partitions_mode7_and_11( 2, NUM_UNIQUE_PARTITIONS2, g_partitions2, NUM_UNIQUE_PARTITIONS2, (uint32_t*)candidate_pats2, &half_pixels_as_floats[0][0], coptions, total_parts2, best_parts2_mode11, best_parts2_mode7); } has_estimated_parts2 = true; } else { if (comp_level >= 1) { const uint32_t MAX_CANDIDATES2 = 48; int candidate_pats2[MAX_CANDIDATES2 * 2]; uint32_t num_candidate_pats2 = maximum((total_parts2 * 3) / 2, very_complex_block ? MAX_CANDIDATES2 : (MAX_CANDIDATES2 / 2)); num_candidate_pats2 = minimum(num_candidate_pats2, (uint32_t)std::size(candidate_pats2)); has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, candidate_pats2, num_candidate_pats2); if (has_estimated_parts2) { estimate_partitions_mode7_and_11( 2, NUM_UNIQUE_PARTITIONS2, g_partitions2, num_candidate_pats2, (uint32_t*)candidate_pats2, &half_pixels_as_floats[0][0], coptions, total_parts2, best_parts2_mode11, best_parts2_mode7); } } else { has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, best_parts2_mode11, total_parts2); if ((has_estimated_parts2) && (any_2subset_mode7_enabled)) memcpy(best_parts2_mode7, best_parts2_mode11, total_parts2 * sizeof(best_parts2_mode7[0])); } } } int best_parts3[NUM_UNIQUE_PARTITIONS3]; bool has_estimated_parts3 = false; if (total_parts3) { #if 0 has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, best_parts3, total_parts3); #elif 1 if (global_cfg.m_brute_force_partition_matching) { int candidate_pats3[NUM_UNIQUE_PARTITIONS3]; for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS3; i++) candidate_pats3[i] = i; estimate_partitions_mode7( 3, NUM_UNIQUE_PARTITIONS3, g_partitions3, NUM_UNIQUE_PARTITIONS3, (uint32_t*)candidate_pats3, &half_pixels_as_floats[0][0], coptions, total_parts3, (uint32_t*)best_parts3); has_estimated_parts3 = true; } else { const uint32_t MAX_CANDIDATES3 = 48; int candidate_pats3[MAX_CANDIDATES3 * 2]; uint32_t num_candidate_pats3 = maximum((total_parts3 * 3) / 2, very_complex_block ? MAX_CANDIDATES3 : (MAX_CANDIDATES3 / 2)); num_candidate_pats3 = minimum(num_candidate_pats3, (uint32_t)std::size(candidate_pats3)); has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, candidate_pats3, num_candidate_pats3); if (has_estimated_parts3) { estimate_partitions_mode7( 3, NUM_UNIQUE_PARTITIONS3, g_partitions3, num_candidate_pats3, (uint32_t*)candidate_pats3, &half_pixels_as_floats[0][0], coptions, total_parts3, (uint32_t*)best_parts3); } } #endif } const opt_mode_t mode11_opt_mode = complex_block ? cWeightedLeastSquares : cOrdinaryLeastSquares; // ---- Encoded block candidate for (uint32_t block_mode_iter = 0; block_mode_iter < (uint32_t)block_mode::cBMTotalModes; block_mode_iter++) { const block_mode bm = (block_mode)block_mode_iter; if (comp_level == 0) { if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL0) == 0) continue; } else if (comp_level == 1) { if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL1) == 0) continue; } else if (comp_level == 2) { if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL2) == 0) continue; } if (global_cfg.m_block_stat_optimizations_flag) { if ((comp_level <= 3) && (g_block_mode_descs[block_mode_iter].m_dp)) { if ((global_cfg.m_lambda > 0.0f) && (!complex_block) && (g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2)) { if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan_2x2) continue; } else { if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan) continue; } } if (comp_level <= 3) { const uint32_t grid_x = g_block_mode_descs[block_mode_iter].m_grid_x; const uint32_t grid_y = g_block_mode_descs[block_mode_iter].m_grid_y; if (!g_block_mode_descs[block_mode_iter].m_dp) { // Minor gain (.5-1% less canidates) if (very_detailed_block) { if (grid_x * grid_y <= 12) { debug_state.m_detail_stats[0].fetch_add(1, std::memory_order_relaxed); continue; } } // Major gains (10-25% less candidates) if (very_blurry_block) { if ((grid_x > 4) || (grid_y > 4) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1)) { debug_state.m_detail_stats[1].fetch_add(1, std::memory_order_relaxed); continue; } } if (super_blurry_block) { if ((grid_x > 3) || (grid_y > 3) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1)) { debug_state.m_detail_stats[2].fetch_add(1, std::memory_order_relaxed); continue; } } } if (grid_x != grid_y) { if (grid_x < grid_y) { if (!filter_horizontally) { debug_state.m_detail_stats[3].fetch_add(1, std::memory_order_relaxed); continue; } } else { if (filter_horizontally) { debug_state.m_detail_stats[4].fetch_add(1, std::memory_order_relaxed); continue; } } } } if (global_cfg.m_lambda == 0.0f) { // Rarely useful if lambda=0 if ((g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2)) continue; } } // block_stat_optimizations_flag if ((!use_single_subset_mode7) && (g_block_mode_descs[block_mode_iter].m_cem == 7) && (g_block_mode_descs[block_mode_iter].m_num_partitions == 1)) { debug_state.m_total_mode7_skips.fetch_add(1, std::memory_order_relaxed); continue; } for (uint32_t endpoint_mode_iter = 0; endpoint_mode_iter < (uint32_t)endpoint_mode::cTotal; endpoint_mode_iter++) { if (global_cfg.m_lambda == 0.0f) { // No use trying anything else if (endpoint_mode_iter != (uint32_t)endpoint_mode::cRaw) continue; } if (global_cfg.m_disable_delta_endpoint_usage) { if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeftDelta)) continue; } if (!global_cfg.m_favor_higher_compression) { if (comp_level == 0) { if (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta) continue; } if (comp_level <= 1) { if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeft) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpper)) continue; } } const endpoint_mode em = (endpoint_mode)endpoint_mode_iter; switch (em) { case endpoint_mode::cUseLeft: case endpoint_mode::cUseUpper: { const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter]; const uint32_t cem = local_md.m_cem; if (local_md.m_num_partitions > 1) break; if ((em == endpoint_mode::cUseLeft) && (!has_left_neighbor)) break; else if ((em == endpoint_mode::cUseUpper) && (!has_upper_neighbor)) break; candidate_encoding candidate; candidate.m_coder.reserve(24); astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; int nx = bx, ny = by; if (em == endpoint_mode::cUseLeft) nx--; else ny--; const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny); if (neighbor_blk.m_encoding_type == encoding_type::cSolid) break; assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse)); const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode]; if (neighbor_md.m_cem != cem) break; assert(neighbor_blk.m_coded_log_blk.m_color_endpoint_modes[0] == cem); const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y; const bool dual_plane = local_md.m_dp; const uint32_t num_grid_samples = grid_x * grid_y; const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem); coded_log_blk.m_grid_width = (uint8_t)grid_x; coded_log_blk.m_grid_height = (uint8_t)grid_y; coded_log_blk.m_dual_plane = (uint8_t)dual_plane; coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel; coded_log_blk.m_num_partitions = 1; coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)neighbor_md.m_cem; coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range; // We're not explictly writing any endpoints, just reusing existing ones. So copy the neighbor's endpoints unchanged (so no loss). coded_log_blk.m_endpoint_ise_range = neighbor_blk.m_coded_log_blk.m_endpoint_ise_range; memcpy(coded_log_blk.m_endpoints, neighbor_blk.m_coded_log_blk.m_endpoints, num_endpoint_vals); uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS]; // Requantize the neighbor's endpoints to whatever we'll have to transcode into to make a valid ASTC encoding. basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem, neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints, local_md.m_transcode_endpoint_ise_range, transcode_endpoints); // Now encode the block using the transcoded endpoints basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3]; if (cem == 7) { status = get_astc_hdr_mode_7_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr, astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range); } else { status = get_astc_hdr_mode_11_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr, astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range); } if (!status) break; uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H]; if (dual_plane) { eval_selectors_dual_plane(local_md.m_dp_channel, BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); downsample_ise_weights_dual_plane( local_md.m_weight_ise_range, local_md.m_weight_ise_range, BLOCK_W, BLOCK_H, grid_x, grid_y, trial_weights0, trial_weights1, coded_log_blk.m_weights); } else { eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, local_md.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX); downsample_ise_weights( local_md.m_weight_ise_range, local_md.m_weight_ise_range, BLOCK_W, BLOCK_H, grid_x, grid_y, trial_weights0, coded_log_blk.m_weights); } // Transcode these codable weights to ASTC weights. uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range); // Create the block the decoder would transcode into. astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk; decomp_blk.clear(); decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem; decomp_blk.m_dual_plane = local_md.m_dp; decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel; decomp_blk.m_num_partitions = 1; decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range; decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range; memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals); copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk, use_orig_behavior); if (!validate_log_blk(decomp_blk)) { fmt_error_printf("pack_astc_block() failed\n"); return false; } status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]); if (!status) { fmt_error_printf("decode_astc_block() failed\n"); return false; } candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN); code_block(candidate.m_coder, candidate.m_coded_log_blk, (block_mode)block_mode_iter, em, nullptr); candidate.m_encoding_type = encoding_type::cBlock; candidate.m_endpoint_mode = em; candidate.m_block_mode = bm; candidates.emplace_back(std::move(candidate)); break; } case endpoint_mode::cUseLeftDelta: case endpoint_mode::cUseUpperDelta: { const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter]; const uint32_t cem = local_md.m_cem; if (local_md.m_num_partitions > 1) break; if ((em == endpoint_mode::cUseLeftDelta) && (!has_left_neighbor)) break; else if ((em == endpoint_mode::cUseUpperDelta) && (!has_upper_neighbor)) break; candidate_encoding candidate; candidate.m_coder.reserve(24); astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; int nx = bx, ny = by; if (em == endpoint_mode::cUseLeftDelta) nx--; else ny--; const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny); if (neighbor_blk.m_encoding_type == encoding_type::cSolid) break; assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse)); const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode]; if (neighbor_md.m_cem != cem) break; assert(neighbor_md.m_cem == local_md.m_cem); const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y; const bool dual_plane = local_md.m_dp; const uint32_t num_grid_samples = grid_x * grid_y; const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem); // Dequantize neighbor's endpoints to ISE 20 uint8_t neighbor_endpoints_ise20[basist::NUM_MODE11_ENDPOINTS]; basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem, neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints, astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20); // Requantize neighbor's endpoints to our local desired coding ISE range uint8_t neighbor_endpoints_coding_ise_local[basist::NUM_MODE11_ENDPOINTS]; basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem, astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20, local_md.m_endpoint_ise_range, neighbor_endpoints_coding_ise_local); uint8_t blk_endpoints[basist::NUM_MODE11_ENDPOINTS]; uint8_t blk_weights0[NUM_BLOCK_PIXELS], blk_weights1[NUM_BLOCK_PIXELS]; // Now try to encode the current block using the neighbor's endpoints submode. double err = 0.0f; uint32_t best_submode = 0; if (cem == 7) { int maj_index, submode_index; decode_cem_7_config(neighbor_endpoints_ise20, submode_index, maj_index); int first_submode = submode_index, last_submode = submode_index; err = encode_astc_hdr_block_mode_7( NUM_BLOCK_PIXELS, (basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16, local_md.m_weight_ise_range, best_submode, BIG_FLOAT_VAL, blk_endpoints, blk_weights0, coptions, local_md.m_endpoint_ise_range, first_submode, last_submode, &enc_block_stats); } else { int maj_index, submode_index; decode_cem_11_config(neighbor_endpoints_ise20, submode_index, maj_index); int first_submode = -1, last_submode = -1; if (maj_index == 3) { // direct } else { first_submode = submode_index; last_submode = submode_index; } if (dual_plane) { err = encode_astc_hdr_block_mode_11_dual_plane( NUM_BLOCK_PIXELS, (basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16, local_md.m_dp_channel, local_md.m_weight_ise_range, best_submode, BIG_FLOAT_VAL, blk_endpoints, blk_weights0, blk_weights1, coptions, false, local_md.m_endpoint_ise_range, false, //uber_mode_flag, false, first_submode, last_submode, true); } else { err = encode_astc_hdr_block_mode_11( NUM_BLOCK_PIXELS, (basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16, local_md.m_weight_ise_range, best_submode, BIG_FLOAT_VAL, blk_endpoints, blk_weights0, coptions, false, local_md.m_endpoint_ise_range, false, //uber_mode_flag, false, first_submode, last_submode, true, mode11_opt_mode, &enc_block_stats); } } if (err == BIG_FLOAT_VAL) break; uint8_t endpoint_deltas[basist::NUM_MODE11_ENDPOINTS]; // TODO: For now, just try 5 bits for each endpoint. Can tune later. // This isn't right, it's computing the deltas in ISE space. //const uint32_t NUM_ENDPOINT_DELTA_BITS = 5; const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS; const int low_delta_limit = -(total_endpoint_delta_vals / 2), high_delta_limit = (total_endpoint_delta_vals / 2) - 1; const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(local_md.m_endpoint_ise_range).m_ISE_to_rank; bool all_deltas_in_limits = true; for (uint32_t i = 0; i < num_endpoint_vals; i++) { int endpoint_delta = (int)ise_to_rank[blk_endpoints[i]] - (int)ise_to_rank[neighbor_endpoints_coding_ise_local[i]]; if ((endpoint_delta < low_delta_limit) || (endpoint_delta > high_delta_limit)) all_deltas_in_limits = false; endpoint_deltas[i] = (uint8_t)(endpoint_delta + -low_delta_limit); } if (all_deltas_in_limits) { coded_log_blk.m_grid_width = (uint8_t)grid_x; coded_log_blk.m_grid_height = (uint8_t)grid_y; coded_log_blk.m_dual_plane = (uint8_t)dual_plane; coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel; coded_log_blk.m_num_partitions = 1; coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem; coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range; coded_log_blk.m_endpoint_ise_range = (uint8_t)local_md.m_endpoint_ise_range; memcpy(coded_log_blk.m_endpoints, blk_endpoints, num_endpoint_vals); uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS]; uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; basist::astc_6x6_hdr::requantize_ise_endpoints(local_md.m_cem, local_md.m_endpoint_ise_range, blk_endpoints, local_md.m_transcode_endpoint_ise_range, transcode_endpoints); if (dual_plane) { downsample_ise_weights_dual_plane( local_md.m_weight_ise_range, local_md.m_weight_ise_range, BLOCK_W, BLOCK_H, grid_x, grid_y, blk_weights0, blk_weights1, coded_log_blk.m_weights); } else { downsample_ise_weights( local_md.m_weight_ise_range, local_md.m_weight_ise_range, BLOCK_W, BLOCK_H, grid_x, grid_y, blk_weights0, coded_log_blk.m_weights); } basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range); // Create the block the decoder would transcode into. astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk; decomp_blk.clear(); decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem; decomp_blk.m_dual_plane = local_md.m_dp; decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel; decomp_blk.m_num_partitions = 1; decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range; decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range; memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals); copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk, use_orig_behavior); if (!validate_log_blk(decomp_blk)) { fmt_error_printf("pack_astc_block() failed\n"); return false; } status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]); if (!status) { fmt_error_printf("decode_astc_block() failed\n"); return false; } candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN); code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, endpoint_deltas); candidate.m_encoding_type = encoding_type::cBlock; candidate.m_endpoint_mode = em; candidate.m_block_mode = bm; candidates.emplace_back(std::move(candidate)); } break; } case endpoint_mode::cRaw: { //if (candidates.size() == 339) // fmt_printf("!"); const auto& mode_desc = g_block_mode_descs[(uint32_t)bm]; const uint32_t cem = mode_desc.m_cem; //const uint32_t num_endpoint_vals = get_num_endpoint_vals(cem); const bool dual_plane = mode_desc.m_dp; if ((global_cfg.m_disable_twothree_subsets) && (mode_desc.m_num_partitions >= 2)) break; if (mode_desc.m_num_partitions == 3) { assert(!dual_plane); if (!has_estimated_parts3) break; assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range); assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range); trial_result res; status = encode_block_3_subsets( res, cem, mode_desc.m_grid_x, mode_desc.m_grid_y, mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range, &half_pixels[0][0], (vec4F*)block_pixels_q16, coptions, uber_mode_flag, best_parts3, total_parts3, comp_level, mode11_opt_mode); if (!status) break; assert(res.m_valid); candidate_encoding candidate; candidate.m_coder.reserve(24); astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; coded_log_blk = res.m_log_blk; astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk; decomp_blk = res.m_log_blk; if (!validate_log_blk(decomp_blk)) { fmt_error_printf("pack_astc_block() failed\n"); return false; } status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]); if (!status) { fmt_error_printf("decode_astc_block() failed\n"); return false; } candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN); code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr); candidate.m_encoding_type = encoding_type::cBlock; candidate.m_endpoint_mode = em; candidate.m_block_mode = bm; candidates.emplace_back(std::move(candidate)); } else if (mode_desc.m_num_partitions == 2) { assert(!dual_plane); if (!has_estimated_parts2) break; assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range); assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range); for (uint32_t est_part_iter = 0; est_part_iter < total_parts2; est_part_iter++) { trial_result results[2]; assert(((cem == 11) && any_2subset_mode11_enabled) || ((cem == 7) && any_2subset_mode7_enabled)); status = encode_block_2_subsets( results, mode_desc.m_grid_x, mode_desc.m_grid_y, mode_desc.m_cem, mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range, &half_pixels[0][0], (vec4F*)block_pixels_q16, coptions, uber_mode_flag, (cem == 11) ? best_parts2_mode11[est_part_iter] : best_parts2_mode7[est_part_iter], comp_level, mode11_opt_mode, true); if (!status) continue; for (uint32_t r_iter = 0; r_iter < 2; r_iter++) { const trial_result& res = results[r_iter]; if (!res.m_valid) continue; candidate_encoding candidate; candidate.m_coder.reserve(24); astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; coded_log_blk = res.m_log_blk; astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk; decomp_blk = res.m_log_blk; if (!validate_log_blk(decomp_blk)) { fmt_error_printf("pack_astc_block() failed\n"); return false; } status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]); if (!status) { fmt_error_printf("decode_astc_block() failed\n"); return false; } candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN); code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr); candidate.m_encoding_type = encoding_type::cBlock; candidate.m_endpoint_mode = em; candidate.m_block_mode = bm; candidates.emplace_back(std::move(candidate)); } // r_iter } } else { // 1 subset uint8_t blk_weights0[BLOCK_W * BLOCK_H], blk_weights1[BLOCK_W * BLOCK_H]; uint32_t best_submode = 0; candidate_encoding candidate; candidate.m_coder.reserve(24); astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk; const uint32_t grid_x = mode_desc.m_grid_x, grid_y = mode_desc.m_grid_y; const uint32_t num_grid_samples = grid_x * grid_y; const half_vec3* pBlock_pixels_half = &half_pixels[0][0]; const vec4F* pBlock_pixels_q16 = &block_pixels_q16[0][0]; const uint32_t num_grid_samples_dp = num_grid_samples * (dual_plane ? 2 : 1); uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2]; coded_log_blk.m_grid_width = (uint8_t)grid_x; coded_log_blk.m_grid_height = (uint8_t)grid_y; coded_log_blk.m_dual_plane = (uint8_t)dual_plane; coded_log_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel; coded_log_blk.m_num_partitions = 1; coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem; coded_log_blk.m_weight_ise_range = (uint8_t)mode_desc.m_weight_ise_range; coded_log_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_endpoint_ise_range; if ((cem == 11) && (!dual_plane) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H))) { double e = encode_astc_hdr_block_downsampled_mode_11( BLOCK_W, BLOCK_H, grid_x, grid_y, mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range, NUM_BLOCK_PIXELS, (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16, BIG_FLOAT_VAL, FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode, coded_log_blk.m_endpoints, coded_log_blk.m_weights, best_submode, coptions, &enc_block_stats); if (e == BIG_FLOAT_VAL) break; } else { if (cem == 7) { assert(!dual_plane); double e = encode_astc_hdr_block_mode_7( NUM_BLOCK_PIXELS, (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16, mode_desc.m_weight_ise_range, best_submode, BIG_FLOAT_VAL, coded_log_blk.m_endpoints, blk_weights0, coptions, mode_desc.m_endpoint_ise_range, 0, MAX_MODE7_SUBMODE_INDEX, &enc_block_stats); BASISU_NOTE_UNUSED(e); } else { double e; if (dual_plane) { e = encode_astc_hdr_block_mode_11_dual_plane( NUM_BLOCK_PIXELS, (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16, mode_desc.m_dp_channel, mode_desc.m_weight_ise_range, best_submode, BIG_FLOAT_VAL, coded_log_blk.m_endpoints, blk_weights0, blk_weights1, coptions, false, mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false); } else { e = encode_astc_hdr_block_mode_11( NUM_BLOCK_PIXELS, (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16, mode_desc.m_weight_ise_range, best_submode, BIG_FLOAT_VAL, coded_log_blk.m_endpoints, blk_weights0, coptions, false, mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false, mode11_opt_mode, &enc_block_stats); } if (e == BIG_FLOAT_VAL) break; } if (dual_plane) { downsample_ise_weights_dual_plane( mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range, BLOCK_W, BLOCK_H, grid_x, grid_y, blk_weights0, blk_weights1, coded_log_blk.m_weights); } else { downsample_ise_weights( mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range, BLOCK_W, BLOCK_H, grid_x, grid_y, blk_weights0, coded_log_blk.m_weights); if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H))) { bool refine_status = refine_endpoints(cem, mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints, 6, 6, mode_desc.m_grid_x, mode_desc.m_grid_y, coded_log_blk.m_weights, mode_desc.m_weight_ise_range, BLOCK_W * BLOCK_H, (basist::half_float(*)[3])pBlock_pixels_half, (vec4F*)pBlock_pixels_q16, nullptr, coptions, mode11_opt_mode); BASISU_NOTE_UNUSED(refine_status); } } } basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples_dp, coded_log_blk.m_weights, mode_desc.m_weight_ise_range, transcode_weights, mode_desc.m_transcode_weight_ise_range); // Create the block the decoder would transcode into. astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk; decomp_blk.clear(); decomp_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem; decomp_blk.m_dual_plane = mode_desc.m_dp; decomp_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel; decomp_blk.m_num_partitions = 1; decomp_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_transcode_endpoint_ise_range; decomp_blk.m_weight_ise_range = (uint8_t)mode_desc.m_transcode_weight_ise_range; basist::astc_6x6_hdr::requantize_ise_endpoints(mode_desc.m_cem, mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints, mode_desc.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints); copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk, use_orig_behavior); if (!validate_log_blk(decomp_blk)) { fmt_error_printf("pack_astc_block() failed\n"); return false; } status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]); if (!status) { fmt_error_printf("decode_astc_block() failed\n"); return false; } candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN); code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr); candidate.m_encoding_type = encoding_type::cBlock; candidate.m_endpoint_mode = em; candidate.m_block_mode = bm; candidates.emplace_back(std::move(candidate)); } break; } default: assert(0); fmt_debug_printf("Invalid endpoint mode\n"); return false; } // switch (em) } // endpoint_mode_iter } // block_mode_iter } // is_solid_block //------------------------------------------------ debug_state.m_total_candidates_considered.fetch_add(candidates.size_u32(), std::memory_order_relaxed); atomic_max(debug_state.m_max_candidates_considered, candidates.size_u32()); for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++) { auto& candidate = candidates[candidate_iter]; for (uint32_t y = 0; y < BLOCK_H; y++) for (uint32_t x = 0; x < BLOCK_W; x++) linear_rgb_to_itp(candidate.m_comp_pixels[y][x], candidate.m_comp_pixels_itp[y][x], global_cfg); } // Find best overall candidate double best_t = DBL_MAX; int best_candidate_index = -1; float best_d_ssim = BIG_FLOAT_VAL; if (global_cfg.m_lambda == 0.0f) { for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++) { const auto& candidate = candidates[candidate_iter]; float candidate_d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]); if (candidate_d_ssim < best_d_ssim) best_d_ssim = candidate_d_ssim; candidate_d_ssim *= SSIM_WEIGHT; float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment); candidate_mse += candidate_d_ssim; float total_deblock_penalty = 0.0f; if (global_cfg.m_deblocking_flag) { total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight; } candidate_mse += total_deblock_penalty * SSIM_WEIGHT; if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse)) { // Bias the encoder away from 2 level blocks on complex blocks // TODO: Perhaps only do this on large or non-interpolated grids if (complex_block) { if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS) { candidate_mse *= TWO_LEVEL_PENALTY; } } // Bias the encoder away from smaller weight grids if the block is very complex // TODO: Use the DCT to compute an approximation of the block energy/variance retained vs. lost by downsampling. if (complex_block) { if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2)) candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY; else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3) candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY; else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4) candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY; } } float candidate_t = candidate_mse; if (candidate_t < best_t) { best_t = candidate_t; best_candidate_index = candidate_iter; } } // candidate_iter if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM)) { debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed); continue; } const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f); if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) && (block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) && (block_avg_y >= 1.5f)) { debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed); continue; } } else { assert(enc_state.smooth_block_mse_scales.get_width() > 0); // Compute block's perceptual weighting float perceptual_scale = 0.0f; for (uint32_t y = 0; y < BLOCK_H; y++) for (uint32_t x = 0; x < BLOCK_W; x++) perceptual_scale = basisu::maximumf(perceptual_scale, enc_state.smooth_block_mse_scales.at_clamped(bx * BLOCK_W + x, by * BLOCK_H + y)); // Very roughly normalize the computed distortion vs. bits. perceptual_scale *= 10.0f; for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++) { auto& candidate = candidates[candidate_iter]; float d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]); if (d_ssim < best_d_ssim) best_d_ssim = (float)d_ssim; d_ssim *= SSIM_WEIGHT; float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment); candidate_mse += d_ssim; float total_deblock_penalty = 0.0f; if (global_cfg.m_deblocking_flag) { total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight; } candidate_mse += total_deblock_penalty * SSIM_WEIGHT; if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse)) { // Bias the encoder away from 2 level blocks on complex blocks if (complex_block) { if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS) { candidate_mse *= TWO_LEVEL_PENALTY; } } // Bias the encoder away from smaller weight grids if the block is very complex if (complex_block) { if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2)) candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY; else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3) candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY; else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4) candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY; } } float mode_penalty = 1.0f; if (candidate.m_encoding_type == encoding_type::cSolid) mode_penalty *= SOLID_PENALTY; else if (candidate.m_encoding_type == encoding_type::cReuse) mode_penalty *= REUSE_PENALTY; else if (candidate.m_encoding_type == encoding_type::cRun) mode_penalty *= (complex_block ? RUN_PENALTY * 2.0f : RUN_PENALTY); float candidate_bits = (float)candidate.m_coder.get_total_bits(); double candidate_d = (double)candidate_mse * mode_penalty; const float D_POWER = 2.0f; // this value can get VERY large after squaring on random (fuzzed) HDR inputs double candidate_t = perceptual_scale * pow(candidate_d, D_POWER) + candidate_bits * (global_cfg.m_lambda * 1000.0f); candidate.m_t = candidate_t; candidate.m_d = candidate_d; candidate.m_bits = candidate_bits; if (candidate_t < best_t) { best_t = candidate_t; best_candidate_index = candidate_iter; } } // candidate_iter if (best_candidate_index < 0) { assert(0); // Should never happen best_candidate_index = 0; } if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM)) { debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed); continue; } const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f); if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) && (block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) && (block_avg_y >= 1.5f)) { debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed); continue; } if (global_cfg.m_rdo_candidate_diversity_boost) { // candidate diversity boosting - consider candidates along/near the Pareto front const candidate_encoding& comp_candidate = candidates[best_candidate_index]; double best_d = DBL_MAX; for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++) { const auto& candidate = candidates[candidate_iter]; if (candidate.m_bits <= comp_candidate.m_bits * global_cfg.m_rdo_candidate_diversity_boost_bit_window_weight) { if (candidate.m_d < best_d) { best_d = candidate.m_d; best_candidate_index = candidate_iter; } } } } // candidate JND optimization - if there's a cheaper to code candidate that is nearly equivalent visually to the best candidate chose, choose that if (global_cfg.m_jnd_optimization) { const candidate_encoding& cur_comp_candidate = candidates[best_candidate_index]; float new_best_candidate_bits = BIG_FLOAT_VAL; int new_best_candidate_index = -1; for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++) { if ((int)candidate_iter == best_candidate_index) continue; const auto& candidate = candidates[candidate_iter]; if (candidate.m_bits >= cur_comp_candidate.m_bits) continue; float max_delta_itp = 0.0f; for (uint32_t y = 0; y < BLOCK_H; y++) { for (uint32_t x = 0; x < BLOCK_W; x++) { float delta_itp = compute_pixel_delta_itp(cur_comp_candidate.m_comp_pixels_itp[y][x], candidate.m_comp_pixels_itp[y][x], block_pixels_as_itp[y][x], global_cfg.m_delta_itp_dark_adjustment); max_delta_itp = maximum(max_delta_itp, delta_itp); if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh) goto skip; } } skip: if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh) continue; if (candidate.m_bits < new_best_candidate_bits) { new_best_candidate_bits = candidate.m_bits; new_best_candidate_index = candidate_iter; } } if (new_best_candidate_index != -1) { best_candidate_index = new_best_candidate_index; debug_state.m_total_jnd_replacements.fetch_add(1, std::memory_order_relaxed); } } } // if (lambda == 0.0f) if (global_cfg.m_debug_images) { std::lock_guard lck(debug_state.m_stat_vis_mutex); debug_state.m_stat_vis.fill_box(bx * 6, by * 6, 6, 6, vec4F(best_d_ssim, max_std_dev, lowpass_std_dev, 1.0f)); } if (best_candidate_index < 0) { assert(best_candidate_index >= 0); fmt_error_printf("No candidates!\n"); return false; } const auto& best_candidate = candidates[best_candidate_index]; assert(best_candidate.m_encoding_type != encoding_type::cInvalid); if (best_candidate.m_encoding_type == encoding_type::cRun) { if (!prev_run_len) { if (prev_encoding.get_total_bits()) { #if SYNC_MARKERS strip_coded_bits.put_bits(0xDEAD, 16); #endif strip_coded_bits.append(prev_encoding); } assert(best_candidate.m_coder.get_total_bits()); prev_encoding = best_candidate.m_coder; prev_run_len = 1; } else { prev_run_len++; const uint32_t prev_run_bits = prev_encoding.get_total_bits_u32(); assert(prev_run_bits); BASISU_NOTE_UNUSED(prev_run_bits); const uint32_t num_dummy_bits = best_candidate.m_coder.get_total_bits_u32(); BASISU_NOTE_UNUSED(num_dummy_bits); // Rewrite the previous encoding to extend the run length. prev_encoding.restart(); prev_encoding.put_bits(RUN_CODE, RUN_CODE_LEN); prev_encoding.put_vlc(prev_run_len - 1, 5); assert(prev_encoding.get_total_bits() == prev_run_bits + num_dummy_bits); } } else { if (prev_encoding.get_total_bits()) { #if SYNC_MARKERS strip_coded_bits.put_bits(0xDEAD, 16); #endif strip_coded_bits.append(prev_encoding); } prev_encoding = best_candidate.m_coder; prev_run_len = 0; } memcpy(prev_comp_pixels, best_candidate.m_comp_pixels, sizeof(vec3F) * BLOCK_W * BLOCK_H); prev_candidate_encoding = best_candidate; if (best_candidate.m_encoding_type != encoding_type::cRun) prev_non_run_candidate_encoding = best_candidate; { std::lock_guard lck(debug_state.m_stats_mutex); debug_state.m_encoding_type_hist[(uint32_t)best_candidate.m_encoding_type]++; if (best_candidate.m_encoding_type == encoding_type::cBlock) { debug_state.m_endpoint_mode_hist[(uint32_t)best_candidate.m_endpoint_mode]++; } if ((best_candidate.m_encoding_type == encoding_type::cReuse) || (best_candidate.m_encoding_type == encoding_type::cBlock)) { const uint32_t bm_index = (uint32_t)best_candidate.m_block_mode; assert(bm_index < (uint32_t)block_mode::cBMTotalModes); debug_state.m_block_mode_hist[bm_index]++; debug_state.m_block_mode_total_bits[bm_index] += best_candidate.m_coder.get_total_bits(); for (uint32_t i = 0; i < 3; i++) { debug_state.m_block_mode_comp_stats[bm_index][i].push_back(half_comp_stats[i]); debug_state.m_block_mode_comparative_stats[bm_index][i].push_back(half_cross_chan_stats[i]); } } if (best_candidate.m_encoding_type == encoding_type::cReuse) { debug_state.m_reuse_num_parts[best_candidate.m_coded_log_blk.m_num_partitions].fetch_add(1, std::memory_order_relaxed); if (best_candidate.m_coded_log_blk.m_dual_plane) debug_state.m_reuse_total_dp.fetch_add(1, std::memory_order_relaxed); } } enc_state.coded_blocks(bx, by) = prev_non_run_candidate_encoding; // Update decoded image vec4F decoded_float_pixels[BLOCK_H][BLOCK_W]; for (uint32_t y = 0; y < BLOCK_H; y++) for (uint32_t x = 0; x < BLOCK_W; x++) decoded_float_pixels[y][x] = best_candidate.m_comp_pixels[y][x]; enc_state.packed_img.set_block_clipped((vec4F*)decoded_float_pixels, bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H); status = astc_helpers::pack_astc_block(enc_state.final_astc_blocks(bx, by), best_candidate.m_decomp_log_blk, nullptr, nullptr); if (!status) { fmt_error_printf("Failed packing block\n"); return false; } const uint32_t r = debug_state.m_total_blocks_compressed.fetch_add(1, std::memory_order_relaxed); if ((r & 2047) == 2047) { if (global_cfg.m_status_output) { basisu::fmt_printf("{} of {} total blocks compressed, {3.2}%\n", r, total_blocks, (r * 100.0f) / total_blocks); } } if ((global_cfg.m_debug_images) && ((best_candidate.m_encoding_type != encoding_type::cRun) && (best_candidate.m_encoding_type != encoding_type::cSolid))) { std::lock_guard lck(debug_state.m_vis_image_mutex); if (best_candidate.m_decomp_log_blk.m_num_partitions == 2) { const int part2_unique_index = g_part2_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id]; assert((part2_unique_index >= 0) && (part2_unique_index < (int)NUM_UNIQUE_PARTITIONS2)); const partition_pattern_vec& pat = g_partitions2[part2_unique_index]; for (uint32_t y = 0; y < 6; y++) { for (uint32_t x = 0; x < 6; x++) { const uint32_t p = pat[x + y * 6]; debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, color_rgba(p ? 100 : 0, 128, p ? 100 : 0, 255)); } // x } // y } else if (best_candidate.m_decomp_log_blk.m_num_partitions == 3) { //part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(0, 0, 255, 255)); const int part3_unique_index = g_part3_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id]; assert((part3_unique_index >= 0) && (part3_unique_index < (int)NUM_UNIQUE_PARTITIONS3)); const partition_pattern_vec& pat = g_partitions3[part3_unique_index]; for (uint32_t y = 0; y < 6; y++) { for (uint32_t x = 0; x < 6; x++) { const uint32_t p = pat[x + y * 6]; color_rgba c(0, 0, 150, 255); if (p == 1) c.set(100, 0, 150, 255); else if (p == 2) c.set(0, 100, 150, 255); debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, c); } // x } // y } else if (best_candidate.m_decomp_log_blk.m_dual_plane) { debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 255, 255)); } else { debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 0, 255)); } color_rgba c; c.set((best_candidate.m_coded_log_blk.m_grid_width * best_candidate.m_coded_log_blk.m_grid_height * 255 + 18) / 36); debug_state.m_grid_vis.fill_box(bx * 6, by * 6, 6, 6, c); c.set(0, 0, 0, 255); if (complex_block) c[0] = 255; if (very_complex_block) c[1] = 255; if (outer_pass == 2) c[2] = 255; else if (outer_pass == 1) c[2] = 128; debug_state.m_mode_vis.fill_box(bx * 6, by * 6, 6, 6, c); c.set(0, 255, 0, 255); if (best_candidate.m_coded_log_blk.m_color_endpoint_modes[0] == 7) c.set(255, 0, 0, 255); debug_state.m_mode_vis2.fill_box(bx * 6, by * 6, 6, 6, c); switch (best_candidate.m_encoding_type) { case encoding_type::cRun: c.set(0, 0, 0, 255); break; case encoding_type::cSolid: c.set(128, 128, 128, 255); // dark grey break; case encoding_type::cReuse: c.set(255, 255, 0, 255); // yellow break; case encoding_type::cBlock: { switch (best_candidate.m_endpoint_mode) { case endpoint_mode::cRaw: c.set(255, 0, 0, 255); // red break; case endpoint_mode::cUseLeft: c.set(0, 0, 255, 255); // blue break; case endpoint_mode::cUseUpper: c.set(0, 0, 192, 255); // darker blue break; case endpoint_mode::cUseLeftDelta: c.set(0, 255, 0, 255); // green break; case endpoint_mode::cUseUpperDelta: c.set(0, 192, 0, 255); // darker green break; default: break; } break; } default: break; } if (filtered_x_err < filtered_y_err) c[3] = 0; else c[3] = 255; debug_state.m_enc_vis.fill_box(bx * 6, by * 6, 6, 6, c); } break; } // outer_pass } // bx } // by if (prev_encoding.get_total_bits()) { #if SYNC_MARKERS strip_coded_bits.put_bits(0xDEAD, 16); #endif strip_coded_bits.append(prev_encoding); } return true; } bool g_initialized = false; void global_init() { if (g_initialized) return; interval_timer tm; tm.start(); init_pq_tables(); init_partitions2_6x6(); init_partitions3_6x6(); init_contrib_lists(); g_initialized = true; //fmt_printf("astc_6x6_hdr::global_init() total time: {}\n", tm.get_elapsed_secs()); } bool compress_photo(const basisu::imagef &orig_src_img, const astc_hdr_6x6_global_config &orig_global_cfg, job_pool *pJob_pool, basisu::uint8_vec& intermediate_tex_data, basisu::uint8_vec& astc_tex_data, result_metrics& metrics) { assert(g_initialized); if (!g_initialized) return false; assert(pJob_pool); if (orig_global_cfg.m_debug_output) { fmt_debug_printf("------ astc_6x6_hdr::compress_photo:\n"); fmt_debug_printf("Source image dimensions: {}x{}\n", orig_src_img.get_width(), orig_src_img.get_height()); fmt_debug_printf("Job pool total threads: {}\n", (uint64_t)pJob_pool->get_total_threads()); orig_global_cfg.print(); } if (!orig_src_img.get_width() || !orig_src_img.get_height()) { assert(false); fmt_error_printf("compress_photo: Invalid source image\n"); return false; } astc_hdr_6x6_global_config global_cfg(orig_global_cfg); uastc_hdr_6x6_encode_state enc_state; enc_state.master_coptions.m_q_log_bias = Q_LOG_BIAS_6x6; enc_state.src_img = orig_src_img; //src_img.crop(256, 256); const uint32_t width = enc_state.src_img.get_width(); const uint32_t height = enc_state.src_img.get_height(); const uint32_t num_blocks_x = enc_state.src_img.get_block_width(BLOCK_W); const uint32_t num_blocks_y = enc_state.src_img.get_block_height(BLOCK_H); const uint32_t total_blocks = num_blocks_x * num_blocks_y; for (uint32_t y = 0; y < height; y++) { for (uint32_t x = 0; x < width; x++) { for (uint32_t c = 0; c < 3; c++) { float f = enc_state.src_img(x, y)[c]; if (std::isinf(f) || std::isnan(f) || (f < 0.0f)) f = 0; else if (f > basist::ASTC_HDR_MAX_VAL) f = basist::ASTC_HDR_MAX_VAL; enc_state.src_img(x, y)[c] = f; } // c } // x } // y if (global_cfg.m_debug_images) { write_exr((global_cfg.m_debug_image_prefix + "orig.exr").c_str(), enc_state.src_img, 3, 0); } image src_img_compressed; tonemap_image_compressive2(src_img_compressed, enc_state.src_img); if (global_cfg.m_debug_images) { save_png(global_cfg.m_debug_image_prefix + "compressive_tone_map.png", src_img_compressed); } smooth_map_params rp; rp.m_debug_images = global_cfg.m_debug_images; if (global_cfg.m_lambda != 0.0f) { if (global_cfg.m_status_output) fmt_printf("Creating RDO perceptual weighting maps\n"); create_smooth_maps2(enc_state.smooth_block_mse_scales, src_img_compressed, rp); } if (global_cfg.m_status_output) fmt_printf("Blurring image\n"); enc_state.src_img_filtered1.resize(width, height); image_resample(enc_state.src_img, enc_state.src_img_filtered1, "gaussian", global_cfg.m_gaussian1_strength); //1.45f); enc_state.src_img_filtered2.resize(width, height); image_resample(enc_state.src_img, enc_state.src_img_filtered2, "gaussian", global_cfg.m_gaussian2_strength); //1.83f); if (global_cfg.m_debug_images) { write_exr((global_cfg.m_debug_image_prefix + "blurred1.exr").c_str(), enc_state.src_img_filtered1, 3, 0); write_exr((global_cfg.m_debug_image_prefix + "blurred2.exr").c_str(), enc_state.src_img_filtered2, 3, 0); } if (global_cfg.m_status_output) fmt_printf("Transforming to ITP\n"); enc_state.src_img_itp.resize(width, height); convet_rgb_image_to_itp(enc_state.src_img, enc_state.src_img_itp, global_cfg); enc_state.src_img_filtered1_itp.resize(width, height); convet_rgb_image_to_itp(enc_state.src_img_filtered1, enc_state.src_img_filtered1_itp, global_cfg); enc_state.src_img_filtered2_itp.resize(width, height); convet_rgb_image_to_itp(enc_state.src_img_filtered2, enc_state.src_img_filtered2_itp, global_cfg); if (global_cfg.m_lambda == 0.0f) global_cfg.m_favor_higher_compression = false; uint32_t total_strips = 0, rows_per_strip = 0; if (!calc_strip_size(global_cfg.m_lambda, num_blocks_y, (uint32_t)pJob_pool->get_total_threads(), global_cfg.m_force_one_strip, total_strips, rows_per_strip, global_cfg)) { fmt_error_printf("compress_photo: Failed computing strip sizes\n"); return false; } if (global_cfg.m_debug_output) fmt_printf("lambda: {}, comp_level: {}, highest_comp_level: {}, extra patterns: {}\n", global_cfg.m_lambda, global_cfg.m_master_comp_level, global_cfg.m_highest_comp_level, global_cfg.m_extra_patterns_flag); enc_state.coded_blocks.resize(num_blocks_x, num_blocks_y); bitwise_coder coded_bits; // For Basis v1.60 files write the original marker, otherwise write the new marker. coded_bits.put_bits(global_cfg.m_write_basisu_1_6_compatible_files ? UASTC_6x6_HDR_SIG0 : UASTC_6x6_HDR_SIG1, 16); coded_bits.put_bits(width, 16); coded_bits.put_bits(height, 16); enc_state.packed_img.resize(width, height); enc_state.strip_bits.resize(total_strips); enc_state.final_astc_blocks.resize(num_blocks_x, num_blocks_y); uastc_hdr_6x6_debug_state debug_state; if (global_cfg.m_debug_images) debug_state.init(width, height); else debug_state.init(0, 0); interval_timer tm; tm.start(); std::atomic_bool any_failed_flag; any_failed_flag.store(false); for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++) { const uint32_t strip_first_by = strip_index * rows_per_strip; uint32_t strip_last_by = minimum(strip_first_by + rows_per_strip - 1, num_blocks_y); if (strip_index == (total_strips - 1)) strip_last_by = num_blocks_y - 1; pJob_pool->add_job([&any_failed_flag, &global_cfg, &debug_state, &enc_state, strip_index, total_strips, strip_first_by, strip_last_by, num_blocks_x, num_blocks_y, total_blocks, width, height] { if (!any_failed_flag) { bool status = compress_strip_task( strip_index, total_strips, strip_first_by, strip_last_by, num_blocks_x, num_blocks_y, total_blocks, width, height, global_cfg, debug_state, enc_state); if (!status) { fmt_error_printf("compress_photo: compress_strip_task() failed\n"); any_failed_flag.store(true, std::memory_order_relaxed); } } } ); if (any_failed_flag) break; } // strip_index pJob_pool->wait_for_all(); if (any_failed_flag) { fmt_error_printf("One or more strips failed during compression\n"); return false; } if (global_cfg.m_debug_output) fmt_printf("Encoding time: {} secs\n", tm.get_elapsed_secs()); if (global_cfg.m_debug_output) debug_state.print(total_blocks); if (global_cfg.m_debug_images) { save_png(global_cfg.m_debug_image_prefix + "part_vis.png", debug_state.m_part_vis); save_png(global_cfg.m_debug_image_prefix + "grid_vis.png", debug_state.m_grid_vis); save_png(global_cfg.m_debug_image_prefix + "mode_vis.png", debug_state.m_mode_vis); save_png(global_cfg.m_debug_image_prefix + "mode_vis2.png", debug_state.m_mode_vis2); save_png(global_cfg.m_debug_image_prefix + "enc_vis.png", debug_state.m_enc_vis); write_exr((global_cfg.m_debug_image_prefix + "stat_vis.exr").c_str(), debug_state.m_stat_vis, 3, 0); } for (uint32_t i = 0; i < total_strips; i++) coded_bits.append(enc_state.strip_bits[i]); coded_bits.put_bits(0xA742, 16); coded_bits.flush(); if (global_cfg.m_output_images) { write_exr((global_cfg.m_output_image_prefix + "comp.exr").c_str(), enc_state.packed_img, 3, 0); } if (global_cfg.m_debug_output) fmt_printf("\nTotal intermediate output bits/pixel: {3.4}\n", (float)coded_bits.get_total_bits() / (float)(width * height)); vector2D decoded_blocks1; vector2D decoded_blocks2; if (global_cfg.m_debug_output) fmt_printf("decode_file\n"); uint32_t unpacked_width = 0, unpacked_height = 0; bool status = decode_file(coded_bits.get_bytes(), decoded_blocks1, unpacked_width, unpacked_height); if (!status) { fmt_error_printf("decode_file() failed\n"); return false; } if (global_cfg.m_debug_output) fmt_printf("decode_6x6_hdr\n"); status = decode_6x6_hdr(coded_bits.get_bytes().get_ptr(), coded_bits.get_bytes().size_in_bytes_u32(), decoded_blocks2, unpacked_width, unpacked_height); if (!status) { fmt_error_printf("decode_6x6_hdr_file() failed\n"); return false; } if ((enc_state.final_astc_blocks.get_width() != decoded_blocks1.get_width()) || (enc_state.final_astc_blocks.get_height() != decoded_blocks1.get_height())) { fmt_error_printf("Decode size mismatch with decode_file\n"); return false; } if ((enc_state.final_astc_blocks.get_width() != decoded_blocks2.get_width()) || (enc_state.final_astc_blocks.get_height() != decoded_blocks2.get_height())) { fmt_error_printf("Decode size mismatch with decode_6x6_hdr_file\n"); return false; } if (memcmp(decoded_blocks1.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks1.size_in_bytes()) != 0) { fmt_error_printf("Decoded ASTC blocks verification failed\n"); return false; } if (memcmp(decoded_blocks2.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks2.size_in_bytes()) != 0) { fmt_error_printf("Decoded ASTC blocks verification failed\n"); return false; } if (global_cfg.m_debug_output) basisu::fmt_printf("Decoded ASTC verification checks succeeded\n"); if (global_cfg.m_output_images) { if (write_astc_file((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), decoded_blocks1.get_ptr(), BLOCK_W, BLOCK_H, width, height)) { basisu::platform_sleep(20); uint8_vec astc_file_data; if (read_file_to_vec((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), astc_file_data)) { if (astc_file_data.size() > 16) { astc_file_data.erase(0, 16); size_t comp_size = 0; void* pComp_data = tdefl_compress_mem_to_heap(&astc_file_data[0], astc_file_data.size(), &comp_size, TDEFL_MAX_PROBES_MASK); mz_free(pComp_data); if (global_cfg.m_debug_output) { fmt_printf(".ASTC file size (less header): {}, bits/pixel: {}, Deflate bits/pixel: {}\n", (uint64_t)astc_file_data.size(), (float)astc_file_data.size() * 8.0f / (float)(width * height), (float)comp_size * 8.0f / (float)(width * height)); } } } } } // Must decode all the blocks (even padded rows/cols) to match what the transcoder does. imagef unpacked_astc_img(num_blocks_x * 6, num_blocks_y * 6); imagef unpacked_astc_google_img(num_blocks_x * 6, num_blocks_y * 6); for (uint32_t y = 0; y < decoded_blocks1.get_height(); y++) { for (uint32_t x = 0; x < decoded_blocks1.get_width(); x++) { const auto& phys_blk = decoded_blocks1(x, y); vec4F pixels[MAX_BLOCK_W * MAX_BLOCK_H]; status = unpack_physical_astc_block(&phys_blk, BLOCK_W, BLOCK_H, pixels); if (!status) { fmt_error_printf("unpack_physical_astc_block() failed\n"); return false; } unpacked_astc_img.set_block_clipped(pixels, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H); vec4F pixels_google[MAX_BLOCK_W * MAX_BLOCK_H]; status = unpack_physical_astc_block_google(&phys_blk, BLOCK_W, BLOCK_H, pixels_google); if (!status) { fmt_error_printf("unpack_physical_astc_block_google() failed\n"); return false; } unpacked_astc_google_img.set_block_clipped(pixels_google, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H); for (uint32_t i = 0; i < 36; i++) { if (pixels[i] != pixels_google[i]) { fmt_error_printf("pixel unpack mismatch\n"); return false; } } } } if (global_cfg.m_debug_output) fmt_printf("\nUnpack succeeded\n"); imagef unpacked_bc6h_img; { vector2D bc6h_blocks; fast_bc6h_params enc_params; bool pack_status = pack_bc6h_image(unpacked_astc_img, bc6h_blocks, &unpacked_bc6h_img, enc_params); if (!pack_status) { fmt_error_printf("pack_bc6h_image() failed!"); return false; } unpacked_bc6h_img.crop(width, height); if (global_cfg.m_output_images) { write_exr((global_cfg.m_output_image_prefix + "unpacked_bc6h.exr").c_str(), unpacked_bc6h_img, 3, 0); } } unpacked_astc_img.crop(width, height); unpacked_astc_google_img.crop(width, height); if (global_cfg.m_output_images) { write_exr((global_cfg.m_output_image_prefix + "unpacked_astc.exr").c_str(), unpacked_astc_img, 3, 0); write_exr((global_cfg.m_output_image_prefix + "unpacked_google_astc.exr").c_str(), unpacked_astc_google_img, 3, 0); } // ASTC metrics if (global_cfg.m_image_stats) { image_metrics im; if (global_cfg.m_debug_output) printf("\nASTC log2 float error metrics:\n"); for (uint32_t i = 0; i < 3; i++) { im.calc(enc_state.src_img, unpacked_astc_img, i, 1, true, true); if (global_cfg.m_debug_output) { printf("%c: ", "RGBA"[i]); im.print_hp(); } } metrics.m_im_astc_log2.calc(enc_state.src_img, unpacked_astc_img, 0, 3, true, true); if (global_cfg.m_debug_output) { printf("RGB: "); metrics.m_im_astc_log2.print_hp(); printf("\n"); } } if (global_cfg.m_image_stats) { image_metrics im; if (global_cfg.m_debug_output) printf("ASTC half float space error metrics (a piecewise linear approximation of log2 error):\n"); for (uint32_t i = 0; i < 3; i++) { im.calc_half(enc_state.src_img, unpacked_astc_img, i, 1, true); if (global_cfg.m_debug_output) { printf("%c: ", "RGBA"[i]); im.print_hp(); } } metrics.m_im_astc_half.calc_half(enc_state.src_img, unpacked_astc_img, 0, 3, true); if (global_cfg.m_debug_output) { printf("RGB: "); metrics.m_im_astc_half.print_hp(); } } // BC6H metrics if (global_cfg.m_image_stats) { image_metrics im; if (global_cfg.m_debug_output) printf("\nBC6H log2 float error metrics:\n"); for (uint32_t i = 0; i < 3; i++) { im.calc(enc_state.src_img, unpacked_bc6h_img, i, 1, true, true); if (global_cfg.m_debug_output) { printf("%c: ", "RGBA"[i]); im.print_hp(); } } metrics.m_im_bc6h_log2.calc(enc_state.src_img, unpacked_bc6h_img, 0, 3, true, true); if (global_cfg.m_debug_output) { printf("RGB: "); metrics.m_im_bc6h_log2.print_hp(); printf("\n"); } } if (global_cfg.m_image_stats) { image_metrics im; if (global_cfg.m_debug_output) printf("BC6H half float space error metrics (a piecewise linear approximation of log2 error):\n"); for (uint32_t i = 0; i < 3; i++) { im.calc_half(enc_state.src_img, unpacked_bc6h_img, i, 1, true); if (global_cfg.m_debug_output) { printf("%c: ", "RGBA"[i]); im.print_hp(); } } metrics.m_im_bc6h_half.calc_half(enc_state.src_img, unpacked_bc6h_img, 0, 3, true); if (global_cfg.m_debug_output) { printf("RGB: "); metrics.m_im_bc6h_half.print_hp(); printf("\n"); } } intermediate_tex_data.swap(coded_bits.get_bytes()); astc_tex_data.resize(decoded_blocks1.size_in_bytes()); memcpy(astc_tex_data.data(), decoded_blocks1.get_ptr(), decoded_blocks1.size_in_bytes()); return true; } } // namespace astc_6x6_hdr