// File: basisu_astc_hdr_6x6_enc.cpp
// Copyright (C) 2019-2026 Binomial LLC. All Rights Reserved.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//    http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "basisu_astc_hdr_6x6_enc.h"
#include "basisu_enc.h"
#include "basisu_astc_hdr_common.h"
#include "basisu_math.h"
#include "basisu_resampler.h"
#include "basisu_resampler_filters.h"

#define MINIZ_HEADER_FILE_ONLY
#define MINIZ_NO_ZLIB_COMPATIBLE_NAMES
#include "basisu_miniz.h"

#include "3rdparty/android_astc_decomp.h"

#include <array>
#include <cfloat>

using namespace basisu;
using namespace buminiz;
using namespace basist::astc_6x6_hdr;

namespace astc_6x6_hdr
{

static void atomic_max(std::atomic<uint32_t>& atomic_var, uint32_t new_value) 
{
	uint32_t current = atomic_var.load(std::memory_order_relaxed);
	for ( ; ; )
	{
		uint32_t new_max = std::max(current, new_value);
		if (atomic_var.compare_exchange_weak(current, new_max, std::memory_order_relaxed, std::memory_order_relaxed)) 
			break;
	}
}

void astc_hdr_6x6_global_config::set_user_level(int level)
{
	level = basisu::clamp<int>(level, 0, ASTC_HDR_6X6_MAX_USER_COMP_LEVEL);

	m_master_comp_level = 0;
	m_highest_comp_level = 0;
	m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS;
	m_extra_patterns_flag = false;
	m_brute_force_partition_matching = false;

	switch (level)
	{
	case 0:
	{
		// Both reduce compression a lot when lambda>0
		m_favor_higher_compression = false;
		m_num_reuse_xy_deltas = NUM_REUSE_XY_DELTAS / 2;
		break;
	}
	case 1:
	{
		m_master_comp_level = 0;
		m_highest_comp_level = 0;
		break;
	}
	case 2:
	{
		m_master_comp_level = 0;
		m_highest_comp_level = 1;
		break;
	}
	case 3:
	{
		m_master_comp_level = 1;
		m_highest_comp_level = 1;
		break;
	}
	case 4:
	{
		m_master_comp_level = 1;
		m_highest_comp_level = 2;
		break;
	}
	case 5:
	{
		m_master_comp_level = 1;
		m_highest_comp_level = 3;
		break;
	}
	case 6:
	{
		m_master_comp_level = 1;
		m_highest_comp_level = 4;
		break;
	}
	case 7:
	{
		m_master_comp_level = 2;
		m_highest_comp_level = 2;
		break;
	}
	case 8:
	{
		m_master_comp_level = 2;
		m_highest_comp_level = 3;
		break;
	}
	case 9:
	{
		m_master_comp_level = 2;
		m_highest_comp_level = 4;
		break;
	}
	case 10:
	{
		m_master_comp_level = 3;
		m_highest_comp_level = 3;
		break;
	}
	case 11:
	{
		m_master_comp_level = 3;
		m_highest_comp_level = 4;
		break;
	}
	case 12:
	default:
	{
		m_master_comp_level = 4;
		m_highest_comp_level = 4;
		m_extra_patterns_flag = true;
		m_brute_force_partition_matching = true;
		break;
	}
	}
}

const float m1 = 0.1593017578125f;    // (2610 / 2^14) * (1/100)
const float m2 = 78.84375f;           // (2523 / 32) * (1/100)
const float c1 = 0.8359375f;          // 3424 / (2^12)
const float c2 = 18.8515625f;         // (2413 / 128)
const float c3 = 18.6875f;            // (2392 / 128)

static float forwardPQ(float Y)
{
	// 10,000 here is an absolute scale - it's in nits (cd per square meter)
	float L = Y * (1.0f / 10000.0f);

	float num = powf(L, m1);
	float N = powf((c1 + c2 * num) / (1 + c3 * num), m2);

	return N;
}

#if 0
static float inversePQ(float E)
{
	float N = powf(E, 1.0f / m2);

	float num = basisu::maximum<float>((N - c1), 0.0f) / (c2 - c3 * N);
	float L = powf(num, 1.0f / m1);

	return L * 10000.0f;
}
#endif

// PQ function approximation: convert input to bfloat16, look up in tables, bilinear interpolation between table entries.
// max_er: 0.000023007392883, max_rel_er: 0.000023472490284, avg_er: 0.000004330495689, 6-7x faster on x86
// Highest error is for values less than SMALLEST_PQ_VAL_IN.
//
// Approximation is round trip lossless for 10-12 bits at [0,10000] nits:
// for x [0,1024] (SCALE=1023) or for x [0,4095] (SCALE=4096): 
// round(forwardPQTab(inversePQ(x / SCALE)) * SCALE) == x
//
// bfloat16 has enough precision to handle 8-bit sRGB to linear conversions:
// round(linear_to_srgb(bfloat16_to_float(float_to_bfloat16(srgb_to_linear(isRGB/255.0f))))*255.0) is lossless

const int PQ_APPROX_MIN_EXP = -16, PQ_APPROX_MAX_EXP = 16;
const int PQ_APPROX_EXP_RANGE = (PQ_APPROX_MAX_EXP - PQ_APPROX_MIN_EXP + 1);

const float SMALLEST_PQ_VAL_IN = 0.000015258829080f;
const float SMALLEST_PQ_VAL = 0.000551903737f;		// forwardPQ(SMALLEST_PQ_VAL_IN)

const float LARGEST_PQ_VAL = 1.251312f; 

float g_pq_approx_tabs[PQ_APPROX_EXP_RANGE][128];

static void init_pq_tables()
{
	for (int exp = PQ_APPROX_MIN_EXP; exp <= PQ_APPROX_MAX_EXP; exp++)
	{
		for (int mant = 0; mant < 128; mant++)
		{
			bfloat16 b = bfloat16_init(1, exp, mant);
			float bf = bfloat16_to_float(b);

			float pq = forwardPQ(bf);

			g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant] = pq;
		}
	}

	//fmt_printf("{.15} {.15}\n", g_pq_approx_tabs[0][0], inversePQ(g_pq_approx_tabs[0][0]));
	//fmt_printf("{.15}\n", forwardPQ(SMALLEST_PQ_VAL_IN));
}

static inline float forwardPQTab(float v)
{
	assert(g_pq_approx_tabs[0][0]);

	assert(v >= 0.0f);
	if (v == 0.0f)
		return 0.0f;

	bfloat16 bf = float_to_bfloat16(v, false);
	assert(v >= bfloat16_to_float(bf));

	int exp = bfloat16_get_exp(bf);

	if (exp < PQ_APPROX_MIN_EXP)
	{
		// not accurate but should be good enough for our uses
		return lerp(0.0f, SMALLEST_PQ_VAL, minimum(1.0f, v / SMALLEST_PQ_VAL_IN));
	}
	else if (exp > PQ_APPROX_MAX_EXP)
		return LARGEST_PQ_VAL;

	int mant = bfloat16_get_mantissa(bf);

	float a = g_pq_approx_tabs[exp - PQ_APPROX_MIN_EXP][mant];
	float bf_f32 = bfloat16_to_float(bf);

	int next_mant = mant + 1;
	int next_exp = exp;
	if (next_mant == 128)
	{
		next_mant = 0;
		next_exp++;
		if (next_exp > PQ_APPROX_MAX_EXP)
			return a;
	}

	float b = g_pq_approx_tabs[next_exp - PQ_APPROX_MIN_EXP][next_mant];

	bfloat16 next_bf = bfloat16_init(1, next_exp, next_mant);
	float next_bf_f32 = bfloat16_to_float(next_bf);
	assert(v <= next_bf_f32);

	float lerp_factor = (v - bf_f32) / (next_bf_f32 - bf_f32);
	assert((lerp_factor >= 0) && (lerp_factor <= 1.0f));

	return lerp(a, b, lerp_factor);
}

// 100 nits = ~.5 i
// This converts absolute linear RGB light in either REC 709 or REC2020/BT2100 color gamut to ICtCp, a coding space where Ct is scaled by 2. 
// To convert to perceptual ITP for error/distance calculations, multiply the result Ct by .5 (or set itp_flag to true).
// Assumes REC 709 input, or REC 2020/BT.2100 RGB input if rec2020_bt2100_color_gamut is true.
//
// ITP info:
// https://www.portrait.com/resource-center/ictcp-color-difference-metric/
// https://professional.dolby.com/siteassets/pdfs/measuringperceptualcolorvolume_v07.253.pdf (see scale to JND's)
// This also converts from a ICtCp coding space to threshold or perceptually uniform space ITP.
//
// Linear REC709 to REC2020/BT.2100 gamut conversion:
// rgb_2100[0] = rgb_in[0] * 0.6274f + rgb_in[1] * 0.3293f + rgb_in[2] * 0.0433f;
// rgb_2100[1] = rgb_in[0] * 0.0691f + rgb_in[1] * 0.9195f + rgb_in[2] * 0.0114f;
// rgb_2100[2] = rgb_in[0] * 0.0164f + rgb_in[1] * 0.0880f + rgb_in[2] * 0.8956f;
// const float S = 1.0f / 4096.0f;
// l = (1688.0f * S) * rgb_2100[0] + (2146.0f * S) * rgb_2100[1] + (262.0f * S) * rgb_2100[2];
// m = (683.0f * S) * rgb_2100[0] + (2951.0f * S) * rgb_2100[1] + (462.0f * S) * rgb_2100[2];
// s = (99.0f * S) * rgb_2100[0] + (309.0f * S) * rgb_2100[1] + (3688.0f * S) * rgb_2100[2];
static void linear_rgb_to_ictcp(const vec3F& rgb_in, vec3F& ictcp, bool itp_flag = false, bool rec2020_bt2100_color_gamut = false)
{
	vec3F rgb_2100(rgb_in);
	
	float l, m, s;
	if (!rec2020_bt2100_color_gamut)
	{
		// Assume REC 709 input color gamut
		// (REC2020_to_LMS * REC709_to_2020) * input_color
		l = rgb_2100[0] * 0.2958097f + rgb_2100[1] * 0.6230863f + rgb_2100[2] * 0.0811040f;
		m = rgb_2100[0] * 0.1562512f + rgb_2100[1] * 0.7272980f + rgb_2100[2] * 0.1164508f;
		s = rgb_2100[0] * 0.0351435f + rgb_2100[1] * 0.1565601f + rgb_2100[2] * 0.8082964f;
	}
	else
	{
		// Assumes REC2020/BT.2100 input color gamut (this is from the spec)
		l = 0.412109375f    * rgb_2100[0] + 0.52392578125f  * rgb_2100[1] + 0.06396484375f * rgb_2100[2];
		m = 0.166748046875f * rgb_2100[0] + 0.720458984375f * rgb_2100[1] + 0.11279296875f * rgb_2100[2];
		s = 0.024169921875f * rgb_2100[0] + 0.075439453125f * rgb_2100[1] + 0.900390625f   * rgb_2100[2];
	}

	float ld = forwardPQTab(l);
	float md = forwardPQTab(m);
	float sd = forwardPQTab(s);

	ictcp[0] = .5f * ld + .5f * md;

	// if ITP scale Ct by .5 (the ICtCp spec scaled Ct to better exploit the full scaled output, which is not perceptually linear)
	if (itp_flag)
		ictcp[1] = 0.806884765625f * ld + -1.6617431640625f * md + 0.8548583984375f * sd;
	else
		ictcp[1] = 1.61376953125f * ld + -3.323486328125f * md + 1.709716796875f * sd;

	ictcp[2] = 4.378173828125f * ld + -4.24560546875f * md + -0.132568359375f * sd;
}

static inline void linear_rgb_to_itp(const vec3F& rgb_in, vec3F& itp, const astc_hdr_6x6_global_config &cfg)
{
	linear_rgb_to_ictcp(rgb_in, itp, true, cfg.m_rec2020_bt2100_color_gamut);
}

#if 0
// Outputs rec2020/bt2100 color gamut (i.e. this doesn't convert back to REC709 gamut).
static void ictcp_to_linear_rgb(const vec3F& ictcp, vec3F& rgb, bool itp_flag = false)
{
	float ct = ictcp[1];

	if (itp_flag)
		ct *= 2.0f;

	float ld = ictcp[0] + ct * 0.008609037037932726f + ictcp[2] * 0.11102962500302596f;
	float md = ictcp[0] + ct * -0.008609037037932726f + ictcp[2] * -0.11102962500302596f;
	float sd = ictcp[0] + ct * 0.5600313357106792f + ictcp[2] * -0.32062717498731885f;

	float l = inversePQ(ld);
	float m = inversePQ(md);
	float s = inversePQ(sd);

	rgb[0] = l * 3.436606694333079f + m * -2.5064521186562705f + s * 0.06984542432319149f;
	rgb[1] = l * -0.7913295555989289f + m * 1.983600451792291f + s * -0.192270896193362f;
	rgb[2] = l * -0.025949899690592672f + m * -0.09891371471172646f + s * 1.1248636144023192f;
}
#endif

struct half_vec3
{
	basist::half_float m_vals[3];

	inline half_vec3() { }

	inline half_vec3(basist::half_float x, basist::half_float y, basist::half_float z)
	{
		m_vals[0] = x;
		m_vals[1] = y;
		m_vals[2] = z;
	}

	inline half_vec3(const half_vec3& other)
	{
		*this = other;
	}

	inline half_vec3& operator= (const half_vec3& rhs)
	{
		m_vals[0] = rhs.m_vals[0];
		m_vals[1] = rhs.m_vals[1];
		m_vals[2] = rhs.m_vals[2];
		return *this;
	}

	inline void clear()
	{
		clear_obj(m_vals);
	}

	inline half_vec3 &set(basist::half_float x, basist::half_float y, basist::half_float z)
	{
		m_vals[0] = x;
		m_vals[1] = y;
		m_vals[2] = z;
		return *this;
	}

	inline half_vec3& set(float x, float y, float z)
	{
		m_vals[0] = basist::float_to_half(x);
		m_vals[1] = basist::float_to_half(y);
		m_vals[2] = basist::float_to_half(z);
		return *this;
	}

	template<typename T>
	inline half_vec3& set_vec(const T& vec)
	{
		m_vals[0] = basist::float_to_half(vec[0]);
		m_vals[1] = basist::float_to_half(vec[1]);
		m_vals[2] = basist::float_to_half(vec[2]);
		return *this;
	}

	template<typename T>
	inline T get_vec() const
	{
		return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]));
	}

	inline basist::half_float operator[] (uint32_t c) const { assert(c < 3); return m_vals[c]; }
	inline basist::half_float& operator[] (uint32_t c) { assert(c < 3); return m_vals[c]; }

	float get_float_comp(uint32_t c) const
	{
		assert(c < 3);
		return basist::half_to_float(m_vals[c]);
	}

	half_vec3& set_float_comp(uint32_t c, float v)
	{
		assert(c < 3);
		m_vals[c] = basist::float_to_half(v);
		return *this;
	}
};

struct half_vec4
{
	basist::half_float m_vals[4];

	inline half_vec4() { }

	inline half_vec4(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w)
	{
		m_vals[0] = x;
		m_vals[1] = y;
		m_vals[2] = z;
		m_vals[3] = w;
	}

	inline half_vec4(const half_vec4& other)
	{
		*this = other;
	}

	inline half_vec4& operator= (const half_vec4& rhs)
	{
		m_vals[0] = rhs.m_vals[0];
		m_vals[1] = rhs.m_vals[1];
		m_vals[2] = rhs.m_vals[2];
		m_vals[3] = rhs.m_vals[3];
		return *this;
	}

	inline void clear()
	{
		clear_obj(m_vals);
	}

	inline half_vec4& set(basist::half_float x, basist::half_float y, basist::half_float z, basist::half_float w)
	{
		m_vals[0] = x;
		m_vals[1] = y;
		m_vals[2] = z;
		m_vals[3] = w;
		return *this;
	}

	inline half_vec4& set(float x, float y, float z, float w)
	{
		m_vals[0] = basist::float_to_half(x);
		m_vals[1] = basist::float_to_half(y);
		m_vals[2] = basist::float_to_half(z);
		m_vals[3] = basist::float_to_half(w);
		return *this;
	}

	template<typename T>
	inline half_vec4& set_vec(const T& vec)
	{
		m_vals[0] = basist::float_to_half(vec[0]);
		m_vals[1] = basist::float_to_half(vec[1]);
		m_vals[2] = basist::float_to_half(vec[2]);
		m_vals[3] = basist::float_to_half(vec[3]);
		return *this;
	}

	template<typename T>
	inline T get_vec() const
	{
		return T(basist::half_to_float(m_vals[0]), basist::half_to_float(m_vals[1]), basist::half_to_float(m_vals[2]), basist::half_to_float(m_vals[3]));
	}

	inline basist::half_float operator[] (uint32_t c) const { assert(c < 4); return m_vals[c]; }
	inline basist::half_float &operator[] (uint32_t c) { assert(c < 4); return m_vals[c]; }

	float get_float_comp(uint32_t c) const
	{
		assert(c < 4);
		return basist::half_to_float(m_vals[c]);
	}

	half_vec4& set_float_comp(uint32_t c, float v)
	{
		assert(c < 4);
		m_vals[c] = basist::float_to_half(v);
		return *this;
	}
};

const uint32_t MAX_BLOCK_W = 6, MAX_BLOCK_H = 6;

struct trial_result
{
	astc_helpers::log_astc_block m_log_blk;
	double m_err;
	bool m_valid;
};

//----------------------------------------------------------

const uint32_t NUM_PART3_MAPPINGS = 6;
static uint8_t g_part3_mapping[NUM_PART3_MAPPINGS][3] =
{
	{ 0, 1, 2 },
	{ 1, 2, 0 },
	{ 2, 0, 1 },
	{ 0, 2, 1 },
	{ 1, 0, 2 },
	{ 2, 1, 0 }
};

struct partition_pattern_vec
{
	uint8_t m_parts[6 * 6];

	partition_pattern_vec()
	{
		clear();
	}

	partition_pattern_vec(const partition_pattern_vec& other)
	{
		*this = other;
	}

	void clear()
	{
		memset(m_parts, 0, sizeof(m_parts));
	}

	partition_pattern_vec& operator= (const partition_pattern_vec& rhs)
	{
		if (this == &rhs)
			return *this;
		memcpy(m_parts, rhs.m_parts, 36);
		return *this;
	}

	uint8_t operator[] (uint32_t i) const { assert(i < 36); return m_parts[i]; }
	uint8_t& operator[] (uint32_t i) { assert(i < 36); return m_parts[i]; }

	uint8_t operator() (uint32_t x, uint32_t y) const { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; }
	uint8_t& operator() (uint32_t x, uint32_t y) { assert((x < 6) && (y < 6)); return m_parts[x + y * 6]; }

	int get_squared_distance(const partition_pattern_vec& other) const
	{
		int total_dist = 0;
		for (uint32_t i = 0; i < 36; i++)
			total_dist += iabs((int)m_parts[i] - (int)other.m_parts[i]);
		return total_dist;
	}

	float get_distance(const partition_pattern_vec& other) const
	{
		return sqrtf((float)get_squared_distance(other));
	}

	partition_pattern_vec get_permuted2(uint32_t permute_index) const
	{
		assert(permute_index <= 1);

		partition_pattern_vec res;
		for (uint32_t i = 0; i < 36; i++)
		{
			assert(m_parts[i] <= 1);
			res.m_parts[i] = (uint8_t)(m_parts[i] ^ permute_index);
		}

		return res;
	}

	partition_pattern_vec get_permuted3(uint32_t permute_index) const
	{
		assert(permute_index <= 5);

		partition_pattern_vec res;
		for (uint32_t i = 0; i < 36; i++)
		{
			assert(m_parts[i] <= 2);
			res.m_parts[i] = g_part3_mapping[permute_index][m_parts[i]];
		}

		return res;
	}

	partition_pattern_vec get_canonicalized() const
	{
		partition_pattern_vec res;

		int new_labels[3] = { -1, -1, -1 };
		uint32_t next_index = 0;
		for (uint32_t i = 0; i < 36; i++)
		{
			uint32_t p = m_parts[i];
			if (new_labels[p] == -1)
				new_labels[p] = next_index++;

			res.m_parts[i] = (uint8_t)new_labels[p];
		}

		return res;
	}

	bool operator== (const partition_pattern_vec& rhs) const
	{
		return memcmp(m_parts, rhs.m_parts, sizeof(m_parts)) == 0;
	}

	operator size_t() const
	{
		return basist::hash_hsieh(m_parts, sizeof(m_parts));
	}
};

struct vp_tree_node
{
	partition_pattern_vec m_vantage_point;
	uint32_t m_point_index;
	float m_dist;

	int m_inner_node, m_outer_node;
};

#define BRUTE_FORCE_PART_SEARCH (0)

class vp_tree
{
public:
	vp_tree()
	{
	}

	void clear()
	{
		m_nodes.clear();
	}

	// This requires no redundant patterns, i.e. all must be unique.
	bool init(uint32_t n, const partition_pattern_vec* pUnique_pats)
	{
		clear();

		uint_vec pat_indices(n);
		for (uint32_t i = 0; i < n; i++)
			pat_indices[i] = i;

		std::pair<int, float> root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices);

		if (root_idx.first == -1)
			return false;

		m_nodes.resize(1);
		m_nodes[0].m_vantage_point = pUnique_pats[root_idx.first];
		m_nodes[0].m_point_index = root_idx.first;
		m_nodes[0].m_dist = root_idx.second;
		m_nodes[0].m_inner_node = -1;
		m_nodes[0].m_outer_node = -1;

		uint_vec inner_list, outer_list;
		
		inner_list.reserve(n / 2);
		outer_list.reserve(n / 2);

		for (uint32_t pat_index = 0; pat_index < n; pat_index++)
		{
			if ((int)pat_index == root_idx.first)
				continue;

			const float dist = m_nodes[0].m_vantage_point.get_distance(pUnique_pats[pat_index]);

			if (dist <= root_idx.second)
				inner_list.push_back(pat_index);
			else
				outer_list.push_back(pat_index);
		}

		if (inner_list.size())
		{
			m_nodes[0].m_inner_node = create_node(n, pUnique_pats, inner_list);
			if (m_nodes[0].m_inner_node < 0)
				return false;
		}

		if (outer_list.size())
		{
			m_nodes[0].m_outer_node = create_node(n, pUnique_pats, outer_list);
			if (m_nodes[0].m_outer_node < 0)
				return false;
		}

		return true;
	}

	struct result
	{
		uint32_t m_pat_index;
		uint32_t m_mapping_index;
		float m_dist;

		bool operator< (const result& rhs) const { return m_dist < rhs.m_dist; }
		bool operator> (const result& rhs) const { return m_dist > rhs.m_dist; }
	};

	class result_queue
	{
		enum { MaxSupportedSize = 256 + 1 };

	public:
		result_queue() : 
			m_cur_size(0) 
		{
		}

		size_t get_size() const
		{
			return m_cur_size;
		}

		bool empty() const
		{
			return !m_cur_size;
		}

		typedef std::array<result, MaxSupportedSize + 1> result_array_type;

		const result_array_type& get_elements() const { return m_elements; }
		result_array_type& get_elements() { return m_elements; }

		void clear()
		{
			m_cur_size = 0;
		}

		void reserve(uint32_t n)
		{
			BASISU_NOTE_UNUSED(n);
		}

		const result& top() const
		{
			assert(m_cur_size);
			return m_elements[1];
		}

		bool insert(const result& val, uint32_t max_size)
		{
			assert(max_size < MaxSupportedSize);

			if (m_cur_size >= MaxSupportedSize)
				return false;

			m_elements[++m_cur_size] = val;
			up_heap(m_cur_size);

			if (m_cur_size > max_size)
				pop();

			return true;
		}

		bool pop()
		{
			if (m_cur_size == 0) 
				return false;

			m_elements[1] = m_elements[m_cur_size--];
			down_heap(1);
			return true;
		}
								
		float get_highest_dist() const
		{
			if (!m_cur_size)
				return 0.0f;

			return top().m_dist;
		}
	
	private:
		result_array_type m_elements;
		size_t m_cur_size;

		void up_heap(size_t index)
		{
			while ((index > 1) && (m_elements[index] > m_elements[index >> 1]))
			{
				std::swap(m_elements[index], m_elements[index >> 1]);
				index >>= 1;
			}
		}

		void down_heap(size_t index)
		{
			for ( ; ; )
			{
				size_t largest = index, left_child = 2 * index, right_child = 2 * index + 1;

				if ((left_child <= m_cur_size) && (m_elements[left_child] > m_elements[largest]))
					largest = left_child;

				if ((right_child <= m_cur_size) && (m_elements[right_child] > m_elements[largest]))
					largest = right_child;

				if (largest == index)
					break;

				std::swap(m_elements[index], m_elements[largest]);
				index = largest;
			}
		}
	};
		
	void find_nearest(uint32_t num_subsets, const partition_pattern_vec& desired_pat, result_queue& results, uint32_t max_results)
	{
		assert((num_subsets >= 2) && (num_subsets <= 3));

		results.clear();

		if (!m_nodes.size())
			return;

		uint32_t num_desired_pats;
		partition_pattern_vec desired_pats[NUM_PART3_MAPPINGS];

		if (num_subsets == 2)
		{
			num_desired_pats = 2;
			for (uint32_t i = 0; i < 2; i++)
				desired_pats[i] = desired_pat.get_permuted2(i);
		}
		else
		{
			num_desired_pats = NUM_PART3_MAPPINGS;
			for (uint32_t i = 0; i < NUM_PART3_MAPPINGS; i++)
				desired_pats[i] = desired_pat.get_permuted3(i);
		}

#if 0
		find_nearest_at_node(0, num_desired_pats, desired_pats, results, max_results);
#else
		find_nearest_at_node_non_recursive(0, num_desired_pats, desired_pats, results, max_results);
#endif
	}

private:
	basisu::vector<vp_tree_node> m_nodes;

	void find_nearest_at_node(int node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results)
	{
		float best_dist_to_vantage = BIG_FLOAT_VAL;
		uint32_t best_mapping = 0;
		for (uint32_t i = 0; i < num_desired_pats; i++)
		{
			float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point);
			if (dist < best_dist_to_vantage)
			{
				best_dist_to_vantage = dist;
				best_mapping = i;
			}
		}

		result r;
		r.m_dist = best_dist_to_vantage;
		r.m_mapping_index = best_mapping;
		r.m_pat_index = m_nodes[node_index].m_point_index;

		results.insert(r, max_results);

		if (best_dist_to_vantage <= m_nodes[node_index].m_dist)
		{
			// inner first
			if (m_nodes[node_index].m_inner_node >= 0)
				find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results);

			if (m_nodes[node_index].m_outer_node >= 0)
			{
				if ( (results.get_size() < max_results) || 
					((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist())
					)
				{
					find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results);
				}
			}
		}
		else
		{
			// outer first
			if (m_nodes[node_index].m_outer_node >= 0)
				find_nearest_at_node(m_nodes[node_index].m_outer_node, num_desired_pats, pDesired_pats, results, max_results);

			if (m_nodes[node_index].m_inner_node >= 0)
			{
				if ( (results.get_size() < max_results) || 
					((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist())
					)
				{
					find_nearest_at_node(m_nodes[node_index].m_inner_node, num_desired_pats, pDesired_pats, results, max_results);
				}
			}
		}
	}
		
	void find_nearest_at_node_non_recursive(int init_node_index, uint32_t num_desired_pats, const partition_pattern_vec* pDesired_pats, result_queue& results, uint32_t max_results)
	{
		uint_vec node_stack;
		node_stack.reserve(16);
		node_stack.push_back(init_node_index);
		
		do
		{
			const uint32_t node_index = node_stack.back();
			node_stack.pop_back();

			float best_dist_to_vantage = BIG_FLOAT_VAL;
			uint32_t best_mapping = 0;
			for (uint32_t i = 0; i < num_desired_pats; i++)
			{
				float dist = pDesired_pats[i].get_distance(m_nodes[node_index].m_vantage_point);
				if (dist < best_dist_to_vantage)
				{
					best_dist_to_vantage = dist;
					best_mapping = i;
				}
			}

			result r;
			r.m_dist = best_dist_to_vantage;
			r.m_mapping_index = best_mapping;
			r.m_pat_index = m_nodes[node_index].m_point_index;

			results.insert(r, max_results);

			if (best_dist_to_vantage <= m_nodes[node_index].m_dist)
			{
				if (m_nodes[node_index].m_outer_node >= 0)
				{
					if ((results.get_size() < max_results) ||
						((m_nodes[node_index].m_dist - best_dist_to_vantage) <= results.get_highest_dist())
						)
					{
						node_stack.push_back(m_nodes[node_index].m_outer_node);
					}
				}

				// inner first
				if (m_nodes[node_index].m_inner_node >= 0)
				{
					node_stack.push_back(m_nodes[node_index].m_inner_node);
				}
			}
			else
			{
				if (m_nodes[node_index].m_inner_node >= 0)
				{
					if ((results.get_size() < max_results) ||
						((best_dist_to_vantage - m_nodes[node_index].m_dist) <= results.get_highest_dist())
						)
					{
						node_stack.push_back(m_nodes[node_index].m_inner_node);
					}
				}

				// outer first
				if (m_nodes[node_index].m_outer_node >= 0)
				{
					node_stack.push_back(m_nodes[node_index].m_outer_node);
				}
			}

		} while (!node_stack.empty());
	}

	// returns the index of the new node, or -1 on error
	int create_node(uint32_t n, const partition_pattern_vec* pUnique_pats, const uint_vec& pat_indices)
	{
		std::pair<int, float> root_idx = find_best_vantage_point(n, pUnique_pats, pat_indices);

		if (root_idx.first < 0)
			return -1;

		m_nodes.resize(m_nodes.size() + 1);
		const uint32_t new_node_index = m_nodes.size_u32() - 1;
				
		m_nodes[new_node_index].m_vantage_point = pUnique_pats[root_idx.first];
		m_nodes[new_node_index].m_point_index = root_idx.first;
		m_nodes[new_node_index].m_dist = root_idx.second;
		m_nodes[new_node_index].m_inner_node = -1;
		m_nodes[new_node_index].m_outer_node = -1;

		uint_vec inner_list, outer_list;

		inner_list.reserve(pat_indices.size_u32() / 2);
		outer_list.reserve(pat_indices.size_u32() / 2);

		for (uint32_t pat_indices_iter = 0; pat_indices_iter < pat_indices.size(); pat_indices_iter++)
		{
			const uint32_t pat_index = pat_indices[pat_indices_iter];

			if ((int)pat_index == root_idx.first)
				continue;

			const float dist = m_nodes[new_node_index].m_vantage_point.get_distance(pUnique_pats[pat_index]);

			if (dist <= root_idx.second)
				inner_list.push_back(pat_index);
			else
				outer_list.push_back(pat_index);
		}

		if (inner_list.size())
			m_nodes[new_node_index].m_inner_node = create_node(n, pUnique_pats, inner_list);

		if (outer_list.size())
			m_nodes[new_node_index].m_outer_node = create_node(n, pUnique_pats, outer_list);

		return new_node_index;
	}

	// returns the pattern index of the vantage point (-1 on error), and the optimal split distance
	std::pair<int, float> find_best_vantage_point(uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, const uint_vec &pat_indices)
	{
		BASISU_NOTE_UNUSED(num_unique_pats);

		const uint32_t n = pat_indices.size_u32();

		assert(n);
		if (n == 1)
			return std::pair(pat_indices[0], 0.0f);

		float best_split_metric = -1.0f;
		int best_split_pat = -1;
		float best_split_dist = 0.0f;
		float best_split_var = 0.0f;

		basisu::vector< std::pair<float, uint32_t> > dists;
		dists.reserve(n);
		
		float_vec float_dists;
		float_dists.reserve(n);
				
		for (uint32_t pat_indices_iter = 0; pat_indices_iter < n; pat_indices_iter++)
		{
			const uint32_t split_pat_index = pat_indices[pat_indices_iter];
			assert(split_pat_index < num_unique_pats);

			const partition_pattern_vec& trial_vantage = pUnique_pats[split_pat_index];
		
			dists.resize(0);
			float_dists.resize(0);

			for (uint32_t j = 0; j < n; j++)
			{
				const uint32_t pat_index = pat_indices[j];
				assert(pat_index < num_unique_pats);

				if (pat_index == split_pat_index)
					continue;
				
				float dist = trial_vantage.get_distance(pUnique_pats[pat_index]);
				dists.emplace_back(std::pair(dist, pat_index));

				float_dists.push_back(dist);
			}

			stats<double> s;
			s.calc(float_dists.size_u32(), float_dists.data());

			std::sort(dists.begin(), dists.end(), [](const auto &a, const auto &b) {
				return a.first < b.first;
				});

			const uint32_t num_dists = dists.size_u32();
			float split_dist = dists[num_dists / 2].first;
			if ((num_dists & 1) == 0)
				split_dist = (split_dist + dists[(num_dists / 2) - 1].first) * .5f;

			uint32_t total_inner = 0, total_outer = 0;
			
			for (uint32_t j = 0; j < n; j++)
			{
				const uint32_t pat_index = pat_indices[j];
				if (pat_index == split_pat_index)
					continue;
				
				float dist = trial_vantage.get_distance(pUnique_pats[pat_index]);

				if (dist <= split_dist)
					total_inner++;
				else
					total_outer++;
			}

			float split_metric = (float)minimum(total_inner, total_outer) / (float)maximum(total_inner, total_outer);
			
			if ( (split_metric > best_split_metric) ||
				 ((split_metric == best_split_metric) && (s.m_var > best_split_var)) )
			{
				best_split_metric = split_metric;
				best_split_dist = split_dist;
				best_split_pat = split_pat_index;
				best_split_var = (float)s.m_var;
			}
		}

		return std::pair(best_split_pat, best_split_dist);
	}
};

struct partition
{
	uint64_t m_p;

	inline partition() : 
		m_p(0)
	{
	}

	inline partition(uint64_t p) :
		m_p(p)
	{
		assert(p < (1ULL << 36));
	}

	inline partition& operator=(uint64_t p)
	{
		assert(p < (1ULL << 36));
		m_p = p;
		return *this;
	}

	inline bool operator< (const partition& p) const
	{
		return m_p < p.m_p;
	}

	inline bool operator== (const partition& p) const
	{
		return m_p == p.m_p;
	}

	inline operator size_t() const
	{
		return basist::hash_hsieh((const uint8_t *)&m_p, sizeof(m_p));
	}
};

partition_pattern_vec g_partitions2[NUM_UNIQUE_PARTITIONS2];
int g_part2_seed_to_unique_index[1024];
vp_tree g_part2_vp_tree;

static inline vec3F vec3F_norm_approx(vec3F axis)
{
	float l = axis.norm();
	axis = (fabs(l) >= SMALL_FLOAT_VAL) ? (axis * bu_math::inv_sqrt(l)) : vec3F(0.577350269f);
	return axis;
}

static void init_partitions2_6x6()
{
#if 0
	// makes pattern bits to the 10-bit ASTC seed index
	typedef basisu::hash_map<uint64_t, uint32_t> partition2_hash_map;
	partition2_hash_map phash;
	phash.reserve(1024);

	for (uint32_t i = 0; i < 1024; i++)
	{
		uint64_t p_bits = 0;
		uint64_t p_bits_inv = 0;
				
		for (uint32_t y = 0; y < 6; y++)
		{
			for (uint32_t x = 0; x < 6; x++)
			{
				uint64_t p = astc_helpers::compute_texel_partition(i, x, y, 0, 2, false);
				assert(p < 2);
								
				p_bits |= (p << (x + y * 6));
				p_bits_inv |= ((1 - p) << (x + y * 6));
			}
		}
				
		if (!p_bits)
			continue;
		if (p_bits == ((1ULL << 36) - 1))
			continue;

		assert(p_bits < (1ULL << 36));
		assert(p_bits_inv < (1ULL << 36));

		if (phash.contains(p_bits))
		{
		}
		else if (phash.contains(p_bits_inv))
		{
		}
		else
		{
			auto res = phash.insert(p_bits, i);
			assert(res.second);
			BASISU_NOTE_UNUSED(res);
		}
	}
		
	uint32_t num_unique_partitions2 = 0;
		
	for (const auto& r : phash)
	{
		assert(r.second < 1024);
		
		const uint32_t unique_index = num_unique_partitions2;
		assert(unique_index < NUM_UNIQUE_PARTITIONS2);

		partition_pattern_vec pat_vec;
		for (uint32_t i = 0; i < 36; i++)
			pat_vec[i] = (uint8_t)((r.first >> i) & 1);

		g_partitions2[unique_index] = pat_vec;
		
		assert(g_part2_unique_index_to_seed[unique_index] == r.second);
		g_part2_seed_to_unique_index[r.second] = unique_index;

		num_unique_partitions2++;
	}
	assert(num_unique_partitions2 == NUM_UNIQUE_PARTITIONS2);
#else
	for (uint32_t unique_index = 0; unique_index < NUM_UNIQUE_PARTITIONS2; unique_index++)
	{
		const uint32_t seed_index = g_part2_unique_index_to_seed[unique_index];
		assert(seed_index < 1024);

		assert(g_part2_seed_to_unique_index[seed_index] == 0);
		g_part2_seed_to_unique_index[seed_index] = unique_index;

		partition_pattern_vec& pat_vec = g_partitions2[unique_index];

		for (uint32_t y = 0; y < 6; y++)
		{
			for (uint32_t x = 0; x < 6; x++)
			{
				uint8_t p = (uint8_t)astc_helpers::compute_texel_partition(seed_index, x, y, 0, 2, false);
				assert(p < 2);

				pat_vec[x + y * 6] = p;
			}
		}
	}
#endif

	g_part2_vp_tree.init(NUM_UNIQUE_PARTITIONS2, g_partitions2);
}

static bool estimate_partition2_6x6(
	const basist::half_float pBlock_pixels_half[][3],
	int* pBest_parts, uint32_t num_best_parts)
{
	const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H;
		
	vec3F training_vecs[BLOCK_T], mean(0.0f);

	for (uint32_t i = 0; i < BLOCK_T; i++)
	{
		vec3F& v = training_vecs[i];

		v[0] = (float)pBlock_pixels_half[i][0];
		v[1] = (float)pBlock_pixels_half[i][1];
		v[2] = (float)pBlock_pixels_half[i][2];

		mean += v;
	}
	mean *= (1.0f / (float)BLOCK_T);

	vec3F max_vals(-BIG_FLOAT_VAL);

	for (uint32_t i = 0; i < BLOCK_T; i++)
	{
		vec3F& v = training_vecs[i];
		max_vals = vec3F::component_max(max_vals, v);
	}

	// Initialize principle axis approximation
	vec3F axis(max_vals - mean);

	// Incremental approx. PCA - only viable if we have a reasonably fast approximation for 1.0/sqrt(x).
	for (uint32_t i = 0; i < BLOCK_T; i++)
	{
		axis = vec3F_norm_approx(axis);

		vec3F color(training_vecs[i] - mean);

		float d = color.dot(axis);

		axis += color * d;
	}

	if (axis.norm() < SMALL_FLOAT_VAL)
		axis.set(0.57735027f);
	else
		axis.normalize_in_place();

#if BRUTE_FORCE_PART_SEARCH
	int desired_parts[BLOCK_H][BLOCK_W]; // [y][x]
	for (uint32_t i = 0; i < BLOCK_T; i++)
	{
		float proj = (training_vecs[i] - mean).dot(axis);

		desired_parts[i / BLOCK_W][i % BLOCK_W] = proj < 0.0f;
	}
#else
	partition_pattern_vec desired_part;

	for (uint32_t i = 0; i < BLOCK_T; i++)
	{
		float proj = (training_vecs[i] - mean).dot(axis);

		desired_part.m_parts[i] = proj < 0.0f;
	}
#endif
	
	//interval_timer tm;
	//tm.start();
	
#if BRUTE_FORCE_PART_SEARCH
	uint32_t part_similarity[NUM_UNIQUE_PARTITIONS2];

	for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS2; part_index++)
	{
		const partition_pattern_vec &pat_vec = g_partitions2[part_index];

		int total_sim_non_inv = 0;
		int total_sim_inv = 0;

		for (uint32_t y = 0; y < BLOCK_H; y++)
		{
			for (uint32_t x = 0; x < BLOCK_W; x++)
			{
				int part = pat_vec[x + y * 6];

				if (part == desired_parts[y][x])
					total_sim_non_inv++;

				if ((part ^ 1) == desired_parts[y][x])
					total_sim_inv++;
			}
		}

		int total_sim = maximum(total_sim_non_inv, total_sim_inv);

		part_similarity[part_index] = (total_sim << 16) | part_index;

	} // part_index;

	std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS2);

	for (uint32_t i = 0; i < num_best_parts; i++)
		pBest_parts[i] = part_similarity[(NUM_UNIQUE_PARTITIONS2 - 1) - i] & 0xFFFF;
#else
	vp_tree::result_queue results;
	results.reserve(num_best_parts);
	g_part2_vp_tree.find_nearest(2, desired_part, results, num_best_parts);

	assert(results.get_size() == num_best_parts);

	const auto& elements = results.get_elements();

	for (uint32_t i = 0; i < results.get_size(); i++)
		pBest_parts[i] = elements[1 + i].m_pat_index;
#endif

	//fmt_printf("{} ", tm.get_elapsed_ms());

	return true;
}

const uint32_t MIN_REFINE_LEVEL = 0;

static bool encode_block_2_subsets(
	trial_result res[2],
	uint32_t grid_w, uint32_t grid_h,
	uint32_t cem,
	uint32_t weights_ise_range, uint32_t endpoints_ise_range,
	const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16,
	astc_hdr_codec_base_options& coptions,
	bool uber_mode_flag,
	int unique_pat_index,
	uint32_t comp_level,
	opt_mode_t mode11_opt_mode,
	bool refine_endpoints_flag)
{
	const uint32_t num_endpoint_vals = (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS;

	res[0].m_valid = false;
	res[1].m_valid = false;

	const uint32_t BLOCK_W = 6, BLOCK_H = 6;

	astc_helpers::log_astc_block best_log_blk;
	clear_obj(best_log_blk);

	best_log_blk.m_num_partitions = 2;
	best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem;
	best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem;
	best_log_blk.m_grid_width = (uint8_t)grid_w;
	best_log_blk.m_grid_height = (uint8_t)grid_h;

	best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range;
	best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range;

	partition_pattern_vec* pPat = &g_partitions2[unique_pat_index];
	const uint32_t p_seed = g_part2_unique_index_to_seed[unique_pat_index];

	vec4F part_pixels_q16[2][64];
	half_vec3 part_half_pixels[2][64];
	uint8_t part_pixel_index[2][64];
	uint32_t part_total_pixels[2] = { 0 };

	for (uint32_t y = 0; y < BLOCK_H; y++)
	{
		for (uint32_t x = 0; x < BLOCK_W; x++)
		{
			uint32_t part_index = (*pPat)[x + y * BLOCK_W];

			uint32_t l = part_total_pixels[part_index];

			part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W];
			part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W];
			part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W);

			part_total_pixels[part_index] = l + 1;
		} // x 
	} // y

	uint8_t blk_endpoints[2][basist::NUM_MODE11_ENDPOINTS];
	uint8_t blk_weights[2][BLOCK_W * BLOCK_H];
	uint32_t best_submode[2];

	for (uint32_t part_iter = 0; part_iter < 2; part_iter++)
	{
		assert(part_total_pixels[part_iter]);

		double e;
		if (cem == 7)
		{
			e = encode_astc_hdr_block_mode_7(
				part_total_pixels[part_iter],
				(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
				best_log_blk.m_weight_ise_range,
				best_submode[part_iter],
				BIG_FLOAT_VAL,
				blk_endpoints[part_iter],
				blk_weights[part_iter],
				coptions,
				best_log_blk.m_endpoint_ise_range);
		}
		else
		{
			assert(cem == 11);

			e = encode_astc_hdr_block_mode_11(
				part_total_pixels[part_iter],
				(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
				best_log_blk.m_weight_ise_range,
				best_submode[part_iter],
				BIG_FLOAT_VAL,
				blk_endpoints[part_iter],
				blk_weights[part_iter],
				coptions,
				false,
				best_log_blk.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false,
				mode11_opt_mode);
		}

		if (e == BIG_FLOAT_VAL)
			return false;

	} // part_iter

	uint8_t ise_weights[BLOCK_W * BLOCK_H];

	uint32_t src_pixel_index[2] = { 0, 0 };
	for (uint32_t y = 0; y < BLOCK_H; y++)
	{
		for (uint32_t x = 0; x < BLOCK_W; x++)
		{
			uint32_t part_index = (*pPat)[x + y * BLOCK_W];
			ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
			src_pixel_index[part_index]++;
		} // x
	} // y

	if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H))
	{
		best_log_blk.m_partition_id = (uint16_t)p_seed;

		memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals);
		memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals);
		memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H);

		res[0].m_valid = true;
		res[0].m_log_blk = best_log_blk;
	}
	else
	{
		uint8_t desired_weights[BLOCK_H * BLOCK_W];

		const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val;

		for (uint32_t by = 0; by < BLOCK_H; by++)
			for (uint32_t bx = 0; bx < BLOCK_W; bx++)
				desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]];

		uint8_t downsampled_weights[BLOCK_H * BLOCK_W];

		const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h);
		if (!pDownsample_matrix)
		{
			assert(0);
			return false;
		}

		downsample_weight_grid(
			pDownsample_matrix,
			BLOCK_W, BLOCK_H,		// source/from dimension (block size)
			grid_w, grid_h,			// dest/to dimension (grid size)
			desired_weights,		// these are dequantized weights, NOT ISE symbols, [by][bx]
			downsampled_weights);	// [wy][wx]
				
		best_log_blk.m_partition_id = (uint16_t)p_seed;
		memcpy(best_log_blk.m_endpoints, blk_endpoints[0], num_endpoint_vals);
		memcpy(best_log_blk.m_endpoints + num_endpoint_vals, blk_endpoints[1], num_endpoint_vals);

		const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise;

		for (uint32_t gy = 0; gy < grid_h; gy++)
			for (uint32_t gx = 0; gx < grid_w; gx++)
				best_log_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]];

		res[0].m_valid = true;
		res[0].m_log_blk = best_log_blk;

		if ((refine_endpoints_flag) && (comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6)))
		{
			bool any_refined = false;

			for (uint32_t part_iter = 0; part_iter < 2; part_iter++)
			{
				bool refine_status = refine_endpoints(
					cem,
					endpoints_ise_range,
					best_log_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize
					BLOCK_W, BLOCK_H, // block dimensions
					grid_w, grid_h, best_log_blk.m_weights, weights_ise_range, // weight grid
					part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
					&part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets
					coptions, mode11_opt_mode);

				if (refine_status)
					any_refined = true;
			}

			if (any_refined)
			{
				res[1].m_valid = true;
				res[1].m_log_blk = best_log_blk;
			}
		}
	}

	return true;
}

typedef basisu::hash_map<partition_pattern_vec, std::pair<uint32_t, uint32_t > > partition3_hash_map;

partition_pattern_vec g_partitions3[NUM_UNIQUE_PARTITIONS3];
int g_part3_seed_to_unique_index[1024];
vp_tree g_part3_vp_tree;

static void init_partitions3_6x6()
{
	uint32_t t = 0;

	for (uint32_t i = 0; i < 1024; i++)
		g_part3_seed_to_unique_index[i] = -1;

	partition3_hash_map part3_hash;
	part3_hash.reserve(512);
		
	for (uint32_t seed_index = 0; seed_index < 1024; seed_index++)
	{
		partition_pattern_vec p3;
		uint32_t part_hist[3] = { 0 };

		for (uint32_t y = 0; y < 6; y++)
		{
			for (uint32_t x = 0; x < 6; x++)
			{
				uint64_t p = astc_helpers::compute_texel_partition(seed_index, x, y, 0, 3, false);
				assert(p < 3);

				p3.m_parts[x + y * 6] = (uint8_t)p;
				part_hist[p]++;
			}
		}

		if (!part_hist[0] || !part_hist[1] || !part_hist[2])
			continue;

		uint32_t j;
		for (j = 0; j < NUM_PART3_MAPPINGS; j++)
		{
			partition_pattern_vec temp_part3(p3.get_permuted3(j));

			if (part3_hash.contains(temp_part3))
				break;
		}
		if (j < NUM_PART3_MAPPINGS)
			continue;

		part3_hash.insert(p3, std::make_pair(seed_index, t) );

		assert(g_part3_unique_index_to_seed[t] == seed_index);
		g_part3_seed_to_unique_index[seed_index] = t;
		g_partitions3[t] = p3;

		t++;
	}

	g_part3_vp_tree.init(NUM_UNIQUE_PARTITIONS3, g_partitions3);
}

static bool estimate_partition3_6x6(
	const basist::half_float pBlock_pixels_half[][3],
	int* pBest_parts, uint32_t num_best_parts)
{
	const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = BLOCK_W * BLOCK_H, NUM_SUBSETS = 3;

	assert(num_best_parts && (num_best_parts <= NUM_UNIQUE_PARTITIONS3));

	vec3F training_vecs[BLOCK_T], mean(0.0f);

	float brightest_inten = 0.0f, darkest_inten = BIG_FLOAT_VAL;
	vec3F cluster_centroids[NUM_SUBSETS];
	clear_obj(cluster_centroids);

	for (uint32_t i = 0; i < BLOCK_T; i++)
	{
		vec3F& v = training_vecs[i];

		v.set((float)pBlock_pixels_half[i][0], (float)pBlock_pixels_half[i][1], (float)pBlock_pixels_half[i][2]);

		float inten = v.dot(vec3F(1.0f));
		if (inten < darkest_inten)
		{
			darkest_inten = inten;
			cluster_centroids[0] = v;
		}

		if (inten > brightest_inten)
		{
			brightest_inten = inten;
			cluster_centroids[1] = v;
		}
	}

	if (cluster_centroids[0] == cluster_centroids[1])
		return false;

	float furthest_dist2 = 0.0f;
	for (uint32_t i = 0; i < BLOCK_T; i++)
	{
		vec3F& v = training_vecs[i];

		float dist_a = v.squared_distance(cluster_centroids[0]);
		if (dist_a == 0.0f)
			continue;

		float dist_b = v.squared_distance(cluster_centroids[1]);
		if (dist_b == 0.0f)
			continue;

		float dist2 = dist_a + dist_b;
		if (dist2 > furthest_dist2)
		{
			furthest_dist2 = dist2;
			cluster_centroids[2] = v;
		}
	}

	if ((cluster_centroids[0] == cluster_centroids[2]) || (cluster_centroids[1] == cluster_centroids[2]))
		return false;
		
	uint32_t cluster_pixels[NUM_SUBSETS][BLOCK_T];
	uint32_t num_cluster_pixels[NUM_SUBSETS];
	vec3F new_cluster_means[NUM_SUBSETS];

	const uint32_t NUM_ITERS = 4;
	
	for (uint32_t s = 0; s < NUM_ITERS; s++)
	{
		memset(num_cluster_pixels, 0, sizeof(num_cluster_pixels));
		memset((void *)new_cluster_means, 0, sizeof(new_cluster_means));

		for (uint32_t i = 0; i < BLOCK_T; i++)
		{
			float d[NUM_SUBSETS] = { 
				training_vecs[i].squared_distance(cluster_centroids[0]), 
				training_vecs[i].squared_distance(cluster_centroids[1]), 
				training_vecs[i].squared_distance(cluster_centroids[2]) };

			float min_d = d[0];
			uint32_t min_idx = 0;
			for (uint32_t j = 1; j < NUM_SUBSETS; j++)
			{
				if (d[j] < min_d)
				{
					min_d = d[j];
					min_idx = j;
				}
			}

			cluster_pixels[min_idx][num_cluster_pixels[min_idx]] = i;
			new_cluster_means[min_idx] += training_vecs[i];
			num_cluster_pixels[min_idx]++;
		} // i

		for (uint32_t j = 0; j < NUM_SUBSETS; j++)
		{
			if (!num_cluster_pixels[j])
				return false;

			cluster_centroids[j] = new_cluster_means[j] / (float)num_cluster_pixels[j];
		}
	} // s
		
	partition_pattern_vec desired_part;
	for (uint32_t p = 0; p < NUM_SUBSETS; p++)
	{
		for (uint32_t i = 0; i < num_cluster_pixels[p]; i++)
		{
			const uint32_t pix_index = cluster_pixels[p][i];
			desired_part[pix_index] = (uint8_t)p;
		}
	}

#if BRUTE_FORCE_PART_SEARCH
	partition_pattern_vec desired_parts[NUM_PART3_MAPPINGS];
	for (uint32_t j = 0; j < NUM_PART3_MAPPINGS; j++)
		desired_parts[j] = desired_part.get_permuted3(j);

	uint32_t part_similarity[NUM_UNIQUE_PARTITIONS3];

	for (uint32_t part_index = 0; part_index < NUM_UNIQUE_PARTITIONS3; part_index++)
	{
		const partition_pattern_vec& pat = g_partitions3[part_index];

		uint32_t lowest_pat_dist = UINT32_MAX;
		for (uint32_t p = 0; p < NUM_PART3_MAPPINGS; p++)
		{
			uint32_t dist = pat.get_squared_distance(desired_parts[p]);
			if (dist < lowest_pat_dist)
				lowest_pat_dist = dist;
		}

		part_similarity[part_index] = (lowest_pat_dist << 16) | part_index;

	} // part_index;

	std::sort(part_similarity, part_similarity + NUM_UNIQUE_PARTITIONS3);
		
	for (uint32_t i = 0; i < num_best_parts; i++)
		pBest_parts[i] = part_similarity[i] & 0xFFFF;
#else
	vp_tree::result_queue results;
	results.reserve(num_best_parts);
	g_part3_vp_tree.find_nearest(3, desired_part, results, num_best_parts);

	assert(results.get_size() == num_best_parts);

	const auto& elements = results.get_elements();

	for (uint32_t i = 0; i < results.get_size(); i++)
		pBest_parts[i] = elements[1 + i].m_pat_index;
#endif

	return true;
}

static bool encode_block_3_subsets(
	trial_result& res,
	uint32_t cem,
	uint32_t grid_w, uint32_t grid_h,
	uint32_t weights_ise_range, uint32_t endpoints_ise_range,
	const half_vec3* pBlock_pixels_half, const vec4F* pBlock_pixels_q16,
	astc_hdr_codec_base_options& coptions,
	bool uber_mode_flag,
	const int* pEst_patterns, int num_est_patterns,
	uint32_t comp_level, 
	opt_mode_t mode11_opt_mode)
{
	BASISU_NOTE_UNUSED(uber_mode_flag);
	const uint32_t BLOCK_W = 6, BLOCK_H = 6, NUM_SUBSETS = 3;
	const uint32_t num_endpoint_vals = astc_helpers::get_num_cem_values(cem);
		
	res.m_valid = false;
		
	double best_e = BIG_FLOAT_VAL;

	astc_helpers::log_astc_block best_log_blk;
	clear_obj(best_log_blk);

	best_log_blk.m_num_partitions = NUM_SUBSETS;
	best_log_blk.m_color_endpoint_modes[0] = (uint8_t)cem;
	best_log_blk.m_color_endpoint_modes[1] = (uint8_t)cem;
	best_log_blk.m_color_endpoint_modes[2] = (uint8_t)cem;
	best_log_blk.m_grid_width = (uint8_t)grid_w;
	best_log_blk.m_grid_height = (uint8_t)grid_h;

	best_log_blk.m_weight_ise_range = (uint8_t)weights_ise_range;
	best_log_blk.m_endpoint_ise_range = (uint8_t)endpoints_ise_range;

	const uint32_t n = num_est_patterns ? num_est_patterns : NUM_UNIQUE_PARTITIONS3;

	for (uint32_t unique_p_iter = 0; unique_p_iter < n; unique_p_iter++)
	{
		const uint32_t unique_part_index = num_est_patterns ? pEst_patterns[unique_p_iter] : unique_p_iter;
		assert(unique_part_index < NUM_UNIQUE_PARTITIONS3);
		const partition_pattern_vec*pPart = &g_partitions3[unique_part_index];

		vec4F part_pixels_q16[NUM_SUBSETS][64];
		half_vec3 part_half_pixels[NUM_SUBSETS][64];
		uint8_t part_pixel_index[NUM_SUBSETS][64];
		uint32_t part_total_pixels[NUM_SUBSETS] = { 0 };

		for (uint32_t y = 0; y < BLOCK_H; y++)
		{
			for (uint32_t x = 0; x < BLOCK_W; x++)
			{
				const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W];

				uint32_t l = part_total_pixels[part_index];

				part_pixels_q16[part_index][l] = pBlock_pixels_q16[x + y * BLOCK_W];
				part_half_pixels[part_index][l] = pBlock_pixels_half[x + y * BLOCK_W];
				part_pixel_index[part_index][l] = (uint8_t)(x + y * BLOCK_W);

				part_total_pixels[part_index] = l + 1;
			} // x 
		} // y

		uint8_t blk_endpoints[NUM_SUBSETS][basist::NUM_MODE11_ENDPOINTS];
		uint8_t blk_weights[NUM_SUBSETS][BLOCK_W * BLOCK_H];
		uint32_t best_submode[NUM_SUBSETS];

		bool failed_flag = false;
		double e = 0.0f;
		for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++)
		{
			assert(part_total_pixels[part_iter]);

			double part_e;
			if (cem == 7)
			{
				part_e = encode_astc_hdr_block_mode_7(
					part_total_pixels[part_iter],
					(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
					best_log_blk.m_weight_ise_range,
					best_submode[part_iter],
					BIG_FLOAT_VAL,
					blk_endpoints[part_iter],
					blk_weights[part_iter],
					coptions,
					best_log_blk.m_endpoint_ise_range);
			}
			else
			{
				assert(cem == 11);

				part_e = encode_astc_hdr_block_mode_11(
					part_total_pixels[part_iter],
					(basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
					best_log_blk.m_weight_ise_range,
					best_submode[part_iter],
					BIG_FLOAT_VAL,
					blk_endpoints[part_iter],
					blk_weights[part_iter],
					coptions,
					false, best_log_blk.m_endpoint_ise_range, uber_mode_flag, false, 
					FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode);
			}

			if (part_e == BIG_FLOAT_VAL)
			{
				failed_flag = true;
				break;
			}
			e += part_e;
		} // part_iter

		if (failed_flag)
			continue;

		uint8_t ise_weights[BLOCK_W * BLOCK_H];

		uint32_t src_pixel_index[NUM_SUBSETS] = { 0 };
		for (uint32_t y = 0; y < BLOCK_H; y++)
		{
			for (uint32_t x = 0; x < BLOCK_W; x++)
			{
				const uint32_t part_index = pPart->m_parts[x + y * BLOCK_W];

				ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
				src_pixel_index[part_index]++;
			} // x
		} // y

		if ((grid_w == BLOCK_W) && (grid_h == BLOCK_H))
		{
			if (e < best_e)
			{
				best_e = e;
				best_log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index];

				for (uint32_t p = 0; p < NUM_SUBSETS; p++)
					memcpy(best_log_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals);
				
				memcpy(best_log_blk.m_weights, ise_weights, BLOCK_W * BLOCK_H);
			}
		}
		else
		{
			uint8_t desired_weights[BLOCK_H * BLOCK_W];

			const auto& dequant_tab = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_ISE_to_val;

			for (uint32_t by = 0; by < BLOCK_H; by++)
				for (uint32_t bx = 0; bx < BLOCK_W; bx++)
					desired_weights[bx + by * BLOCK_W] = dequant_tab[ise_weights[bx + by * BLOCK_W]];

			uint8_t downsampled_weights[BLOCK_H * BLOCK_W];

			const float* pDownsample_matrix = get_6x6_downsample_matrix(grid_w, grid_h);
			if (!pDownsample_matrix)
			{
				assert(0);
				return false;
			}

			downsample_weight_grid(
				pDownsample_matrix,
				BLOCK_W, BLOCK_H,		// source/from dimension (block size)
				grid_w, grid_h,			// dest/to dimension (grid size)
				desired_weights,		// these are dequantized weights, NOT ISE symbols, [by][bx]
				downsampled_weights);	// [wy][wx]

			astc_helpers::log_astc_block trial_blk(best_log_blk);

			trial_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_part_index];
			
			for (uint32_t p = 0; p < NUM_SUBSETS; p++)
				memcpy(trial_blk.m_endpoints + num_endpoint_vals * p, blk_endpoints[p], num_endpoint_vals);

			const auto& weight_to_ise = astc_helpers::g_dequant_tables.get_weight_tab(weights_ise_range).m_val_to_ise;

			for (uint32_t gy = 0; gy < grid_h; gy++)
				for (uint32_t gx = 0; gx < grid_w; gx++)
					trial_blk.m_weights[gx + gy * grid_w] = weight_to_ise[downsampled_weights[gx + gy * grid_w]];

			if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_w < 6) || (grid_h < 6)))
			{
				for (uint32_t part_iter = 0; part_iter < NUM_SUBSETS; part_iter++)
				{
					bool refine_status = refine_endpoints(
						cem,
						endpoints_ise_range,
						trial_blk.m_endpoints + part_iter * num_endpoint_vals, // the endpoints to optimize
						BLOCK_W, BLOCK_H, // block dimensions
						grid_w, grid_h, trial_blk.m_weights, weights_ise_range, // weight grid
						part_total_pixels[part_iter], (basist::half_float(*)[3])part_half_pixels[part_iter], (vec4F*)part_pixels_q16[part_iter],
						&part_pixel_index[part_iter][0], // maps this subset's pixels to block offsets
						coptions, mode11_opt_mode);

					BASISU_NOTE_UNUSED(refine_status);
				}
			}

			half_vec4 decoded_pixels_half4[BLOCK_H][BLOCK_W]; // [y][x]
			bool status = astc_helpers::decode_block(trial_blk, decoded_pixels_half4, BLOCK_W, BLOCK_H, astc_helpers::cDecodeModeHDR16);
			assert(status);
			if (!status)
				return false;

			half_vec3 decoded_pixels_half3[BLOCK_H][BLOCK_W];
			for (uint32_t y = 0; y < BLOCK_H; y++)
				for (uint32_t x = 0; x < BLOCK_W; x++)
					decoded_pixels_half3[y][x].set(decoded_pixels_half4[y][x][0], decoded_pixels_half4[y][x][1], decoded_pixels_half4[y][x][2]);

			double trial_err = compute_block_error(BLOCK_W * BLOCK_H, (const basist::half_float*)pBlock_pixels_half, (const basist::half_float*)decoded_pixels_half3, coptions);
			if (trial_err < best_e)
			{
				best_e = trial_err;
				best_log_blk = trial_blk;
			}
		}

	} // unique_p_iter

	if (best_e < BIG_FLOAT_VAL)
	{
		res.m_log_blk = best_log_blk;
		res.m_valid = true;
		res.m_err = best_e;
	}
	else
	{
		res.m_valid = false;
	}

	return res.m_valid;
}

static uint32_t encode_values(bitwise_coder &coder, uint32_t total_values, const uint8_t *pVals, uint32_t endpoint_range)
{
	const uint32_t MAX_VALS = 64;
	uint32_t bit_values[MAX_VALS], tq_values[(MAX_VALS + 2) / 3];
	uint32_t total_tq_values = 0, tq_accum = 0, tq_mul = 1;

	assert((total_values) && (total_values <= MAX_VALS));
	
	const uint32_t ep_bits = astc_helpers::g_ise_range_table[endpoint_range][0];
	const uint32_t ep_trits = astc_helpers::g_ise_range_table[endpoint_range][1];
	const uint32_t ep_quints = astc_helpers::g_ise_range_table[endpoint_range][2];

	for (uint32_t i = 0; i < total_values; i++)
	{
		uint32_t val = pVals[i];

		uint32_t bits = val & ((1 << ep_bits) - 1);
		uint32_t tq = val >> ep_bits;

		bit_values[i] = bits;

		if (ep_trits)
		{
			assert(tq < 3);
			tq_accum += tq * tq_mul;
			tq_mul *= 3;
			if (tq_mul == 243)
			{
				assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values));
				tq_values[total_tq_values++] = tq_accum;
				tq_accum = 0;
				tq_mul = 1;
			}
		}
		else if (ep_quints)
		{
			assert(tq < 5);
			tq_accum += tq * tq_mul;
			tq_mul *= 5;
			if (tq_mul == 125)
			{
				assert(total_tq_values < BASISU_ARRAY_SIZE(tq_values));
				tq_values[total_tq_values++] = tq_accum;
				tq_accum = 0;
				tq_mul = 1;
			}
		}
	}

	uint32_t total_bits_output = 0;
	
	for (uint32_t i = 0; i < total_tq_values; i++)
	{
		const uint32_t num_bits = ep_trits ? 8 : 7;
		coder.put_bits(tq_values[i], num_bits);
		total_bits_output += num_bits;
	}

	if (tq_mul > 1)
	{
		uint32_t num_bits;
		if (ep_trits)
		{
			if (tq_mul == 3)
				num_bits = 2;
			else if (tq_mul == 9)
				num_bits = 4;
			else if (tq_mul == 27)
				num_bits = 5;
			else //if (tq_mul == 81)
				num_bits = 7;
		}
		else
		{
			if (tq_mul == 5)
				num_bits = 3;
			else //if (tq_mul == 25)
				num_bits = 5;
		}
		coder.put_bits(tq_accum, num_bits);
		total_bits_output += num_bits;
	}

	for (uint32_t i = 0; i < total_values; i++)
	{
		coder.put_bits(bit_values[i], ep_bits);
		total_bits_output += ep_bits;
	}

	return total_bits_output;
}

static inline uint32_t get_num_endpoint_vals(uint32_t cem)
{
	assert((cem == 7) || (cem == 11));
	return (cem == 11) ? basist::NUM_MODE11_ENDPOINTS : basist::NUM_MODE7_ENDPOINTS;
}

static void code_block(bitwise_coder& coder,
	const astc_helpers::log_astc_block& log_blk,
	block_mode block_mode_index,
	endpoint_mode em, const uint8_t *pEP_deltas)
{
	coder.put_truncated_binary((uint32_t)block_mode_index, (uint32_t)block_mode::cBMTotalModes);
	coder.put_truncated_binary((uint32_t)em, (uint32_t)endpoint_mode::cTotal);

	const uint32_t num_endpoint_vals = get_num_endpoint_vals(log_blk.m_color_endpoint_modes[0]);

	if ((em == endpoint_mode::cUseLeftDelta) || (em == endpoint_mode::cUseUpperDelta))
	{
		assert(log_blk.m_num_partitions == 1);

		for (uint32_t i = 0; i < num_endpoint_vals; i++)
			coder.put_bits(pEP_deltas[i], NUM_ENDPOINT_DELTA_BITS);
	}
	else if (em == endpoint_mode::cRaw)
	{
		if (log_blk.m_num_partitions == 2)
		{
			const int unique_partition_index = g_part2_seed_to_unique_index[log_blk.m_partition_id];
			assert(unique_partition_index != -1);
			
			coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS2);
		}
		else if (log_blk.m_num_partitions == 3)
		{
			const int unique_partition_index = g_part3_seed_to_unique_index[log_blk.m_partition_id];
			assert(unique_partition_index != -1);

			coder.put_truncated_binary(unique_partition_index, NUM_UNIQUE_PARTITIONS3);
		}
		
		encode_values(coder, num_endpoint_vals * log_blk.m_num_partitions, log_blk.m_endpoints, log_blk.m_endpoint_ise_range);
	}

	encode_values(coder, log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1), log_blk.m_weights, log_blk.m_weight_ise_range);
}

struct smooth_map_params
{
	bool m_no_mse_scaling;

	float m_max_smooth_std_dev;
	float m_smooth_max_mse_scale;

	float m_max_med_smooth_std_dev;
	float m_med_smooth_max_mse_scale;

	float m_max_ultra_smooth_std_dev;
	float m_ultra_smooth_max_mse_scale;

	bool m_debug_images;

	smooth_map_params()
	{
		clear();
	}

	void clear()
	{
		m_no_mse_scaling = false;

		// 3x3 region
		m_max_smooth_std_dev = 100.0f;
		m_smooth_max_mse_scale = 13000.0f;
				
		// 7x7 region
		m_max_med_smooth_std_dev = 9.0f;
		m_med_smooth_max_mse_scale = 15000.0f;

		// 11x11 region
		m_max_ultra_smooth_std_dev = 4.0f;
		//m_ultra_smooth_max_mse_scale = 4500.0f;
		//m_ultra_smooth_max_mse_scale = 10000.0f;
		//m_ultra_smooth_max_mse_scale = 50000.0f;
		//m_ultra_smooth_max_mse_scale = 100000.0f;
		//m_ultra_smooth_max_mse_scale = 400000.0f;
		//m_ultra_smooth_max_mse_scale = 800000.0f;
		m_ultra_smooth_max_mse_scale = 2000000.0f;

		m_debug_images = true;
	}
};

Resampler::Contrib_List* g_contrib_lists[7]; // 1-6

static void init_contrib_lists()
{
	for (uint32_t dst_width = 1; dst_width <= 6; dst_width++)
		//g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_GAUSSIAN_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f);
		g_contrib_lists[dst_width] = Resampler::make_clist(6, 6, basisu::Resampler::BOUNDARY_CLAMP, gaussian_filter, BASISU_BELL_FILTER_SUPPORT, 6.0f / (float)dst_width, 0.0f);
}

#if 0
static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, half_vec3 *pDst_block_half3, vec4F *pDst_block_q16)
{
	vec3F temp_block[6][6]; // [y][x]

	// first filter rows to temp_block
	if (grid_x == 6)
	{
		memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6);
	}
	else
	{
		Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];

		for (uint32_t y = 0; y < 6; y++)
		{
			for (uint32_t x = 0; x < 6; x++)
			{
				vec3F p(0.0f);

				for (uint32_t i = 0; i < pRow_lists[x].n; i++)
					p += pSrc_block[y * 6 + pRow_lists[x].p[i].pixel] * pRow_lists[x].p[i].weight;

				p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);

				temp_block[y][x] = p;
			} // x
		} // y
	}

	// filter columns
	if (grid_y == 6)
	{
		for (uint32_t y = 0; y < 6; y++)
		{
			for (uint32_t x = 0; x < 6; x++)
			{
				for (uint32_t c = 0; c < 3; c++)
				{
					const basist::half_float h = basist::float_to_half(temp_block[y][x][c]);
					
					pDst_block_half3[x + y * 6][c] = h;
					pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h);
				}

				pDst_block_q16[x + y * 6][3] = 0.0f;
			} // x
		} // y
	}
	else
	{
		Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];

		for (uint32_t x = 0; x < 6; x++)
		{
			for (uint32_t y = 0; y < 6; y++)
			{
				vec3F p(0.0f);

				for (uint32_t i = 0; i < pCol_lists[y].n; i++)
					p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;
				
				p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);
				
				for (uint32_t c = 0; c < 3; c++)
				{
					const basist::half_float h = basist::float_to_half(p[c]);

					pDst_block_half3[x + y * 6][c] = h;
					pDst_block_q16[x + y * 6][c] = (float)half_to_qlog16(h);
				}

				pDst_block_q16[x + y * 6][3] = 0.0f;
				
			} // x
		} // y
	}
}
#endif

static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec4F* pSrc_block, vec4F* pDst_block)
{
	vec4F temp_block[6][6]; // [y][x]

	// first filter rows to temp_block
	if (grid_x == 6)
	{
		memcpy(temp_block, pSrc_block, sizeof(vec4F) * 6 * 6);
	}
	else
	{
		Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];

		for (uint32_t y = 0; y < 6; y++)
		{
			for (uint32_t x = 0; x < 6; x++)
			{
				vec3F p(0.0f);

				for (uint32_t i = 0; i < pRow_lists[x].n; i++)
					p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight;

				p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);

				temp_block[y][x] = p;
			} // x
		} // y
	}

	// filter columns
	if (grid_y == 6)
	{
		for (uint32_t y = 0; y < 6; y++)
		{
			for (uint32_t x = 0; x < 6; x++)
			{
				for (uint32_t c = 0; c < 3; c++)
					pDst_block[x + y * 6][c] = temp_block[y][x][c];
			} // x
		} // y
	}
	else
	{
		Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];

		for (uint32_t x = 0; x < 6; x++)
		{
			for (uint32_t y = 0; y < 6; y++)
			{
				vec3F p(0.0f);

				for (uint32_t i = 0; i < pCol_lists[y].n; i++)
					p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;

				p.clamp(0.0f, basist::ASTC_HDR_MAX_VAL);

				pDst_block[x + y * 6] = p;

			} // x
		} // y
	}
}

static void filter_block(uint32_t grid_x, uint32_t grid_y, const vec3F* pSrc_block, vec3F* pDst_block)
{
	vec3F temp_block[6][6]; // [y][x]

	// first filter rows to temp_block
	if (grid_x == 6)
	{
		memcpy(temp_block, pSrc_block, sizeof(vec3F) * 6 * 6);
	}
	else
	{
		Resampler::Contrib_List* pRow_lists = g_contrib_lists[grid_x];

		for (uint32_t y = 0; y < 6; y++)
		{
			for (uint32_t x = 0; x < 6; x++)
			{
				vec3F p(0.0f);

				for (uint32_t i = 0; i < pRow_lists[x].n; i++)
					p += vec3F(pSrc_block[y * 6 + pRow_lists[x].p[i].pixel]) * pRow_lists[x].p[i].weight;
								
				temp_block[y][x] = p;
			} // x
		} // y
	}

	// filter columns
	if (grid_y == 6)
	{
		memcpy((void *)pDst_block, temp_block, sizeof(vec3F) * 6 * 6);
	}
	else
	{
		Resampler::Contrib_List* pCol_lists = g_contrib_lists[grid_y];

		for (uint32_t x = 0; x < 6; x++)
		{
			for (uint32_t y = 0; y < 6; y++)
			{
				vec3F& p = pDst_block[x + y * 6];
				p.set(0.0f);

				for (uint32_t i = 0; i < pCol_lists[y].n; i++)
					p += temp_block[pCol_lists[y].p[i].pixel][x] * pCol_lists[y].p[i].weight;
			} // x
		} // y
	}
}

static float diff_blocks(const vec4F* pA, const vec4F* pB)
{
	const uint32_t BLOCK_T = 36;

	float diff = 0.0f;
	for (uint32_t i = 0; i < BLOCK_T; i++)
		diff += square(pA[i][0] - pB[i][0]) + square(pA[i][1] - pB[i][1]) + square(pA[i][2] - pB[i][2]);
	
	return diff * (1.0f / (float)BLOCK_T);
}

static float sub_and_compute_std_dev(const vec3F* pA, const vec3F* pB)
{
	const uint32_t BLOCK_T = 36;

	vec3F mean(0.0f);

	for (uint32_t i = 0; i < BLOCK_T; i++)
	{
		vec3F diff(pA[i] - pB[i]);
		mean += diff;
	}

	mean *= (1.0f / (float)BLOCK_T);

	vec3F diff_sum(0.0f);
	for (uint32_t i = 0; i < BLOCK_T; i++)
	{
		vec3F diff(pA[i] - pB[i]);
		diff -= mean;
		diff_sum += vec3F::component_mul(diff, diff);
	}

	vec3F var(diff_sum * (1.0f / (float)BLOCK_T));

	vec3F std_dev(sqrtf(var[0]), sqrtf(var[1]), sqrtf(var[2]));

	return maximum(std_dev[0], std_dev[1], std_dev[2]);
}

static void create_smooth_maps2(
	vector2D<float>& smooth_block_mse_scales,
	const image& orig_img,
	smooth_map_params& params, image* pUltra_smooth_img = nullptr)
{
	const uint32_t width = orig_img.get_width();
	const uint32_t height = orig_img.get_height();
	//const uint32_t total_pixels = orig_img.get_total_pixels();
	const uint32_t num_comps = 3;
		
	if (params.m_no_mse_scaling)
	{
		smooth_block_mse_scales.set_all(1.0f);
		return;
	}
	
	// TODO: - move up before the no mse scaling check (harmless as that is only a debug aid)
	smooth_block_mse_scales.resize(width, height);

	image smooth_vis, med_smooth_vis, ultra_smooth_vis;

	if (params.m_debug_images)
	{
		smooth_vis.resize(width, height);
		med_smooth_vis.resize(width, height);
		ultra_smooth_vis.resize(width, height);
	}

	for (uint32_t y = 0; y < height; y++)
	{
		for (uint32_t x = 0; x < width; x++)
		{
			{
				tracked_stat_dbl comp_stats[4];
				for (int yd = -1; yd <= 1; yd++)
				{
					for (int xd = -1; xd <= 1; xd++)
					{
						const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);

						comp_stats[0].update((float)p[0]);
						comp_stats[1].update((float)p[1]);
						comp_stats[2].update((float)p[2]);
					}
				}

				float max_std_dev = 0.0f;
				for (uint32_t i = 0; i < num_comps; i++)
					max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());

				float yl = clampf(max_std_dev / params.m_max_smooth_std_dev, 0.0f, 1.0f);
				//yl = powf(yl, 2.0f);
				yl = powf(yl, 1.0f / 2.0f); // substantially less bits

				smooth_block_mse_scales(x, y) = lerp(params.m_smooth_max_mse_scale, 1.0f, yl);

				if (params.m_debug_images)
				{
					//smooth_vis(x, y).set(clamp((int)((smooth_block_mse_scales(x, y) - 1.0f) / (params.m_smooth_max_mse_scale - 1.0f) * 255.0f + .5f), 0, 255));
					// white=high local activity (edges/detail)
					// black=low local activity (smooth - error is amplified)
					smooth_vis(x, y).set(clamp((int)((yl * 255.0f) + .5f), 0, 255));
				}
			}

			{
				tracked_stat_dbl comp_stats[4];

				const int S = 3;
				for (int yd = -S; yd < S; yd++)
				{
					for (int xd = -S; xd < S; xd++)
					{
						const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);

						comp_stats[0].update((float)p[0]);
						comp_stats[1].update((float)p[1]);
						comp_stats[2].update((float)p[2]);
					}
				}

				float max_std_dev = 0.0f;
				for (uint32_t i = 0; i < num_comps; i++)
					max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());

				float yl = clampf(max_std_dev / params.m_max_med_smooth_std_dev, 0.0f, 1.0f);
				//yl = powf(yl, 2.0f);

				smooth_block_mse_scales(x, y) = lerp(params.m_med_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl);

				if (params.m_debug_images)
					med_smooth_vis(x, y).set((int)std::round(yl * 255.0f));
			}

			{
				tracked_stat_dbl comp_stats[4];

				const int S = 5;
				for (int yd = -S; yd < S; yd++)
				{
					for (int xd = -S; xd < S; xd++)
					{
						const color_rgba& p = orig_img.get_clamped((int)x + xd, (int)y + yd);

						comp_stats[0].update((float)p[0]);
						comp_stats[1].update((float)p[1]);
						comp_stats[2].update((float)p[2]);
					}
				}

				float max_std_dev = 0.0f;
				for (uint32_t i = 0; i < num_comps; i++)
					max_std_dev = basisu::maximum(max_std_dev, (float)comp_stats[i].get_std_dev());

				float yl = clampf(max_std_dev / params.m_max_ultra_smooth_std_dev, 0.0f, 1.0f);
				yl = powf(yl, 2.0f);
				
				smooth_block_mse_scales(x, y) = lerp(params.m_ultra_smooth_max_mse_scale, smooth_block_mse_scales(x, y), yl);

				if (params.m_debug_images)
					ultra_smooth_vis(x, y).set((int)std::round(yl * 255.0f));
			}

		}
	}

	if (params.m_debug_images)
	{
		save_png("dbg_smooth_vis.png", smooth_vis);
		save_png("dbg_med_smooth_vis.png", med_smooth_vis);
		save_png("dbg_ultra_smooth_vis.png", ultra_smooth_vis);

		image vis_img(width, height);

		float max_scale = 0.0f;
		for (uint32_t y = 0; y < height; y++)
			for (uint32_t x = 0; x < width; x++)
				max_scale = basisu::maximumf(max_scale, smooth_block_mse_scales(x, y));

		for (uint32_t y = 0; y < height; y++)
			for (uint32_t x = 0; x < width; x++)
				vis_img(x, y).set((int)std::round(smooth_block_mse_scales(x, y) * 255.0f / max_scale));

		save_png("scale_vis.png", vis_img);
	}

	if (pUltra_smooth_img)
		*pUltra_smooth_img = ultra_smooth_vis;
}

const float REALLY_DARK_I_THRESHOLD = 0.0625f;
const float REALLY_DARK_MSE_ERR_SCALE = 128.0f;
const float REALLY_DARK_DELTA_ITP_JND_SCALE = 5.0f;

static float compute_pixel_mse_itp(const vec3F& orig_pixel_itp, const vec3F& comp_pixel_itp, bool delta_itp_dark_adjustment)
{
	float delta_i = orig_pixel_itp[0] - comp_pixel_itp[0];
	float delta_t = orig_pixel_itp[1] - comp_pixel_itp[1];
	float delta_p = orig_pixel_itp[2] - comp_pixel_itp[2];
		
	float err = (delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p);

	if (delta_itp_dark_adjustment)
	{
		// We have to process a large range of inputs, including extremely dark inputs. 
		// Artifically amplify MSE on very dark pixels - otherwise they'll be overly compressed at higher lambdas.
		// This is to better handle very dark signals which could be explictly overexposed.
		float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig_pixel_itp[0]);
		s = lerp(REALLY_DARK_MSE_ERR_SCALE, 1.0f, s);
		err *= s;
	}

	return err;
}

static float compute_block_mse_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp, bool delta_itp_dark_adjustment)
{
	float total_mse = 0.0f;

	for (uint32_t y = 0; y < block_h; y++)
	{
		for (uint32_t x = 0; x < block_w; x++)
		{
			total_mse += compute_pixel_mse_itp(pOrig_pixels_itp[x + y * block_w], pComp_pixels_itp[x + y * block_w], delta_itp_dark_adjustment);
		} // x
	} // y

	return total_mse * (1.0f / (float)(block_w * block_h));
}

static float compute_block_ssim_itp(uint32_t block_w, uint32_t block_h, const vec3F* pOrig_pixels_itp, const vec3F* pComp_pixels_itp)
{
	const uint32_t n = block_w * block_h;
	assert(n <= 36);

	stats<float> x_stats[3], y_stats[3];
	comparative_stats<float> xy_cov[3];

	for (uint32_t c = 0; c < 3; c++)
	{
		x_stats[c].calc_simplified(n, &pOrig_pixels_itp[0][c], 3);
		y_stats[c].calc_simplified(n, &pComp_pixels_itp[0][c], 3);
	}

	for (uint32_t c = 0; c < 3; c++)
		xy_cov[c].calc_cov(n, &pOrig_pixels_itp[0][c], &pComp_pixels_itp[0][c], 3, 3, &x_stats[c], &y_stats[c]);

	float ssim[3];
	const double d = 1.0f, k1 = .01f, k2 = .03f;

	// weight mean error more highly to reduce blocking
	float ap = 1.5f, bp = 1.0f, cp = 1.0f;

	const double s_c1 = square(k1 * d), s_c2 = square(k2 * d);
	const double s_c3(s_c2 * .5f);

	for (uint32_t c = 0; c < 3; c++)
	{
		float lum = (float)((2.0f * x_stats[c].m_avg * y_stats[c].m_avg + s_c1) / (square(x_stats[c].m_avg) + square(y_stats[c].m_avg) + s_c1));
		lum = saturate(lum);

		float con = (float)((2.0f * x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c2) / (x_stats[c].m_var + y_stats[c].m_var + s_c2));
		con = saturate(con);

		float str = (float)((xy_cov[c].m_cov + s_c3) / (x_stats[c].m_std_dev * y_stats[c].m_std_dev + s_c3));
		str = saturate(str);

		ssim[c] = powf(lum, ap) * powf(con, bp) * powf(str, cp);
	}

#if 0
	float final_ssim = (ssim[0] * .4f + ssim[1] * .3f + ssim[2] * .3f);
#elif 1
	float final_ssim = ssim[0] * ssim[1] * ssim[2];
#else
	const float LP = .75f;
	float final_ssim = ssim[0] * powf((ssim[1] + ssim[2]) * .5f, LP);
#endif

	return final_ssim;
}

// delta ITP, 1.0 is JND (Rec. ITU-R BT.2124), modified for higher error at low light
static float compute_pixel_delta_itp(const vec3F& a, const vec3F& b, const vec3F& orig, bool delta_itp_dark_adjustment)
{
	float delta_i = a[0] - b[0];
	float delta_t = a[1] - b[1];
	float delta_p = a[2] - b[2];

	float err = 720.0f * sqrtf((delta_i * delta_i) + (delta_t * delta_t) + (delta_p * delta_p));

	float s = bu_math::smoothstep(0.0f, REALLY_DARK_I_THRESHOLD, orig[0]);
	
	if (delta_itp_dark_adjustment)
	{
		// This is to better handle very dark signals which could be explictly overexposed.
		s = lerp(REALLY_DARK_DELTA_ITP_JND_SCALE, 1.0f, s);
		err *= s;
	}

	return err;
}

struct candidate_encoding
{
	encoding_type m_encoding_type;
		
	basist::half_float m_solid_color[3];

	uint32_t m_run_len;

	vec3F m_comp_pixels[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x]
	vec3F m_comp_pixels_itp[MAX_BLOCK_H][MAX_BLOCK_W]; // [y][x]
		
	endpoint_mode m_endpoint_mode;
	block_mode m_block_mode;

	bitwise_coder m_coder;
		
	// The block to code, which may not be valid ASTC. This may have to be transcoded (by requantizing the weights/endpoints) before it's valid ASTC.
	// Note the endpoints may be coded endpoints OR transcoded endpoints, depending on the encoding type.
	astc_helpers::log_astc_block m_coded_log_blk; 

	// The block the decoder outputs.
	astc_helpers::log_astc_block m_decomp_log_blk;

	int m_reuse_delta_index;

	// m_t can get VERY large
	double m_t, m_d; 
	float m_bits;
					
	candidate_encoding()
	{
		clear();
	}

	candidate_encoding(const candidate_encoding &other)
	{
		*this = other;
	}

	candidate_encoding(candidate_encoding&& other)
	{
		*this = std::move(other);
	}

	candidate_encoding& operator=(const candidate_encoding& rhs)
	{
		if (this == &rhs)
			return *this;

		m_encoding_type = rhs.m_encoding_type;
		memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color));
		m_run_len = rhs.m_run_len;
		memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels));
		m_endpoint_mode = rhs.m_endpoint_mode;
		m_block_mode = rhs.m_block_mode;
		m_coder = rhs.m_coder;
		m_coded_log_blk = rhs.m_coded_log_blk;
		m_decomp_log_blk = rhs.m_decomp_log_blk;
		m_reuse_delta_index = rhs.m_reuse_delta_index;
		
		return *this;
	}

	candidate_encoding& operator=(candidate_encoding&& rhs)
	{
		if (this == &rhs)
			return *this;

		m_encoding_type = rhs.m_encoding_type;
		memcpy(m_solid_color, rhs.m_solid_color, sizeof(m_solid_color));
		m_run_len = rhs.m_run_len;
		memcpy(m_comp_pixels, rhs.m_comp_pixels, sizeof(m_comp_pixels));
		m_endpoint_mode = rhs.m_endpoint_mode;
		m_block_mode = rhs.m_block_mode;
		m_coder = std::move(rhs.m_coder);
		m_coded_log_blk = rhs.m_coded_log_blk;
		m_decomp_log_blk = rhs.m_decomp_log_blk;
		m_reuse_delta_index = rhs.m_reuse_delta_index;

		return *this;
	}

	void clear()
	{
		m_encoding_type = encoding_type::cInvalid;

		clear_obj(m_solid_color);

		m_run_len = 0;

		clear_obj(m_comp_pixels);
						
		m_endpoint_mode = endpoint_mode::cInvalid;
		m_block_mode = block_mode::cInvalid;

		m_coder.restart();
		
		m_coded_log_blk.clear();
		m_decomp_log_blk.clear();

		m_t = 0;
		m_d = 0;
		m_bits = 0;
		
		m_reuse_delta_index = 0;
	}
};

bool decode_astc_block(uint32_t block_w, uint32_t block_h, astc_helpers::log_astc_block &log_blk, vec3F *pPixels)
{
	assert((block_w <= 6) && (block_h <= 6));

	half_vec4 decoded_pixels_half4[6 * 6]; // [y][x]
	bool status = astc_helpers::decode_block(log_blk, decoded_pixels_half4, block_w, block_h, astc_helpers::cDecodeModeHDR16);
	assert(status);

	if (!status)
		return false;

	for (uint32_t y = 0; y < block_h; y++)
	{
		for (uint32_t x = 0; x < block_w; x++)
		{
			pPixels[x + y * block_w].set(
				basist::half_to_float(decoded_pixels_half4[x + y * block_w][0]),
				basist::half_to_float(decoded_pixels_half4[x + y * block_w][1]),
				basist::half_to_float(decoded_pixels_half4[x + y * block_w][2]));
		} // x 
	} //y

	return true;
}

static inline bool validate_log_blk(const astc_helpers::log_astc_block &decomp_blk)
{
	astc_helpers::astc_block phys_blk;
	return astc_helpers::pack_astc_block(phys_blk, decomp_blk);
}

#define SYNC_MARKERS (0)

static bool decode_file(const uint8_vec& comp_data, vector2D<astc_helpers::astc_block>& decoded_blocks, uint32_t &width, uint32_t &height)
{
	interval_timer tm;
	tm.start();

	const uint32_t BLOCK_W = 6, BLOCK_H = 6;

	width = 0;
	height = 0;

	if (comp_data.size() <= 2*3)
		return false;

	basist::bitwise_decoder decoder;
	if (!decoder.init(comp_data.data(), comp_data.size_u32()))
		return false;

	// Read initial LE marker
	const uint32_t marker = decoder.get_bits(16);
	
	// Check for v1.60 and v2.0 markers - if it's not either, it's not valid data.
	if ((marker != UASTC_6x6_HDR_SIG0) && (marker != UASTC_6x6_HDR_SIG1))
		return false;
	
	// Use original v1.60 behavior for tiny weight grid upsampling if it's the original marker, otherwise v2.0.
	const bool use_orig_behavior = (marker == UASTC_6x6_HDR_SIG0);

	width = decoder.get_bits(16);
	height = decoder.get_bits(16);
		
	if (!width || !height || (width > MAX_ASTC_HDR_6X6_DIM) || (height > MAX_ASTC_HDR_6X6_DIM))
		return false;

	const uint32_t num_blocks_x = (width + BLOCK_W - 1) / BLOCK_W;
	const uint32_t num_blocks_y = (height + BLOCK_H - 1) / BLOCK_H;
	const uint32_t total_blocks = num_blocks_x * num_blocks_y;

	decoded_blocks.resize(num_blocks_x, num_blocks_y);
	//memset(decoded_blocks.get_ptr(), 0, decoded_blocks.size_in_bytes());

	vector2D<astc_helpers::log_astc_block> decoded_log_blocks(num_blocks_x, num_blocks_y);
	//memset(decoded_log_blocks.get_ptr(), 0, decoded_log_blocks.size_in_bytes());

	uint32_t cur_bx = 0, cur_by = 0;
	uint32_t step_counter = 0;
	BASISU_NOTE_UNUSED(step_counter);
		
	while (cur_by < num_blocks_y)
	{
		step_counter++;
		
		//if ((cur_bx == 9) && (cur_by == 13))
		//	printf("!");

#if SYNC_MARKERS
		uint32_t mk = decoder.get_bits(16);
		if (mk != 0xDEAD)
		{
			printf("!");
			assert(0);
			return false;
		}
#endif
		if (decoder.get_bits_remaining() < 1)
			return false;

		encoding_type et = encoding_type::cBlock;

		uint32_t b0 = decoder.get_bits(1);
		if (!b0)
		{
			uint32_t b1 = decoder.get_bits(1);
			if (b1)
				et = encoding_type::cReuse;
			else
			{
				uint32_t b2 = decoder.get_bits(1);
				if (b2)
					et = encoding_type::cSolid;
				else
					et = encoding_type::cRun;
			}
		}

		switch (et)
		{
		case encoding_type::cRun:
		{
			if (!cur_bx && !cur_by)
				return false;

			const uint32_t run_len = decoder.decode_vlc(5) + 1;
			
			uint32_t num_blocks_remaining = total_blocks - (cur_bx + cur_by * num_blocks_x);
			if (run_len > num_blocks_remaining)
				return false;
						
			uint32_t prev_bx = cur_bx, prev_by = cur_by;

			if (cur_bx)
				prev_bx--;
			else
			{
				prev_bx = num_blocks_x - 1;
				prev_by--;
			}

			const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by);
			const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by);

			for (uint32_t i = 0; i < run_len; i++)
			{
				decoded_log_blocks(cur_bx, cur_by) = prev_log_blk;
				decoded_blocks(cur_bx, cur_by) = prev_phys_blk;

				cur_bx++;
				if (cur_bx == num_blocks_x)
				{
					cur_bx = 0;
					cur_by++;
				}
			}

			break;
		}
		case encoding_type::cSolid:
		{
			const basist::half_float rh = (basist::half_float)decoder.get_bits(15);
			const basist::half_float gh = (basist::half_float)decoder.get_bits(15);
			const basist::half_float bh = (basist::half_float)decoder.get_bits(15);

			astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);

			log_blk.clear();
			log_blk.m_solid_color_flag_hdr = true;
			log_blk.m_solid_color[0] = rh;
			log_blk.m_solid_color[1] = gh;
			log_blk.m_solid_color[2] = bh;
			log_blk.m_solid_color[3] = basist::float_to_half(1.0f);

			bool status = astc_helpers::pack_astc_block(decoded_blocks(cur_bx, cur_by), log_blk);
			if (!status)
				return false;

			cur_bx++;
			if (cur_bx == num_blocks_x)
			{
				cur_bx = 0;
				cur_by++;
			}
			
			break;
		}
		case encoding_type::cReuse:
		{
			if (!cur_bx && !cur_by)
				return false;

			const uint32_t reuse_delta_index = decoder.get_bits(REUSE_XY_DELTA_BITS);

			const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x;
			const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y;

			const int prev_bx = cur_bx + reuse_delta_x, prev_by = cur_by + reuse_delta_y;
			if ((prev_bx < 0) || (prev_bx >= (int)num_blocks_x))
				return false;
			if (prev_by < 0)
				return false;
			
			const astc_helpers::log_astc_block& prev_log_blk = decoded_log_blocks(prev_bx, prev_by);
			const astc_helpers::astc_block& prev_phys_blk = decoded_blocks(prev_bx, prev_by);

			if (prev_log_blk.m_solid_color_flag_hdr)
				return false;

			astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
			astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);
			
			log_blk = prev_log_blk;

			const uint32_t total_grid_weights = log_blk.m_grid_width * log_blk.m_grid_height * (log_blk.m_dual_plane ? 2 : 1);

			bool status = basist::astc_6x6_hdr::decode_values(decoder, total_grid_weights, log_blk.m_weight_ise_range, log_blk.m_weights);
			if (!status)
				return false;

			astc_helpers::log_astc_block decomp_blk;
			status = astc_helpers::unpack_block(&prev_phys_blk, decomp_blk, BLOCK_W, BLOCK_H);
			if (!status)
				return false;
			
			uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
			basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, log_blk.m_weight_ise_range, transcode_weights, decomp_blk.m_weight_ise_range);

			copy_weight_grid(log_blk.m_dual_plane, log_blk.m_grid_width, log_blk.m_grid_height, transcode_weights, decomp_blk, use_orig_behavior);

			status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
			if (!status)
				return false;

			cur_bx++;
			if (cur_bx == num_blocks_x)
			{
				cur_bx = 0;
				cur_by++;
			}

			break;
		}
		case encoding_type::cBlock:
		{
			const block_mode bm = (block_mode)decoder.decode_truncated_binary((uint32_t)block_mode::cBMTotalModes);
			const endpoint_mode em = (endpoint_mode)decoder.decode_truncated_binary((uint32_t)endpoint_mode::cTotal);

			switch (em)
			{
			case endpoint_mode::cUseLeft:
			case endpoint_mode::cUseUpper:
			{
				int neighbor_bx = cur_bx, neighbor_by = cur_by;
				
				if (em == endpoint_mode::cUseLeft)
					neighbor_bx--;
				else
					neighbor_by--;

				if ((neighbor_bx < 0) || (neighbor_by < 0))
					return false;

				const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by);
				if (!neighbor_blk.m_color_endpoint_modes[0])
					return false;

				const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];
				const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);

				if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0])
					return false;

				astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
				astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);

				log_blk.clear();
				log_blk.m_num_partitions = 1;
				log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
				log_blk.m_endpoint_ise_range = neighbor_blk.m_endpoint_ise_range;
				log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;
				log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;
				log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;
				log_blk.m_dual_plane = (uint8_t)bmd.m_dp;
				log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;

				memcpy(log_blk.m_endpoints, neighbor_blk.m_endpoints, num_endpoint_values);

				const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);

				bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);
				if (!status)
					return false;

				astc_helpers::log_astc_block decomp_blk;
				decomp_blk.clear();

				decomp_blk.m_num_partitions = 1;
				decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
				decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;
				decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;
				decomp_blk.m_dual_plane = bmd.m_dp;
				decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;

				basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);

				uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
				basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);

				copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk, use_orig_behavior);

				status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
				if (!status)
					return false;

				cur_bx++;
				if (cur_bx == num_blocks_x)
				{
					cur_bx = 0;
					cur_by++;
				}

				break;
			}
			case endpoint_mode::cUseLeftDelta:
			case endpoint_mode::cUseUpperDelta:
			{
				int neighbor_bx = cur_bx, neighbor_by = cur_by;

				if (em == endpoint_mode::cUseLeftDelta)
					neighbor_bx--;
				else
					neighbor_by--;

				if ((neighbor_bx < 0) || (neighbor_by < 0))
					return false;

				const astc_helpers::log_astc_block& neighbor_blk = decoded_log_blocks(neighbor_bx, neighbor_by);
				if (!neighbor_blk.m_color_endpoint_modes[0])
					return false;

				const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];
				const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);

				if (bmd.m_cem != neighbor_blk.m_color_endpoint_modes[0])
					return false;

				astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
				astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);

				log_blk.clear();
				log_blk.m_num_partitions = 1;
				log_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
				log_blk.m_dual_plane = bmd.m_dp;
				log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
				
				log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range;
				basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, neighbor_blk.m_endpoint_ise_range, neighbor_blk.m_endpoints, bmd.m_endpoint_ise_range, log_blk.m_endpoints);

				const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS;
				const int low_delta_limit = -(total_endpoint_delta_vals / 2); // high_delta_limit = (total_endpoint_delta_vals / 2) - 1;

				const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_ISE_to_rank;
				const auto& rank_to_ise = astc_helpers::g_dequant_tables.get_endpoint_tab(log_blk.m_endpoint_ise_range).m_rank_to_ISE;
				const int total_endpoint_levels = astc_helpers::get_ise_levels(log_blk.m_endpoint_ise_range);

				for (uint32_t i = 0; i < num_endpoint_values; i++)
				{
					int cur_val = ise_to_rank[log_blk.m_endpoints[i]];
					
					int delta = (int)decoder.get_bits(NUM_ENDPOINT_DELTA_BITS) + low_delta_limit;

					cur_val += delta;
					if ((cur_val < 0) || (cur_val >= total_endpoint_levels))
						return false;

					log_blk.m_endpoints[i] = rank_to_ise[cur_val];
				}

				log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;
				log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;
				log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;

				const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);

				bool status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);
				if (!status)
					return false;

				astc_helpers::log_astc_block decomp_blk;
				decomp_blk.clear();

				decomp_blk.m_num_partitions = 1;
				decomp_blk.m_color_endpoint_modes[0] = (uint8_t)bmd.m_cem;
				decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;
				decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;
				decomp_blk.m_dual_plane = (uint8_t)bmd.m_dp;
				decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;

				basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, log_blk.m_endpoint_ise_range, log_blk.m_endpoints, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);

				uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
				basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);

				copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk, use_orig_behavior);

				status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
				if (!status)
					return false;

				cur_bx++;
				if (cur_bx == num_blocks_x)
				{
					cur_bx = 0;
					cur_by++;
				}

				break;
			}
			case endpoint_mode::cRaw:
			{
				const block_mode_desc& bmd = g_block_mode_descs[(uint32_t)bm];

				const uint32_t num_endpoint_values = get_num_endpoint_vals(bmd.m_cem);

				astc_helpers::log_astc_block& log_blk = decoded_log_blocks(cur_bx, cur_by);
				astc_helpers::astc_block& phys_blk = decoded_blocks(cur_bx, cur_by);

				log_blk.clear();
				log_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions;
				
				for (uint32_t p = 0; p < bmd.m_num_partitions; p++)
					log_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem;

				log_blk.m_endpoint_ise_range = (uint8_t)bmd.m_endpoint_ise_range;
				log_blk.m_weight_ise_range = (uint8_t)bmd.m_weight_ise_range;

				log_blk.m_grid_width = (uint8_t)bmd.m_grid_x;
				log_blk.m_grid_height = (uint8_t)bmd.m_grid_y;
				log_blk.m_dual_plane = (uint8_t)bmd.m_dp;
				log_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;

				if (bmd.m_num_partitions == 2)
				{
					const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS2);
					log_blk.m_partition_id = (uint16_t)g_part2_unique_index_to_seed[unique_partition_index];
				}
				else if (bmd.m_num_partitions == 3)
				{
					const uint32_t unique_partition_index = decoder.decode_truncated_binary(NUM_UNIQUE_PARTITIONS3);
					log_blk.m_partition_id = (uint16_t)g_part3_unique_index_to_seed[unique_partition_index];
				}
				
				bool status = decode_values(decoder, num_endpoint_values * bmd.m_num_partitions, bmd.m_endpoint_ise_range, log_blk.m_endpoints);
				if (!status)
					return false;

				const uint32_t total_grid_weights = bmd.m_grid_x * bmd.m_grid_y * (bmd.m_dp ? 2 : 1);

				status = decode_values(decoder, total_grid_weights, bmd.m_weight_ise_range, log_blk.m_weights);
				if (!status)
					return false;

				astc_helpers::log_astc_block decomp_blk;
				decomp_blk.clear();
				
				decomp_blk.m_dual_plane = bmd.m_dp;
				decomp_blk.m_color_component_selector = (uint8_t)bmd.m_dp_channel;
				decomp_blk.m_partition_id = log_blk.m_partition_id;
								
				decomp_blk.m_num_partitions = (uint8_t)bmd.m_num_partitions;
				
				for (uint32_t p = 0; p < bmd.m_num_partitions; p++)
					decomp_blk.m_color_endpoint_modes[p] = (uint8_t)bmd.m_cem;

				decomp_blk.m_endpoint_ise_range = (uint8_t)bmd.m_transcode_endpoint_ise_range;
				decomp_blk.m_weight_ise_range = (uint8_t)bmd.m_transcode_weight_ise_range;

				for (uint32_t p = 0; p < bmd.m_num_partitions; p++)
					basist::astc_6x6_hdr::requantize_ise_endpoints(bmd.m_cem, bmd.m_endpoint_ise_range, log_blk.m_endpoints + num_endpoint_values * p, bmd.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints + num_endpoint_values * p);

				uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
				basist::astc_6x6_hdr::requantize_astc_weights(total_grid_weights, log_blk.m_weights, bmd.m_weight_ise_range, transcode_weights, bmd.m_transcode_weight_ise_range);

				copy_weight_grid(bmd.m_dp, bmd.m_grid_x, bmd.m_grid_y, transcode_weights, decomp_blk, use_orig_behavior);

				status = astc_helpers::pack_astc_block(phys_blk, decomp_blk);
				if (!status)
					return false;

				cur_bx++;
				if (cur_bx == num_blocks_x)
				{
					cur_bx = 0;
					cur_by++;
				}

				break;
			}
			default:
			{
				assert(0);
				return false;
			}
			}

			break;
		}
		default:
		{
			assert(0);
			return false;
		}
		}
	}

	if (decoder.get_bits(16) != 0xA742)
	{
		fmt_error_printf("End marker not found!\n");
		return false;
	}

	//fmt_printf("Total decode_file() time: {} secs\n", tm.get_elapsed_secs());

	return true;
}

static bool unpack_physical_astc_block(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels)
{
	astc_helpers::log_astc_block log_blk;
	if (!astc_helpers::unpack_block(pBlock, log_blk, block_width, block_height))
		return false;
	
	basist::half_float half_block[MAX_BLOCK_W * MAX_BLOCK_H][4];
	if (!astc_helpers::decode_block(log_blk, half_block, block_width, block_height, astc_helpers::cDecodeModeHDR16))
		return false;

	const uint32_t total_block_pixels = block_width * block_height;
	for (uint32_t p = 0; p < total_block_pixels; p++)
	{
		pPixels[p][0] = basist::half_to_float(half_block[p][0]);
		pPixels[p][1] = basist::half_to_float(half_block[p][1]);
		pPixels[p][2] = basist::half_to_float(half_block[p][2]);
		pPixels[p][3] = basist::half_to_float(half_block[p][3]);
	}

	return true;
}

static bool unpack_physical_astc_block_google(const void* pBlock, uint32_t block_width, uint32_t block_height, vec4F* pPixels)
{
	return basisu_astc::astc::decompress_hdr((float *)pPixels, (uint8_t*)pBlock, block_width, block_height);
}

static bool pack_bc6h_image(const imagef &src_img, vector2D<basist::bc6h_block> &bc6h_blocks, imagef *pPacked_bc6h_img, const fast_bc6h_params &enc_params)
{
	const uint32_t width = src_img.get_width();
	const uint32_t height = src_img.get_height();
	
	if (pPacked_bc6h_img)
		pPacked_bc6h_img->resize(width, height);

	interval_timer tm;
	double total_enc_time = 0.0f;
	BASISU_NOTE_UNUSED(total_enc_time);

	const uint32_t num_blocks_x = src_img.get_block_width(4);
	const uint32_t num_blocks_y = src_img.get_block_height(4);

	bc6h_blocks.resize(num_blocks_x, num_blocks_y);
				
	for (uint32_t by = 0; by < num_blocks_y; by++)
	{
		for (uint32_t bx = 0; bx < num_blocks_x; bx++)
		{
			// Extract source image block
			vec4F block_pixels[4][4]; // [y][x]
			src_img.extract_block_clamped(&block_pixels[0][0], bx * 4, by * 4, 4, 4);

			basist::half_float half_pixels[16 * 3]; // [y][x]

			for (uint32_t y = 0; y < 4; y++)
			{
				for (uint32_t x = 0; x < 4; x++)
				{
					for (uint32_t c = 0; c < 3; c++)
					{
						float v = block_pixels[y][x][c];

						basist::half_float h = basist::float_to_half(v);

						half_pixels[(x + y * 4) * 3 + c] = h;

					} // c

				} // x
			} // y

			basist::bc6h_block& bc6h_blk = bc6h_blocks(bx, by);

			tm.start();

			basist::astc_6x6_hdr::fast_encode_bc6h(half_pixels, &bc6h_blk, enc_params);

			total_enc_time += tm.get_elapsed_secs();

			if (pPacked_bc6h_img)
			{
				basist::half_float unpacked_blk[16 * 3];
				bool status = unpack_bc6h(&bc6h_blk, unpacked_blk, false);
				assert(status);
				if (!status)
				{
					fmt_error_printf("unpack_bc6h() failed\n");
					return false;
				}
							
				for (uint32_t y = 0; y < 4; y++)
				{
					for (uint32_t x = 0; x < 4; x++)
					{
						vec4F p;

						for (uint32_t c = 0; c < 3; c++)
						{
							float v = basist::half_to_float(unpacked_blk[(x + y * 4) * 3 + c]);
							p[c] = v;

						} // c

						p[3] = 1.0f;

						pPacked_bc6h_img->set_clipped(bx * 4 + x, by * 4 + y, p);
					} // x
				} // y
			}

		} // bx
	} // by

	//fmt_printf("Total BC6H encode time: {}\n", total_enc_time);

	return true;
}

static float dist_to_line_squared(const vec3F& p, const vec3F &line_org, const vec3F &line_dir)
{
	vec3F q(p - line_org);
	vec3F v(q - q.dot(line_dir) * line_dir);
	return v.dot(v);
}

static void estimate_partitions_mode7_and_11(
	uint32_t num_parts, // 2 or 3 partitions
	uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns
	uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine
	const vec3F *pHalf_pixels_as_floats, // block's half pixel values casted to floats
	const astc_hdr_codec_base_options& coptions, // options
	uint32_t num_desired_pats, 
	int *pDesired_pat_indices_mode11, int *pDesired_pat_indices_mode7) // output indices
{
	BASISU_NOTE_UNUSED(coptions);
	BASISU_NOTE_UNUSED(num_unique_pats);

	const uint32_t BLOCK_W = 6, BLOCK_H = 6, MAX_PARTS = 3; // BLOCK_T = 6 * 6
	assert(num_parts <= MAX_PARTS);

	struct candidate_res
	{
		float m_total_sq_dist;
		uint32_t m_index;
		bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; }
	};

	const uint32_t MAX_CANDIDATES = 1024;
	assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES));

	candidate_res mode11_candidates[MAX_CANDIDATES];
	candidate_res mode7_candidates[MAX_CANDIDATES];

	const vec3F grayscale_axis(0.5773502691f);
	
	for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++)
	{
		const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter];
		assert(unique_part_index < num_unique_pats);

		const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index];

		vec3F part_means[MAX_PARTS];
		uint32_t part_total_texels[MAX_PARTS] = { 0 };

		for (uint32_t i = 0; i < num_parts; i++)
			part_means[i].clear();

		for (uint32_t y = 0; y < BLOCK_H; y++)
		{
			for (uint32_t x = 0; x < BLOCK_W; x++)
			{
				const uint32_t part_index = (*pPat)(x, y);
				assert(part_index < num_parts);

				part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W];
				part_total_texels[part_index]++;

 			} // x
		} // y
		
		for (uint32_t i = 0; i < num_parts; i++)
		{
			assert(part_total_texels[i]);
			part_means[i] /= (float)part_total_texels[i];
		}

		float part_cov[MAX_PARTS][6];
		memset(part_cov, 0, sizeof(part_cov));

		for (uint32_t y = 0; y < BLOCK_H; y++)
		{
			for (uint32_t x = 0; x < BLOCK_W; x++)
			{
				const uint32_t part_index = (*pPat)(x, y);
				assert(part_index < num_parts);

				const vec3F p(pHalf_pixels_as_floats[x + y * BLOCK_W] - part_means[part_index]);

				const float r = p[0], g = p[1], b = p[2];

				part_cov[part_index][0] += r * r;
				part_cov[part_index][1] += r * g;
				part_cov[part_index][2] += r * b;
				part_cov[part_index][3] += g * g;
				part_cov[part_index][4] += g * b;
				part_cov[part_index][5] += b * b;

			} // x
		} // y

		// For each partition compute the total variance of all channels.
		float total_variance[MAX_PARTS];
		for (uint32_t part_index = 0; part_index < num_parts; part_index++)
			total_variance[part_index] = part_cov[part_index][0] + part_cov[part_index][3] + part_cov[part_index][5];

		//vec3F part_axis[MAX_PARTS];
		float mode11_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis
		float mode7_eigenvalue_est[MAX_PARTS]; // For each partition, compute the variance along the principle axis

		for (uint32_t part_index = 0; part_index < num_parts; part_index++)
		{
			float* pCov = &part_cov[part_index][0];

			float xr = .9f, xg = 1.0f, xb = .7f;
			
			const uint32_t NUM_POWER_ITERS = 4;
			for (uint32_t iter = 0; iter < NUM_POWER_ITERS; iter++)
			{
				float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2];
				float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4];
				float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5];

				float m = maximumf(maximumf(fabsf(r), fabsf(g)), fabsf(b));

				if (m >= 1e-10f)
				{
					m = 1.0f / m;
					
					r *= m;
					g *= m;
					b *= m;
				}

				xr = r;
				xg = g;
				xb = b;
			}

			float len_sq = xr * xr + xg * xg + xb * xb;
						
			if (len_sq < 1e-10f)
			{
				xr = grayscale_axis[0];
				xg = grayscale_axis[0];
				xb = grayscale_axis[0];
			}
			else
			{
				len_sq = 1.0f / sqrtf(len_sq);

				xr *= len_sq;
				xg *= len_sq;
				xb *= len_sq;
			}
			
			{
				// Transform the principle axis by the covariance matrix, which will scale the vector by its eigenvalue (the variance of the dataset projected onto the principle axis).
				float r = xr * pCov[0] + xg * pCov[1] + xb * pCov[2];
				float g = xr * pCov[1] + xg * pCov[3] + xb * pCov[4];
				float b = xr * pCov[2] + xg * pCov[4] + xb * pCov[5];

				// Estimate the principle eigenvalue by computing the magnitude of the transformed vector.
				// The result is the variance along the principle axis.
				//float z1 = sqrtf(r * r + g * g + b * b); // this works with the principle axis
				//float z2 = r * xr + g * xg + b * xb; // compute length projected along xr,xg,xb
				
				mode11_eigenvalue_est[part_index] = r * xr + g * xg + b * xb;
			}

			{
				const float yrgb = grayscale_axis[0];
				
				// Transform the grayscale axis by the covariance matrix, which will scale the vector by the eigenvalue (which is the variance of the dataset projected onto this vector).
				float r = yrgb * pCov[0] + yrgb * pCov[1] + yrgb * pCov[2];
				float g = yrgb * pCov[1] + yrgb * pCov[3] + yrgb * pCov[4];
				float b = yrgb * pCov[2] + yrgb * pCov[4] + yrgb * pCov[5];

				mode7_eigenvalue_est[part_index] = r * yrgb + g * yrgb + b * yrgb;
			}

		} // part_index
				
		// Compute the total variance (squared error) of the other 2 axes by subtracting the total variance of all channels by the variance of the principle axis.
		// TODO: Could also compute the ratio of the principle axis's variance vs. the total variance.
		float mode11_total_sq_dist_to_line_alt = 0.0f;
		for (uint32_t part_index = 0; part_index < num_parts; part_index++)
		{
			float d = maximum(0.0f, total_variance[part_index] - mode11_eigenvalue_est[part_index]);
			mode11_total_sq_dist_to_line_alt += d;
		}

		{
#if 0
			// TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix),
			// then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances.
			float total_sq_dist_to_line = 0.0f;
			for (uint32_t i = 0; i < BLOCK_T; i++)
			{
				const uint32_t part_index = (*pPat)[i];
				assert(part_index < num_parts);

				total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis[part_index]);
			}

			mode11_candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line;
#else
			mode11_candidates[examine_iter].m_total_sq_dist = mode11_total_sq_dist_to_line_alt;
#endif
			mode11_candidates[examine_iter].m_index = unique_part_index;
		}

		{
			float mode7_total_sq_dist_to_line_alt = 0.0f;
			for (uint32_t part_index = 0; part_index < num_parts; part_index++)
			{
				float d = maximum(0.0f, total_variance[part_index] - mode7_eigenvalue_est[part_index]);
				mode7_total_sq_dist_to_line_alt += d;
			}

			mode7_candidates[examine_iter].m_total_sq_dist = mode7_total_sq_dist_to_line_alt;
			mode7_candidates[examine_iter].m_index = unique_part_index;
		}

	} // examine_iter

	std::sort(&mode11_candidates[0], &mode11_candidates[num_pats_to_examine]);
	std::sort(&mode7_candidates[0], &mode7_candidates[num_pats_to_examine]);

	for (uint32_t i = 0; i < num_desired_pats; i++)
		pDesired_pat_indices_mode11[i] = mode11_candidates[i].m_index;

	for (uint32_t i = 0; i < num_desired_pats; i++)
		pDesired_pat_indices_mode7[i] = mode7_candidates[i].m_index;
}

static void estimate_partitions_mode7(
	uint32_t num_parts, // 2 or 3 partitions
	uint32_t num_unique_pats, const partition_pattern_vec* pUnique_pats, // list of all unique, canonicalized patterns
	uint32_t num_pats_to_examine, const uint32_t* pUnique_pat_indices_to_examine, // indices of pats to examine
	const vec3F* pHalf_pixels_as_floats, // block's half pixel values casted to floats
	const astc_hdr_codec_base_options& coptions, // options
	uint32_t num_desired_pats, uint32_t* pDesired_pat_indices) // output indices
{
	BASISU_NOTE_UNUSED(coptions);
	BASISU_NOTE_UNUSED(num_unique_pats);

	const uint32_t BLOCK_W = 6, BLOCK_H = 6, BLOCK_T = 6 * 6, MAX_PARTS = 3;
	assert(num_parts <= MAX_PARTS);

	struct candidate_res
	{
		float m_total_sq_dist;
		uint32_t m_index;
		bool operator< (const candidate_res& rhs) const { return m_total_sq_dist < rhs.m_total_sq_dist; }
	};

	const uint32_t MAX_CANDIDATES = 1024;
	assert(num_desired_pats && (num_desired_pats <= MAX_CANDIDATES));

	candidate_res candidates[MAX_CANDIDATES];

	for (uint32_t examine_iter = 0; examine_iter < num_pats_to_examine; examine_iter++)
	{
		const uint32_t unique_part_index = pUnique_pat_indices_to_examine[examine_iter];
		assert(unique_part_index < num_unique_pats);

		const partition_pattern_vec* pPat = &pUnique_pats[unique_part_index];

		vec3F part_means[MAX_PARTS];
		uint32_t part_total_texels[MAX_PARTS] = { 0 };

		for (uint32_t i = 0; i < num_parts; i++)
			part_means[i].clear();

		for (uint32_t y = 0; y < BLOCK_H; y++)
		{
			for (uint32_t x = 0; x < BLOCK_W; x++)
			{
				const uint32_t part_index = (*pPat)(x, y);
				assert(part_index < num_parts);

				part_means[part_index] += pHalf_pixels_as_floats[x + y * BLOCK_W];
				part_total_texels[part_index]++;

			} // x
		} // y

		for (uint32_t i = 0; i < num_parts; i++)
		{
			assert(part_total_texels[i]);
			part_means[i] /= (float)part_total_texels[i];
		}

		vec3F part_axis(0.5773502691f);
		
		// TODO: This total distance can be computed rapidly. First compute the total variance of each channel (sum the diag entries of the covar matrix),
		// then compute the principle eigenvalue, and subtract. The result is the variance of the projection distances.
		float total_sq_dist_to_line = 0.0f;
		for (uint32_t i = 0; i < BLOCK_T; i++)
		{
			const uint32_t part_index = (*pPat)[i];
			assert(part_index < num_parts);

			total_sq_dist_to_line += dist_to_line_squared(pHalf_pixels_as_floats[i], part_means[part_index], part_axis);
		}

		candidates[examine_iter].m_total_sq_dist = total_sq_dist_to_line;

		candidates[examine_iter].m_index = unique_part_index;

	} // examine_iter

	std::sort(&candidates[0], &candidates[num_pats_to_examine]);

	for (uint32_t i = 0; i < num_desired_pats; i++)
		pDesired_pat_indices[i] = candidates[i].m_index;
}

static float calc_deblocking_penalty_itp(
	uint32_t bx, uint32_t by, uint32_t width, uint32_t height,
	const imagef& pass_src_img_itp, const candidate_encoding& candidate)
{
	float total_deblock_penalty = 0.0f;

	float total_orig_mse = 0.0f, total_comp_mse = 0.0f;
	uint32_t total_c = 0;

	for (uint32_t b = 0; b < 4; b++)
	{
		for (uint32_t i = 0; i < 6; i++)
		{
			int ox = 0, oy = 0, qx = 0, qy = 0;

			switch (b)
			{
			case 0:
				ox = bx * 6 + i; oy = (by - 1) * 6 + 5;
				qx = bx * 6 + i; qy = by * 6;
				break;
			case 1:
				ox = bx * 6 + i; oy = (by + 1) * 6;
				qx = bx * 6 + i; qy = by * 6 + 5;
				break;
			case 2:
				ox = (bx - 1) * 6 + 5; oy = by * 6 + i;
				qx = bx * 6; qy = by * 6 + i;
				break;
			case 3:
				ox = (bx + 1) * 6; oy = by * 6 + i;
				qx = bx * 6 + 5; qy = by * 6 + i;
				break;
			}

			if ((ox < 0) || (oy < 0) || (ox >= (int)width) || (oy >= (int)height))
				continue;

			const vec3F& o_pixel_itp = pass_src_img_itp(ox, oy);
			const vec3F& q_pixel_itp = pass_src_img_itp(qx, qy);

			const vec3F &d_pixel_itp = candidate.m_comp_pixels_itp[qy - by * 6][qx - bx * 6]; // compressed block
			
			vec3F orig_delta_v(o_pixel_itp - q_pixel_itp);
			total_orig_mse += square(orig_delta_v[0]) + square(orig_delta_v[1]) + square(orig_delta_v[2]);

			vec3F d_delta_v(o_pixel_itp - d_pixel_itp);
			total_comp_mse += square(d_delta_v[0]) + square(d_delta_v[1]) + square(d_delta_v[2]);

			total_c++;
		}
	}

	if (total_c)
	{
		total_orig_mse /= (float)total_c;
		total_comp_mse /= (float)total_c;

		if (total_orig_mse)
		{
			total_deblock_penalty = fabsf((total_comp_mse - total_orig_mse) / total_orig_mse);
		}
	}

	return total_deblock_penalty;
}

static bool calc_strip_size(
	float lambda,
	uint32_t num_blocks_y, uint32_t total_threads, bool force_one_strip,
	uint32_t& res_total_strips, uint32_t& res_rows_per_strip, astc_hdr_6x6_global_config &global_cfg)
{
	uint32_t total_strips = 1;

	if (lambda == 0.0f)
	{
		if (!force_one_strip)
		{
			total_strips = total_threads;
		}
	}
	else
	{
		const uint32_t MIN_DESIRED_STRIPS = 8;
		const uint32_t MAX_TARGET_STRIPS = 32;
		const uint32_t TARGET_ASTC_6X6_ROWS_PER_STRIP = 12;

		if (!force_one_strip)
		{
			total_strips = maximum<uint32_t>(1, num_blocks_y / TARGET_ASTC_6X6_ROWS_PER_STRIP);

			if (num_blocks_y >= MIN_DESIRED_STRIPS * 2)
				total_strips = maximum(total_strips, MIN_DESIRED_STRIPS);
		}

		total_strips = minimum(total_strips, MAX_TARGET_STRIPS);
	}

	uint32_t rows_per_strip = 0;
	if (total_strips <= 1)
	{
		rows_per_strip = num_blocks_y;
	}
	else
	{
		rows_per_strip = (num_blocks_y / total_strips) & ~1;
		
		if (rows_per_strip < 2)
			rows_per_strip = 2;// num_blocks_y;
	}
		
	assert((rows_per_strip == num_blocks_y) || ((rows_per_strip & 1) == 0));

	total_strips = (num_blocks_y + rows_per_strip - 1) / rows_per_strip;
	
	if (global_cfg.m_debug_output)
	{
		fmt_printf("num_blocks_y: {}, total_threads : {}, Total strips : {}\n", num_blocks_y, total_threads, total_strips);
		fmt_printf("ASTC 6x6 block rows per strip: {}\n", rows_per_strip);
		fmt_printf("ASTC 6x6 block rows on final strip: {}\n", num_blocks_y - (total_strips - 1) * rows_per_strip);
	}

	uint32_t total_rows = 0;
	for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++)
	{
		uint32_t strip_first_by = strip_index * rows_per_strip;
		uint32_t strip_last_by = minimum<uint32_t>(strip_first_by + rows_per_strip - 1, num_blocks_y);

		if (strip_index == (total_strips - 1))
			strip_last_by = num_blocks_y - 1;

		uint32_t num_strip_block_rows = (strip_last_by - strip_first_by) + 1;
		total_rows += num_strip_block_rows;

		if (global_cfg.m_debug_output)
			fmt_printf("Strip row: {}, total block rows: {}\n", strip_index, num_strip_block_rows);
	}

	if (total_rows != num_blocks_y)
	{
		fmt_error_printf("Strip calc failed\n");
		return false;
	}

	res_total_strips = total_strips;
	res_rows_per_strip = rows_per_strip;

	return true;
}

static void convet_rgb_image_to_itp(const imagef &src_img, imagef &dst_img, const astc_hdr_6x6_global_config& cfg)
{
	const uint32_t width = src_img.get_width(), height = src_img.get_height();

	dst_img.resize(width, height);

	for (uint32_t y = 0; y < height; y++)
	{
		for (uint32_t x = 0; x < width; x++)
		{
			vec3F src_rgb(src_img(x, y));

			vec3F src_itp;
			linear_rgb_to_itp(src_rgb, src_itp, cfg);

			dst_img(x, y) = src_itp;
		}
	}
}

const uint32_t BLOCK_W = 6, BLOCK_H = 6;
const uint32_t NUM_BLOCK_PIXELS = BLOCK_W * BLOCK_H;

const float SOLID_PENALTY = 4.0f;
const float REUSE_PENALTY = 1.0f;
const float RUN_PENALTY = 10.0f;

const float MSE_WEIGHT = 300000.0f;
const float SSIM_WEIGHT = 200.0f;
const float TWO_LEVEL_PENALTY = 1.425f;
const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM = .04f;
const float SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM = .04f;
const float COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY = 1.5f;
const float COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY = 1.25f;
const float COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY = 1.15f;

struct uastc_hdr_6x6_debug_state
{
	uint32_t m_encoding_type_hist[(uint32_t)encoding_type::cTotal] = { 0 };
	uint32_t m_endpoint_mode_hist[(uint32_t)endpoint_mode::cTotal] = { 0 };
	uint32_t m_block_mode_hist[(uint32_t)block_mode::cBMTotalModes] = { 0 };
	uint64_t m_block_mode_total_bits[(uint32_t)block_mode::cBMTotalModes] = { 0 };

	basisu::vector< basisu::stats<float> > m_block_mode_comp_stats[(uint32_t)block_mode::cBMTotalModes][3];
	basisu::vector< basisu::comparative_stats<float> > m_block_mode_comparative_stats[(uint32_t)block_mode::cBMTotalModes][3];

	std::atomic<uint32_t> m_total_gaussian1_blocks;
	std::atomic<uint32_t> m_total_gaussian2_blocks;
	std::atomic<uint32_t> m_total_filter_horizontal;
	std::atomic<uint32_t> m_detail_stats[5];
	std::atomic<uint32_t> m_total_mode7_skips;

	std::atomic<uint32_t> m_total_blocks_compressed;

	std::atomic<uint32_t> m_total_candidates_considered;
	std::atomic<uint32_t> m_max_candidates_considered;

	std::atomic<uint32_t> m_total_part2_stats[4];
	std::atomic<uint32_t> m_dp_stats[5];

	std::atomic<uint32_t> m_reuse_num_parts[4];
	std::atomic<uint32_t> m_reuse_total_dp;

	imagef m_stat_vis;
	std::mutex m_stat_vis_mutex;

	image m_part_vis;
	image m_mode_vis;
	image m_mode_vis2;
	image m_grid_vis;
	image m_enc_vis;
	std::mutex m_vis_image_mutex;

	std::atomic<uint32_t> m_comp_level_hist[ASTC_HDR_6X6_MAX_COMP_LEVEL + 1];
		
	std::atomic<uint32_t> m_total_jnd_replacements;

	std::mutex m_stats_mutex;

	uastc_hdr_6x6_debug_state()
	{
		for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)
		{
			for (uint32_t j = 0; j < 3; j++)
			{
				m_block_mode_comp_stats[i][j].reserve(512);
				m_block_mode_comparative_stats[i][j].reserve(512);
			}
		}
	}
	
	void init(uint32_t width, uint32_t height)
	{
		m_stat_vis.resize(width, height);
		m_part_vis.resize(width, height);
		m_mode_vis.resize(width, height);
		m_mode_vis2.resize(width, height);
		m_grid_vis.resize(width, height);
		m_enc_vis.resize(width, height);

		basisu::clear_obj(m_encoding_type_hist);
		basisu::clear_obj(m_endpoint_mode_hist);
		basisu::clear_obj(m_block_mode_hist);
		basisu::clear_obj(m_block_mode_total_bits);
		
		for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)
		{
			for (uint32_t j = 0; j < 3; j++)
			{
				m_block_mode_comp_stats[i][j].clear();
				m_block_mode_comparative_stats[i][j].clear();
			}
		}

		m_total_gaussian1_blocks.store(0);
		m_total_gaussian2_blocks.store(0);
		m_total_filter_horizontal.store(0);
		for (uint32_t i = 0; i < std::size(m_detail_stats); i++)
			m_detail_stats[i].store(0);
		m_total_mode7_skips.store(0);

		for (uint32_t i = 0; i < std::size(m_comp_level_hist); i++)
			m_comp_level_hist[i].store(0);

		m_total_blocks_compressed.store(0);

		m_total_candidates_considered.store(0);
		m_max_candidates_considered.store(0);

		for (uint32_t i = 0; i < std::size(m_total_part2_stats); i++)
			m_total_part2_stats[i].store(0);
		
		for (uint32_t i = 0; i < std::size(m_dp_stats); i++)
			m_dp_stats[i].store(0);

		for (uint32_t i = 0; i < std::size(m_reuse_num_parts); i++)
			m_reuse_num_parts[i] .store(0);

		m_reuse_total_dp.store(0);

		m_total_jnd_replacements.store(0);
	}

	void print(uint32_t total_blocks) const
	{
		fmt_printf("Total blocks: {}\n", total_blocks);
		fmt_printf("Total JND replacements: {} {3.2}%\n", m_total_jnd_replacements, (float)m_total_jnd_replacements * 100.0f / (float)total_blocks);
		fmt_printf("Comp level histogram: {} {} {} {} {}\n", m_comp_level_hist[0], m_comp_level_hist[1], m_comp_level_hist[2], m_comp_level_hist[3], m_comp_level_hist[4]);
		fmt_printf("Total gaussian 1 blocks: {} {3.2}%\n", m_total_gaussian1_blocks, (float)m_total_gaussian1_blocks * 100.0f / (float)total_blocks);
		fmt_printf("Total gaussian 2 blocks: {} {3.2}%\n", m_total_gaussian2_blocks, (float)m_total_gaussian2_blocks * 100.0f / (float)total_blocks);
		fmt_printf("Total filter horizontal: {} {3.2}%\n", m_total_filter_horizontal, (float)m_total_filter_horizontal * 100.0f / (float)total_blocks);
		fmt_printf("Detail stats: Detailed block low grid skip: {}, Blurry block skip: {}, Very blurry block skip: {}, NH:{} H:{}\n", m_detail_stats[0], m_detail_stats[1], m_detail_stats[2], m_detail_stats[3], m_detail_stats[4]);
		fmt_printf("Total mode7 skips: {}\n", m_total_mode7_skips);

		fmt_printf("Total candidates: {}, {} avg per block\n", m_total_candidates_considered, (float)m_total_candidates_considered / (float)total_blocks);
		fmt_printf("Max ever candidates: {}\n", m_max_candidates_considered);

		fmt_printf("Part2/3 stats: {} {} {} {}\n", m_total_part2_stats[0], m_total_part2_stats[1], m_total_part2_stats[2], m_total_part2_stats[3]);
		fmt_printf("Dual plane stats: {} {} {} {} {}\n", m_dp_stats[0], m_dp_stats[1], m_dp_stats[2], m_dp_stats[3], m_dp_stats[4]);
		fmt_printf("Reuse total dual plane: {}\n", m_reuse_total_dp);
		fmt_printf("Reuse part stats: {} {} {}\n", m_reuse_num_parts[1], m_reuse_num_parts[2], m_reuse_num_parts[3]);

		fmt_printf("\nEncoding type histogram:\n");
		for (uint32_t i = 0; i < std::size(m_encoding_type_hist); i++)
			fmt_printf("{}: {}\n", i, m_encoding_type_hist[i]);

		fmt_printf("\nEndpoint mode histogram:\n");
		for (uint32_t i = 0; i < std::size(m_endpoint_mode_hist); i++)
			fmt_printf("{}: {}\n", i, m_endpoint_mode_hist[i]);

		fmt_printf("\nBlock mode histogram:\n");

		uint32_t total_dp = 0, total_sp = 0;
		uint32_t total_mode11 = 0, total_mode7 = 0;
		uint32_t part_hist[3] = { 0 };
		uint32_t part2_mode7_total = 0, part2_mode11_total = 0;
		uint32_t total_used_modes = 0;
		for (uint32_t i = 0; i < std::size(m_block_mode_hist); i++)
		{
			const auto& bm_desc = g_block_mode_descs[i];

			const uint32_t total_uses = m_block_mode_hist[i];

			if (bm_desc.m_dp)
				total_dp += total_uses;
			else
				total_sp += total_uses;

			if (bm_desc.m_cem == 7)
				total_mode7 += total_uses;
			else
				total_mode11 += total_uses;

			part_hist[bm_desc.m_num_partitions - 1] += total_uses;

			if (bm_desc.m_num_partitions == 2)
			{
				if (bm_desc.m_cem == 7)
					part2_mode7_total += total_uses;
				else
				{
					assert(bm_desc.m_cem == 11);
					part2_mode11_total += total_uses;
				}
			}

			float avg_std_dev = 0.0f;
			float avg_cross_correlations[3] = { 0 };

			if (m_block_mode_comp_stats[i][0].size())
			{
				const uint32_t num_uses = m_block_mode_comp_stats[i][0].size_u32();

				for (uint32_t j = 0; j < num_uses; j++)
					avg_std_dev += (float)maximum(m_block_mode_comp_stats[i][0][j].m_std_dev, m_block_mode_comp_stats[i][1][j].m_std_dev, m_block_mode_comp_stats[i][2][j].m_std_dev);
				avg_std_dev /= (float)num_uses;

				for (uint32_t j = 0; j < num_uses; j++)
				{
					avg_cross_correlations[0] += fabsf((float)m_block_mode_comparative_stats[i][0][j].m_pearson);
					avg_cross_correlations[1] += fabsf((float)m_block_mode_comparative_stats[i][1][j].m_pearson);
					avg_cross_correlations[2] += fabsf((float)m_block_mode_comparative_stats[i][2][j].m_pearson);
				}

				avg_cross_correlations[0] /= (float)num_uses;
				avg_cross_correlations[1] /= (float)num_uses;
				avg_cross_correlations[2] /= (float)num_uses;
			}

			fmt_printf("{ 2}: uses: { 6}, cem: {}, dp: {} chan: {}, parts: {}, grid: {}x{}, endpoint levels: {}, weight levels: {}, Avg bits: {}, Avg Max Std Dev: {}, RG: {} RB: {} GB: {}\n", i, total_uses,
				bm_desc.m_cem,
				bm_desc.m_dp, bm_desc.m_dp_channel,
				bm_desc.m_num_partitions,
				bm_desc.m_grid_x, bm_desc.m_grid_y,
				astc_helpers::get_ise_levels(bm_desc.m_endpoint_ise_range),
				astc_helpers::get_ise_levels(bm_desc.m_weight_ise_range),
				total_uses ? ((double)m_block_mode_total_bits[i] / total_uses) : 0.0f,
				avg_std_dev, avg_cross_correlations[0], avg_cross_correlations[1], avg_cross_correlations[2]);

			if (total_uses)
				total_used_modes++;
		}

		fmt_printf("Total used modes: {}\n", total_used_modes);

		fmt_printf("Total single plane: {}, total dual plane: {}\n", total_sp, total_dp);
		fmt_printf("Total mode 11: {}, mode 7: {}\n", total_mode11, total_mode7);
		fmt_printf("Partition histogram: {} {} {}\n", part_hist[0], part_hist[1], part_hist[2]);
		fmt_printf("2 subset mode 7 uses: {}, mode 11 uses: {}\n", part2_mode7_total, part2_mode11_total);
	}
};

struct uastc_hdr_6x6_encode_state
{
	astc_hdr_codec_base_options master_coptions;
		
	imagef src_img;
		
	imagef src_img_filtered1;
	imagef src_img_filtered2;

	imagef src_img_itp;
	imagef src_img_filtered1_itp;
	imagef src_img_filtered2_itp;

	vector2D<float> smooth_block_mse_scales;

	imagef packed_img;

	basisu::vector<bitwise_coder> strip_bits;

	basisu::vector2D<astc_helpers::astc_block> final_astc_blocks;

	vector2D<candidate_encoding> coded_blocks;
};

static bool compress_strip_task(
	uint32_t strip_index, uint32_t total_strips, uint32_t strip_first_by, uint32_t strip_last_by,
	uint32_t num_blocks_x, uint32_t num_blocks_y, uint32_t total_blocks, uint32_t width, uint32_t height,
	astc_hdr_6x6_global_config &global_cfg, uastc_hdr_6x6_debug_state &debug_state, uastc_hdr_6x6_encode_state &enc_state)
{
	BASISU_NOTE_UNUSED(num_blocks_y);
	BASISU_NOTE_UNUSED(total_strips);
	
	vec3F prev_comp_pixels[BLOCK_H][BLOCK_W]; // [y][x]
	basisu::clear_obj(prev_comp_pixels);

	uint32_t prev_run_len = 0;

	bitwise_coder prev_encoding;
	candidate_encoding prev_candidate_encoding; // the previous candidate written, which may have been a run extension
	candidate_encoding prev_non_run_candidate_encoding; // the previous *non-run* candidate written

	bitwise_coder& strip_coded_bits = enc_state.strip_bits[strip_index];

	const uint32_t CANDIDATES_TO_RESERVE = 1536;

	basisu::vector<candidate_encoding> candidates;
	candidates.reserve(CANDIDATES_TO_RESERVE);

	const bool use_orig_behavior = global_cfg.m_write_basisu_1_6_compatible_files;

	for (uint32_t by = strip_first_by; by <= strip_last_by; by++)
	{
		const bool has_upper_neighbor = by > strip_first_by;

		for (uint32_t bx = 0; bx < num_blocks_x; bx++)
		{
			//if ((bx == 1) && (by == 2))
			//	basisu::fmt_printf("!");

			for (uint32_t outer_pass = 0; outer_pass < 3; outer_pass++)
			{
				const bool has_left_neighbor = bx > 0;
				//const bool has_prev = has_left_neighbor || has_upper_neighbor;

				// Select either the original source image, or the Gaussian filtered version.
				// From here the encoder *must* use these 2 sources.
				const imagef& pass_src_img = (outer_pass == 2) ? enc_state.src_img_filtered2 :
					((outer_pass == 1) ? enc_state.src_img_filtered1 : enc_state.src_img);

				const imagef& pass_src_img_itp = (outer_pass == 2) ? enc_state.src_img_filtered2_itp :
					((outer_pass == 1) ? enc_state.src_img_filtered1_itp : enc_state.src_img_itp);

				// Extract source image block
				vec4F block_pixels[BLOCK_H][BLOCK_W]; // [y][x]
				pass_src_img.extract_block_clamped(&block_pixels[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);

				vec4F block_pixels_itp[BLOCK_H][BLOCK_W]; // [y][x]
				pass_src_img_itp.extract_block_clamped(&block_pixels_itp[0][0], bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);

				half_vec3 half_pixels[BLOCK_H][BLOCK_W]; // [y][x] half-float values
				vec3F half_pixels_as_floats[BLOCK_H][BLOCK_W]; // [y][x] half float values, integer bits as floats
				vec4F block_pixels_q16[BLOCK_H][BLOCK_W]; // [y][x], q16 space for low-level ASTC encoding
				vec3F block_pixels_as_itp[BLOCK_H][BLOCK_W]; // [y][x] input converted to itp space, for faster error calculations

				bool is_grayscale = true;

				candidates.resize(0);

				float block_ly = BIG_FLOAT_VAL, block_hy = 0.0f, block_avg_y = 0.0f;

				for (uint32_t y = 0; y < BLOCK_H; y++)
				{
					for (uint32_t x = 0; x < BLOCK_W; x++)
					{
						vec3F rgb_input;

						for (uint32_t c = 0; c < 3; c++)
						{
							float v = block_pixels[y][x][c];

							rgb_input[c] = v;

							const basist::half_float h = basisu::fast_float_to_half_no_clamp_neg_nan_or_inf(v);
							assert(h == basist::float_to_half(v));

							half_pixels[y][x][c] = h;

							block_pixels_q16[y][x][c] = (float)half_to_qlog16(h);

							half_pixels_as_floats[y][x][c] = (float)h;

						} // c

						float py = rgb_input.dot(vec3F(REC_709_R, REC_709_G, REC_709_B));
						if (py < block_ly)
							block_ly = py;
						if (py > block_hy)
							block_hy = py;
						block_avg_y += py;

						//linear_rgb_to_itp(rgb_input, block_pixels_as_itp[y][x]);

						block_pixels_as_itp[y][x] = block_pixels_itp[y][x];

						block_pixels_q16[y][x][3] = 0.0f;

						if ((half_pixels[y][x][0] != half_pixels[y][x][1]) || (half_pixels[y][x][0] != half_pixels[y][x][2]))
							is_grayscale = false;

					} // x
				} // y

				block_avg_y *= (1.0f / (float)NUM_BLOCK_PIXELS);

				encode_astc_block_stats enc_block_stats;
				enc_block_stats.init(NUM_BLOCK_PIXELS, &block_pixels_q16[0][0]);

				vec4F x_filtered[6][6], y_filtered[6][6];

				filter_block(3, 6, (vec4F*)block_pixels, (vec4F*)x_filtered); // filter rows (horizontal)
				filter_block(6, 3, (vec4F*)block_pixels, (vec4F*)y_filtered); // filter cols (vertically)

				const float filtered_x_err = diff_blocks((vec4F*)block_pixels, (vec4F*)x_filtered);
				const float filtered_y_err = diff_blocks((vec4F*)block_pixels, (vec4F*)y_filtered);
				const bool filter_horizontally = filtered_x_err < filtered_y_err;

				//const float block_mag_gradient_mag = block_max_gradient_mag(bx, by);

				if (filter_horizontally)
					debug_state.m_total_filter_horizontal.fetch_add(1, std::memory_order_relaxed);

				vec3F lowpass_filtered[6][6];
				filter_block(3, 3, &half_pixels_as_floats[0][0], &lowpass_filtered[0][0]);
				float lowpass_std_dev = sub_and_compute_std_dev(&lowpass_filtered[0][0], &half_pixels_as_floats[0][0]);

				const bool very_detailed_block = lowpass_std_dev > 350.0f;
				const bool very_blurry_block = lowpass_std_dev < 30.0f;
				const bool super_blurry_block = lowpass_std_dev < 15.0f;

				basisu::stats<float> half_comp_stats[3];
				for (uint32_t c = 0; c < 3; c++)
					half_comp_stats[c].calc(NUM_BLOCK_PIXELS, &half_pixels_as_floats[0][0][c], 3);

				const float SINGLE_PART_HALF_THRESH = 256.0f;
				const float COMPLEX_HALF_THRESH = 1024.0f;
				// HACK HACK
				const float VERY_COMPLEX_HALF_THRESH = 1400.0f; // 1536.0f;

				const float max_std_dev = (float)maximum(half_comp_stats[0].m_std_dev, half_comp_stats[1].m_std_dev, half_comp_stats[2].m_std_dev);

				const bool very_simple_block = (max_std_dev < SINGLE_PART_HALF_THRESH);
				const bool complex_block = (max_std_dev > COMPLEX_HALF_THRESH);
				const bool very_complex_block = (max_std_dev > VERY_COMPLEX_HALF_THRESH);

				// Dynamically choose a comp_level for this block.
				astc_hdr_codec_base_options coptions(enc_state.master_coptions);
				uint32_t comp_level = global_cfg.m_master_comp_level;

				if (very_complex_block)
					comp_level = global_cfg.m_highest_comp_level;
				else if (complex_block)
					comp_level = (global_cfg.m_master_comp_level + global_cfg.m_highest_comp_level + 1) / 2;

				debug_state.m_comp_level_hist[comp_level].fetch_add(1, std::memory_order_relaxed);

				bool any_2subset_enabled = false, any_2subset_mode11_enabled = false, any_2subset_mode7_enabled = false, any_3subset_enabled = false;
				BASISU_NOTE_UNUSED(any_2subset_mode11_enabled);

				for (uint32_t i = 0; i < (uint32_t)block_mode::cBMTotalModes; i++)
				{
					if (comp_level == 0)
					{
						if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL0) == 0)
							continue;
					}
					else if (comp_level == 1)
					{
						if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL1) == 0)
							continue;
					}
					else if (comp_level == 2)
					{
						if ((g_block_mode_descs[i].m_flags & BASIST_HDR_6X6_LEVEL2) == 0)
							continue;
					}

					if (g_block_mode_descs[i].m_num_partitions == 2)
					{
						any_2subset_enabled = true;

						if (g_block_mode_descs[i].m_cem == 7)
						{
							any_2subset_mode7_enabled = true;
						}
						else
						{
							assert(g_block_mode_descs[i].m_cem == 11);
							any_2subset_mode11_enabled = true;
						}
					}
					else if (g_block_mode_descs[i].m_num_partitions == 3)
						any_3subset_enabled = true;
				}

				coptions.m_mode7_full_s_optimization = (comp_level >= 2);

				const bool uber_mode_flag = (comp_level >= 3);
				coptions.m_allow_uber_mode = uber_mode_flag;

				coptions.m_ultra_quant = (comp_level >= 4);

				coptions.m_take_first_non_clamping_mode11_submode = (comp_level <= 2);
				coptions.m_take_first_non_clamping_mode7_submode = (comp_level <= 2);

				coptions.m_disable_weight_plane_optimization = (comp_level >= 2);

				// -------------------

				uint32_t total_used_block_chans = 0;
				for (uint32_t i = 0; i < 3; i++)
					total_used_block_chans += (half_comp_stats[i].m_range > 0.0f);

				const bool is_solid_block = (total_used_block_chans == 0);

				basisu::comparative_stats<float> half_cross_chan_stats[3];

				// R vs. G
				half_cross_chan_stats[0].calc_pearson(NUM_BLOCK_PIXELS,
					&half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][1],
					3, 3,
					&half_comp_stats[0], &half_comp_stats[1]);

				// R vs. B
				half_cross_chan_stats[1].calc_pearson(NUM_BLOCK_PIXELS,
					&half_pixels_as_floats[0][0][0], &half_pixels_as_floats[0][0][2],
					3, 3,
					&half_comp_stats[0], &half_comp_stats[2]);

				// G vs. B
				half_cross_chan_stats[2].calc_pearson(NUM_BLOCK_PIXELS,
					&half_pixels_as_floats[0][0][1], &half_pixels_as_floats[0][0][2],
					3, 3,
					&half_comp_stats[1], &half_comp_stats[2]);

				const float rg_corr = fabsf((float)half_cross_chan_stats[0].m_pearson);
				const float rb_corr = fabsf((float)half_cross_chan_stats[1].m_pearson);
				const float gb_corr = fabsf((float)half_cross_chan_stats[2].m_pearson);

				float min_corr = BIG_FLOAT_VAL, max_corr = -BIG_FLOAT_VAL;
				for (uint32_t i = 0; i < 3; i++)
				{
#if 0
					// 9/5/2025, wrong metric, we're iterating channels pairs here, not individual channels. 
					// On 3 active channel blocks this causes no difference.
					if (half_comp_stats[i].m_range > 0.0f) 
#else
					static const uint8_t s_chan_pairs[3][2] = { {0, 1}, {0, 2}, {1, 2} };
					
					const uint32_t chanA = s_chan_pairs[i][0];
					const uint32_t chanB = s_chan_pairs[i][1];
					
					if ((half_comp_stats[chanA].m_range > 0.0f) && (half_comp_stats[chanB].m_range > 0.0f))
#endif
					{
						const float c = fabsf((float)half_cross_chan_stats[i].m_pearson);
						min_corr = minimum(min_corr, c);
						max_corr = maximum(max_corr, c);
					}
				}

				bool use_single_subset_mode7 = true;
				if (comp_level <= 1)
				{
					// TODO: could also compute angle between principle axis and the grayscale axis.
					// TODO: Transform grayscale axis by covar matrix, compute variance vs. total variance
					const float MODE7_MIN_CHAN_CORR = .5f;
					const float MODE7_PCA_ANGLE_THRESH = .9f;
					use_single_subset_mode7 = is_grayscale || is_solid_block || ((total_used_block_chans == 1) || (min_corr >= MODE7_MIN_CHAN_CORR));

					if (use_single_subset_mode7)
					{
						float cos_ang = fabsf(enc_block_stats.m_axis_q16.dot(vec3F(0.5773502691f)));
						if (cos_ang < MODE7_PCA_ANGLE_THRESH)
							use_single_subset_mode7 = false;
					}
				}

				const float STRONG_CORR_THRESH = (comp_level <= 1) ? .5f : ((comp_level <= 3) ? .75f : .9f);

				int desired_dp_chan = -1;
				if (total_used_block_chans <= 1)
				{
					// no need for dual plane (except possibly 2x2 weight grids for RDO)
				}
				else
				{
					if (min_corr >= STRONG_CORR_THRESH)
					{
						// all channel pairs strongly correlated, no need for dual plane
						debug_state.m_dp_stats[0].fetch_add(1, std::memory_order_relaxed);
					}
					else
					{
						if (total_used_block_chans == 2)
						{
							if (half_comp_stats[0].m_range == 0.0f)
							{
								// r unused, check for strong gb correlation
								if (gb_corr < STRONG_CORR_THRESH)
									desired_dp_chan = 1;
							}
							else if (half_comp_stats[1].m_range == 0.0f)
							{
								// g unused, check for strong rb correlation
								if (rb_corr < STRONG_CORR_THRESH)
									desired_dp_chan = 0;
							}
							else
							{
								// b unused, check for strong rg correlation
								if (rg_corr < STRONG_CORR_THRESH)
									desired_dp_chan = 0;
							}
						}
						else
						{
							assert(total_used_block_chans == 3);

							// see if rg/rb is weakly correlated vs. gb
							if ((rg_corr < gb_corr) && (rb_corr < gb_corr))
								desired_dp_chan = 0;
							// see if gr/gb is weakly correlated vs. rb
							else if ((rg_corr < rb_corr) && (gb_corr < rb_corr))
								desired_dp_chan = 1;
							// assume b is weakest
							else
								desired_dp_chan = 2;
						}

						if (desired_dp_chan == -1)
							debug_state.m_dp_stats[1].fetch_add(1, std::memory_order_relaxed);
						else
							debug_state.m_dp_stats[2 + desired_dp_chan].fetch_add(1, std::memory_order_relaxed);
					}
				}

				// 2x2 is special for RDO at higher lambdas - always pick a preferred channel.
				int desired_dp_chan_2x2 = 0;
				if (total_used_block_chans == 2)
				{
					if (half_comp_stats[0].m_range == 0.0f)
						desired_dp_chan_2x2 = 1;
				}
				else if (total_used_block_chans == 3)
				{
					// see if rg/rb is weakly correlated vs. gb
					if ((rg_corr < gb_corr) && (rb_corr < gb_corr))
						desired_dp_chan_2x2 = 0;
					// see if gr/gb is weakly correlated vs. rb
					else if ((rg_corr < rb_corr) && (gb_corr < rb_corr))
						desired_dp_chan_2x2 = 1;
					// assume b is weakest
					else
						desired_dp_chan_2x2 = 2;
				}

				// Gather all candidate encodings
				bool status = false;

				// ---- Run candidate
				if ((global_cfg.m_use_runs) && (has_left_neighbor || has_upper_neighbor))
				{
					candidate_encoding candidate;
					candidate.m_coder.reserve(24);

					candidate.m_encoding_type = encoding_type::cRun;

					candidate.m_decomp_log_blk = prev_non_run_candidate_encoding.m_decomp_log_blk;
					candidate.m_coded_log_blk = prev_non_run_candidate_encoding.m_coded_log_blk;

					memcpy(candidate.m_comp_pixels, prev_comp_pixels, sizeof(prev_comp_pixels));

					if (!prev_run_len)
					{
						candidate.m_coder.put_bits(RUN_CODE, RUN_CODE_LEN);
						candidate.m_coder.put_vlc(0, 5);
					}
					else
					{
						// extend current run - compute the # of new bits needed for the extension.

						uint32_t prev_run_bits = prev_encoding.get_total_bits_u32();
						assert(prev_run_bits > 0);

						// We're not actually going to code this, because the previously emitted run code will be extended.
						bitwise_coder temp_coder;
						temp_coder.put_bits(RUN_CODE, RUN_CODE_LEN);
						temp_coder.put_vlc((prev_run_len + 1) - 1, 5);

						uint32_t cur_run_bits = temp_coder.get_total_bits_u32();
						assert(cur_run_bits >= prev_run_bits);

						uint32_t total_new_bits = cur_run_bits - prev_run_bits;
						if (total_new_bits > 0)
							candidate.m_coder.put_bits(0, total_new_bits); // dummy bits
					}

					candidate.m_run_len = prev_run_len + 1;

					candidates.emplace_back(std::move(candidate));
				}

				// ---- Reuse candidate
				if ((!is_solid_block) && (global_cfg.m_lambda > 0.0f))
				{
					for (uint32_t reuse_delta_index = 0; reuse_delta_index < global_cfg.m_num_reuse_xy_deltas; reuse_delta_index++)
					{
						const int reuse_delta_x = g_reuse_xy_deltas[reuse_delta_index].m_x;
						const int reuse_delta_y = g_reuse_xy_deltas[reuse_delta_index].m_y;

						const int reuse_bx = bx + reuse_delta_x, reuse_by = by + reuse_delta_y;
						if ((reuse_bx < 0) || (reuse_bx >= (int)num_blocks_x))
							continue;
						if (reuse_by < (int)strip_first_by)
							break;

						const candidate_encoding& prev_candidate = enc_state.coded_blocks(reuse_bx, reuse_by);

						// TODO - support this.
						if (prev_candidate.m_encoding_type == encoding_type::cSolid)
							continue;
						assert((prev_candidate.m_encoding_type == encoding_type::cBlock) || (prev_candidate.m_encoding_type == encoding_type::cReuse));

						candidate_encoding candidate;
						candidate.m_coder.reserve(24);
						astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;
						astc_helpers::log_astc_block& decomp_log_blk = candidate.m_decomp_log_blk;

						const astc_helpers::log_astc_block& prev_coded_log_blk = prev_candidate.m_coded_log_blk;

						const uint32_t grid_x = prev_coded_log_blk.m_grid_width, grid_y = prev_coded_log_blk.m_grid_height;
						const bool dual_plane = prev_candidate.m_coded_log_blk.m_dual_plane;
						const uint32_t num_grid_samples = grid_x * grid_y;
						const uint32_t num_endpoint_vals = get_num_endpoint_vals(prev_coded_log_blk.m_color_endpoint_modes[0]);

						coded_log_blk = prev_candidate.m_coded_log_blk;
						decomp_log_blk = prev_candidate.m_decomp_log_blk;

						if (prev_coded_log_blk.m_num_partitions == 1)
						{
							// Now encode the block using the transcoded endpoints
							basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];

							if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7)
							{
								status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr,
									astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
							}
							else
							{
								status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints, &decoded_half[0][0], nullptr,
									astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
							}
							assert(status);

							uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H];
							uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];

							if (dual_plane)
							{
								eval_selectors_dual_plane(prev_candidate.m_coded_log_blk.m_color_component_selector,
									BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);

								downsample_ise_weights_dual_plane(
									coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
									BLOCK_W, BLOCK_H,
									grid_x, grid_y,
									trial_weights0, trial_weights1, coded_log_blk.m_weights);

								basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * 2, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
							}
							else
							{
								eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, coded_log_blk.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);

								downsample_ise_weights(
									coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
									BLOCK_W, BLOCK_H,
									grid_x, grid_y,
									trial_weights0, coded_log_blk.m_weights);

								basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);
							}

							// Create the block the decoder would transcode into.
							copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk, use_orig_behavior);
						}
						else if (prev_coded_log_blk.m_num_partitions == 2)
						{
							assert(!dual_plane);

							const int unique_pat_index = g_part2_seed_to_unique_index[coded_log_blk.m_partition_id];
							assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS2));

							const partition_pattern_vec& pat_vec = g_partitions2[unique_pat_index];

							vec4F part_pixels_q16[2][64];
							half_vec3 part_half_pixels[2][64];
							uint32_t part_total_pixels[2] = { 0 };

							for (uint32_t y = 0; y < BLOCK_H; y++)
							{
								for (uint32_t x = 0; x < BLOCK_W; x++)
								{
									const uint32_t part_index = pat_vec[x + y * 6];

									uint32_t l = part_total_pixels[part_index];

									part_pixels_q16[part_index][l] = block_pixels_q16[y][x];
									part_half_pixels[part_index][l] = half_pixels[y][x];

									part_total_pixels[part_index] = l + 1;
								} // x 
							} // y

							uint8_t blk_weights[2][BLOCK_W * BLOCK_H];

							for (uint32_t part_index = 0; part_index < 2; part_index++)
							{
								basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];

								if (prev_coded_log_blk.m_color_endpoint_modes[0] == 7)
								{
									status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,
										astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
								}
								else
								{
									status = get_astc_hdr_mode_11_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,
										astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
								}
								assert(status);

								eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range,
									(basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);

							} // part_index

							uint8_t ise_weights[BLOCK_W * BLOCK_H];

							uint32_t src_pixel_index[2] = { 0, 0 };
							for (uint32_t y = 0; y < BLOCK_H; y++)
							{
								for (uint32_t x = 0; x < BLOCK_W; x++)
								{
									const uint32_t part_index = pat_vec[x + y * 6];

									ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
									src_pixel_index[part_index]++;
								} // x
							} // y

							downsample_ise_weights(
								coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
								BLOCK_W, BLOCK_H,
								grid_x, grid_y,
								ise_weights, coded_log_blk.m_weights);

							// Transcode these codable weights to ASTC weights.
							uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H];
							basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);

							// Create the block the decoder would transcode into.
							copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk, use_orig_behavior);
						}
						else if (prev_coded_log_blk.m_num_partitions == 3)
						{
							assert(!dual_plane);

							const int unique_pat_index = g_part3_seed_to_unique_index[coded_log_blk.m_partition_id];
							assert((unique_pat_index >= 0) && (unique_pat_index < (int)NUM_UNIQUE_PARTITIONS3));

							const partition_pattern_vec& pat = g_partitions3[unique_pat_index];

							vec4F part_pixels_q16[3][64];
							half_vec3 part_half_pixels[3][64];
							uint32_t part_total_pixels[3] = { 0 };

							for (uint32_t y = 0; y < BLOCK_H; y++)
							{
								for (uint32_t x = 0; x < BLOCK_W; x++)
								{
									const uint32_t part_index = pat.m_parts[x + y * BLOCK_W];

									uint32_t l = part_total_pixels[part_index];

									part_pixels_q16[part_index][l] = block_pixels_q16[y][x];
									part_half_pixels[part_index][l] = half_pixels[y][x];

									part_total_pixels[part_index] = l + 1;
								} // x 
							} // y

							uint8_t blk_weights[3][BLOCK_W * BLOCK_H];

							for (uint32_t part_index = 0; part_index < 3; part_index++)
							{
								basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];

								status = get_astc_hdr_mode_7_block_colors(coded_log_blk.m_endpoints + num_endpoint_vals * part_index, &decoded_half[0][0], nullptr,
									astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), coded_log_blk.m_weight_ise_range, coded_log_blk.m_endpoint_ise_range);
								assert(status);

								eval_selectors(part_total_pixels[part_index], blk_weights[part_index], coded_log_blk.m_weight_ise_range,
									(basist::half_float*)&part_half_pixels[part_index][0][0], astc_helpers::get_ise_levels(coded_log_blk.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);

							} // part_index

							uint8_t ise_weights[BLOCK_W * BLOCK_H];

							uint32_t src_pixel_index[3] = { 0 };
							for (uint32_t y = 0; y < BLOCK_H; y++)
							{
								for (uint32_t x = 0; x < BLOCK_W; x++)
								{
									const uint32_t part_index = pat.m_parts[x + y * BLOCK_W];

									ise_weights[x + y * BLOCK_W] = blk_weights[part_index][src_pixel_index[part_index]];
									src_pixel_index[part_index]++;
								} // x
							} // y

							downsample_ise_weights(
								coded_log_blk.m_weight_ise_range, coded_log_blk.m_weight_ise_range,
								BLOCK_W, BLOCK_H,
								grid_x, grid_y,
								ise_weights, coded_log_blk.m_weights);

							// Transcode these codable weights to ASTC weights.
							uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H];
							basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples, coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range, transcode_weights, decomp_log_blk.m_weight_ise_range);

							// Create the block the decoder would transcode into.
							copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_log_blk, use_orig_behavior);
						}

						if (!validate_log_blk(decomp_log_blk))
						{
							fmt_error_printf("pack_astc_block() failed\n");
							return false;
						}

						status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_log_blk, &candidate.m_comp_pixels[0][0]);
						if (!status)
						{
							fmt_error_printf("decode_astc_block() failed\n");
							return false;
						}

						candidate.m_coder.put_bits(REUSE_CODE, REUSE_CODE_LEN);
						candidate.m_coder.put_bits(reuse_delta_index, REUSE_XY_DELTA_BITS);
						encode_values(candidate.m_coder, num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, coded_log_blk.m_weight_ise_range);

						candidate.m_encoding_type = encoding_type::cReuse;
						candidate.m_block_mode = prev_candidate.m_block_mode;
						candidate.m_endpoint_mode = prev_candidate.m_endpoint_mode;
						candidate.m_reuse_delta_index = reuse_delta_index;

						candidates.emplace_back(std::move(candidate));

					} // reuse_delta_index
				}

				// ---- Solid candidate
				if (global_cfg.m_use_solid_blocks)
				{
					candidate_encoding candidate;
					candidate.m_coder.reserve(24);

					// solid
					candidate.m_encoding_type = encoding_type::cSolid;

					float r = 0.0f, g = 0.0f, b = 0.0f;
					const float LOG_BIAS = .125f;
					bool solid_block = true;
					for (uint32_t y = 0; y < BLOCK_H; y++)
					{
						for (uint32_t x = 0; x < BLOCK_W; x++)
						{
							if ((block_pixels[0][0][0] != block_pixels[y][x][0]) ||
								(block_pixels[0][0][1] != block_pixels[y][x][1]) ||
								(block_pixels[0][0][2] != block_pixels[y][x][2]))
							{
								solid_block = false;
							}

							r += log2f(block_pixels[y][x][0] + LOG_BIAS);
							g += log2f(block_pixels[y][x][1] + LOG_BIAS);
							b += log2f(block_pixels[y][x][2] + LOG_BIAS);
						}
					}

					if (solid_block)
					{
						r = block_pixels[0][0][0];
						g = block_pixels[0][0][1];
						b = block_pixels[0][0][2];
					}
					else
					{
						r = maximum<float>(0.0f, powf(2.0f, r * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);
						g = maximum<float>(0.0f, powf(2.0f, g * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);
						b = maximum<float>(0.0f, powf(2.0f, b * (1.0f / (float)NUM_BLOCK_PIXELS)) - LOG_BIAS);

						r = minimum<float>(r, basist::MAX_HALF_FLOAT);
						g = minimum<float>(g, basist::MAX_HALF_FLOAT);
						b = minimum<float>(b, basist::MAX_HALF_FLOAT);
					}

					basist::half_float rh = float_to_half_non_neg_no_nan_inf(r), gh = float_to_half_non_neg_no_nan_inf(g), bh = float_to_half_non_neg_no_nan_inf(b);

					candidate.m_solid_color[0] = rh;
					candidate.m_solid_color[1] = gh;
					candidate.m_solid_color[2] = bh;

					candidate.m_coder.put_bits(SOLID_CODE, SOLID_CODE_LEN);

					candidate.m_coder.put_bits(rh, 15);
					candidate.m_coder.put_bits(gh, 15);
					candidate.m_coder.put_bits(bh, 15);

					vec3F cp(basist::half_to_float(rh), basist::half_to_float(gh), basist::half_to_float(bh));

					for (uint32_t y = 0; y < BLOCK_H; y++)
						for (uint32_t x = 0; x < BLOCK_W; x++)
							candidate.m_comp_pixels[y][x] = cp;

					astc_helpers::log_astc_block& log_blk = candidate.m_coded_log_blk;

					log_blk.clear();
					log_blk.m_solid_color_flag_hdr = true;
					log_blk.m_solid_color[0] = rh;
					log_blk.m_solid_color[1] = gh;
					log_blk.m_solid_color[2] = bh;
					log_blk.m_solid_color[3] = basist::float_to_half(1.0f);

					candidate.m_decomp_log_blk = log_blk;

					candidates.emplace_back(std::move(candidate));
				}

				if ((!is_solid_block) || (!global_cfg.m_use_solid_blocks))
				{
					static uint8_t s_parts2_normal[5] = { 0, 2, 4, 6, 8 };
					static uint8_t s_parts3_normal[5] = { 0, 0, 4, 6, 8 };

					static uint8_t s_parts2_complex[5] = { 0, 4, 8, 10, 16 };
					static uint8_t s_parts3_complex[5] = { 0, 0, 8, 10, 16 };

					static uint8_t s_parts2_very_complex[5] = { 0, 8, 12, 14, 20 };
					static uint8_t s_parts3_very_complex[5] = { 0, 0, 12, 14, 20 };

					uint32_t total_parts2 = 0, total_parts3 = 0;

					assert(comp_level < 5);
					if ((very_simple_block) && (comp_level <= 3))
					{
						// Block's std dev is so low that 2-3 subsets are unlikely to help much
						total_parts2 = 0;
						total_parts3 = 0;

						debug_state.m_total_part2_stats[0].fetch_add(1, std::memory_order_relaxed);
					}
					else if (very_complex_block)
					{
						total_parts2 = s_parts2_very_complex[comp_level];
						total_parts3 = s_parts3_very_complex[comp_level];

						if (global_cfg.m_extra_patterns_flag)
						{
							total_parts2 += (comp_level == 4) ? 30 : 20;
							total_parts3 += (comp_level == 4) ? 30 : 20;
						}

						debug_state.m_total_part2_stats[2].fetch_add(1, std::memory_order_relaxed);
					}
					else if (complex_block)
					{
						total_parts2 = s_parts2_complex[comp_level];
						total_parts3 = s_parts3_complex[comp_level];

						if (global_cfg.m_extra_patterns_flag)
						{
							total_parts2 += (comp_level == 4) ? 15 : 10;
							total_parts3 += (comp_level == 4) ? 15 : 10;
						}

						debug_state.m_total_part2_stats[3].fetch_add(1, std::memory_order_relaxed);
					}
					else
					{
						// moderate complexity - use defaults
						total_parts2 = s_parts2_normal[comp_level];
						total_parts3 = s_parts3_normal[comp_level];

						if (global_cfg.m_extra_patterns_flag)
						{
							total_parts2 += 5;
							total_parts3 += 5;
						}

						debug_state.m_total_part2_stats[1].fetch_add(1, std::memory_order_relaxed);
					}

					if (!any_2subset_enabled)
						total_parts2 = 0;

					if (!any_3subset_enabled)
						total_parts3 = 0;

					int best_parts2_mode11[NUM_UNIQUE_PARTITIONS2], best_parts2_mode7[NUM_UNIQUE_PARTITIONS2];
					bool has_estimated_parts2 = false;

					if (total_parts2)
					{
						if (global_cfg.m_brute_force_partition_matching)
						{
							int candidate_pats2[NUM_UNIQUE_PARTITIONS2];
							for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS2; i++)
								candidate_pats2[i] = i;

							if (any_2subset_enabled)
							{
								estimate_partitions_mode7_and_11(
									2,
									NUM_UNIQUE_PARTITIONS2, g_partitions2,
									NUM_UNIQUE_PARTITIONS2, (uint32_t*)candidate_pats2,
									&half_pixels_as_floats[0][0],
									coptions,
									total_parts2, best_parts2_mode11, best_parts2_mode7);
							}

							has_estimated_parts2 = true;
						}
						else
						{
							if (comp_level >= 1)
							{
								const uint32_t MAX_CANDIDATES2 = 48;
								int candidate_pats2[MAX_CANDIDATES2 * 2];

								uint32_t num_candidate_pats2 = maximum((total_parts2 * 3) / 2, very_complex_block ? MAX_CANDIDATES2 : (MAX_CANDIDATES2 / 2));
								num_candidate_pats2 = minimum<uint32_t>(num_candidate_pats2, (uint32_t)std::size(candidate_pats2));

								has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, candidate_pats2, num_candidate_pats2);

								if (has_estimated_parts2)
								{
									estimate_partitions_mode7_and_11(
										2,
										NUM_UNIQUE_PARTITIONS2, g_partitions2,
										num_candidate_pats2, (uint32_t*)candidate_pats2,
										&half_pixels_as_floats[0][0],
										coptions,
										total_parts2, best_parts2_mode11, best_parts2_mode7);
								}
							}
							else
							{
								has_estimated_parts2 = estimate_partition2_6x6((basist::half_float(*)[3])half_pixels, best_parts2_mode11, total_parts2);

								if ((has_estimated_parts2) && (any_2subset_mode7_enabled))
									memcpy(best_parts2_mode7, best_parts2_mode11, total_parts2 * sizeof(best_parts2_mode7[0]));
							}
						}
					}

					int best_parts3[NUM_UNIQUE_PARTITIONS3];
					bool has_estimated_parts3 = false;

					if (total_parts3)
					{
#if 0
						has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, best_parts3, total_parts3);
#elif 1
						if (global_cfg.m_brute_force_partition_matching)
						{
							int candidate_pats3[NUM_UNIQUE_PARTITIONS3];
							for (uint32_t i = 0; i < NUM_UNIQUE_PARTITIONS3; i++)
								candidate_pats3[i] = i;

							estimate_partitions_mode7(
								3,
								NUM_UNIQUE_PARTITIONS3, g_partitions3,
								NUM_UNIQUE_PARTITIONS3, (uint32_t*)candidate_pats3,
								&half_pixels_as_floats[0][0],
								coptions,
								total_parts3, (uint32_t*)best_parts3);

							has_estimated_parts3 = true;
						}
						else
						{
							const uint32_t MAX_CANDIDATES3 = 48;
							int candidate_pats3[MAX_CANDIDATES3 * 2];

							uint32_t num_candidate_pats3 = maximum((total_parts3 * 3) / 2, very_complex_block ? MAX_CANDIDATES3 : (MAX_CANDIDATES3 / 2));
							num_candidate_pats3 = minimum<uint32_t>(num_candidate_pats3, (uint32_t)std::size(candidate_pats3));

							has_estimated_parts3 = estimate_partition3_6x6((basist::half_float(*)[3])half_pixels, candidate_pats3, num_candidate_pats3);

							if (has_estimated_parts3)
							{
								estimate_partitions_mode7(
									3,
									NUM_UNIQUE_PARTITIONS3, g_partitions3,
									num_candidate_pats3, (uint32_t*)candidate_pats3,
									&half_pixels_as_floats[0][0],
									coptions,
									total_parts3, (uint32_t*)best_parts3);
							}
						}
#endif
					}

					const opt_mode_t mode11_opt_mode = complex_block ? cWeightedLeastSquares : cOrdinaryLeastSquares;

					// ---- Encoded block candidate
					for (uint32_t block_mode_iter = 0; block_mode_iter < (uint32_t)block_mode::cBMTotalModes; block_mode_iter++)
					{
						const block_mode bm = (block_mode)block_mode_iter;

						if (comp_level == 0)
						{
							if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL0) == 0)
								continue;
						}
						else if (comp_level == 1)
						{
							if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL1) == 0)
								continue;
						}
						else if (comp_level == 2)
						{
							if ((g_block_mode_descs[block_mode_iter].m_flags & BASIST_HDR_6X6_LEVEL2) == 0)
								continue;
						}

						if (global_cfg.m_block_stat_optimizations_flag)
						{
							if ((comp_level <= 3) && (g_block_mode_descs[block_mode_iter].m_dp))
							{
								if ((global_cfg.m_lambda > 0.0f) && (!complex_block) && (g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2))
								{
									if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan_2x2)
										continue;
								}
								else
								{
									if (g_block_mode_descs[block_mode_iter].m_dp_channel != desired_dp_chan)
										continue;
								}
							}

							if (comp_level <= 3)
							{
								const uint32_t grid_x = g_block_mode_descs[block_mode_iter].m_grid_x;
								const uint32_t grid_y = g_block_mode_descs[block_mode_iter].m_grid_y;

								if (!g_block_mode_descs[block_mode_iter].m_dp)
								{
									// Minor gain (.5-1% less canidates)
									if (very_detailed_block)
									{
										if (grid_x * grid_y <= 12)
										{
											debug_state.m_detail_stats[0].fetch_add(1, std::memory_order_relaxed);
											continue;
										}
									}

									// Major gains (10-25% less candidates)
									if (very_blurry_block)
									{
										if ((grid_x > 4) || (grid_y > 4) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1))
										{
											debug_state.m_detail_stats[1].fetch_add(1, std::memory_order_relaxed);
											continue;
										}
									}
									if (super_blurry_block)
									{
										if ((grid_x > 3) || (grid_y > 3) || (g_block_mode_descs[block_mode_iter].m_num_partitions > 1))
										{
											debug_state.m_detail_stats[2].fetch_add(1, std::memory_order_relaxed);
											continue;
										}
									}
								}

								if (grid_x != grid_y)
								{
									if (grid_x < grid_y)
									{
										if (!filter_horizontally)
										{
											debug_state.m_detail_stats[3].fetch_add(1, std::memory_order_relaxed);
											continue;
										}
									}
									else
									{
										if (filter_horizontally)
										{
											debug_state.m_detail_stats[4].fetch_add(1, std::memory_order_relaxed);
											continue;
										}
									}
								}
							}

							if (global_cfg.m_lambda == 0.0f)
							{
								// Rarely useful if lambda=0
								if ((g_block_mode_descs[block_mode_iter].m_grid_x == 2) && (g_block_mode_descs[block_mode_iter].m_grid_y == 2))
									continue;
							}
						} // block_stat_optimizations_flag

						if ((!use_single_subset_mode7) &&
							(g_block_mode_descs[block_mode_iter].m_cem == 7) &&
							(g_block_mode_descs[block_mode_iter].m_num_partitions == 1))
						{
							debug_state.m_total_mode7_skips.fetch_add(1, std::memory_order_relaxed);
							continue;
						}

						for (uint32_t endpoint_mode_iter = 0; endpoint_mode_iter < (uint32_t)endpoint_mode::cTotal; endpoint_mode_iter++)
						{
							if (global_cfg.m_lambda == 0.0f)
							{
								// No use trying anything else
								if (endpoint_mode_iter != (uint32_t)endpoint_mode::cRaw)
									continue;
							}

							if (global_cfg.m_disable_delta_endpoint_usage)
							{
								if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeftDelta))
									continue;
							}

							if (!global_cfg.m_favor_higher_compression)
							{
								if (comp_level == 0)
								{
									if (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpperDelta)
										continue;
								}

								if (comp_level <= 1)
								{
									if ((endpoint_mode_iter == (uint32_t)endpoint_mode::cUseLeft) || (endpoint_mode_iter == (uint32_t)endpoint_mode::cUseUpper))
										continue;
								}
							}

							const endpoint_mode em = (endpoint_mode)endpoint_mode_iter;

							switch (em)
							{
							case endpoint_mode::cUseLeft:
							case endpoint_mode::cUseUpper:
							{
								const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter];
								const uint32_t cem = local_md.m_cem;

								if (local_md.m_num_partitions > 1)
									break;

								if ((em == endpoint_mode::cUseLeft) && (!has_left_neighbor))
									break;
								else if ((em == endpoint_mode::cUseUpper) && (!has_upper_neighbor))
									break;

								candidate_encoding candidate;
								candidate.m_coder.reserve(24);
								astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;

								int nx = bx, ny = by;
								if (em == endpoint_mode::cUseLeft)
									nx--;
								else
									ny--;

								const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny);
								if (neighbor_blk.m_encoding_type == encoding_type::cSolid)
									break;
								assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse));

								const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode];

								if (neighbor_md.m_cem != cem)
									break;

								assert(neighbor_blk.m_coded_log_blk.m_color_endpoint_modes[0] == cem);

								const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y;
								const bool dual_plane = local_md.m_dp;
								const uint32_t num_grid_samples = grid_x * grid_y;
								const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem);

								coded_log_blk.m_grid_width = (uint8_t)grid_x;
								coded_log_blk.m_grid_height = (uint8_t)grid_y;
								coded_log_blk.m_dual_plane = (uint8_t)dual_plane;
								coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
								coded_log_blk.m_num_partitions = 1;
								coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)neighbor_md.m_cem;
								coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range;

								// We're not explictly writing any endpoints, just reusing existing ones. So copy the neighbor's endpoints unchanged (so no loss).
								coded_log_blk.m_endpoint_ise_range = neighbor_blk.m_coded_log_blk.m_endpoint_ise_range;
								memcpy(coded_log_blk.m_endpoints, neighbor_blk.m_coded_log_blk.m_endpoints, num_endpoint_vals);

								uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS];

								// Requantize the neighbor's endpoints to whatever we'll have to transcode into to make a valid ASTC encoding.
								basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem,
									neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints,
									local_md.m_transcode_endpoint_ise_range, transcode_endpoints);

								// Now encode the block using the transcoded endpoints
								basist::half_float decoded_half[MAX_SUPPORTED_WEIGHT_LEVELS][3];

								if (cem == 7)
								{
									status = get_astc_hdr_mode_7_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr,
										astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range);
								}
								else
								{
									status = get_astc_hdr_mode_11_block_colors(transcode_endpoints, &decoded_half[0][0], nullptr,
										astc_helpers::get_ise_levels(local_md.m_weight_ise_range), local_md.m_weight_ise_range, local_md.m_transcode_endpoint_ise_range);
								}
								if (!status)
									break;

								uint8_t trial_weights0[BLOCK_W * BLOCK_H], trial_weights1[BLOCK_W * BLOCK_H];
								if (dual_plane)
								{
									eval_selectors_dual_plane(local_md.m_dp_channel, BLOCK_W * BLOCK_H, trial_weights0, trial_weights1, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);

									downsample_ise_weights_dual_plane(
										local_md.m_weight_ise_range, local_md.m_weight_ise_range,
										BLOCK_W, BLOCK_H,
										grid_x, grid_y,
										trial_weights0, trial_weights1, coded_log_blk.m_weights);
								}
								else
								{
									eval_selectors(BLOCK_W * BLOCK_H, trial_weights0, local_md.m_weight_ise_range, (basist::half_float*)&half_pixels[0][0][0], astc_helpers::get_ise_levels(local_md.m_weight_ise_range), &decoded_half[0][0], coptions, UINT32_MAX);

									downsample_ise_weights(
										local_md.m_weight_ise_range, local_md.m_weight_ise_range,
										BLOCK_W, BLOCK_H,
										grid_x, grid_y,
										trial_weights0, coded_log_blk.m_weights);
								}

								// Transcode these codable weights to ASTC weights.
								uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];
								basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range);

								// Create the block the decoder would transcode into.
								astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
								decomp_blk.clear();

								decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;
								decomp_blk.m_dual_plane = local_md.m_dp;
								decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
								decomp_blk.m_num_partitions = 1;
								decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range;
								decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range;

								memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals);

								copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk, use_orig_behavior);

								if (!validate_log_blk(decomp_blk))
								{
									fmt_error_printf("pack_astc_block() failed\n");
									return false;
								}

								status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
								if (!status)
								{
									fmt_error_printf("decode_astc_block() failed\n");
									return false;
								}

								candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
								code_block(candidate.m_coder, candidate.m_coded_log_blk, (block_mode)block_mode_iter, em, nullptr);

								candidate.m_encoding_type = encoding_type::cBlock;
								candidate.m_endpoint_mode = em;
								candidate.m_block_mode = bm;

								candidates.emplace_back(std::move(candidate));

								break;
							}
							case endpoint_mode::cUseLeftDelta:
							case endpoint_mode::cUseUpperDelta:
							{
								const block_mode_desc& local_md = g_block_mode_descs[block_mode_iter];
								const uint32_t cem = local_md.m_cem;

								if (local_md.m_num_partitions > 1)
									break;

								if ((em == endpoint_mode::cUseLeftDelta) && (!has_left_neighbor))
									break;
								else if ((em == endpoint_mode::cUseUpperDelta) && (!has_upper_neighbor))
									break;

								candidate_encoding candidate;
								candidate.m_coder.reserve(24);
								astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;

								int nx = bx, ny = by;
								if (em == endpoint_mode::cUseLeftDelta)
									nx--;
								else
									ny--;

								const candidate_encoding& neighbor_blk = enc_state.coded_blocks(nx, ny);
								if (neighbor_blk.m_encoding_type == encoding_type::cSolid)
									break;
								assert((neighbor_blk.m_encoding_type == encoding_type::cBlock) || (neighbor_blk.m_encoding_type == encoding_type::cReuse));

								const block_mode_desc& neighbor_md = g_block_mode_descs[(uint32_t)neighbor_blk.m_block_mode];

								if (neighbor_md.m_cem != cem)
									break;

								assert(neighbor_md.m_cem == local_md.m_cem);

								const uint32_t grid_x = local_md.m_grid_x, grid_y = local_md.m_grid_y;
								const bool dual_plane = local_md.m_dp;
								const uint32_t num_grid_samples = grid_x * grid_y;
								const uint32_t num_endpoint_vals = get_num_endpoint_vals(local_md.m_cem);

								// Dequantize neighbor's endpoints to ISE 20
								uint8_t neighbor_endpoints_ise20[basist::NUM_MODE11_ENDPOINTS];
								basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem,
									neighbor_blk.m_coded_log_blk.m_endpoint_ise_range, neighbor_blk.m_coded_log_blk.m_endpoints,
									astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20);

								// Requantize neighbor's endpoints to our local desired coding ISE range
								uint8_t neighbor_endpoints_coding_ise_local[basist::NUM_MODE11_ENDPOINTS];
								basist::astc_6x6_hdr::requantize_ise_endpoints(neighbor_md.m_cem, astc_helpers::BISE_256_LEVELS, neighbor_endpoints_ise20, local_md.m_endpoint_ise_range, neighbor_endpoints_coding_ise_local);

								uint8_t blk_endpoints[basist::NUM_MODE11_ENDPOINTS];
								uint8_t blk_weights0[NUM_BLOCK_PIXELS], blk_weights1[NUM_BLOCK_PIXELS];

								// Now try to encode the current block using the neighbor's endpoints submode.
								double err = 0.0f;
								uint32_t best_submode = 0;

								if (cem == 7)
								{
									int maj_index, submode_index;
									decode_cem_7_config(neighbor_endpoints_ise20, submode_index, maj_index);

									int first_submode = submode_index, last_submode = submode_index;

									err = encode_astc_hdr_block_mode_7(
										NUM_BLOCK_PIXELS,
										(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,
										local_md.m_weight_ise_range,
										best_submode,
										BIG_FLOAT_VAL,
										blk_endpoints, blk_weights0,
										coptions,
										local_md.m_endpoint_ise_range,
										first_submode, last_submode,
										&enc_block_stats);
								}
								else
								{
									int maj_index, submode_index;
									decode_cem_11_config(neighbor_endpoints_ise20, submode_index, maj_index);

									int first_submode = -1, last_submode = -1;
									if (maj_index == 3)
									{
										// direct
									}
									else
									{
										first_submode = submode_index;
										last_submode = submode_index;
									}

									if (dual_plane)
									{
										err = encode_astc_hdr_block_mode_11_dual_plane(
											NUM_BLOCK_PIXELS,
											(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,
											local_md.m_dp_channel,
											local_md.m_weight_ise_range,
											best_submode,
											BIG_FLOAT_VAL,
											blk_endpoints, blk_weights0, blk_weights1,
											coptions,
											false,
											local_md.m_endpoint_ise_range,
											false, //uber_mode_flag,
											false,
											first_submode, last_submode, true);
									}
									else
									{
										err = encode_astc_hdr_block_mode_11(
											NUM_BLOCK_PIXELS,
											(basist::half_float(*)[3])half_pixels, (vec4F*)block_pixels_q16,
											local_md.m_weight_ise_range,
											best_submode,
											BIG_FLOAT_VAL,
											blk_endpoints, blk_weights0,
											coptions,
											false,
											local_md.m_endpoint_ise_range,
											false, //uber_mode_flag,
											false,
											first_submode, last_submode, true,
											mode11_opt_mode,
											&enc_block_stats);
									}
								}

								if (err == BIG_FLOAT_VAL)
									break;

								uint8_t endpoint_deltas[basist::NUM_MODE11_ENDPOINTS];

								// TODO: For now, just try 5 bits for each endpoint. Can tune later.
								// This isn't right, it's computing the deltas in ISE space.
								//const uint32_t NUM_ENDPOINT_DELTA_BITS = 5;
								const int total_endpoint_delta_vals = 1 << NUM_ENDPOINT_DELTA_BITS;
								const int low_delta_limit = -(total_endpoint_delta_vals / 2), high_delta_limit = (total_endpoint_delta_vals / 2) - 1;

								const auto& ise_to_rank = astc_helpers::g_dequant_tables.get_endpoint_tab(local_md.m_endpoint_ise_range).m_ISE_to_rank;

								bool all_deltas_in_limits = true;
								for (uint32_t i = 0; i < num_endpoint_vals; i++)
								{
									int endpoint_delta = (int)ise_to_rank[blk_endpoints[i]] - (int)ise_to_rank[neighbor_endpoints_coding_ise_local[i]];

									if ((endpoint_delta < low_delta_limit) || (endpoint_delta > high_delta_limit))
										all_deltas_in_limits = false;

									endpoint_deltas[i] = (uint8_t)(endpoint_delta + -low_delta_limit);
								}

								if (all_deltas_in_limits)
								{
									coded_log_blk.m_grid_width = (uint8_t)grid_x;
									coded_log_blk.m_grid_height = (uint8_t)grid_y;
									coded_log_blk.m_dual_plane = (uint8_t)dual_plane;
									coded_log_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
									coded_log_blk.m_num_partitions = 1;
									coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;
									coded_log_blk.m_weight_ise_range = (uint8_t)local_md.m_weight_ise_range;
									coded_log_blk.m_endpoint_ise_range = (uint8_t)local_md.m_endpoint_ise_range;

									memcpy(coded_log_blk.m_endpoints, blk_endpoints, num_endpoint_vals);

									uint8_t transcode_endpoints[basist::NUM_MODE11_ENDPOINTS];
									uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];

									basist::astc_6x6_hdr::requantize_ise_endpoints(local_md.m_cem, local_md.m_endpoint_ise_range, blk_endpoints, local_md.m_transcode_endpoint_ise_range, transcode_endpoints);

									if (dual_plane)
									{
										downsample_ise_weights_dual_plane(
											local_md.m_weight_ise_range, local_md.m_weight_ise_range,
											BLOCK_W, BLOCK_H,
											grid_x, grid_y,
											blk_weights0, blk_weights1,
											coded_log_blk.m_weights);
									}
									else
									{
										downsample_ise_weights(
											local_md.m_weight_ise_range, local_md.m_weight_ise_range,
											BLOCK_W, BLOCK_H,
											grid_x, grid_y,
											blk_weights0, coded_log_blk.m_weights);
									}

									basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples * (dual_plane ? 2 : 1), coded_log_blk.m_weights, local_md.m_weight_ise_range, transcode_weights, local_md.m_transcode_weight_ise_range);

									// Create the block the decoder would transcode into.

									astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
									decomp_blk.clear();

									decomp_blk.m_color_endpoint_modes[0] = (uint8_t)local_md.m_cem;
									decomp_blk.m_dual_plane = local_md.m_dp;
									decomp_blk.m_color_component_selector = (uint8_t)local_md.m_dp_channel;
									decomp_blk.m_num_partitions = 1;
									decomp_blk.m_endpoint_ise_range = (uint8_t)local_md.m_transcode_endpoint_ise_range;
									decomp_blk.m_weight_ise_range = (uint8_t)local_md.m_transcode_weight_ise_range;

									memcpy(decomp_blk.m_endpoints, transcode_endpoints, num_endpoint_vals);

									copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk, use_orig_behavior);

									if (!validate_log_blk(decomp_blk))
									{
										fmt_error_printf("pack_astc_block() failed\n");
										return false;
									}

									status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
									if (!status)
									{
										fmt_error_printf("decode_astc_block() failed\n");
										return false;
									}

									candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
									code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, endpoint_deltas);

									candidate.m_encoding_type = encoding_type::cBlock;
									candidate.m_endpoint_mode = em;
									candidate.m_block_mode = bm;

									candidates.emplace_back(std::move(candidate));
								}

								break;
							}
							case endpoint_mode::cRaw:
							{
								//if (candidates.size() == 339)
								//	fmt_printf("!");

								const auto& mode_desc = g_block_mode_descs[(uint32_t)bm];
								const uint32_t cem = mode_desc.m_cem;
								//const uint32_t num_endpoint_vals = get_num_endpoint_vals(cem);
								const bool dual_plane = mode_desc.m_dp;

								if ((global_cfg.m_disable_twothree_subsets) && (mode_desc.m_num_partitions >= 2))
									break;

								if (mode_desc.m_num_partitions == 3)
								{
									assert(!dual_plane);

									if (!has_estimated_parts3)
										break;

									assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range);
									assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range);

									trial_result res;

									status = encode_block_3_subsets(
										res,
										cem,
										mode_desc.m_grid_x, mode_desc.m_grid_y,
										mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,
										&half_pixels[0][0], (vec4F*)block_pixels_q16,
										coptions,
										uber_mode_flag,
										best_parts3, total_parts3, comp_level, mode11_opt_mode);

									if (!status)
										break;

									assert(res.m_valid);

									candidate_encoding candidate;
									candidate.m_coder.reserve(24);
									astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;

									coded_log_blk = res.m_log_blk;

									astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
									decomp_blk = res.m_log_blk;

									if (!validate_log_blk(decomp_blk))
									{
										fmt_error_printf("pack_astc_block() failed\n");
										return false;
									}

									status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
									if (!status)
									{
										fmt_error_printf("decode_astc_block() failed\n");
										return false;
									}

									candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
									code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);

									candidate.m_encoding_type = encoding_type::cBlock;
									candidate.m_endpoint_mode = em;
									candidate.m_block_mode = bm;

									candidates.emplace_back(std::move(candidate));
								}
								else if (mode_desc.m_num_partitions == 2)
								{
									assert(!dual_plane);

									if (!has_estimated_parts2)
										break;

									assert(mode_desc.m_weight_ise_range == mode_desc.m_transcode_weight_ise_range);
									assert(mode_desc.m_endpoint_ise_range == mode_desc.m_transcode_endpoint_ise_range);

									for (uint32_t est_part_iter = 0; est_part_iter < total_parts2; est_part_iter++)
									{
										trial_result results[2];

										assert(((cem == 11) && any_2subset_mode11_enabled) || ((cem == 7) && any_2subset_mode7_enabled));

										status = encode_block_2_subsets(
											results,
											mode_desc.m_grid_x, mode_desc.m_grid_y,
											mode_desc.m_cem,
											mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,
											&half_pixels[0][0], (vec4F*)block_pixels_q16,
											coptions,
											uber_mode_flag,
											(cem == 11) ? best_parts2_mode11[est_part_iter] : best_parts2_mode7[est_part_iter],
											comp_level,
											mode11_opt_mode,
											true);

										if (!status)
											continue;

										for (uint32_t r_iter = 0; r_iter < 2; r_iter++)
										{
											const trial_result& res = results[r_iter];

											if (!res.m_valid)
												continue;

											candidate_encoding candidate;
											candidate.m_coder.reserve(24);
											astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;

											coded_log_blk = res.m_log_blk;

											astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
											decomp_blk = res.m_log_blk;

											if (!validate_log_blk(decomp_blk))
											{
												fmt_error_printf("pack_astc_block() failed\n");
												return false;
											}

											status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
											if (!status)
											{
												fmt_error_printf("decode_astc_block() failed\n");
												return false;
											}

											candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
											code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);

											candidate.m_encoding_type = encoding_type::cBlock;
											candidate.m_endpoint_mode = em;
											candidate.m_block_mode = bm;

											candidates.emplace_back(std::move(candidate));

										} // r_iter
									}
								}
								else
								{
									// 1 subset
									uint8_t blk_weights0[BLOCK_W * BLOCK_H], blk_weights1[BLOCK_W * BLOCK_H];
									uint32_t best_submode = 0;

									candidate_encoding candidate;
									candidate.m_coder.reserve(24);
									astc_helpers::log_astc_block& coded_log_blk = candidate.m_coded_log_blk;

									const uint32_t grid_x = mode_desc.m_grid_x, grid_y = mode_desc.m_grid_y;
									const uint32_t num_grid_samples = grid_x * grid_y;

									const half_vec3* pBlock_pixels_half = &half_pixels[0][0];
									const vec4F* pBlock_pixels_q16 = &block_pixels_q16[0][0];

									const uint32_t num_grid_samples_dp = num_grid_samples * (dual_plane ? 2 : 1);

									uint8_t transcode_weights[MAX_BLOCK_W * MAX_BLOCK_H * 2];

									coded_log_blk.m_grid_width = (uint8_t)grid_x;
									coded_log_blk.m_grid_height = (uint8_t)grid_y;
									coded_log_blk.m_dual_plane = (uint8_t)dual_plane;
									coded_log_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel;
									coded_log_blk.m_num_partitions = 1;
									coded_log_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem;
									coded_log_blk.m_weight_ise_range = (uint8_t)mode_desc.m_weight_ise_range;
									coded_log_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_endpoint_ise_range;

									if ((cem == 11) && (!dual_plane) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H)))
									{
										double e = encode_astc_hdr_block_downsampled_mode_11(
											BLOCK_W, BLOCK_H, grid_x, grid_y,
											mode_desc.m_weight_ise_range, mode_desc.m_endpoint_ise_range,
											NUM_BLOCK_PIXELS, (basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
											BIG_FLOAT_VAL,
											FIRST_MODE11_SUBMODE_INDEX, MAX_MODE11_SUBMODE_INDEX, false, mode11_opt_mode,
											coded_log_blk.m_endpoints, coded_log_blk.m_weights, best_submode,
											coptions,
											&enc_block_stats);

										if (e == BIG_FLOAT_VAL)
											break;
									}
									else
									{
										if (cem == 7)
										{
											assert(!dual_plane);

											double e = encode_astc_hdr_block_mode_7(
												NUM_BLOCK_PIXELS,
												(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
												mode_desc.m_weight_ise_range,
												best_submode,
												BIG_FLOAT_VAL,
												coded_log_blk.m_endpoints,
												blk_weights0,
												coptions,
												mode_desc.m_endpoint_ise_range,
												0, MAX_MODE7_SUBMODE_INDEX,
												&enc_block_stats);
											BASISU_NOTE_UNUSED(e);
										}
										else
										{
											double e;

											if (dual_plane)
											{
												e = encode_astc_hdr_block_mode_11_dual_plane(
													NUM_BLOCK_PIXELS,
													(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
													mode_desc.m_dp_channel,
													mode_desc.m_weight_ise_range,
													best_submode,
													BIG_FLOAT_VAL,
													coded_log_blk.m_endpoints,
													blk_weights0, blk_weights1,
													coptions,
													false,
													mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false);
											}
											else
											{
												e = encode_astc_hdr_block_mode_11(
													NUM_BLOCK_PIXELS,
													(basist::half_float(*)[3])pBlock_pixels_half, pBlock_pixels_q16,
													mode_desc.m_weight_ise_range,
													best_submode,
													BIG_FLOAT_VAL,
													coded_log_blk.m_endpoints,
													blk_weights0,
													coptions,
													false,
													mode_desc.m_endpoint_ise_range, uber_mode_flag, false, -1, 7, false,
													mode11_opt_mode,
													&enc_block_stats);
											}

											if (e == BIG_FLOAT_VAL)
												break;
										}

										if (dual_plane)
										{
											downsample_ise_weights_dual_plane(
												mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range,
												BLOCK_W, BLOCK_H,
												grid_x, grid_y,
												blk_weights0, blk_weights1,
												coded_log_blk.m_weights);
										}
										else
										{
											downsample_ise_weights(
												mode_desc.m_weight_ise_range, mode_desc.m_weight_ise_range,
												BLOCK_W, BLOCK_H,
												grid_x, grid_y,
												blk_weights0, coded_log_blk.m_weights);

											if ((comp_level >= MIN_REFINE_LEVEL) && ((grid_x < BLOCK_W) || (grid_y < BLOCK_H)))
											{
												bool refine_status = refine_endpoints(cem,
													mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints,
													6, 6, mode_desc.m_grid_x, mode_desc.m_grid_y,
													coded_log_blk.m_weights, mode_desc.m_weight_ise_range,
													BLOCK_W * BLOCK_H,
													(basist::half_float(*)[3])pBlock_pixels_half, (vec4F*)pBlock_pixels_q16,
													nullptr,
													coptions, mode11_opt_mode);
												BASISU_NOTE_UNUSED(refine_status);
											}
										}
									}

									basist::astc_6x6_hdr::requantize_astc_weights(num_grid_samples_dp, coded_log_blk.m_weights, mode_desc.m_weight_ise_range, transcode_weights, mode_desc.m_transcode_weight_ise_range);

									// Create the block the decoder would transcode into.
									astc_helpers::log_astc_block& decomp_blk = candidate.m_decomp_log_blk;
									decomp_blk.clear();

									decomp_blk.m_color_endpoint_modes[0] = (uint8_t)mode_desc.m_cem;
									decomp_blk.m_dual_plane = mode_desc.m_dp;
									decomp_blk.m_color_component_selector = (uint8_t)mode_desc.m_dp_channel;
									decomp_blk.m_num_partitions = 1;
									decomp_blk.m_endpoint_ise_range = (uint8_t)mode_desc.m_transcode_endpoint_ise_range;
									decomp_blk.m_weight_ise_range = (uint8_t)mode_desc.m_transcode_weight_ise_range;

									basist::astc_6x6_hdr::requantize_ise_endpoints(mode_desc.m_cem, mode_desc.m_endpoint_ise_range, coded_log_blk.m_endpoints, mode_desc.m_transcode_endpoint_ise_range, decomp_blk.m_endpoints);

									copy_weight_grid(dual_plane, grid_x, grid_y, transcode_weights, decomp_blk, use_orig_behavior);

									if (!validate_log_blk(decomp_blk))
									{
										fmt_error_printf("pack_astc_block() failed\n");
										return false;
									}

									status = decode_astc_block(BLOCK_W, BLOCK_H, decomp_blk, &candidate.m_comp_pixels[0][0]);
									if (!status)
									{
										fmt_error_printf("decode_astc_block() failed\n");
										return false;
									}

									candidate.m_coder.put_bits(BLOCK_CODE, BLOCK_CODE_LEN);
									code_block(candidate.m_coder, candidate.m_coded_log_blk, bm, em, nullptr);

									candidate.m_encoding_type = encoding_type::cBlock;
									candidate.m_endpoint_mode = em;
									candidate.m_block_mode = bm;

									candidates.emplace_back(std::move(candidate));
								}

								break;
							}
							default:
								assert(0);
								fmt_debug_printf("Invalid endpoint mode\n");
								return false;

							} // switch (em)

						} // endpoint_mode_iter

					} // block_mode_iter

				} // is_solid_block

				//------------------------------------------------

				debug_state.m_total_candidates_considered.fetch_add(candidates.size_u32(), std::memory_order_relaxed);
				atomic_max(debug_state.m_max_candidates_considered, candidates.size_u32());

				for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
				{
					auto& candidate = candidates[candidate_iter];

					for (uint32_t y = 0; y < BLOCK_H; y++)
						for (uint32_t x = 0; x < BLOCK_W; x++)
							linear_rgb_to_itp(candidate.m_comp_pixels[y][x], candidate.m_comp_pixels_itp[y][x], global_cfg);
				}

				// Find best overall candidate
				double best_t = DBL_MAX;
				int best_candidate_index = -1;

				float best_d_ssim = BIG_FLOAT_VAL;

				if (global_cfg.m_lambda == 0.0f)
				{
					for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
					{
						const auto& candidate = candidates[candidate_iter];

						float candidate_d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]);

						if (candidate_d_ssim < best_d_ssim)
							best_d_ssim = candidate_d_ssim;

						candidate_d_ssim *= SSIM_WEIGHT;

						float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment);

						candidate_mse += candidate_d_ssim;

						float total_deblock_penalty = 0.0f;
						if (global_cfg.m_deblocking_flag)
						{
							total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight;
						}
						candidate_mse += total_deblock_penalty * SSIM_WEIGHT;

						if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse))
						{
							// Bias the encoder away from 2 level blocks on complex blocks
							// TODO: Perhaps only do this on large or non-interpolated grids
							if (complex_block)
							{
								if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS)
								{
									candidate_mse *= TWO_LEVEL_PENALTY;
								}
							}

							// Bias the encoder away from smaller weight grids if the block is very complex
							// TODO: Use the DCT to compute an approximation of the block energy/variance retained vs. lost by downsampling.
							if (complex_block)
							{
								if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2))
									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY;
								else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3)
									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY;
								else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4)
									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY;
							}
						}

						float candidate_t = candidate_mse;

						if (candidate_t < best_t)
						{
							best_t = candidate_t;
							best_candidate_index = candidate_iter;
						}

					} // candidate_iter

					if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM))
					{
						debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed);
						continue;
					}

					const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f);

					if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) &&
						(block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) &&
						(block_avg_y >= 1.5f))
					{
						debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed);
						continue;
					}
				}
				else
				{
					assert(enc_state.smooth_block_mse_scales.get_width() > 0);

					// Compute block's perceptual weighting
					float perceptual_scale = 0.0f;
					for (uint32_t y = 0; y < BLOCK_H; y++)
						for (uint32_t x = 0; x < BLOCK_W; x++)
							perceptual_scale = basisu::maximumf(perceptual_scale, enc_state.smooth_block_mse_scales.at_clamped(bx * BLOCK_W + x, by * BLOCK_H + y));

					// Very roughly normalize the computed distortion vs. bits.
					perceptual_scale *= 10.0f;

					for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
					{
						auto& candidate = candidates[candidate_iter];

						float d_ssim = 1.0f - compute_block_ssim_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0]);

						if (d_ssim < best_d_ssim)
							best_d_ssim = (float)d_ssim;

						d_ssim *= SSIM_WEIGHT;

						float candidate_mse = MSE_WEIGHT * compute_block_mse_itp(BLOCK_W, BLOCK_H, &block_pixels_as_itp[0][0], &candidate.m_comp_pixels_itp[0][0], global_cfg.m_delta_itp_dark_adjustment);

						candidate_mse += d_ssim;

						float total_deblock_penalty = 0.0f;
						if (global_cfg.m_deblocking_flag)
						{
							total_deblock_penalty = calc_deblocking_penalty_itp(bx, by, width, height, pass_src_img_itp, candidate) * global_cfg.m_deblock_penalty_weight;
						}
						candidate_mse += total_deblock_penalty * SSIM_WEIGHT;

						if ((candidate.m_encoding_type == encoding_type::cBlock) || (candidate.m_encoding_type == encoding_type::cReuse))
						{
							// Bias the encoder away from 2 level blocks on complex blocks
							if (complex_block)
							{
								if (candidate.m_coded_log_blk.m_weight_ise_range == astc_helpers::BISE_2_LEVELS)
								{
									candidate_mse *= TWO_LEVEL_PENALTY;
								}
							}

							// Bias the encoder away from smaller weight grids if the block is very complex
							if (complex_block)
							{
								if ((candidate.m_coded_log_blk.m_grid_width == 2) && (candidate.m_coded_log_blk.m_grid_height == 2))
									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_2X2_MSE_PENALTY;
								else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 3)
									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_3X3_MSE_PENALTY;
								else if (minimum(candidate.m_coded_log_blk.m_grid_width, candidate.m_coded_log_blk.m_grid_height) <= 4)
									candidate_mse *= COMPLEX_BLOCK_WEIGHT_GRID_4X4_MSE_PENALTY;
							}
						}

						float mode_penalty = 1.0f;
						if (candidate.m_encoding_type == encoding_type::cSolid)
							mode_penalty *= SOLID_PENALTY;
						else if (candidate.m_encoding_type == encoding_type::cReuse)
							mode_penalty *= REUSE_PENALTY;
						else if (candidate.m_encoding_type == encoding_type::cRun)
							mode_penalty *= (complex_block ? RUN_PENALTY * 2.0f : RUN_PENALTY);

						float candidate_bits = (float)candidate.m_coder.get_total_bits();
						
						double candidate_d = (double)candidate_mse * mode_penalty;

						const float D_POWER = 2.0f;
						
						// this value can get VERY large after squaring on random (fuzzed) HDR inputs
						double candidate_t = perceptual_scale * pow(candidate_d, D_POWER) + candidate_bits * (global_cfg.m_lambda * 1000.0f); 

						candidate.m_t = candidate_t;
						candidate.m_d = candidate_d;
						candidate.m_bits = candidate_bits;

						if (candidate_t < best_t)
						{
							best_t = candidate_t;
							best_candidate_index = candidate_iter;
						}

					} // candidate_iter

					if (best_candidate_index < 0)
					{
						assert(0);
						
						// Should never happen
						best_candidate_index = 0;
					}

					if (global_cfg.m_gaussian1_fallback && (outer_pass == 0) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH1_D_SSIM))
					{
						debug_state.m_total_gaussian1_blocks.fetch_add(1, std::memory_order_relaxed);
						continue;
					}

					const float block_y_contrast_ratio = block_hy / (block_ly + .00000125f);

					if (global_cfg.m_gaussian2_fallback && (comp_level >= 1) && (outer_pass == 1) && (very_complex_block) && (best_d_ssim > SWITCH_TO_GAUSSIAN_FILTERED_THRESH2_D_SSIM) &&
						(block_hy >= 18.0f) && (block_y_contrast_ratio > 150.0f) &&
						(block_avg_y >= 1.5f))
					{
						debug_state.m_total_gaussian2_blocks.fetch_add(1, std::memory_order_relaxed);
						continue;
					}
										
					if (global_cfg.m_rdo_candidate_diversity_boost)
					{
						// candidate diversity boosting - consider candidates along/near the Pareto front
						const candidate_encoding& comp_candidate = candidates[best_candidate_index];

						double best_d = DBL_MAX;

						for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
						{
							const auto& candidate = candidates[candidate_iter];

							if (candidate.m_bits <= comp_candidate.m_bits * global_cfg.m_rdo_candidate_diversity_boost_bit_window_weight)
							{
								if (candidate.m_d < best_d)
								{
									best_d = candidate.m_d;
									best_candidate_index = candidate_iter;
								}
							}
						}
					}

					// candidate JND optimization - if there's a cheaper to code candidate that is nearly equivalent visually to the best candidate chose, choose that
					if (global_cfg.m_jnd_optimization)
					{
						const candidate_encoding& cur_comp_candidate = candidates[best_candidate_index];

						float new_best_candidate_bits = BIG_FLOAT_VAL;
						int new_best_candidate_index = -1;

						for (uint32_t candidate_iter = 0; candidate_iter < candidates.size_u32(); candidate_iter++)
						{
							if ((int)candidate_iter == best_candidate_index)
								continue;

							const auto& candidate = candidates[candidate_iter];

							if (candidate.m_bits >= cur_comp_candidate.m_bits)
								continue;

							float max_delta_itp = 0.0f;
							for (uint32_t y = 0; y < BLOCK_H; y++)
							{
								for (uint32_t x = 0; x < BLOCK_W; x++)
								{
									float delta_itp = compute_pixel_delta_itp(cur_comp_candidate.m_comp_pixels_itp[y][x], candidate.m_comp_pixels_itp[y][x], block_pixels_as_itp[y][x], global_cfg.m_delta_itp_dark_adjustment);
									max_delta_itp = maximum(max_delta_itp, delta_itp);

									if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh)
										goto skip;
								}
							}

						skip:
							if (max_delta_itp >= global_cfg.m_jnd_delta_itp_thresh)
								continue;

							if (candidate.m_bits < new_best_candidate_bits)
							{
								new_best_candidate_bits = candidate.m_bits;
								new_best_candidate_index = candidate_iter;
							}
						}

						if (new_best_candidate_index != -1)
						{
							best_candidate_index = new_best_candidate_index;
							debug_state.m_total_jnd_replacements.fetch_add(1, std::memory_order_relaxed);
						}
					}

				} // if (lambda == 0.0f)

				if (global_cfg.m_debug_images)
				{
					std::lock_guard<std::mutex> lck(debug_state.m_stat_vis_mutex);
					debug_state.m_stat_vis.fill_box(bx * 6, by * 6, 6, 6, vec4F(best_d_ssim, max_std_dev, lowpass_std_dev, 1.0f));
				}

				if (best_candidate_index < 0)
				{
					assert(best_candidate_index >= 0);
					fmt_error_printf("No candidates!\n");
					return false;
				}

				const auto& best_candidate = candidates[best_candidate_index];

				assert(best_candidate.m_encoding_type != encoding_type::cInvalid);

				if (best_candidate.m_encoding_type == encoding_type::cRun)
				{
					if (!prev_run_len)
					{
						if (prev_encoding.get_total_bits())
						{
#if SYNC_MARKERS
							strip_coded_bits.put_bits(0xDEAD, 16);
#endif

							strip_coded_bits.append(prev_encoding);
						}

						assert(best_candidate.m_coder.get_total_bits());

						prev_encoding = best_candidate.m_coder;

						prev_run_len = 1;
					}
					else
					{
						prev_run_len++;

						const uint32_t prev_run_bits = prev_encoding.get_total_bits_u32();
						assert(prev_run_bits);
						BASISU_NOTE_UNUSED(prev_run_bits);

						const uint32_t num_dummy_bits = best_candidate.m_coder.get_total_bits_u32();
						BASISU_NOTE_UNUSED(num_dummy_bits);

						// Rewrite the previous encoding to extend the run length.
						prev_encoding.restart();
						prev_encoding.put_bits(RUN_CODE, RUN_CODE_LEN);
						prev_encoding.put_vlc(prev_run_len - 1, 5);

						assert(prev_encoding.get_total_bits() == prev_run_bits + num_dummy_bits);
					}
				}
				else
				{
					if (prev_encoding.get_total_bits())
					{
#if SYNC_MARKERS
						strip_coded_bits.put_bits(0xDEAD, 16);
#endif

						strip_coded_bits.append(prev_encoding);
					}

					prev_encoding = best_candidate.m_coder;
					prev_run_len = 0;
				}

				memcpy(prev_comp_pixels, best_candidate.m_comp_pixels, sizeof(vec3F) * BLOCK_W * BLOCK_H);

				prev_candidate_encoding = best_candidate;

				if (best_candidate.m_encoding_type != encoding_type::cRun)
					prev_non_run_candidate_encoding = best_candidate;

				{
					std::lock_guard<std::mutex> lck(debug_state.m_stats_mutex);

					debug_state.m_encoding_type_hist[(uint32_t)best_candidate.m_encoding_type]++;

					if (best_candidate.m_encoding_type == encoding_type::cBlock)
					{
						debug_state.m_endpoint_mode_hist[(uint32_t)best_candidate.m_endpoint_mode]++;
					}

					if ((best_candidate.m_encoding_type == encoding_type::cReuse) || (best_candidate.m_encoding_type == encoding_type::cBlock))
					{
						const uint32_t bm_index = (uint32_t)best_candidate.m_block_mode;
						assert(bm_index < (uint32_t)block_mode::cBMTotalModes);

						debug_state.m_block_mode_hist[bm_index]++;
						debug_state.m_block_mode_total_bits[bm_index] += best_candidate.m_coder.get_total_bits();

						for (uint32_t i = 0; i < 3; i++)
						{
							debug_state.m_block_mode_comp_stats[bm_index][i].push_back(half_comp_stats[i]);
							debug_state.m_block_mode_comparative_stats[bm_index][i].push_back(half_cross_chan_stats[i]);
						}
					}

					if (best_candidate.m_encoding_type == encoding_type::cReuse)
					{
						debug_state.m_reuse_num_parts[best_candidate.m_coded_log_blk.m_num_partitions].fetch_add(1, std::memory_order_relaxed);

						if (best_candidate.m_coded_log_blk.m_dual_plane)
							debug_state.m_reuse_total_dp.fetch_add(1, std::memory_order_relaxed);
					}
				}

				enc_state.coded_blocks(bx, by) = prev_non_run_candidate_encoding;

				// Update decoded image
				vec4F decoded_float_pixels[BLOCK_H][BLOCK_W];
				for (uint32_t y = 0; y < BLOCK_H; y++)
					for (uint32_t x = 0; x < BLOCK_W; x++)
						decoded_float_pixels[y][x] = best_candidate.m_comp_pixels[y][x];

				enc_state.packed_img.set_block_clipped((vec4F*)decoded_float_pixels, bx * BLOCK_W, by * BLOCK_H, BLOCK_W, BLOCK_H);

				status = astc_helpers::pack_astc_block(enc_state.final_astc_blocks(bx, by), best_candidate.m_decomp_log_blk, nullptr, nullptr);
				if (!status)
				{
					fmt_error_printf("Failed packing block\n");
					return false;
				}

				const uint32_t r = debug_state.m_total_blocks_compressed.fetch_add(1, std::memory_order_relaxed);
				if ((r & 2047) == 2047)
				{
					if (global_cfg.m_status_output)
					{
						basisu::fmt_printf("{} of {} total blocks compressed, {3.2}%\n", r, total_blocks, (r * 100.0f) / total_blocks);
					}
				}

				if ((global_cfg.m_debug_images) &&
					((best_candidate.m_encoding_type != encoding_type::cRun) && (best_candidate.m_encoding_type != encoding_type::cSolid)))
				{
					std::lock_guard<std::mutex> lck(debug_state.m_vis_image_mutex);

					if (best_candidate.m_decomp_log_blk.m_num_partitions == 2)
					{
						const int part2_unique_index = g_part2_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id];
						assert((part2_unique_index >= 0) && (part2_unique_index < (int)NUM_UNIQUE_PARTITIONS2));

						const partition_pattern_vec& pat = g_partitions2[part2_unique_index];

						for (uint32_t y = 0; y < 6; y++)
						{
							for (uint32_t x = 0; x < 6; x++)
							{
								const uint32_t p = pat[x + y * 6];
								debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, color_rgba(p ? 100 : 0, 128, p ? 100 : 0, 255));
							} // x
						} // y 
					}
					else if (best_candidate.m_decomp_log_blk.m_num_partitions == 3)
					{
						//part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(0, 0, 255, 255));

						const int part3_unique_index = g_part3_seed_to_unique_index[best_candidate.m_decomp_log_blk.m_partition_id];
						assert((part3_unique_index >= 0) && (part3_unique_index < (int)NUM_UNIQUE_PARTITIONS3));

						const partition_pattern_vec& pat = g_partitions3[part3_unique_index];

						for (uint32_t y = 0; y < 6; y++)
						{
							for (uint32_t x = 0; x < 6; x++)
							{
								const uint32_t p = pat[x + y * 6];
								color_rgba c(0, 0, 150, 255);
								if (p == 1)
									c.set(100, 0, 150, 255);
								else if (p == 2)
									c.set(0, 100, 150, 255);
								debug_state.m_part_vis.set_clipped(bx * 6 + x, by * 6 + y, c);
							} // x
						} // y 
					}
					else if (best_candidate.m_decomp_log_blk.m_dual_plane)
					{
						debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 255, 255));
					}
					else
					{
						debug_state.m_part_vis.fill_box(bx * 6, by * 6, 6, 6, color_rgba(255, 0, 0, 255));
					}

					color_rgba c;
					c.set((best_candidate.m_coded_log_blk.m_grid_width * best_candidate.m_coded_log_blk.m_grid_height * 255 + 18) / 36);
					debug_state.m_grid_vis.fill_box(bx * 6, by * 6, 6, 6, c);

					c.set(0, 0, 0, 255);
					if (complex_block)
						c[0] = 255;

					if (very_complex_block)
						c[1] = 255;

					if (outer_pass == 2)
						c[2] = 255;
					else if (outer_pass == 1)
						c[2] = 128;

					debug_state.m_mode_vis.fill_box(bx * 6, by * 6, 6, 6, c);

					c.set(0, 255, 0, 255);
					if (best_candidate.m_coded_log_blk.m_color_endpoint_modes[0] == 7)
						c.set(255, 0, 0, 255);
					debug_state.m_mode_vis2.fill_box(bx * 6, by * 6, 6, 6, c);

					switch (best_candidate.m_encoding_type)
					{
					case encoding_type::cRun:
						c.set(0, 0, 0, 255);
						break;
					case encoding_type::cSolid:
						c.set(128, 128, 128, 255); // dark grey
						break;
					case encoding_type::cReuse:
						c.set(255, 255, 0, 255); // yellow
						break;
					case encoding_type::cBlock:
					{
						switch (best_candidate.m_endpoint_mode)
						{
						case endpoint_mode::cRaw:
							c.set(255, 0, 0, 255); // red
							break;
						case endpoint_mode::cUseLeft:
							c.set(0, 0, 255, 255); // blue
							break;
						case endpoint_mode::cUseUpper:
							c.set(0, 0, 192, 255); // darker blue
							break;
						case endpoint_mode::cUseLeftDelta:
							c.set(0, 255, 0, 255); // green
							break;
						case endpoint_mode::cUseUpperDelta:
							c.set(0, 192, 0, 255); // darker green
							break;
						default:
							break;
						}

						break;
					}
					default:
						break;
					}

					if (filtered_x_err < filtered_y_err)
						c[3] = 0;
					else
						c[3] = 255;

					debug_state.m_enc_vis.fill_box(bx * 6, by * 6, 6, 6, c);
				}

				break;

			} // outer_pass

		} // bx

	} // by

	if (prev_encoding.get_total_bits())
	{
#if SYNC_MARKERS
		strip_coded_bits.put_bits(0xDEAD, 16);
#endif

		strip_coded_bits.append(prev_encoding);
	}

	return true;
}

bool g_initialized = false;

void global_init()
{
	if (g_initialized)
		return;

	interval_timer tm;
	tm.start();

	init_pq_tables();
		
	init_partitions2_6x6();
	init_partitions3_6x6();

	init_contrib_lists();

	g_initialized = true;

	//fmt_printf("astc_6x6_hdr::global_init() total time: {}\n", tm.get_elapsed_secs());
}

bool compress_photo(const basisu::imagef &orig_src_img, const astc_hdr_6x6_global_config &orig_global_cfg, job_pool *pJob_pool,
	basisu::uint8_vec& intermediate_tex_data, basisu::uint8_vec& astc_tex_data, result_metrics& metrics)
{
	assert(g_initialized);
	if (!g_initialized)
		return false;
	
	assert(pJob_pool);

	if (orig_global_cfg.m_debug_output)
	{
		fmt_debug_printf("------ astc_6x6_hdr::compress_photo:\n");
		fmt_debug_printf("Source image dimensions: {}x{}\n", orig_src_img.get_width(), orig_src_img.get_height());
		fmt_debug_printf("Job pool total threads: {}\n", (uint64_t)pJob_pool->get_total_threads());
		orig_global_cfg.print();
	}

	if (!orig_src_img.get_width() || !orig_src_img.get_height())
	{
		assert(false);
		fmt_error_printf("compress_photo: Invalid source image\n");
		return false;
	}

	astc_hdr_6x6_global_config global_cfg(orig_global_cfg);

	uastc_hdr_6x6_encode_state enc_state;
	enc_state.master_coptions.m_q_log_bias = Q_LOG_BIAS_6x6;
	enc_state.src_img = orig_src_img;

	//src_img.crop(256, 256);

	const uint32_t width = enc_state.src_img.get_width();
	const uint32_t height = enc_state.src_img.get_height();
	const uint32_t num_blocks_x = enc_state.src_img.get_block_width(BLOCK_W);
	const uint32_t num_blocks_y = enc_state.src_img.get_block_height(BLOCK_H);
	const uint32_t total_blocks = num_blocks_x * num_blocks_y;

	for (uint32_t y = 0; y < height; y++)
	{
		for (uint32_t x = 0; x < width; x++)
		{
			for (uint32_t c = 0; c < 3; c++)
			{
				float f = enc_state.src_img(x, y)[c];

				if (std::isinf(f) || std::isnan(f) || (f < 0.0f))
					f = 0;
				else if (f > basist::ASTC_HDR_MAX_VAL)
					f = basist::ASTC_HDR_MAX_VAL;

				enc_state.src_img(x, y)[c] = f;
								
			} // c
						
		} // x
	} // y
	
	if (global_cfg.m_debug_images)
	{
		write_exr((global_cfg.m_debug_image_prefix + "orig.exr").c_str(), enc_state.src_img, 3, 0);
	}
			
	image src_img_compressed;
	tonemap_image_compressive2(src_img_compressed, enc_state.src_img);

	if (global_cfg.m_debug_images)
	{
		save_png(global_cfg.m_debug_image_prefix + "compressive_tone_map.png", src_img_compressed);
	}

	smooth_map_params rp;
	rp.m_debug_images = global_cfg.m_debug_images;

	if (global_cfg.m_lambda != 0.0f)
	{
		if (global_cfg.m_status_output)
			fmt_printf("Creating RDO perceptual weighting maps\n");

		create_smooth_maps2(enc_state.smooth_block_mse_scales, src_img_compressed, rp);
	}

	if (global_cfg.m_status_output)
		fmt_printf("Blurring image\n");

	enc_state.src_img_filtered1.resize(width, height);
	image_resample(enc_state.src_img, enc_state.src_img_filtered1, "gaussian", global_cfg.m_gaussian1_strength); //1.45f);
	
	enc_state.src_img_filtered2.resize(width, height);
	image_resample(enc_state.src_img, enc_state.src_img_filtered2, "gaussian", global_cfg.m_gaussian2_strength); //1.83f);
		
	if (global_cfg.m_debug_images)
	{
		write_exr((global_cfg.m_debug_image_prefix + "blurred1.exr").c_str(), enc_state.src_img_filtered1, 3, 0);
		write_exr((global_cfg.m_debug_image_prefix + "blurred2.exr").c_str(), enc_state.src_img_filtered2, 3, 0);
	}

	if (global_cfg.m_status_output)
		fmt_printf("Transforming to ITP\n");

	enc_state.src_img_itp.resize(width, height);
	convet_rgb_image_to_itp(enc_state.src_img, enc_state.src_img_itp, global_cfg);
	
	enc_state.src_img_filtered1_itp.resize(width, height);
	convet_rgb_image_to_itp(enc_state.src_img_filtered1, enc_state.src_img_filtered1_itp, global_cfg);
	
	enc_state.src_img_filtered2_itp.resize(width, height);
	convet_rgb_image_to_itp(enc_state.src_img_filtered2, enc_state.src_img_filtered2_itp, global_cfg);

	if (global_cfg.m_lambda == 0.0f)
		global_cfg.m_favor_higher_compression = false;

	uint32_t total_strips = 0, rows_per_strip = 0;
	if (!calc_strip_size(global_cfg.m_lambda, num_blocks_y, (uint32_t)pJob_pool->get_total_threads(), global_cfg.m_force_one_strip, total_strips, rows_per_strip, global_cfg))
	{
		fmt_error_printf("compress_photo: Failed computing strip sizes\n");
		return false;
	}
		
	if (global_cfg.m_debug_output)
		fmt_printf("lambda: {}, comp_level: {}, highest_comp_level: {}, extra patterns: {}\n", global_cfg.m_lambda, global_cfg.m_master_comp_level, global_cfg.m_highest_comp_level, global_cfg.m_extra_patterns_flag);
					
	enc_state.coded_blocks.resize(num_blocks_x, num_blocks_y);
						
	bitwise_coder coded_bits;

	// For Basis v1.60 files write the original marker, otherwise write the new marker.
	coded_bits.put_bits(global_cfg.m_write_basisu_1_6_compatible_files ? UASTC_6x6_HDR_SIG0 : UASTC_6x6_HDR_SIG1, 16);

	coded_bits.put_bits(width, 16);
	coded_bits.put_bits(height, 16);
					
	enc_state.packed_img.resize(width, height);
		
	enc_state.strip_bits.resize(total_strips);

	enc_state.final_astc_blocks.resize(num_blocks_x, num_blocks_y);

	uastc_hdr_6x6_debug_state debug_state;

	if (global_cfg.m_debug_images)
		debug_state.init(width, height);
	else
		debug_state.init(0, 0);
		
	interval_timer tm;
	tm.start();

	std::atomic_bool any_failed_flag;
	any_failed_flag.store(false);

	for (uint32_t strip_index = 0; strip_index < total_strips; strip_index++)
	{
		const uint32_t strip_first_by = strip_index * rows_per_strip;
		
		uint32_t strip_last_by = minimum<uint32_t>(strip_first_by + rows_per_strip - 1, num_blocks_y);
		if (strip_index == (total_strips - 1))
			strip_last_by = num_blocks_y - 1;

		pJob_pool->add_job([&any_failed_flag, &global_cfg, &debug_state, &enc_state,
			strip_index, total_strips, strip_first_by, strip_last_by,
			num_blocks_x, num_blocks_y, total_blocks, width, height]
		{
			if (!any_failed_flag)
			{
				bool status = compress_strip_task(
					strip_index, total_strips, strip_first_by, strip_last_by,
					num_blocks_x, num_blocks_y, total_blocks, width, height,
					global_cfg, debug_state, enc_state);

				if (!status)
				{
					fmt_error_printf("compress_photo: compress_strip_task() failed\n");
					any_failed_flag.store(true, std::memory_order_relaxed);
				}
			}
		} );

		if (any_failed_flag)
			break;
	
	} // strip_index

	pJob_pool->wait_for_all();

	if (any_failed_flag)
	{
		fmt_error_printf("One or more strips failed during compression\n");
		return false;
	}
				
	if (global_cfg.m_debug_output)
		fmt_printf("Encoding time: {} secs\n", tm.get_elapsed_secs());

	if (global_cfg.m_debug_output)
		debug_state.print(total_blocks);

	if (global_cfg.m_debug_images)
	{
		save_png(global_cfg.m_debug_image_prefix +  "part_vis.png", debug_state.m_part_vis);
		save_png(global_cfg.m_debug_image_prefix + "grid_vis.png", debug_state.m_grid_vis);
		save_png(global_cfg.m_debug_image_prefix + "mode_vis.png", debug_state.m_mode_vis);
		save_png(global_cfg.m_debug_image_prefix + "mode_vis2.png", debug_state.m_mode_vis2);
		save_png(global_cfg.m_debug_image_prefix + "enc_vis.png", debug_state.m_enc_vis);
		write_exr((global_cfg.m_debug_image_prefix + "stat_vis.exr").c_str(), debug_state.m_stat_vis, 3, 0);
	}

	for (uint32_t i = 0; i < total_strips; i++)
		coded_bits.append(enc_state.strip_bits[i]);
		
	coded_bits.put_bits(0xA742, 16);

	coded_bits.flush();

	if (global_cfg.m_output_images)
	{
		write_exr((global_cfg.m_output_image_prefix + "comp.exr").c_str(), enc_state.packed_img, 3, 0);
	}
	
	if (global_cfg.m_debug_output)
		fmt_printf("\nTotal intermediate output bits/pixel: {3.4}\n", (float)coded_bits.get_total_bits() / (float)(width * height));

	vector2D<astc_helpers::astc_block> decoded_blocks1;
	vector2D<astc_helpers::astc_block> decoded_blocks2;
	
	if (global_cfg.m_debug_output)
		fmt_printf("decode_file\n");

	uint32_t unpacked_width = 0, unpacked_height = 0;
	bool status = decode_file(coded_bits.get_bytes(), decoded_blocks1, unpacked_width, unpacked_height);
	if (!status)
	{
		fmt_error_printf("decode_file() failed\n");
		return false;
	}

	if (global_cfg.m_debug_output)
		fmt_printf("decode_6x6_hdr\n");

	status = decode_6x6_hdr(coded_bits.get_bytes().get_ptr(), coded_bits.get_bytes().size_in_bytes_u32(), decoded_blocks2, unpacked_width, unpacked_height);
	if (!status)
	{
		fmt_error_printf("decode_6x6_hdr_file() failed\n");
		return false;
	}

	if ((enc_state.final_astc_blocks.get_width() != decoded_blocks1.get_width()) ||
		(enc_state.final_astc_blocks.get_height() != decoded_blocks1.get_height()))
	{
		fmt_error_printf("Decode size mismatch with decode_file\n");
		return false;
	}

	if ((enc_state.final_astc_blocks.get_width() != decoded_blocks2.get_width()) ||
		(enc_state.final_astc_blocks.get_height() != decoded_blocks2.get_height()))
	{
		fmt_error_printf("Decode size mismatch with decode_6x6_hdr_file\n");
		return false;
	}

	if (memcmp(decoded_blocks1.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks1.size_in_bytes()) != 0)
	{
		fmt_error_printf("Decoded ASTC blocks verification failed\n");
		return false;
	}

	if (memcmp(decoded_blocks2.get_ptr(), enc_state.final_astc_blocks.get_ptr(), decoded_blocks2.size_in_bytes()) != 0)
	{
		fmt_error_printf("Decoded ASTC blocks verification failed\n");
		return false;
	}

	if (global_cfg.m_debug_output)
		basisu::fmt_printf("Decoded ASTC verification checks succeeded\n");

	if (global_cfg.m_output_images)
	{
		if (write_astc_file((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), decoded_blocks1.get_ptr(), BLOCK_W, BLOCK_H, width, height))
		{
			basisu::platform_sleep(20);

			uint8_vec astc_file_data;
			if (read_file_to_vec((global_cfg.m_output_image_prefix + "decoded.astc").c_str(), astc_file_data))
			{
				if (astc_file_data.size() > 16)
				{
					astc_file_data.erase(0, 16);

					size_t comp_size = 0;
					void* pComp_data = tdefl_compress_mem_to_heap(&astc_file_data[0], astc_file_data.size(), &comp_size, TDEFL_MAX_PROBES_MASK);
					mz_free(pComp_data);

					if (global_cfg.m_debug_output)
					{
						fmt_printf(".ASTC file size (less header): {}, bits/pixel: {}, Deflate bits/pixel: {}\n",
							(uint64_t)astc_file_data.size(),
							(float)astc_file_data.size() * 8.0f / (float)(width * height),
							(float)comp_size * 8.0f / (float)(width * height));
					}
				}
			}
		}
	}

	// Must decode all the blocks (even padded rows/cols) to match what the transcoder does.
	imagef unpacked_astc_img(num_blocks_x * 6, num_blocks_y * 6);
	imagef unpacked_astc_google_img(num_blocks_x * 6, num_blocks_y * 6);

	for (uint32_t y = 0; y < decoded_blocks1.get_height(); y++)
	{
		for (uint32_t x = 0; x < decoded_blocks1.get_width(); x++)
		{
			const auto& phys_blk = decoded_blocks1(x, y);

			vec4F pixels[MAX_BLOCK_W * MAX_BLOCK_H];
			status = unpack_physical_astc_block(&phys_blk, BLOCK_W, BLOCK_H, pixels);
			if (!status)
			{
				fmt_error_printf("unpack_physical_astc_block() failed\n");
				return false;
			}
			
			unpacked_astc_img.set_block_clipped(pixels, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H);

			vec4F pixels_google[MAX_BLOCK_W * MAX_BLOCK_H];
			status = unpack_physical_astc_block_google(&phys_blk, BLOCK_W, BLOCK_H, pixels_google);
			if (!status)
			{
				fmt_error_printf("unpack_physical_astc_block_google() failed\n");
				return false;
			}

			unpacked_astc_google_img.set_block_clipped(pixels_google, x * BLOCK_W, y * BLOCK_H, BLOCK_W, BLOCK_H);

			for (uint32_t i = 0; i < 36; i++)
			{
				if (pixels[i] != pixels_google[i])
				{
					fmt_error_printf("pixel unpack mismatch\n");
					return false;
				}
			}
		}
	}
		
	if (global_cfg.m_debug_output)
		fmt_printf("\nUnpack succeeded\n");

	imagef unpacked_bc6h_img;

	{
		vector2D<basist::bc6h_block> bc6h_blocks;
		
		fast_bc6h_params enc_params;
						
		bool pack_status = pack_bc6h_image(unpacked_astc_img, bc6h_blocks, &unpacked_bc6h_img, enc_params);
		if (!pack_status)
		{
			fmt_error_printf("pack_bc6h_image() failed!");
			return false;
		}

		unpacked_bc6h_img.crop(width, height);
		
		if (global_cfg.m_output_images)
		{
			write_exr((global_cfg.m_output_image_prefix + "unpacked_bc6h.exr").c_str(), unpacked_bc6h_img, 3, 0);
		}
	}

	unpacked_astc_img.crop(width, height);
	unpacked_astc_google_img.crop(width, height);
	
	if (global_cfg.m_output_images)
	{
		write_exr((global_cfg.m_output_image_prefix + "unpacked_astc.exr").c_str(), unpacked_astc_img, 3, 0);
		write_exr((global_cfg.m_output_image_prefix + "unpacked_google_astc.exr").c_str(), unpacked_astc_google_img, 3, 0);
	}

	// ASTC metrics
	if (global_cfg.m_image_stats)
	{
		image_metrics im;

		if (global_cfg.m_debug_output)
			printf("\nASTC log2 float error metrics:\n");

		for (uint32_t i = 0; i < 3; i++)
		{
			im.calc(enc_state.src_img, unpacked_astc_img, i, 1, true, true);

			if (global_cfg.m_debug_output)
			{
				printf("%c:   ", "RGBA"[i]);
				im.print_hp();
			}
		}
		
		metrics.m_im_astc_log2.calc(enc_state.src_img, unpacked_astc_img, 0, 3, true, true);

		if (global_cfg.m_debug_output)
		{
			printf("RGB: ");
			metrics.m_im_astc_log2.print_hp();

			printf("\n");
		}
	}

	if (global_cfg.m_image_stats)
	{
		image_metrics im;

		if (global_cfg.m_debug_output)
			printf("ASTC half float space error metrics (a piecewise linear approximation of log2 error):\n");

		for (uint32_t i = 0; i < 3; i++)
		{
			im.calc_half(enc_state.src_img, unpacked_astc_img, i, 1, true);

			if (global_cfg.m_debug_output)
			{
				printf("%c:   ", "RGBA"[i]);
				im.print_hp();
			}
		}

		metrics.m_im_astc_half.calc_half(enc_state.src_img, unpacked_astc_img, 0, 3, true);

		if (global_cfg.m_debug_output)
		{
			printf("RGB: ");
			metrics.m_im_astc_half.print_hp();
		}
	}

	// BC6H metrics
	if (global_cfg.m_image_stats)
	{
		image_metrics im;

		if (global_cfg.m_debug_output)
			printf("\nBC6H log2 float error metrics:\n");

		for (uint32_t i = 0; i < 3; i++)
		{
			im.calc(enc_state.src_img, unpacked_bc6h_img, i, 1, true, true);
			
			if (global_cfg.m_debug_output)
			{
				printf("%c:   ", "RGBA"[i]);
				im.print_hp();
			}
		}

		metrics.m_im_bc6h_log2.calc(enc_state.src_img, unpacked_bc6h_img, 0, 3, true, true);

		if (global_cfg.m_debug_output)
		{
			printf("RGB: ");
			metrics.m_im_bc6h_log2.print_hp();

			printf("\n");
		}
	}

	if (global_cfg.m_image_stats)
	{
		image_metrics im;
		
		if (global_cfg.m_debug_output)
			printf("BC6H half float space error metrics (a piecewise linear approximation of log2 error):\n");

		for (uint32_t i = 0; i < 3; i++)
		{
			im.calc_half(enc_state.src_img, unpacked_bc6h_img, i, 1, true);
			
			if (global_cfg.m_debug_output)
			{
				printf("%c:   ", "RGBA"[i]);
				im.print_hp();
			}
		}

		metrics.m_im_bc6h_half.calc_half(enc_state.src_img, unpacked_bc6h_img, 0, 3, true);
		
		if (global_cfg.m_debug_output)
		{
			printf("RGB: ");
			metrics.m_im_bc6h_half.print_hp();

			printf("\n");
		}
	}

	intermediate_tex_data.swap(coded_bits.get_bytes());

	astc_tex_data.resize(decoded_blocks1.size_in_bytes());
	memcpy(astc_tex_data.data(), decoded_blocks1.get_ptr(), decoded_blocks1.size_in_bytes());

	return true;
}

} // namespace astc_6x6_hdr
