#define TILE 16u
layout(local_size_x = TILE, local_size_y = TILE, local_size_z = 1) in;
const vec2 workGroupsRender = vec2(1.0, 1.0);

/*
	// Based on https://developer.nvidia.com/blog/optimizing-compute-shaders-for-l2-locality-using-thread-group-id-swizzling/

	uvec2 invocation_id() {
		const uint tile_max_width = 8u; // [8 16 32]

		immut uint perfect_tile_size = tile_max_width * gl_NumWorkGroups.y;
		immut uint perfect_tile_count = gl_NumWorkGroups.x / tile_max_width;

		immut uint perfect_tile_work_groups = perfect_tile_count * perfect_tile_size;
		immut uint flattened_id = gl_NumWorkGroups.x * gl_WorkGroupID.y + gl_WorkGroupID.x;

		immut uint tile_id = flattened_id / perfect_tile_size;
		immut uint flattened_local_work_group_id = flattened_id % perfect_tile_size;

		uvec2 local_work_group_id;

		if (perfect_tile_work_groups <= flattened_id) {
			immut uint last_tile_x = gl_NumWorkGroups.x % tile_max_width;

			local_work_group_id = uvec2(
				flattened_local_work_group_id % last_tile_x,
				flattened_local_work_group_id / last_tile_x
			);
		} else local_work_group_id = uvec2(
			flattened_local_work_group_id % tile_max_width,
			flattened_local_work_group_id / tile_max_width
		);

		immut uint swizzled_flattened_id = tile_id * tile_max_width + local_work_group_id.y * gl_NumWorkGroups.x + local_work_group_id.x;

		immut uvec2 swizzled_work_group_id = uvec2(
			swizzled_flattened_id % gl_NumWorkGroups.x,
			swizzled_flattened_id / gl_NumWorkGroups.x
		);

		return TILE * swizzled_work_group_id + gl_LocalInvocationID.xy;
	}
*/