Autoexposure example restoration#728
Autoexposure example restoration#728devshgraphicsprogramming wants to merge 63 commits intomasterfrom
Conversation
…into autoexposue_ex
…into autoexposue_ex
…into autoexposue_ex
…into autoexposue_ex
…into autoexposue_ex
| template <typename T> | ||
| T morton2d_mask(uint16_t _n) | ||
| { | ||
| const static uint64_t mask[5] = | ||
| { | ||
| 0x5555555555555555ull, | ||
| 0x3333333333333333ull, | ||
| 0x0F0F0F0F0F0F0F0Full, | ||
| 0x00FF00FF00FF00FFull, | ||
| 0x0000FFFF0000FFFFull | ||
| }; | ||
| return (T)mask[_n]; | ||
| } | ||
|
|
||
| template <typename T> | ||
| T morton3d_mask(uint16_t _n) | ||
| { | ||
| const static uint64_t mask[5] = | ||
| { | ||
| 0x1249249249249249ull, | ||
| 0x10C30C30C30C30C3ull, | ||
| 0x010F00F00F00F00Full, | ||
| 0x001F0000FF0000FFull, | ||
| 0x001F00000000FFFFull | ||
| }; | ||
| return (T)mask[_n]; | ||
| } | ||
| template <typename T> | ||
| T morton4d_mask(uint16_t _n) | ||
| { | ||
| const static uint64_t mask[4] = | ||
| { | ||
| 0x1111111111111111ull, | ||
| 0x0303030303030303ull, | ||
| 0x000F000F000F000Full, | ||
| 0x000000FF000000FFull | ||
| }; | ||
| return (T)mask[_n]; | ||
| } |
There was a problem hiding this comment.
couldn't you make them into
namespace morton
{
namespace impl
{
template<uint16_t Dims, typename T>
struct mask;
// now the partial specializations
}
}with the masks as NBL_CONSTEXPR member variables
This way you can have
template<uint16_t Dims, typename T, uint16_t BitDepth>
enable_if_t<is_scalar_v<T>&&is_integral_v<T>,T> decode(T x)
{
static_assert(BitDepth <= sizeof(T)*8);
x = x & mask<Dims,T>::value[0];
.. rest of if-else statements
}| template<typename T, uint32_t bitDepth = sizeof(T) * 8u> | ||
| T morton2d_decode_x(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton); } | ||
| template<typename T, uint32_t bitDepth = sizeof(T) * 8u> | ||
| T morton2d_decode_y(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton >> 1); } | ||
|
|
||
| template<typename T, uint32_t bitDepth = sizeof(T) * 8u> | ||
| T morton2d_encode(T x, T y) { return impl::separate_bits_2d<T, bitDepth>(x) | (impl::separate_bits_2d<T, bitDepth>(y) << 1); } | ||
| template<typename T, uint32_t bitDepth = sizeof(T) * 8u> | ||
| T morton3d_encode(T x, T y, T z) { return impl::separate_bits_3d<T, bitDepth>(x) | (impl::separate_bits_3d<T, bitDepth>(y) << 1) | (impl::separate_bits_3d<T, bitDepth>(z) << 2); } | ||
| template<typename T, uint32_t bitDepth = sizeof(T) * 8u> | ||
| T morton4d_encode(T x, T y, T z, T w) { return impl::separate_bits_4d<T, bitDepth>(x) | (impl::separate_bits_4d<T, bitDepth>(y) << 1) | (impl::separate_bits_4d<T, bitDepth>(z) << 2) | (impl::separate_bits_4d<T, bitDepth>(w) << 3); } |
There was a problem hiding this comment.
imho these should be template<uint16_t Dims, typename T, uint16_t BitDepth=sizeof(T)*8>
because that way you only need to write the encode and decode once
template<uint16_t Dims, typename T, uint16_t BitDepth=sizeof(T)*8>
vector<T,Dims> decode(const T _morton)
{
vector<T,Dims> retval;
for (uint16_t i=0; i<Dims; i++)
retval[i] = impl::decode<Dims,T,BitDepth>(_morton>>i);
return retval;
}
template<uint16_t Dims, typename T, uint16_t BitDepth=sizeof(T)*8>
T encode(const vector<T,Dims> coord)
{
T retval = impl::separate_bits<Dims,T,BitDepth>(coord[0]);
for (uint16_t i=1; i<Dims; i++)
retval |= impl::separate_bits<Dims,T,BitDepth>(coord[1])<<i;
return retval;
}| retval.minLuma = lumaMinimum; | ||
| retval.maxLuma = lumaMaximum; |
There was a problem hiding this comment.
typo, you've set the min and max equal to each other
There was a problem hiding this comment.
P.S. its also more useful to take a precomputed minLumaLog2 and lumaLog2Range (diff between log of max and log of min)
There was a problem hiding this comment.
still outstanding for the geom meter
| luma = clamp(luma, lumaMinMax.x, lumaMinMax.y); | ||
|
|
||
| return max(log2(luma), log2(lumaMinMax.x)); |
There was a problem hiding this comment.
why max you already clamped!
There was a problem hiding this comment.
btw if you have the log2 already precomputed then you can do
return min(spirv::nMax(log2(luma),lumaMinLog2),lumaMaxLog2);nMin is a special SPIR-V version of min that will return the other operand if another is NaN (which happens on log of negative value or 0)
| float_t luma = 0.0f; | ||
| float_t2 shiftedCoord = (tileOffset + (float32_t2)(coord)) / viewportSize; | ||
| luma = computeLumaLog2(window, tex, shiftedCoord); |
There was a problem hiding this comment.
luma should be called lumaLog2
| luma = computeLumaLog2(window, tex, shiftedCoord); | ||
| float_t lumaSum = reduction(luma, sdata); | ||
|
|
||
| if (tid == GroupSize - 1) { |
There was a problem hiding this comment.
its somewhat semantically cleaner to pick the first, instead of last, esp since its a reduction you performed before
|
|
||
| void uploadFloat( | ||
| NBL_REF_ARG(ValueAccessor) val_accessor, | ||
| uint32_t index, |
There was a problem hiding this comment.
don't give bad names to variables, this needs to be called workGroupIndex
There was a problem hiding this comment.
this variable shouldn't even exist, because the MEAN meter didn't output to different address per X Y workgroup coordinate
|
|
||
| if (tid == GroupSize - 1) { | ||
| uint32_t3 workgroupCount = glsl::gl_NumWorkGroups(); | ||
| uint32_t workgroupIndex = (workgroupCount.x * workgroupCount.y * workgroupCount.z) / 64; |
There was a problem hiding this comment.
you're computing the wrong thing, every workgroup gets the same index 🤦
Also the original code was touching the same address with every for the MEAN meter mode
| float_t rangeLog2 | ||
| ) | ||
| { | ||
| uint32_t3 workGroupCount = glsl::gl_NumWorkGroups(); |
There was a problem hiding this comment.
take the workgroup count and workgroup XYZ coordinate (or workgroup index) from outside (as function arguments) otherwise in the presence of solutions such as virtual workgroups or persistent threads, this whole thing will fall apart
| float_t lumaSum = reduction(luma, sdata); | ||
|
|
||
| if (tid == GroupSize - 1) { | ||
| uint32_t3 workgroupCount = glsl::gl_NumWorkGroups(); |
There was a problem hiding this comment.
take the workgroup count and workgroup XYZ coordinate (or workgroup index) from outside (as function arguments) otherwise in the presence of solutions such as virtual workgroups or persistent threads, this whole thing will fall apart
| float_t minLog2, | ||
| float_t rangeLog2 |
There was a problem hiding this comment.
should already be precomputed as members
| uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); | ||
|
|
||
| uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); |
There was a problem hiding this comment.
lets write some docs for this....
The val was produced by a workgroup reduction is performed of values in the [MinLog2,MaxLog2] range
Which makes the scaledLogLuma (the variable that should hold (val-minLog2)*rangeLog2) is between 0 and WorkGroupSize
This value is atomic added by N workgroups
You now want to represent it in Fixed Point during the atomic add, but not be vulnerable to overflow, this means the worst case is adding N times WorkGroupSize.
This means that we need to multiply the by (2^32-1)/N precomputed as a float or if you must round up N to PoT and see how many bits are left (512 workgroups, means 9 bits, so 23 are left). To avoid rounding precision errors, the PoT method is chosen.
I have no clue where you're getting +SubgroupSizeLog2 from.
Also the value of (1<<fixedPointBitsLeft)-1 must be precomputed in create and stored as a member
IT should be as easy as
const uint32_t scaledLumaLog2BitPattern = uint32_t((val-lumaMinLog2)*maxIncrement_over_lumaRangeLog2+float_t(0.5));where maxIncrement = (0x1u<<(32u-uint32_t(ceil(log2(WorkGroupCount*WorkGroupSize)))))-1;
|
|
||
| uint32_t lumaSumBitPattern = uint32_t(clamp((val - minLog2) * rangeLog2, 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); | ||
|
|
||
| val_accessor.atomicAdd(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); |
There was a problem hiding this comment.
no, always the same address should be added to, if you wanted to stagger, then you should stagger based on modulo of the workgroup index
| float_t luma = (float_t)val_accessor.get(index & ((1 << glsl::gl_SubgroupSizeLog2()) - 1)); | ||
| return luma / rangeLog2 + minLog2; |
There was a problem hiding this comment.
again, you're getting random floats based on workgroup index which thankfully was always the same (rare case of two wrongs making a right)
Again if you wanted to stagger, you should use entire subgroup to load the values, then subgroup reduce them
just converting to float_t is not the correct way to decode, you should divide by the maxIncrement
…stead of one value
| int_t lower, upper; | ||
| if (tid == 0) | ||
| { | ||
| const uint32_t lowerPercentile = uint32_t(BinCount * lowerBoundPercentile); |
There was a problem hiding this comment.
use totalSamples * percentile and totalSamples can be histo[BinCount-1]
lowerPercentile is in bin units while histo prefix sums are in samples, you compare different units hence the threshold is tiny and the median collapses to the first bins, your EV moves toward log2(lumaMin) and tonemapping is biased dark
counterexample https://godbolt.org/z/49WjcazMr
Description
Testing
TODO list: