-
Notifications
You must be signed in to change notification settings - Fork 59
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Autoexposure example restoration #728
base: master
Are you sure you want to change the base?
Conversation
…into autoexposue_ex
…into autoexposue_ex
…into autoexposue_ex
…into autoexposue_ex
…into autoexposue_ex
struct LumaMeteringWindow | ||
{ | ||
float32_t2 meteringWindowScale; | ||
float32_t2 meteringWindowOffset; | ||
}; | ||
|
||
template<uint32_t GroupSize, typename ValueAccessor, typename SharedAccessor, typename TexAccessor> | ||
struct geom_luma_meter { | ||
using this_t = geom_luma_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you want to split off the cross-C++-and-HLSL code like LumaMeteringWindow
from GPU specific code like geom_luma_meter
also having structs with names and a namespace that has luma
and meter
in both is redundant, I'd accept luma_meter::MeteringWindow
but everything else I'd kill the duplicates
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
#else | ||
template <typename T> | ||
constexpr T morton2d_mask(uint8_t _n) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
why two different versions for HLSL and C++ ?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
HLSL doesn't have constexpr
, right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
add a #define NBL_CONSTEXPR_FUNC
to https://github.com/Devsh-Graphics-Programming/Nabla/blob/master/include/nbl/builtin/hlsl/cpp_compat.hlsl
then you can do an empty define of NBL_CONSTEXPR_FUNC
for HLSL_VERSION
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
template <typename T> | ||
T morton2d_mask(uint16_t _n) | ||
{ | ||
const static uint64_t mask[5] = | ||
{ | ||
0x5555555555555555ull, | ||
0x3333333333333333ull, | ||
0x0F0F0F0F0F0F0F0Full, | ||
0x00FF00FF00FF00FFull, | ||
0x0000FFFF0000FFFFull | ||
}; | ||
return (T)mask[_n]; | ||
} | ||
|
||
template <typename T> | ||
T morton3d_mask(uint16_t _n) | ||
{ | ||
const static uint64_t mask[5] = | ||
{ | ||
0x1249249249249249ull, | ||
0x10C30C30C30C30C3ull, | ||
0x010F00F00F00F00Full, | ||
0x001F0000FF0000FFull, | ||
0x001F00000000FFFFull | ||
}; | ||
return (T)mask[_n]; | ||
} | ||
template <typename T> | ||
T morton4d_mask(uint16_t _n) | ||
{ | ||
const static uint64_t mask[4] = | ||
{ | ||
0x1111111111111111ull, | ||
0x0303030303030303ull, | ||
0x000F000F000F000Full, | ||
0x000000FF000000FFull | ||
}; | ||
return (T)mask[_n]; | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
couldn't you make them into
namespace morton
{
namespace impl
{
template<uint16_t Dims, typename T>
struct mask;
// now the partial specializations
}
}
with the masks as NBL_CONSTEXPR
member variables
This way you can have
template<uint16_t Dims, typename T, uint16_t BitDepth>
enable_if_t<is_scalar_v<T>&&is_integral_v<T>,T> decode(T x)
{
static_assert(BitDepth <= sizeof(T)*8);
x = x & mask<Dims,T>::value[0];
.. rest of if-else statements
}
template<typename T, uint32_t bitDepth = sizeof(T) * 8u> | ||
T morton2d_decode_x(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton); } | ||
template<typename T, uint32_t bitDepth = sizeof(T) * 8u> | ||
T morton2d_decode_y(T _morton) { return impl::morton2d_decode<T, bitDepth>(_morton >> 1); } | ||
|
||
template<typename T, uint32_t bitDepth = sizeof(T) * 8u> | ||
T morton2d_encode(T x, T y) { return impl::separate_bits_2d<T, bitDepth>(x) | (impl::separate_bits_2d<T, bitDepth>(y) << 1); } | ||
template<typename T, uint32_t bitDepth = sizeof(T) * 8u> | ||
T morton3d_encode(T x, T y, T z) { return impl::separate_bits_3d<T, bitDepth>(x) | (impl::separate_bits_3d<T, bitDepth>(y) << 1) | (impl::separate_bits_3d<T, bitDepth>(z) << 2); } | ||
template<typename T, uint32_t bitDepth = sizeof(T) * 8u> | ||
T morton4d_encode(T x, T y, T z, T w) { return impl::separate_bits_4d<T, bitDepth>(x) | (impl::separate_bits_4d<T, bitDepth>(y) << 1) | (impl::separate_bits_4d<T, bitDepth>(z) << 2) | (impl::separate_bits_4d<T, bitDepth>(w) << 3); } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
imho these should be template<uint16_t Dims, typename T, uint16_t BitDepth=sizeof(T)*8>
because that way you only need to write the encode and decode once
template<uint16_t Dims, typename T, uint16_t BitDepth=sizeof(T)*8>
vector<T,Dims> decode(const T _morton)
{
vector<T,Dims> retval;
for (uint16_t i=0; i<Dims; i++)
retval[i] = impl::decode<Dims,T,BitDepth>(_morton>>i);
return retval;
}
template<uint16_t Dims, typename T, uint16_t BitDepth=sizeof(T)*8>
T encode(const vector<T,Dims> coord)
{
T retval = impl::separate_bits<Dims,T,BitDepth>(coord[0]);
for (uint16_t i=1; i<Dims; i++)
retval |= impl::separate_bits<Dims,T,BitDepth>(coord[1])<<i;
return retval;
}
static this_t create(float EV, float key = 0.18f, float Contrast = 1.f) { | ||
this_t retval; | ||
retval.gamma = Contrast; | ||
retval.exposure = EV + log2(key * 0.77321666f); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
did the GLSL have any comment (or git blame) about what the 0.77321666f
is?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed
struct ReinhardParams | ||
{ | ||
using this_t = ReinhardParams; | ||
static this_t create(float EV, float key = 0.18f, float WhitePointRelToEV = 16.f) | ||
{ | ||
this_t retval; | ||
retval.keyAndManualLinearExposure = key * exp2(EV); | ||
retval.rcpWhite2 = 1.f / (WhitePointRelToEV * WhitePointRelToEV); | ||
return retval; | ||
} | ||
|
||
float32_t keyAndManualLinearExposure; | ||
float32_t rcpWhite2; | ||
}; | ||
|
||
struct ACESParams |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
- drop
Params
from the name - put the actual tonemapping as an
operator()
- [Optional] template them on
float_t
, so one has choice of using float16_t or float32_t
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
float32_t3 reinhard(ReinhardParams params, float32_t3 rawCIEXYZcolor) | ||
{ | ||
float32_t exposureFactors = params.keyAndManualLinearExposure; | ||
float32_t exposedLuma = rawCIEXYZcolor.y * exposureFactors; | ||
float32_t colorMultiplier = (exposureFactors * (1.0 + exposedLuma * params.rcpWhite2) / (1.0 + exposedLuma)); | ||
return rawCIEXYZcolor * colorMultiplier; | ||
} | ||
|
||
float32_t3 aces(ACESParams params, float32_t3 rawCIEXYZcolor) | ||
{ |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
these should be operator()(const vector<float_t,3> rawCIEXYZcolor)
inside the respective Reinhard and ACES structs
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
float32_t3 a = v * (v + float32_t3(0.0245786, 0.0245786, 0.0245786)) - float32_t3(0.000090537, 0.000090537, 0.000090537); | ||
float32_t3 b = v * (v * float32_t3(0.983729, 0.983729, 0.983729) + float32_t3(0.4329510, 0.4329510, 0.4329510)) + float32_t3(0.238081, 0.238081, 0.238081); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
btw we have some promote
thing made by @Przemog1 you can use for making vectors full of the same scalar
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
struct geom_luma_meter { | ||
using this_t = geom_luma_meter<GroupSize, ValueAccessor, SharedAccessor, TexAccessor>; | ||
|
||
static this_t create(NBL_REF_ARG(LumaMeteringWindow) window, float32_t lumaMinimum, float32_t lumaMaximum) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
CONST_REF nor just REF
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
uint32_t2 coord = { | ||
morton2d_decode_x(glsl::gl_LocalInvocationIndex()), | ||
morton2d_decode_y(glsl::gl_LocalInvocationIndex()) | ||
}; | ||
uint32_t tid = workgroup::SubgroupContiguousIndex(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you should use the tid
instead of LocalInvocationIndex for the morton code
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
uint32_t2 sampleIndex = coord * GroupSize + float32_t2(glsl::gl_SubgroupID() + 1, glsl::gl_SubgroupInvocationID() + 1); | ||
float32_t luma = 0.0f; | ||
|
||
if (sampleIndex.x <= sampleCount.x && sampleIndex.y <= sampleCount.y) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the computation of sampleIndex
makes no sense here.
Also you shouldn't be bound testing anything, every invocation must sample otherwise your histogram/average is wrong.
Please compute a normalized uv
coordinate relative to a full texture like this
const float32_t2 uv = (tileOffset+float32_t2(coord))*window.scale/totalInvocationsInDispatch+window.offset;
where tileOffset
is basically gl_WorkgroupID*gl_WorkgroupSize
but passed in from the outside
and window.scale/totalInvocationsInDispatch
is precomputed obviously
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
retval.minLuma = lumaMinimum; | ||
retval.maxLuma = lumaMaximum; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
typo, you've set the min and max equal to each other
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
P.S. its also more useful to take a precomputed minLumaLog2
and lumaLog2Range
(diff between log of max and log of min)
{ | ||
float32_t2 stride = window.meteringWindowScale / (sampleCount + float32_t2(1.0f, 1.0f)); | ||
float32_t2 samplePos = stride * sampleIndex; | ||
float32_t2 uvPos = (samplePos + float32_t2(0.5f, 0.5f)) / viewportSize; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the windowOffset should have had the 0.5 texel shift inside itself, so viewportSize not needed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed
float32_t2 stride = window.meteringWindowScale / (sampleCount + float32_t2(1.0f, 1.0f)); | ||
float32_t2 samplePos = stride * sampleIndex; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just compute the UV this way https://github.com/Devsh-Graphics-Programming/Nabla/pull/728/files#r1719941761
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed
float32_t2 stride = window.meteringWindowScale / (sampleCount + float32_t2(1.0f, 1.0f)); | ||
float32_t2 samplePos = stride * sampleIndex; | ||
float32_t2 uvPos = (samplePos + float32_t2(0.5f, 0.5f)) / viewportSize; | ||
float32_t3 color = colorspace::oetf::sRGB(tex.get(uvPos)); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this is wrong, the HDR image you're reading (the noises example with FLOAT16 format) is scene referred in linear space
let the TextureAccessor worry about applying an EOTF if there is one needed.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Fixed
float32_t2 samplePos = stride * sampleIndex; | ||
float32_t2 uvPos = (samplePos + float32_t2(0.5f, 0.5f)) / viewportSize; | ||
float32_t3 color = colorspace::oetf::sRGB(tex.get(uvPos)); | ||
float32_t luma = dot(colorspace::sRGBtoXYZ[1], color); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
actually require/expect a TexAccessor::toXYZ
static member float32_t3x3
variable :P
think of it as the TexAccessor
telling everyone what the colorspace of the image is
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
luma = clamp(luma, minLuma, maxLuma); | ||
|
||
return log2(luma / minLuma) / log2(maxLuma / minLuma); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you want to just do a max(log2(luma),minLumaLog2)
here without any normalization
also rename the function to computeLumaLog2
or something
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
luma = computeLuma(tex, sampleCount, sampleIndex, viewportSize); | ||
float32_t lumaSum = reduction(luma, sdata); | ||
|
||
sdata.workgroupExecutionAndMemoryBarrier(); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
not sure you need the barrier, the last invocation will have the reduction value already
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
actually every invocation will
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
} | ||
} | ||
|
||
LumaMeteringWindow window; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
the window shouldn't be stored, just fed to gatherLuma
, see my point about gatherLuma
naming
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
uint32_t fixedPointBitsLeft = 32 - uint32_t(ceil(log2(workGroupCount.x * workGroupCount.y * workGroupCount.z))) + glsl::gl_SubgroupSizeLog2(); | ||
|
||
uint32_t lumaSumBitPattern = uint32_t(clamp((lumaSum - log2(minLuma)) * (log2(maxLuma) - log2(minLuma)), 0.f, float32_t((1 << fixedPointBitsLeft) - 1))); | ||
|
||
uint32_t3 workgroupSize = glsl::gl_WorkGroupSize(); | ||
uint32_t workgroupIndex = dot(uint32_t3(workgroupSize.y * workgroupSize.z, workgroupSize.z, 1), glsl::gl_WorkGroupID()); | ||
|
||
val.atomicAdd(workgroupIndex & ((1 << glsl::gl_SubgroupSizeLog2()) - 1), lumaSumBitPattern); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there's a number of bugs here that we went over on discord.
- your
lumaSum
should be just a sum ofmax(log2(luma),minLumaLog2)
fixedPointBitsLeft
needs a better name and needs to come from the outside (precomputed)- the bitpattern needs to map
lumaSum==minLumaLog2*WorkgroupSize
to 0 andlumaSum==maxLumaLog2*WorkgroupSize
to(1<<fixedPointBitsLeft)-1
return log2(luma / minLuma) / log2(maxLuma / minLuma); | ||
} | ||
|
||
void gatherLuma( |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'd rename this to sampleLuma
because you want another method thats called either gather
(because whole workgroup gathers partial averages or histograms) or get
to actually obtain your measurement in the second dispatch
I recommend having the getter in the same struct, because you need much of the state and precomputed constants to be able to joint and decode the measurements to produce a single EV value.
(you can copy paste your struct initialization from push constants or whatever in both the measurement and tonemapping shader)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done
…into autoexposue_ex
What are the action points before completion? |
Description
Testing
TODO list: