Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Work on property pool HLSL impl #649

Draft
wants to merge 19 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 19 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions include/nbl/builtin/hlsl/concepts.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
// This file is part of the "Nabla Engine".
// For conditions of distribution and use, see copyright notice in nabla.h

#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_
#define _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_

#include <nbl/builtin/hlsl/cpp_compat/vector.hlsl>
#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
#include <nbl/builtin/hlsl/type_traits.hlsl>


#if (__cplusplus >= 202002L && __cpp_concepts)

#define NBL_CONCEPT_TYPE_PARAMS(...) template <__VA_ARGS__>
#define NBL_CONCEPT_SIGNATURE(NAME, ...) concept NAME = requires(__VA_ARGS__)
#define NBL_CONCEPT_BODY(...) { __VA_ARGS__ };
#define NBL_CONCEPT_ASSIGN(NAME, ...) concept NAME = __VA_ARGS__;
#define NBL_REQUIRES(...) requires __VA_ARGS__

#include <concepts>

namespace nbl
{
namespace hlsl
{
namespace concepts
{

// Alias some of the std concepts in nbl. As this is C++20 only, we don't need to use
// the macros here.
template <typename T, typename U>
concept same_as = std::same_as<T, U>;

template <typename D, typename B>
concept derived_from = std::derived_from<D, B>;

template <typename F, typename T>
concept convertible_to = std::convertible_to<F, T>;

template <typename T, typename F>
concept assignable_from = std::assignable_from<T, F>;

template <typename T, typename U>
concept common_with = std::common_with<T, U>;

template <typename T>
concept integral = std::integral<T>;

template <typename T>
concept signed_integral = std::signed_integral<T>;

template <typename T>
concept unsigned_integral = std::unsigned_integral<T>;

template <typename T>
concept floating_point = std::floating_point<T>;


// Some other useful concepts.

template<typename T, typename... Ts>
concept any_of = (same_as<T, Ts> || ...);

template <typename T>
concept scalar = floating_point<T> || integral<T>;

template <typename T>
concept vectorial = is_vector<T>::value;

template <typename T>
concept matricial = is_matrix<T>::value;

}
}
}

#else

// No C++20 support. Do nothing.
#define NBL_CONCEPT_TYPE_PARAMS(...)
#define NBL_CONCEPT_SIGNATURE(NAME, ...)
#define NBL_CONCEPT_BODY(...)
#define NBL_REQUIRES(...)

#endif

#endif
148 changes: 148 additions & 0 deletions include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
#include "nbl/builtin/hlsl/property_pool/transfer.hlsl"

namespace nbl
{
namespace hlsl
{
namespace property_pools
{

[[vk::push_constant]] GlobalPushContants globals;
deprilula28 marked this conversation as resolved.
Show resolved Hide resolved

template<bool Fill, bool SrcIndexIota, bool DstIndexIota, uint64_t SrcIndexSizeLog2, uint64_t DstIndexSizeLog2>
struct TransferLoop
{
void iteration(uint propertyId, TransferRequest transferRequest, uint64_t invocationIndex)
{
const uint64_t srcIndexSize = uint64_t(1) << SrcIndexSizeLog2;
const uint64_t dstIndexSize = uint64_t(1) << DstIndexSizeLog2;

const uint64_t srcOffset = invocationIndex * srcIndexSize * transferRequest.propertySize;
const uint64_t dstOffset = invocationIndex * dstIndexSize * transferRequest.propertySize;

const uint64_t srcIndexAddress = Fill ? transferRequest.srcIndexAddr + srcOffset : transferRequest.srcIndexAddr;
const uint64_t dstIndexAddress = Fill ? transferRequest.dstIndexAddr + dstOffset : transferRequest.dstIndexAddr;

const uint64_t srcAddressBufferOffset = SrcIndexIota ? srcIndexAddress : vk::RawBufferLoad<uint32_t>(srcIndexAddress);
const uint64_t dstAddressBufferOffset = DstIndexIota ? dstIndexAddress : vk::RawBufferLoad<uint32_t>(dstIndexAddress);

const uint64_t srcAddressMapped = transferRequest.srcAddr + srcAddressBufferOffset * srcIndexSize;
const uint64_t dstAddressMapped = transferRequest.dstAddr + dstAddressBufferOffset * dstIndexSize;

if (SrcIndexSizeLog2 == 0) {} // we can't write individual bytes
else if (SrcIndexSizeLog2 == 1) vk::RawBufferStore<uint16_t>(dstAddressMapped, vk::RawBufferLoad<uint16_t>(srcAddressMapped));
deprilula28 marked this conversation as resolved.
Show resolved Hide resolved
else if (SrcIndexSizeLog2 == 2) vk::RawBufferStore<uint32_t>(dstAddressMapped, vk::RawBufferLoad<uint32_t>(srcAddressMapped));
else if (SrcIndexSizeLog2 == 3) vk::RawBufferStore<uint64_t>(dstAddressMapped, vk::RawBufferLoad<uint64_t>(srcAddressMapped));
}

void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
{
uint64_t elementCount = uint64_t(transferRequest.elementCount32)
| uint64_t(transferRequest.elementCountExtra) << 32;
uint64_t lastInvocation = min(elementCount, globals.endOffset);
for (uint64_t invocationIndex = globals.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize)
{
iteration(propertyId, transferRequest, invocationIndex);
}
}
};

// For creating permutations of the functions based on parameters that are constant over the transfer request
// These branches should all be scalar, and because of how templates are compiled statically, the loops shouldn't have any
// branching within them
//
// Permutations:
// 2 (fill or not) * 2 (src index iota or not) * 2 (dst index iota or not) * 4 (src index size) * 4 (dst index size)
// Total amount of permutations: 128

template<bool Fill, bool SrcIndexIota, bool DstIndexIota, uint64_t SrcIndexSizeLog2>
struct TransferLoopPermutationSrcIndexSizeLog
{
void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
{
if (transferRequest.dstIndexSizeLog2 == 0) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 0> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
else if (transferRequest.dstIndexSizeLog2 == 1) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 1> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
else if (transferRequest.dstIndexSizeLog2 == 2) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 2> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
else /*if (transferRequest.dstIndexSizeLog2 == 3)*/ { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 3> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
}
};

template<bool Fill, bool SrcIndexIota, bool DstIndexIota>
struct TransferLoopPermutationDstIota
{
void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
{
if (transferRequest.srcIndexSizeLog2 == 0) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 0> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
else if (transferRequest.srcIndexSizeLog2 == 1) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 1> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
else if (transferRequest.srcIndexSizeLog2 == 2) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 2> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
else /*if (transferRequest.srcIndexSizeLog2 == 3)*/ { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 3> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
}
};

template<bool Fill, bool SrcIndexIota>
struct TransferLoopPermutationSrcIota
{
void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
{
bool dstIota = transferRequest.dstIndexAddr == 0;
if (dstIota) { TransferLoopPermutationDstIota<Fill, SrcIndexIota, true> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
else { TransferLoopPermutationDstIota<Fill, SrcIndexIota, false> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
}
};

template<bool Fill>
struct TransferLoopPermutationFill
{

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

only use structs instead of templated functions when you need partial specialization

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure what you mean

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The struct functor only makes sense if:

  • you do a partial specialization, i.e. template<typename ArbitraryType> struct MyFunctor<true,ArbitraryType> because functions can be only fully specialized
  • you need to pass the functor as a template arg/lambda because HLSL202x doesn't allow function pointers/references or you want a stateful functor
template<typename Accessor, typename Compare>
uint32_t find_first(inout Accessor accessor, const Compare comparator);

if neither of the above applies, just use a templated function

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah ok, I think your original comment was the wrong way around

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah ok, I think your original comment was the wrong way around

"only use structs instead of templated functions when you need partial specialization"

  1. you don't need partial specialization
  2. you're using structs
  3. but you only use structs when you have partial specialization
  4. dont use structs here.

void copyLoop(uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
{
bool srcIota = transferRequest.srcIndexAddr == 0;
if (srcIota) { TransferLoopPermutationSrcIota<Fill, true> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
else { TransferLoopPermutationSrcIota<Fill, false> loop; loop.copyLoop(baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
}
};

template<typename device_capabilities>
void main(uint32_t3 dispatchId)
{
const uint propertyId = dispatchId.y;
const uint invocationIndex = dispatchId.x;

// Loading transfer request from the pointer (can't use struct
// with BDA on HLSL SPIRV)
TransferRequest transferRequest;
transferRequest.srcAddr = vk::RawBufferLoad<uint>(globals.transferCommandsAddress) | vk::RawBufferLoad<uint>(globals.transferCommandsAddress + sizeof(uint)) << 32;
transferRequest.dstAddr = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof(uint64_t));
transferRequest.srcIndexAddr = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof(uint64_t) * 2);
transferRequest.dstIndexAddr = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof(uint64_t) * 3);
// Remaining elements are part of the same bitfield
// TODO: Do this only using raw buffer load?
uint64_t bitfieldType = vk::RawBufferLoad<uint64_t>(globals.transferCommandsAddress + sizeof(uint64_t) * 4);
transferRequest.elementCount32 = uint32_t(bitfieldType);
transferRequest.elementCountExtra = uint32_t(bitfieldType);
transferRequest.propertySize = uint32_t(bitfieldType >> 3);
transferRequest.fill = uint32_t(bitfieldType >> (3 + 24));
transferRequest.srcIndexSizeLog2 = uint32_t(bitfieldType >> (3 + 24 + 1));
transferRequest.dstIndexSizeLog2 = uint32_t(bitfieldType >> (3 + 24 + 1 + 2));
deprilula28 marked this conversation as resolved.
Show resolved Hide resolved

const uint dispatchSize = nbl::hlsl::device_capabilities_traits<device_capabilities>::maxOptimallyResidentWorkgroupInvocations;
deprilula28 marked this conversation as resolved.
Show resolved Hide resolved
const bool fill = transferRequest.fill == 1;

vk::RawBufferStore<uint64_t>(globals.transferCommandsAddress + 40 * 3, transferRequest.srcAddr);
vk::RawBufferStore<uint64_t>(globals.transferCommandsAddress + 40 * 4, transferRequest.dstAddr);
vk::RawBufferStore<uint>(globals.transferCommandsAddress + 40 * 5, vk::RawBufferLoad<uint>(transferRequest.srcAddr + sizeof(uint16_t) * 3));
//if (fill) { TransferLoopPermutationFill<true> loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); }
//else { TransferLoopPermutationFill<false> loop; loop.copyLoop(invocationIndex, propertyId, transferRequest, dispatchSize); }
}

}
}
}

[numthreads(1,1,1)]
deprilula28 marked this conversation as resolved.
Show resolved Hide resolved
void main(uint32_t3 dispatchId : SV_DispatchThreadID)
{
nbl::hlsl::property_pools::main<nbl::hlsl::jit::device_capabilities>(dispatchId);
}

57 changes: 57 additions & 0 deletions include/nbl/builtin/hlsl/property_pool/transfer.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
#ifndef _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_
#define _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_

#include "nbl/builtin/hlsl/cpp_compat.hlsl"

namespace nbl
{
namespace hlsl
{
namespace property_pools
{

struct TransferRequest
{
// This represents a transfer command/request
uint64_t srcAddr;
uint64_t dstAddr;
uint64_t srcIndexAddr; // IOTA default
uint64_t dstIndexAddr; // IOTA default
// TODO: go back to this ideal layout when things work
// (Getting a fatal error from DXC when using 64-bit bitfields:)
devshgraphicsprogramming marked this conversation as resolved.
Show resolved Hide resolved
// fatal error: generated SPIR-V is invalid: [VUID-StandaloneSpirv-Base-04781] Expected 32-bit int type for Base operand: BitFieldInsert
// %58 = OpBitFieldInsert %ulong %42 %57 %uint_0 %uint_35
//
//uint64_t elementCount : 35; // allow up to 64GB IGPUBuffers
//uint64_t propertySize : 24; // all the leftover bits (just use bytes now)
//uint64_t fill : 1;
//// 0=uint8, 1=uint16, 2=uint32, 3=uint64
//uint64_t srcIndexSizeLog2 : 2;
//uint64_t dstIndexSizeLog2 : 2;
uint32_t elementCount32; // 32 first bits
uint32_t elementCountExtra : 3; // 3 last bits
uint32_t propertySize : 24;
uint32_t fill: 1;
uint32_t srcIndexSizeLog2 : 2;
uint32_t dstIndexSizeLog2 : 2;
};

struct GlobalPushContants
deprilula28 marked this conversation as resolved.
Show resolved Hide resolved
{
// BDA address (GPU pointer) into the transfer commands buffer
uint64_t transferCommandsAddress;
// Define the range of invocations (X axis) that will be transfered over in this dispatch
// May be sectioned off in the case of overflow or any other situation that doesn't allow
// for a full transfer
uint64_t beginOffset;
uint64_t endOffset;
Comment on lines +46 to +50

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be useful to make it clear we're counting in DWORDs or shorts (if you want to do 16bit transfer atoms instead)

};

NBL_CONSTEXPR uint32_t MaxPropertiesPerDispatch = 128;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there any reason to keep this around anymore?


}
}
}

#endif

20 changes: 20 additions & 0 deletions include/nbl/core/alloc/address_allocator_traits.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,18 @@ namespace nbl::core
}
}

static inline void multi_alloc_addr(AddressAlloc& alloc, uint32_t count, size_type* outAddresses, const size_type* bytes,
const size_type alignment, const size_type* hint=nullptr) noexcept
{
for (uint32_t i=0; i<count; i++)
{
if (outAddresses[i]!=AddressAlloc::invalid_address)
continue;

outAddresses[i] = alloc.alloc_addr(bytes[i],alignment,hint ? hint[i]:0ull);
}
}

static inline void multi_free_addr(AddressAlloc& alloc, uint32_t count, const size_type* addr, const size_type* bytes) noexcept
{
for (uint32_t i=0; i<count; i++)
Expand Down Expand Up @@ -186,6 +198,14 @@ namespace nbl::core
alloc,std::min(count-i,maxMultiOps),outAddresses+i,bytes+i,alignment+i,hint ? (hint+i):nullptr);
}

static inline void multi_alloc_addr(AddressAlloc& alloc, uint32_t count, size_type* outAddresses,
const size_type* bytes, const size_type alignment, const size_type* hint=nullptr) noexcept
{
for (uint32_t i=0; i<count; i+=maxMultiOps)
impl::address_allocator_traits_base<AddressAlloc,has_func_multi_alloc_addr<AddressAlloc>::value>::multi_alloc_addr(
alloc,std::min(count-i,maxMultiOps),outAddresses+i,bytes+i,alignment,hint ? (hint+i):nullptr);
}

static inline void multi_free_addr(AddressAlloc& alloc, uint32_t count, const size_type* addr, const size_type* bytes) noexcept
{
for (uint32_t i=0; i<count; i+=maxMultiOps)
Expand Down
2 changes: 2 additions & 0 deletions include/nbl/scene/ITransformTreeManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ class ITransformTreeManager : public virtual core::IReferenceCounted
return true;
}

#if 0 // TODO: upstreaming cpropertypoolhandler
//
struct UpstreamRequestBase : RequestBase
{
Expand Down Expand Up @@ -497,6 +498,7 @@ class ITransformTreeManager : public virtual core::IReferenceCounted
waitSemaphoreCount,semaphoresToWaitBeforeOverwrite,stagesToWaitForPerSemaphore,request.logger,maxWaitPoint
);
}
#endif



Expand Down
Loading