Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Work on property pool HLSL impl #649

Draft
wants to merge 19 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 88 additions & 0 deletions include/nbl/builtin/hlsl/concepts.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
// This file is part of the "Nabla Engine".
// For conditions of distribution and use, see copyright notice in nabla.h

#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_
#define _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_

#include <nbl/builtin/hlsl/cpp_compat/vector.hlsl>
#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
#include <nbl/builtin/hlsl/type_traits.hlsl>


#if (__cplusplus >= 202002L && __cpp_concepts)

#define NBL_CONCEPT_TYPE_PARAMS(...) template <__VA_ARGS__>
#define NBL_CONCEPT_SIGNATURE(NAME, ...) concept NAME = requires(__VA_ARGS__)
#define NBL_CONCEPT_BODY(...) { __VA_ARGS__ };
#define NBL_CONCEPT_ASSIGN(NAME, ...) concept NAME = __VA_ARGS__;
#define NBL_REQUIRES(...) requires __VA_ARGS__

#include <concepts>

namespace nbl
{
namespace hlsl
{
namespace concepts
{

// Alias some of the std concepts in nbl. As this is C++20 only, we don't need to use
// the macros here.
template <typename T, typename U>
concept same_as = std::same_as<T, U>;

template <typename D, typename B>
concept derived_from = std::derived_from<D, B>;

template <typename F, typename T>
concept convertible_to = std::convertible_to<F, T>;

template <typename T, typename F>
concept assignable_from = std::assignable_from<T, F>;

template <typename T, typename U>
concept common_with = std::common_with<T, U>;

template <typename T>
concept integral = std::integral<T>;

template <typename T>
concept signed_integral = std::signed_integral<T>;

template <typename T>
concept unsigned_integral = std::unsigned_integral<T>;

template <typename T>
concept floating_point = std::floating_point<T>;


// Some other useful concepts.

template<typename T, typename... Ts>
concept any_of = (same_as<T, Ts> || ...);

template <typename T>
concept scalar = floating_point<T> || integral<T>;

template <typename T>
concept vectorial = is_vector<T>::value;

template <typename T>
concept matricial = is_matrix<T>::value;

}
}
}

#else

// No C++20 support. Do nothing.
#define NBL_CONCEPT_TYPE_PARAMS(...)
#define NBL_CONCEPT_SIGNATURE(NAME, ...)
#define NBL_CONCEPT_BODY(...)
#define NBL_REQUIRES(...)

#endif

#endif
167 changes: 167 additions & 0 deletions include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
#include "nbl/builtin/hlsl/property_pool/transfer.hlsl"

namespace nbl
{
namespace hlsl
{
namespace property_pools
{

[[vk::push_constant]] TransferDispatchInfo globals;

template<bool Fill, bool SrcIndexIota, bool DstIndexIota, uint64_t SrcIndexSizeLog2, uint64_t DstIndexSizeLog2>
struct TransferLoop
{
void iteration(uint propertyId, TransferRequest transferRequest, uint64_t invocationIndex)
{
const uint64_t srcIndexSize = uint64_t(1) << SrcIndexSizeLog2;
const uint64_t dstIndexSize = uint64_t(1) << DstIndexSizeLog2;

// Fill: Always use offset 0 on src
const uint64_t srcOffset = Fill ? 0 : invocationIndex * transferRequest.propertySize;
const uint64_t dstOffset = invocationIndex * transferRequest.propertySize;

// IOTA: Use the index as the fetching offset
// Non IOTA: Read the address buffer ("index buffer") to select fetching offset
uint64_t srcAddressBufferOffset;
uint64_t dstAddressBufferOffset;

if (SrcIndexIota) srcAddressBufferOffset = srcOffset;
else
{
if (SrcIndexSizeLog2 == 0) {} // we can't read individual byte
else if (SrcIndexSizeLog2 == 1) srcAddressBufferOffset = vk::RawBufferLoad<uint16_t>(transferRequest.srcIndexAddr + srcOffset * sizeof(uint16_t));
else if (SrcIndexSizeLog2 == 2) srcAddressBufferOffset = vk::RawBufferLoad<uint32_t>(transferRequest.srcIndexAddr + srcOffset * sizeof(uint32_t));
else if (SrcIndexSizeLog2 == 3) srcAddressBufferOffset = vk::RawBufferLoad<uint64_t>(transferRequest.srcIndexAddr + srcOffset * sizeof(uint64_t));
}

if (DstIndexIota) dstAddressBufferOffset = dstOffset;
else
{
if (DstIndexSizeLog2 == 0) {} // we can't read individual byte
else if (DstIndexSizeLog2 == 1) dstAddressBufferOffset = vk::RawBufferLoad<uint16_t>(transferRequest.dstIndexAddr + dstOffset * sizeof(uint16_t));
else if (DstIndexSizeLog2 == 2) dstAddressBufferOffset = vk::RawBufferLoad<uint32_t>(transferRequest.dstIndexAddr + dstOffset * sizeof(uint32_t));
else if (DstIndexSizeLog2 == 3) dstAddressBufferOffset = vk::RawBufferLoad<uint64_t>(transferRequest.dstIndexAddr + dstOffset * sizeof(uint64_t));
}

const uint64_t srcAddressMapped = transferRequest.srcAddr + srcAddressBufferOffset * srcIndexSize;
const uint64_t dstAddressMapped = transferRequest.dstAddr + dstAddressBufferOffset * dstIndexSize;

vk::RawBufferStore<uint32_t>(dstAddressMapped, vk::RawBufferLoad<uint32_t>(srcAddressMapped));
}

void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
{
uint64_t elementCount = uint64_t(transferRequest.elementCount32)
| uint64_t(transferRequest.elementCountExtra) << 32;
uint64_t lastInvocation = min(elementCount, dispatchInfo.endOffset);
for (uint64_t invocationIndex = dispatchInfo.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize)
{
iteration(propertyId, transferRequest, invocationIndex);
}
}
};

// For creating permutations of the functions based on parameters that are constant over the transfer request
// These branches should all be scalar, and because of how templates are compiled statically, the loops shouldn't have any
// branching within them
//
// Permutations:
// 2 (fill or not) * 2 (src index iota or not) * 2 (dst index iota or not) * 4 (src index size) * 4 (dst index size)
// Total amount of permutations: 128

template<bool Fill, bool SrcIndexIota, bool DstIndexIota, uint64_t SrcIndexSizeLog2>
struct TransferLoopPermutationSrcIndexSizeLog
{
void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
{
if (transferRequest.dstIndexSizeLog2 == 0) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 0> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
else if (transferRequest.dstIndexSizeLog2 == 1) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 1> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
else if (transferRequest.dstIndexSizeLog2 == 2) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 2> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
else /*if (transferRequest.dstIndexSizeLog2 == 3)*/ { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 3> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
}
};

template<bool Fill, bool SrcIndexIota, bool DstIndexIota>
struct TransferLoopPermutationDstIota
{
void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
{
if (transferRequest.srcIndexSizeLog2 == 0) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 0> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
else if (transferRequest.srcIndexSizeLog2 == 1) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 1> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
else if (transferRequest.srcIndexSizeLog2 == 2) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 2> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
else /*if (transferRequest.srcIndexSizeLog2 == 3)*/ { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 3> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
}
};

template<bool Fill, bool SrcIndexIota>
struct TransferLoopPermutationSrcIota
{
void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
{
bool dstIota = transferRequest.dstIndexAddr == 0;
if (dstIota) { TransferLoopPermutationDstIota<Fill, SrcIndexIota, true> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
else { TransferLoopPermutationDstIota<Fill, SrcIndexIota, false> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
}
};

template<bool Fill>
struct TransferLoopPermutationFill
{
void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
{
bool srcIota = transferRequest.srcIndexAddr == 0;
if (srcIota) { TransferLoopPermutationSrcIota<Fill, true> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
else { TransferLoopPermutationSrcIota<Fill, false> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
}
};

// Loading transfer request from the pointer (can't use struct
// with BDA on HLSL SPIRV)
static TransferRequest TransferRequest::newFromAddress(const uint64_t transferCmdAddr)
{
Comment on lines +121 to +124

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

keep it with the struct

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The struct is shared with c++ code, so i wouldn't be able to use vk::rawbufferread; I could take the 64 bit value though

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can use #ifndef __HLSL_VERSION in the impl of the method

TransferRequest transferRequest;
transferRequest.srcAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr,8);
transferRequest.dstAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t),8);
transferRequest.srcIndexAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 2,8);
transferRequest.dstIndexAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 3,8);
Comment on lines +126 to +129

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make a wrapper for vk::RawBufferLoad and Store in nbl::hlsl::legacy which uses our type traits to default the alignment

// Remaining elements are part of the same bitfield
// TODO: Do this only using raw buffer load?
uint64_t bitfieldType = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 4,8);
transferRequest.elementCount32 = uint32_t(bitfieldType);
transferRequest.elementCountExtra = uint32_t(bitfieldType >> 32);
transferRequest.propertySize = uint32_t(bitfieldType >> (32 + 3));
transferRequest.fill = uint32_t(bitfieldType >> (32 + 3 + 24));
transferRequest.srcIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1));
transferRequest.dstIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1 + 2));

return transferRequest;
}

template<typename device_capabilities>
void main(uint32_t3 dispatchId, const uint dispatchSize)
{
const uint propertyId = dispatchId.y;
const uint invocationIndex = dispatchId.x;

uint64_t transferCmdAddr = globals.transferCommandsAddress + sizeof(TransferRequest) * propertyId;
TransferRequest transferRequest = TransferRequest::newFromAddress(transferCmdAddr);

const bool fill = transferRequest.fill == 1;

if (fill) { TransferLoopPermutationFill<true> loop; loop.copyLoop(globals, invocationIndex, propertyId, transferRequest, dispatchSize); }
else { TransferLoopPermutationFill<false> loop; loop.copyLoop(globals, invocationIndex, propertyId, transferRequest, dispatchSize); }
}

}
}
}

[numthreads(nbl::hlsl::property_pools::OptimalDispatchSize,1,1)]
void main(uint32_t3 dispatchId : SV_DispatchThreadID)
{
nbl::hlsl::property_pools::main<nbl::hlsl::jit::device_capabilities>(dispatchId, nbl::hlsl::property_pools::OptimalDispatchSize);
}

63 changes: 63 additions & 0 deletions include/nbl/builtin/hlsl/property_pool/transfer.hlsl
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#ifndef _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_
#define _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_

#include "nbl/builtin/hlsl/cpp_compat.hlsl"

namespace nbl
{
namespace hlsl
{
namespace property_pools
{

struct TransferRequest
{
// This represents a transfer command/request
uint64_t srcAddr;
uint64_t dstAddr;
uint64_t srcIndexAddr; // IOTA default
uint64_t dstIndexAddr; // IOTA default
// TODO: go back to this ideal layout when things work
// (Getting a fatal error from DXC when using 64-bit bitfields:)
devshgraphicsprogramming marked this conversation as resolved.
Show resolved Hide resolved
// fatal error: generated SPIR-V is invalid: [VUID-StandaloneSpirv-Base-04781] Expected 32-bit int type for Base operand: BitFieldInsert
// %58 = OpBitFieldInsert %ulong %42 %57 %uint_0 %uint_35
//
//uint64_t elementCount : 35; // allow up to 64GB IGPUBuffers
//uint64_t propertySize : 24; // all the leftover bits (just use bytes now)
//uint64_t fill : 1;
//// 0=uint8, 1=uint16, 2=uint32, 3=uint64
//uint64_t srcIndexSizeLog2 : 2;
//uint64_t dstIndexSizeLog2 : 2;
uint32_t elementCount32; // 32 first bits
uint32_t elementCountExtra : 3; // 3 last bits
uint32_t propertySize : 24;
uint32_t fill: 1;
uint32_t srcIndexSizeLog2 : 2;
uint32_t dstIndexSizeLog2 : 2;

// Reads a TransferRequest from a BDA
static TransferRequest newFromAddress(const uint64_t address);
};

struct TransferDispatchInfo
{
// BDA address (GPU pointer) into the transfer commands buffer
uint64_t transferCommandsAddress;
// Define the range of invocations (X axis) that will be transfered over in this dispatch
// May be sectioned off in the case of overflow or any other situation that doesn't allow
// for a full transfer
uint64_t beginOffset;
uint64_t endOffset;
Comment on lines +46 to +50

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be useful to make it clear we're counting in DWORDs or shorts (if you want to do 16bit transfer atoms instead)

};

NBL_CONSTEXPR uint32_t MaxPropertiesPerDispatch = 128;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is there any reason to keep this around anymore?


// TODO: instead use some sort of replace function for getting optimal size?
NBL_CONSTEXPR uint32_t OptimalDispatchSize = 256;
Comment on lines +55 to +56

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you can use the device JIT to query the max compute dispatch size, I'd round it down to nearest PoT though, so the divisions aren't expensive


}
}
}

#endif

20 changes: 20 additions & 0 deletions include/nbl/core/alloc/address_allocator_traits.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,18 @@ namespace nbl::core
}
}

static inline void multi_alloc_addr(AddressAlloc& alloc, uint32_t count, size_type* outAddresses, const size_type* bytes,
const size_type alignment, const size_type* hint=nullptr) noexcept
{
for (uint32_t i=0; i<count; i++)
{
if (outAddresses[i]!=AddressAlloc::invalid_address)
continue;

outAddresses[i] = alloc.alloc_addr(bytes[i],alignment,hint ? hint[i]:0ull);
}
}

static inline void multi_free_addr(AddressAlloc& alloc, uint32_t count, const size_type* addr, const size_type* bytes) noexcept
{
for (uint32_t i=0; i<count; i++)
Expand Down Expand Up @@ -186,6 +198,14 @@ namespace nbl::core
alloc,std::min(count-i,maxMultiOps),outAddresses+i,bytes+i,alignment+i,hint ? (hint+i):nullptr);
}

static inline void multi_alloc_addr(AddressAlloc& alloc, uint32_t count, size_type* outAddresses,
const size_type* bytes, const size_type alignment, const size_type* hint=nullptr) noexcept
{
for (uint32_t i=0; i<count; i+=maxMultiOps)
impl::address_allocator_traits_base<AddressAlloc,has_func_multi_alloc_addr<AddressAlloc>::value>::multi_alloc_addr(
alloc,std::min(count-i,maxMultiOps),outAddresses+i,bytes+i,alignment,hint ? (hint+i):nullptr);
}

static inline void multi_free_addr(AddressAlloc& alloc, uint32_t count, const size_type* addr, const size_type* bytes) noexcept
{
for (uint32_t i=0; i<count; i+=maxMultiOps)
Expand Down
2 changes: 2 additions & 0 deletions include/nbl/scene/ITransformTreeManager.h
Original file line number Diff line number Diff line change
Expand Up @@ -254,6 +254,7 @@ class ITransformTreeManager : public virtual core::IReferenceCounted
return true;
}

#if 0 // TODO: upstreaming cpropertypoolhandler
//
struct UpstreamRequestBase : RequestBase
{
Expand Down Expand Up @@ -497,6 +498,7 @@ class ITransformTreeManager : public virtual core::IReferenceCounted
waitSemaphoreCount,semaphoresToWaitBeforeOverwrite,stagesToWaitForPerSemaphore,request.logger,maxWaitPoint
);
}
#endif



Expand Down
Loading