Devsh-Graphics-Programming · deprilula28 · Jan 17, 2024 · Jan 19, 2024 · Jan 21, 2024 · Jan 21, 2024
diff --git a/include/nbl/builtin/hlsl/concepts.hlsl b/include/nbl/builtin/hlsl/concepts.hlsl
@@ -0,0 +1,88 @@
+// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O.
+// This file is part of the "Nabla Engine".
+// For conditions of distribution and use, see copyright notice in nabla.h
+
+#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_
+#define _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_
+
+#include <nbl/builtin/hlsl/cpp_compat/vector.hlsl>
+#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl>
+#include <nbl/builtin/hlsl/type_traits.hlsl>
+
+
+#if (__cplusplus >= 202002L && __cpp_concepts)
+
+#define NBL_CONCEPT_TYPE_PARAMS(...) template <__VA_ARGS__>
+#define NBL_CONCEPT_SIGNATURE(NAME, ...) concept NAME = requires(__VA_ARGS__)
+#define NBL_CONCEPT_BODY(...) { __VA_ARGS__ };
+#define NBL_CONCEPT_ASSIGN(NAME, ...) concept NAME = __VA_ARGS__;
+#define NBL_REQUIRES(...) requires __VA_ARGS__ 
+
+#include <concepts>
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace concepts
+{
+
+// Alias some of the std concepts in nbl. As this is C++20 only, we don't need to use
+// the macros here.
+template <typename T, typename U>
+concept same_as = std::same_as<T, U>;
+
+template <typename D, typename B>
+concept derived_from = std::derived_from<D, B>;
+
+template <typename F, typename T>
+concept convertible_to = std::convertible_to<F, T>;
+
+template <typename T, typename F>
+concept assignable_from = std::assignable_from<T, F>;
+
+template <typename T, typename U>
+concept common_with = std::common_with<T, U>;
+
+template <typename T>
+concept integral = std::integral<T>;
+
+template <typename T>
+concept signed_integral = std::signed_integral<T>;
+
+template <typename T>
+concept unsigned_integral = std::unsigned_integral<T>;
+
+template <typename T>
+concept floating_point = std::floating_point<T>;
+
+
+// Some other useful concepts.
+
+template<typename T, typename... Ts>
+concept any_of = (same_as<T, Ts> || ...);
+
+template <typename T>
+concept scalar = floating_point<T> || integral<T>;
+
+template <typename T>
+concept vectorial = is_vector<T>::value;
+
+template <typename T>
+concept matricial = is_matrix<T>::value;
+
+}
+}
+}
+
+#else
+
+// No C++20 support. Do nothing.
+#define NBL_CONCEPT_TYPE_PARAMS(...)
+#define NBL_CONCEPT_SIGNATURE(NAME, ...) 
+#define NBL_CONCEPT_BODY(...)
+#define NBL_REQUIRES(...)
+
+#endif
+
+#endif
diff --git a/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl b/include/nbl/builtin/hlsl/property_pool/copy.comp.hlsl
@@ -0,0 +1,167 @@
+#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl"
+#include "nbl/builtin/hlsl/glsl_compat/core.hlsl"
+#include "nbl/builtin/hlsl/property_pool/transfer.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace property_pools
+{
+
+[[vk::push_constant]] TransferDispatchInfo globals;
+
+template<bool Fill, bool SrcIndexIota, bool DstIndexIota, uint64_t SrcIndexSizeLog2, uint64_t DstIndexSizeLog2>
+struct TransferLoop
+{
+    void iteration(uint propertyId, TransferRequest transferRequest, uint64_t invocationIndex)
+    {
+        const uint64_t srcIndexSize = uint64_t(1) << SrcIndexSizeLog2;
+        const uint64_t dstIndexSize = uint64_t(1) << DstIndexSizeLog2;
+
+        // Fill: Always use offset 0 on src
+        const uint64_t srcOffset = Fill ? 0 : invocationIndex * transferRequest.propertySize;
+        const uint64_t dstOffset = invocationIndex * transferRequest.propertySize;
+
+        // IOTA: Use the index as the fetching offset
+        // Non IOTA: Read the address buffer ("index buffer") to select fetching offset
+        uint64_t srcAddressBufferOffset;
+        uint64_t dstAddressBufferOffset;
+
+        if (SrcIndexIota) srcAddressBufferOffset = srcOffset;
+        else 
+        {
+            if (SrcIndexSizeLog2 == 0) {} // we can't read individual byte
+            else if (SrcIndexSizeLog2 == 1) srcAddressBufferOffset = vk::RawBufferLoad<uint16_t>(transferRequest.srcIndexAddr + srcOffset * sizeof(uint16_t));
+            else if (SrcIndexSizeLog2 == 2) srcAddressBufferOffset = vk::RawBufferLoad<uint32_t>(transferRequest.srcIndexAddr + srcOffset * sizeof(uint32_t));
+            else if (SrcIndexSizeLog2 == 3) srcAddressBufferOffset = vk::RawBufferLoad<uint64_t>(transferRequest.srcIndexAddr + srcOffset * sizeof(uint64_t));
+        }
+
+        if (DstIndexIota) dstAddressBufferOffset = dstOffset;
+        else 
+        {
+            if (DstIndexSizeLog2 == 0) {} // we can't read individual byte
+            else if (DstIndexSizeLog2 == 1) dstAddressBufferOffset = vk::RawBufferLoad<uint16_t>(transferRequest.dstIndexAddr + dstOffset * sizeof(uint16_t));
+            else if (DstIndexSizeLog2 == 2) dstAddressBufferOffset = vk::RawBufferLoad<uint32_t>(transferRequest.dstIndexAddr + dstOffset * sizeof(uint32_t));
+            else if (DstIndexSizeLog2 == 3) dstAddressBufferOffset = vk::RawBufferLoad<uint64_t>(transferRequest.dstIndexAddr + dstOffset * sizeof(uint64_t));
+        }
+
+        const uint64_t srcAddressMapped = transferRequest.srcAddr + srcAddressBufferOffset * srcIndexSize; 
+        const uint64_t dstAddressMapped = transferRequest.dstAddr + dstAddressBufferOffset * dstIndexSize; 
+
+        vk::RawBufferStore<uint32_t>(dstAddressMapped, vk::RawBufferLoad<uint32_t>(srcAddressMapped));
+    }
+
+    void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
+    {
+        uint64_t elementCount = uint64_t(transferRequest.elementCount32)
+            | uint64_t(transferRequest.elementCountExtra) << 32;
+        uint64_t lastInvocation = min(elementCount, dispatchInfo.endOffset);
+        for (uint64_t invocationIndex = dispatchInfo.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize)
+        {
+            iteration(propertyId, transferRequest, invocationIndex);
+        }
+    }
+};
+
+// For creating permutations of the functions based on parameters that are constant over the transfer request
+// These branches should all be scalar, and because of how templates are compiled statically, the loops shouldn't have any
+// branching within them
+// 
+// Permutations:
+// 2 (fill or not) * 2 (src index iota or not) * 2 (dst index iota or not) * 4 (src index size) * 4 (dst index size)
+// Total amount of permutations: 128
+
+template<bool Fill, bool SrcIndexIota, bool DstIndexIota, uint64_t SrcIndexSizeLog2>
+struct TransferLoopPermutationSrcIndexSizeLog
+{
+    void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
+    {
+       if (transferRequest.dstIndexSizeLog2 == 0) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 0> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+       else if (transferRequest.dstIndexSizeLog2 == 1) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 1> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+       else if (transferRequest.dstIndexSizeLog2 == 2) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 2> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+       else /*if (transferRequest.dstIndexSizeLog2 == 3)*/ { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 3> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+    }
+};
+
+template<bool Fill, bool SrcIndexIota, bool DstIndexIota>
+struct TransferLoopPermutationDstIota
+{
+    void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
+    {
+       if (transferRequest.srcIndexSizeLog2 == 0) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 0> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+       else if (transferRequest.srcIndexSizeLog2 == 1) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 1> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+       else if (transferRequest.srcIndexSizeLog2 == 2) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 2> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+       else /*if (transferRequest.srcIndexSizeLog2 == 3)*/ { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 3> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+    }
+};
+
+template<bool Fill, bool SrcIndexIota>
+struct TransferLoopPermutationSrcIota
+{
+    void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
+    {
+        bool dstIota = transferRequest.dstIndexAddr == 0;
+        if (dstIota) { TransferLoopPermutationDstIota<Fill, SrcIndexIota, true> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+        else { TransferLoopPermutationDstIota<Fill, SrcIndexIota, false> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+    }
+};
+
+template<bool Fill>
+struct TransferLoopPermutationFill
+{
+    void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize)
+    {
+        bool srcIota = transferRequest.srcIndexAddr == 0;
+        if (srcIota) { TransferLoopPermutationSrcIota<Fill, true> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+        else { TransferLoopPermutationSrcIota<Fill, false> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); }
+    }
+};
+
+// Loading transfer request from the pointer (can't use struct
+// with BDA on HLSL SPIRV)
+static TransferRequest TransferRequest::newFromAddress(const uint64_t transferCmdAddr)
+{   
+    TransferRequest transferRequest;
+    transferRequest.srcAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr,8);
+    transferRequest.dstAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t),8);
+    transferRequest.srcIndexAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 2,8);
+    transferRequest.dstIndexAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 3,8);
+    // Remaining elements are part of the same bitfield
+    // TODO: Do this only using raw buffer load?
+    uint64_t bitfieldType = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 4,8);
+    transferRequest.elementCount32 = uint32_t(bitfieldType);
+    transferRequest.elementCountExtra = uint32_t(bitfieldType >> 32);
+    transferRequest.propertySize = uint32_t(bitfieldType >> (32 + 3));
+    transferRequest.fill = uint32_t(bitfieldType >> (32 + 3 + 24));
+    transferRequest.srcIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1));
+    transferRequest.dstIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1 + 2));
+
+    return transferRequest;
+}
+
+template<typename device_capabilities>
+void main(uint32_t3 dispatchId, const uint dispatchSize)
+{
+    const uint propertyId = dispatchId.y;
+    const uint invocationIndex = dispatchId.x;
+
+    uint64_t transferCmdAddr = globals.transferCommandsAddress + sizeof(TransferRequest) * propertyId;
+    TransferRequest transferRequest = TransferRequest::newFromAddress(transferCmdAddr);
+
+    const bool fill = transferRequest.fill == 1;
+
+    if (fill) { TransferLoopPermutationFill<true> loop; loop.copyLoop(globals, invocationIndex, propertyId, transferRequest, dispatchSize); }
+    else { TransferLoopPermutationFill<false> loop; loop.copyLoop(globals, invocationIndex, propertyId, transferRequest, dispatchSize); }
+}
+
+}
+}
+}
+
+[numthreads(nbl::hlsl::property_pools::OptimalDispatchSize,1,1)]
+void main(uint32_t3 dispatchId : SV_DispatchThreadID)
+{
+    nbl::hlsl::property_pools::main<nbl::hlsl::jit::device_capabilities>(dispatchId, nbl::hlsl::property_pools::OptimalDispatchSize);
+}
+
diff --git a/include/nbl/builtin/hlsl/property_pool/transfer.hlsl b/include/nbl/builtin/hlsl/property_pool/transfer.hlsl
@@ -0,0 +1,63 @@
+#ifndef _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_
+#define _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_
+
+#include "nbl/builtin/hlsl/cpp_compat.hlsl"
+
+namespace nbl
+{
+namespace hlsl
+{
+namespace property_pools
+{
+
+struct TransferRequest
+{
+    // This represents a transfer command/request
+    uint64_t srcAddr;
+    uint64_t dstAddr;
+    uint64_t srcIndexAddr; // IOTA default
+    uint64_t dstIndexAddr; // IOTA default
+    // TODO: go back to this ideal layout when things work
+    // (Getting a fatal error from DXC when using 64-bit bitfields:)
+    // fatal error: generated SPIR-V is invalid: [VUID-StandaloneSpirv-Base-04781] Expected 32-bit int type for Base operand: BitFieldInsert
+    // %58 = OpBitFieldInsert %ulong %42 %57 %uint_0 %uint_35
+    //
+    //uint64_t elementCount : 35; // allow up to 64GB IGPUBuffers
+    //uint64_t propertySize : 24; // all the leftover bits (just use bytes now)
+    //uint64_t fill : 1;
+    //// 0=uint8, 1=uint16, 2=uint32, 3=uint64
+    //uint64_t srcIndexSizeLog2 : 2;
+    //uint64_t dstIndexSizeLog2 : 2;
+    uint32_t elementCount32; // 32 first bits
+    uint32_t elementCountExtra : 3; // 3 last bits
+    uint32_t propertySize : 24;
+    uint32_t fill: 1;
+    uint32_t srcIndexSizeLog2 : 2;
+    uint32_t dstIndexSizeLog2 : 2;
+
+    // Reads a TransferRequest from a BDA
+    static TransferRequest newFromAddress(const uint64_t address);
+};
+
+struct TransferDispatchInfo 
+{
+    // BDA address (GPU pointer) into the transfer commands buffer
+    uint64_t transferCommandsAddress;
+    // Define the range of invocations (X axis) that will be transfered over in this dispatch
+    // May be sectioned off in the case of overflow or any other situation that doesn't allow
+    // for a full transfer
+    uint64_t beginOffset;
+    uint64_t endOffset;
+};
+
+NBL_CONSTEXPR uint32_t MaxPropertiesPerDispatch = 128;
+
+// TODO: instead use some sort of replace function for getting optimal size?
+NBL_CONSTEXPR uint32_t OptimalDispatchSize = 256;
+
+}
+}
+}
+
+#endif
+
diff --git a/include/nbl/core/alloc/address_allocator_traits.h b/include/nbl/core/alloc/address_allocator_traits.h
@@ -53,6 +53,18 @@ namespace nbl::core
                 }
             }
 
+            static inline void         multi_alloc_addr(AddressAlloc& alloc, uint32_t count, size_type* outAddresses, const size_type* bytes,
+                                                        const size_type alignment, const size_type* hint=nullptr) noexcept
+            {
+                for (uint32_t i=0; i<count; i++)
+                {
+                    if (outAddresses[i]!=AddressAlloc::invalid_address)
+                        continue;
+
+                    outAddresses[i] = alloc.alloc_addr(bytes[i],alignment,hint ? hint[i]:0ull);
+                }
+            }
+
             static inline void         multi_free_addr(AddressAlloc& alloc, uint32_t count, const size_type* addr, const size_type* bytes) noexcept
             {
                 for (uint32_t i=0; i<count; i++)
@@ -186,6 +198,14 @@ namespace nbl::core
                                                                 alloc,std::min(count-i,maxMultiOps),outAddresses+i,bytes+i,alignment+i,hint ? (hint+i):nullptr);
             }
 
+            static inline void              multi_alloc_addr(AddressAlloc& alloc, uint32_t count, size_type* outAddresses,
+                                                             const size_type* bytes, const size_type alignment, const size_type* hint=nullptr) noexcept
+            {
+                for (uint32_t i=0; i<count; i+=maxMultiOps)
+                    impl::address_allocator_traits_base<AddressAlloc,has_func_multi_alloc_addr<AddressAlloc>::value>::multi_alloc_addr(
+                                                                alloc,std::min(count-i,maxMultiOps),outAddresses+i,bytes+i,alignment,hint ? (hint+i):nullptr);
+            }
+
             static inline void             multi_free_addr(AddressAlloc& alloc, uint32_t count, const size_type* addr, const size_type* bytes) noexcept
             {
                 for (uint32_t i=0; i<count; i+=maxMultiOps)

diff --git a/include/nbl/scene/ITransformTreeManager.h b/include/nbl/scene/ITransformTreeManager.h
@@ -254,6 +254,7 @@ class ITransformTreeManager : public virtual core::IReferenceCounted
 			return true;
 		}
 
+#if 0 // TODO: upstreaming cpropertypoolhandler
 		//
 		struct UpstreamRequestBase : RequestBase
 		{
@@ -497,6 +498,7 @@ class ITransformTreeManager : public virtual core::IReferenceCounted
 				waitSemaphoreCount,semaphoresToWaitBeforeOverwrite,stagesToWaitForPerSemaphore,request.logger,maxWaitPoint
 			);
 		}
+#endif
-Original file line number
+Diff line change
@@ Expand Up @@
     			return true;
     		}
+    #if 0 // TODO: upstreaming cpropertypoolhandler
     		//
     		struct UpstreamRequestBase : RequestBase
     		{
@@ Expand Down Expand Up @@
     				waitSemaphoreCount,semaphoresToWaitBeforeOverwrite,stagesToWaitForPerSemaphore,request.logger,maxWaitPoint
     			);
     		}
+    #endif
@@ Expand Down @@