-
Notifications
You must be signed in to change notification settings - Fork 62
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Work on property pool HLSL impl #649
base: master
Are you sure you want to change the base?
Changes from all commits
a1747c6
adc4d57
d9ddf41
1707158
279c220
4be1a3c
3570c03
c44bb49
9460e24
88d1d00
52d6972
706000d
b625153
ef4b779
b8db8c9
1a0c998
99d80a7
61604ee
7ac728b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
// Copyright (C) 2023 - DevSH Graphics Programming Sp. z O.O. | ||
// This file is part of the "Nabla Engine". | ||
// For conditions of distribution and use, see copyright notice in nabla.h | ||
|
||
#ifndef _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_ | ||
#define _NBL_BUILTIN_HLSL_CONCEPTS_INCLUDED_ | ||
|
||
#include <nbl/builtin/hlsl/cpp_compat/vector.hlsl> | ||
#include <nbl/builtin/hlsl/cpp_compat/matrix.hlsl> | ||
#include <nbl/builtin/hlsl/type_traits.hlsl> | ||
|
||
|
||
#if (__cplusplus >= 202002L && __cpp_concepts) | ||
|
||
#define NBL_CONCEPT_TYPE_PARAMS(...) template <__VA_ARGS__> | ||
#define NBL_CONCEPT_SIGNATURE(NAME, ...) concept NAME = requires(__VA_ARGS__) | ||
#define NBL_CONCEPT_BODY(...) { __VA_ARGS__ }; | ||
#define NBL_CONCEPT_ASSIGN(NAME, ...) concept NAME = __VA_ARGS__; | ||
#define NBL_REQUIRES(...) requires __VA_ARGS__ | ||
|
||
#include <concepts> | ||
|
||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace concepts | ||
{ | ||
|
||
// Alias some of the std concepts in nbl. As this is C++20 only, we don't need to use | ||
// the macros here. | ||
template <typename T, typename U> | ||
concept same_as = std::same_as<T, U>; | ||
|
||
template <typename D, typename B> | ||
concept derived_from = std::derived_from<D, B>; | ||
|
||
template <typename F, typename T> | ||
concept convertible_to = std::convertible_to<F, T>; | ||
|
||
template <typename T, typename F> | ||
concept assignable_from = std::assignable_from<T, F>; | ||
|
||
template <typename T, typename U> | ||
concept common_with = std::common_with<T, U>; | ||
|
||
template <typename T> | ||
concept integral = std::integral<T>; | ||
|
||
template <typename T> | ||
concept signed_integral = std::signed_integral<T>; | ||
|
||
template <typename T> | ||
concept unsigned_integral = std::unsigned_integral<T>; | ||
|
||
template <typename T> | ||
concept floating_point = std::floating_point<T>; | ||
|
||
|
||
// Some other useful concepts. | ||
|
||
template<typename T, typename... Ts> | ||
concept any_of = (same_as<T, Ts> || ...); | ||
|
||
template <typename T> | ||
concept scalar = floating_point<T> || integral<T>; | ||
|
||
template <typename T> | ||
concept vectorial = is_vector<T>::value; | ||
|
||
template <typename T> | ||
concept matricial = is_matrix<T>::value; | ||
|
||
} | ||
} | ||
} | ||
|
||
#else | ||
|
||
// No C++20 support. Do nothing. | ||
#define NBL_CONCEPT_TYPE_PARAMS(...) | ||
#define NBL_CONCEPT_SIGNATURE(NAME, ...) | ||
#define NBL_CONCEPT_BODY(...) | ||
#define NBL_REQUIRES(...) | ||
|
||
#endif | ||
|
||
#endif |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,167 @@ | ||
#include "nbl/builtin/hlsl/jit/device_capabilities.hlsl" | ||
#include "nbl/builtin/hlsl/glsl_compat/core.hlsl" | ||
#include "nbl/builtin/hlsl/property_pool/transfer.hlsl" | ||
|
||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace property_pools | ||
{ | ||
|
||
[[vk::push_constant]] TransferDispatchInfo globals; | ||
|
||
template<bool Fill, bool SrcIndexIota, bool DstIndexIota, uint64_t SrcIndexSizeLog2, uint64_t DstIndexSizeLog2> | ||
struct TransferLoop | ||
{ | ||
void iteration(uint propertyId, TransferRequest transferRequest, uint64_t invocationIndex) | ||
{ | ||
const uint64_t srcIndexSize = uint64_t(1) << SrcIndexSizeLog2; | ||
const uint64_t dstIndexSize = uint64_t(1) << DstIndexSizeLog2; | ||
|
||
// Fill: Always use offset 0 on src | ||
const uint64_t srcOffset = Fill ? 0 : invocationIndex * transferRequest.propertySize; | ||
const uint64_t dstOffset = invocationIndex * transferRequest.propertySize; | ||
|
||
// IOTA: Use the index as the fetching offset | ||
// Non IOTA: Read the address buffer ("index buffer") to select fetching offset | ||
uint64_t srcAddressBufferOffset; | ||
uint64_t dstAddressBufferOffset; | ||
|
||
if (SrcIndexIota) srcAddressBufferOffset = srcOffset; | ||
else | ||
{ | ||
if (SrcIndexSizeLog2 == 0) {} // we can't read individual byte | ||
else if (SrcIndexSizeLog2 == 1) srcAddressBufferOffset = vk::RawBufferLoad<uint16_t>(transferRequest.srcIndexAddr + srcOffset * sizeof(uint16_t)); | ||
else if (SrcIndexSizeLog2 == 2) srcAddressBufferOffset = vk::RawBufferLoad<uint32_t>(transferRequest.srcIndexAddr + srcOffset * sizeof(uint32_t)); | ||
else if (SrcIndexSizeLog2 == 3) srcAddressBufferOffset = vk::RawBufferLoad<uint64_t>(transferRequest.srcIndexAddr + srcOffset * sizeof(uint64_t)); | ||
} | ||
|
||
if (DstIndexIota) dstAddressBufferOffset = dstOffset; | ||
else | ||
{ | ||
if (DstIndexSizeLog2 == 0) {} // we can't read individual byte | ||
else if (DstIndexSizeLog2 == 1) dstAddressBufferOffset = vk::RawBufferLoad<uint16_t>(transferRequest.dstIndexAddr + dstOffset * sizeof(uint16_t)); | ||
else if (DstIndexSizeLog2 == 2) dstAddressBufferOffset = vk::RawBufferLoad<uint32_t>(transferRequest.dstIndexAddr + dstOffset * sizeof(uint32_t)); | ||
else if (DstIndexSizeLog2 == 3) dstAddressBufferOffset = vk::RawBufferLoad<uint64_t>(transferRequest.dstIndexAddr + dstOffset * sizeof(uint64_t)); | ||
} | ||
|
||
const uint64_t srcAddressMapped = transferRequest.srcAddr + srcAddressBufferOffset * srcIndexSize; | ||
const uint64_t dstAddressMapped = transferRequest.dstAddr + dstAddressBufferOffset * dstIndexSize; | ||
|
||
vk::RawBufferStore<uint32_t>(dstAddressMapped, vk::RawBufferLoad<uint32_t>(srcAddressMapped)); | ||
} | ||
|
||
void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) | ||
{ | ||
uint64_t elementCount = uint64_t(transferRequest.elementCount32) | ||
| uint64_t(transferRequest.elementCountExtra) << 32; | ||
uint64_t lastInvocation = min(elementCount, dispatchInfo.endOffset); | ||
for (uint64_t invocationIndex = dispatchInfo.beginOffset + baseInvocationIndex; invocationIndex < lastInvocation; invocationIndex += dispatchSize) | ||
{ | ||
iteration(propertyId, transferRequest, invocationIndex); | ||
} | ||
} | ||
}; | ||
|
||
// For creating permutations of the functions based on parameters that are constant over the transfer request | ||
// These branches should all be scalar, and because of how templates are compiled statically, the loops shouldn't have any | ||
// branching within them | ||
// | ||
// Permutations: | ||
// 2 (fill or not) * 2 (src index iota or not) * 2 (dst index iota or not) * 4 (src index size) * 4 (dst index size) | ||
// Total amount of permutations: 128 | ||
|
||
template<bool Fill, bool SrcIndexIota, bool DstIndexIota, uint64_t SrcIndexSizeLog2> | ||
struct TransferLoopPermutationSrcIndexSizeLog | ||
{ | ||
void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) | ||
{ | ||
if (transferRequest.dstIndexSizeLog2 == 0) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 0> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else if (transferRequest.dstIndexSizeLog2 == 1) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 1> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else if (transferRequest.dstIndexSizeLog2 == 2) { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 2> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else /*if (transferRequest.dstIndexSizeLog2 == 3)*/ { TransferLoop<Fill, SrcIndexIota, DstIndexIota, SrcIndexSizeLog2, 3> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
} | ||
}; | ||
|
||
template<bool Fill, bool SrcIndexIota, bool DstIndexIota> | ||
struct TransferLoopPermutationDstIota | ||
{ | ||
void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) | ||
{ | ||
if (transferRequest.srcIndexSizeLog2 == 0) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 0> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else if (transferRequest.srcIndexSizeLog2 == 1) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 1> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else if (transferRequest.srcIndexSizeLog2 == 2) { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 2> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else /*if (transferRequest.srcIndexSizeLog2 == 3)*/ { TransferLoopPermutationSrcIndexSizeLog<Fill, SrcIndexIota, DstIndexIota, 3> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
} | ||
}; | ||
|
||
template<bool Fill, bool SrcIndexIota> | ||
struct TransferLoopPermutationSrcIota | ||
{ | ||
void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) | ||
{ | ||
bool dstIota = transferRequest.dstIndexAddr == 0; | ||
if (dstIota) { TransferLoopPermutationDstIota<Fill, SrcIndexIota, true> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else { TransferLoopPermutationDstIota<Fill, SrcIndexIota, false> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
} | ||
}; | ||
|
||
template<bool Fill> | ||
struct TransferLoopPermutationFill | ||
{ | ||
void copyLoop(NBL_CONST_REF_ARG(TransferDispatchInfo) dispatchInfo, uint baseInvocationIndex, uint propertyId, TransferRequest transferRequest, uint dispatchSize) | ||
{ | ||
bool srcIota = transferRequest.srcIndexAddr == 0; | ||
if (srcIota) { TransferLoopPermutationSrcIota<Fill, true> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else { TransferLoopPermutationSrcIota<Fill, false> loop; loop.copyLoop(dispatchInfo, baseInvocationIndex, propertyId, transferRequest, dispatchSize); } | ||
} | ||
}; | ||
|
||
// Loading transfer request from the pointer (can't use struct | ||
// with BDA on HLSL SPIRV) | ||
static TransferRequest TransferRequest::newFromAddress(const uint64_t transferCmdAddr) | ||
{ | ||
TransferRequest transferRequest; | ||
transferRequest.srcAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr,8); | ||
transferRequest.dstAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t),8); | ||
transferRequest.srcIndexAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 2,8); | ||
transferRequest.dstIndexAddr = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 3,8); | ||
Comment on lines
+126
to
+129
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. make a wrapper for |
||
// Remaining elements are part of the same bitfield | ||
// TODO: Do this only using raw buffer load? | ||
uint64_t bitfieldType = vk::RawBufferLoad<uint64_t>(transferCmdAddr + sizeof(uint64_t) * 4,8); | ||
transferRequest.elementCount32 = uint32_t(bitfieldType); | ||
transferRequest.elementCountExtra = uint32_t(bitfieldType >> 32); | ||
transferRequest.propertySize = uint32_t(bitfieldType >> (32 + 3)); | ||
transferRequest.fill = uint32_t(bitfieldType >> (32 + 3 + 24)); | ||
transferRequest.srcIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1)); | ||
transferRequest.dstIndexSizeLog2 = uint32_t(bitfieldType >> (32 + 3 + 24 + 1 + 2)); | ||
|
||
return transferRequest; | ||
} | ||
|
||
template<typename device_capabilities> | ||
void main(uint32_t3 dispatchId, const uint dispatchSize) | ||
{ | ||
const uint propertyId = dispatchId.y; | ||
const uint invocationIndex = dispatchId.x; | ||
|
||
uint64_t transferCmdAddr = globals.transferCommandsAddress + sizeof(TransferRequest) * propertyId; | ||
TransferRequest transferRequest = TransferRequest::newFromAddress(transferCmdAddr); | ||
|
||
const bool fill = transferRequest.fill == 1; | ||
|
||
if (fill) { TransferLoopPermutationFill<true> loop; loop.copyLoop(globals, invocationIndex, propertyId, transferRequest, dispatchSize); } | ||
else { TransferLoopPermutationFill<false> loop; loop.copyLoop(globals, invocationIndex, propertyId, transferRequest, dispatchSize); } | ||
} | ||
|
||
} | ||
} | ||
} | ||
|
||
[numthreads(nbl::hlsl::property_pools::OptimalDispatchSize,1,1)] | ||
void main(uint32_t3 dispatchId : SV_DispatchThreadID) | ||
{ | ||
nbl::hlsl::property_pools::main<nbl::hlsl::jit::device_capabilities>(dispatchId, nbl::hlsl::property_pools::OptimalDispatchSize); | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
#ifndef _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_ | ||
#define _NBL_BUILTIN_HLSL_GLSL_PROPERTY_POOLS_TRANSFER_ | ||
|
||
#include "nbl/builtin/hlsl/cpp_compat.hlsl" | ||
|
||
namespace nbl | ||
{ | ||
namespace hlsl | ||
{ | ||
namespace property_pools | ||
{ | ||
|
||
struct TransferRequest | ||
{ | ||
// This represents a transfer command/request | ||
uint64_t srcAddr; | ||
uint64_t dstAddr; | ||
uint64_t srcIndexAddr; // IOTA default | ||
uint64_t dstIndexAddr; // IOTA default | ||
// TODO: go back to this ideal layout when things work | ||
// (Getting a fatal error from DXC when using 64-bit bitfields:) | ||
devshgraphicsprogramming marked this conversation as resolved.
Show resolved
Hide resolved
|
||
// fatal error: generated SPIR-V is invalid: [VUID-StandaloneSpirv-Base-04781] Expected 32-bit int type for Base operand: BitFieldInsert | ||
// %58 = OpBitFieldInsert %ulong %42 %57 %uint_0 %uint_35 | ||
// | ||
//uint64_t elementCount : 35; // allow up to 64GB IGPUBuffers | ||
//uint64_t propertySize : 24; // all the leftover bits (just use bytes now) | ||
//uint64_t fill : 1; | ||
//// 0=uint8, 1=uint16, 2=uint32, 3=uint64 | ||
//uint64_t srcIndexSizeLog2 : 2; | ||
//uint64_t dstIndexSizeLog2 : 2; | ||
uint32_t elementCount32; // 32 first bits | ||
uint32_t elementCountExtra : 3; // 3 last bits | ||
uint32_t propertySize : 24; | ||
uint32_t fill: 1; | ||
uint32_t srcIndexSizeLog2 : 2; | ||
uint32_t dstIndexSizeLog2 : 2; | ||
|
||
// Reads a TransferRequest from a BDA | ||
static TransferRequest newFromAddress(const uint64_t address); | ||
}; | ||
|
||
struct TransferDispatchInfo | ||
{ | ||
// BDA address (GPU pointer) into the transfer commands buffer | ||
uint64_t transferCommandsAddress; | ||
// Define the range of invocations (X axis) that will be transfered over in this dispatch | ||
// May be sectioned off in the case of overflow or any other situation that doesn't allow | ||
// for a full transfer | ||
uint64_t beginOffset; | ||
uint64_t endOffset; | ||
Comment on lines
+46
to
+50
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would be useful to make it clear we're counting in DWORDs or shorts (if you want to do 16bit transfer atoms instead) |
||
}; | ||
|
||
NBL_CONSTEXPR uint32_t MaxPropertiesPerDispatch = 128; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is there any reason to keep this around anymore? |
||
|
||
// TODO: instead use some sort of replace function for getting optimal size? | ||
NBL_CONSTEXPR uint32_t OptimalDispatchSize = 256; | ||
Comment on lines
+55
to
+56
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you can use the device JIT to query the max compute dispatch size, I'd round it down to nearest PoT though, so the divisions aren't expensive |
||
|
||
} | ||
} | ||
} | ||
|
||
#endif | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
keep it with the struct
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The struct is shared with c++ code, so i wouldn't be able to use vk::rawbufferread; I could take the 64 bit value though
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
you can use
#ifndef __HLSL_VERSION
in the impl of the method