From fccbfa3752abc4c599a58be32573246a6215b747 Mon Sep 17 00:00:00 2001 From: Yineng Zhang Date: Sat, 14 Dec 2024 22:36:04 +0800 Subject: [PATCH] format: add clang-format for sgl-kernel (#2483) --- sgl-kernel/.clang-format | 8 +++++ sgl-kernel/Makefile | 5 ++- sgl-kernel/src/sgl-kernel/csrc/warp_reduce.cc | 10 +++--- .../src/sgl-kernel/csrc/warp_reduce_kernel.cu | 32 +++++++------------ 4 files changed, 28 insertions(+), 27 deletions(-) create mode 100644 sgl-kernel/.clang-format diff --git a/sgl-kernel/.clang-format b/sgl-kernel/.clang-format new file mode 100644 index 00000000000..5e690c02885 --- /dev/null +++ b/sgl-kernel/.clang-format @@ -0,0 +1,8 @@ +BasedOnStyle: Google +IndentWidth: 2 +ColumnLimit: 120 +AllowShortFunctionsOnASingleLine: Empty +DerivePointerAlignment: false +PointerAlignment: Left +NamespaceIndentation: None +SortIncludes: true diff --git a/sgl-kernel/Makefile b/sgl-kernel/Makefile index 3186031acc7..ec86d684877 100644 --- a/sgl-kernel/Makefile +++ b/sgl-kernel/Makefile @@ -1,4 +1,4 @@ -.PHONY: tree ln install build clean test +.PHONY: tree ln install build clean test format tree: @tree --prune -I "__pycache__|*.egg-info|*.so|build" @@ -17,3 +17,6 @@ clean: test: @pytest tests/ + +format: + @find src tests -name '*.cc' -o -name '*.cu' -o -name '*.cuh' -o -name '*.h' | xargs clang-format -i && find src tests -name '*.py' | xargs isort && find src tests -name '*.py' | xargs black diff --git a/sgl-kernel/src/sgl-kernel/csrc/warp_reduce.cc b/sgl-kernel/src/sgl-kernel/csrc/warp_reduce.cc index 46c6a41c3ac..efc2f0cd951 100644 --- a/sgl-kernel/src/sgl-kernel/csrc/warp_reduce.cc +++ b/sgl-kernel/src/sgl-kernel/csrc/warp_reduce.cc @@ -2,12 +2,10 @@ torch::Tensor warp_reduce_cuda(torch::Tensor input); -#define CHECK_CUDA(x) \ - TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor") -#define CHECK_CONTIGUOUS(x) \ - TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") -#define CHECK_INPUT(x) \ - CHECK_CUDA(x); \ +#define CHECK_CUDA(x) TORCH_CHECK(x.device().is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) \ + CHECK_CUDA(x); \ CHECK_CONTIGUOUS(x) torch::Tensor warp_reduce(torch::Tensor input) { diff --git a/sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu b/sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu index c547682f60d..d75cc9bee26 100644 --- a/sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu +++ b/sgl-kernel/src/sgl-kernel/csrc/warp_reduce_kernel.cu @@ -25,34 +25,28 @@ __device__ __forceinline__ scalar_t blockReduceSum(scalar_t val) { int lane = threadIdx.x % 32; int wid = threadIdx.x / 32; - val = warpReduceSum(val); // First reduce within warp + val = warpReduceSum(val); // First reduce within warp - if (lane == 0) - shared[wid] = val; // Write reduced value to shared memory + if (lane == 0) shared[wid] = val; // Write reduced value to shared memory - __syncthreads(); // Wait for all partial reductions + __syncthreads(); // Wait for all partial reductions // Read from shared memory only if that warp existed val = (threadIdx.x < (blockDim.x / 32)) ? shared[lane] : 0; - if (wid == 0) - val = warpReduceSum(val); // Final reduce within first warp + if (wid == 0) val = warpReduceSum(val); // Final reduce within first warp return val; } template __global__ void warp_reduce_cuda_kernel( - const torch::PackedTensorAccessor32 - input, - torch::PackedTensorAccessor32 output, - int N) { - + const torch::PackedTensorAccessor32 input, + torch::PackedTensorAccessor32 output, int N) { scalar_t sum = 0; // Grid-stride loop - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; - i += blockDim.x * gridDim.x) { + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N; i += blockDim.x * gridDim.x) { sum += input[i]; } @@ -84,13 +78,11 @@ torch::Tensor warp_reduce_cuda(torch::Tensor input) { // Allocate output tensor for partial sums auto output = torch::empty({blocks}, input.options()); - AT_DISPATCH_FLOATING_TYPES( - input.scalar_type(), "warp_reduce_cuda", ([&] { - warp_reduce_cuda_kernel<<>>( - input.packed_accessor32(), - output.packed_accessor32(), - N); - })); + AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "warp_reduce_cuda", ([&] { + warp_reduce_cuda_kernel<<>>( + input.packed_accessor32(), + output.packed_accessor32(), N); + })); // Sum the partial results return output.sum();