From af93fd4a6c73e3b5a338d91874ba4f58ce837672 Mon Sep 17 00:00:00 2001 From: Bruno Date: Sat, 16 Nov 2024 23:56:22 +0100 Subject: [PATCH] WIP: AMGCL backend using OpenNL cuda interface --- src/lib/geogram/NL/nl_amgcl.cpp | 444 ++++++++++++++++++++++++++++++-- src/lib/geogram/NL/nl_blas.h | 7 +- src/lib/geogram/NL/nl_cuda.c | 30 ++- src/lib/geogram/NL/nl_cuda.h | 13 + 4 files changed, 460 insertions(+), 34 deletions(-) diff --git a/src/lib/geogram/NL/nl_amgcl.cpp b/src/lib/geogram/NL/nl_amgcl.cpp index b02c184fa1d5..f3ec1d1b2799 100644 --- a/src/lib/geogram/NL/nl_amgcl.cpp +++ b/src/lib/geogram/NL/nl_amgcl.cpp @@ -7,6 +7,7 @@ extern "C" { #include #include +#include } extern "C" { @@ -17,6 +18,7 @@ extern "C" { #include #include #include +#include #ifdef NL_WITH_AMGCL @@ -71,6 +73,7 @@ extern "C" { #include #include #include +#include // for cuda backend #ifdef AMGCL_PROFILING #include @@ -79,6 +82,8 @@ namespace amgcl { } #endif +/*************************************************************************/ + // Types for column index and row pointers, as expected // by AMGCL. Note: AMGCL prefers signed indices. // In standard mode, 32-bit indices everywhere @@ -92,33 +97,35 @@ typedef ptrdiff_t rowptr_t; typedef int rowptr_t; #endif -typedef amgcl::backend::builtin Backend; + +/*************************************************************************/ + +static NLboolean nlSolveAMGCL_CPU(void) { + + typedef amgcl::backend::builtin Backend; #ifdef WITH_BOOST -typedef amgcl::runtime::preconditioner Precond; -typedef amgcl::runtime::solver::wrapper IterativeSolver; -typedef amgcl::make_solver Solver; -typedef boost::property_tree::ptree Params; + typedef amgcl::runtime::preconditioner Precond; + typedef amgcl::runtime::solver::wrapper IterativeSolver; + typedef amgcl::make_solver Solver; + typedef boost::property_tree::ptree Params; #else -typedef amgcl::amg< - Backend, - amgcl::coarsening::smoothed_aggregation, - amgcl::relaxation::spai0 - > Precond; + typedef amgcl::amg< + Backend, + amgcl::coarsening::smoothed_aggregation, + amgcl::relaxation::spai0 + > Precond; -typedef amgcl::solver::cg IterativeSolver; -typedef amgcl::make_solver Solver; -typedef Solver::params Params; + typedef amgcl::solver::cg IterativeSolver; + typedef amgcl::make_solver Solver; + typedef Solver::params Params; #endif - -/*************************************************************************/ - -NLboolean nlSolveAMGCL() { + /********************************/ if( GEO::CmdLine::get_arg_bool("OT:verbose") || @@ -137,7 +144,7 @@ NLboolean nlSolveAMGCL() { nlMatrixCompress(&ctxt->M); } - geo_assert(ctxt->M->type == NL_MATRIX_CRS); + nl_assert(ctxt->M->type == NL_MATRIX_CRS); NLCRSMatrix* M = (NLCRSMatrix*)(ctxt->M); size_t n = size_t(M->m); NLdouble* b = ctxt->b; @@ -186,7 +193,7 @@ NLboolean nlSolveAMGCL() { if(ctxt->no_variables_indirection) { x = (double*)ctxt->variable_buffer[k].base_address; - geo_assert( + nl_assert( ctxt->variable_buffer[k].stride == sizeof(double) ); } @@ -211,6 +218,405 @@ NLboolean nlSolveAMGCL() { /*************************************************************************/ +namespace nlcuda_adapters { + typedef double value_type; + typedef colind_t col_type; + typedef rowptr_t ptr_type; + typedef colind_t index_type; + + /** + * \brief a matrix, stored in device memory + */ + struct matrix { + matrix() : impl_(nullptr) { + } + + matrix(NLMatrix M) : impl_(M) { + } + + ~matrix() { + nlDeleteMatrix(impl_); + impl_ = nullptr; + } + + matrix(const matrix& rhs) = delete; + matrix& operator=(const matrix& rhs) = delete; + + NLMatrix impl_; + }; + + /** + * \brief a diagonal matrix, stored in device memory + */ + typedef matrix matrix_diagonal; + + /** + * \brief a vector, stored in device memory + */ + struct vector { + + vector() : n_(0), data_(nullptr) { + } + + vector(index_type n) { + n_ = n; + // TODO: should we clear it ? + data_ = NL_NEW_VECTOR( + nlCUDABlas(), NL_DEVICE_MEMORY, n_ + ); + clear(); + } + + vector(const value_type* x_on_host, index_type n) { + n_ = n; + data_ = NL_NEW_VECTOR( + nlCUDABlas(), NL_DEVICE_MEMORY, n_ + ); + copy_from_host(x_on_host, n); + } + + ~vector() { + if(data_ != nullptr) { + NL_DELETE_VECTOR( + nlCUDABlas(), NL_DEVICE_MEMORY, n_, data_ + ); + n_ = 0; + data_ = nullptr; + } + } + + vector(const vector& rhs) = delete; + vector& operator=(const vector& rhs) = delete; + + void clear() { + // TODO: smarter method for clearing a vector + double* tmp = new double[n_]; + memset(tmp, 0, bytes()); + copy_from_host(tmp, n_); + delete[] tmp; + } + + size_t bytes() const { + return sizeof(value_type) * size_t(n_); + } + + void copy_from_host(const value_type* x_on_host, index_type n) { + nl_assert(n == n_); + NLBlas_t blas = nlCUDABlas(); + blas->Memcpy( + blas, + const_cast(x_on_host), NL_HOST_MEMORY, + data_, NL_DEVICE_MEMORY, + bytes() + ); + } + + void copy_from_host(const std::vector& x_on_host) { + copy_from_host(x_on_host.data(), index_type(x_on_host.size())); + } + + void copy_to_host(value_type* x_on_host, index_type n) const { + nl_assert(n == n_); + NLBlas_t blas = nlCUDABlas(); + blas->Memcpy( + blas, + data_, NL_DEVICE_MEMORY, + x_on_host, NL_HOST_MEMORY, + bytes() + ); + } + + void copy_to_host(std::vector& x_on_host) const { + copy_to_host(x_on_host.data(), index_type(x_on_host.size())); + } + + void copy_from_device(const value_type* x_on_device, index_type n) { + nl_assert(n == n_); + NLBlas_t blas = nlCUDABlas(); + blas->Memcpy( + blas, + const_cast(x_on_device), NL_DEVICE_MEMORY, + data_, NL_DEVICE_MEMORY, + bytes() + ); + } + + void copy_from(const vector& rhs) { + nl_assert(rhs.n_ == n_); + copy_from_device(rhs.data_, rhs.n_); + } + + index_type n_; + double* data_; + }; + + /** + * Wrapper around solver::skyline_lu for use with the NLCUDA backend. + * Inspired from AMGCL cuda wrapper + * Copies the rhs to the host memory, solves the problem using the host CPU, + * then copies the solution back to the compute device(s). + */ + struct cuda_skyline_lu : amgcl::solver::skyline_lu { + typedef amgcl::solver::skyline_lu Base; + + template + cuda_skyline_lu(const Matrix &A, const Params&): + Base(*A), + rhs_on_host_(amgcl::backend::rows(*A)), + x_on_host_(amgcl::backend::rows(*A)) { + } + + void operator()(const vector &rhs, vector &x) const { + rhs.copy_to_host(rhs_on_host_); + static_cast(this)->operator()(rhs_on_host_, x_on_host_); + x.copy_from_host(x_on_host_); + } + + mutable std::vector rhs_on_host_; + mutable std::vector x_on_host_; + }; +} + +namespace amgcl { namespace backend { + /** + * \brief nlcuda backend for AMGCL + */ + struct nlcuda { + typedef nlcuda_adapters::value_type value_type; + typedef nlcuda_adapters::col_type col_type; + typedef nlcuda_adapters::ptr_type ptr_type; + typedef nlcuda_adapters::index_type index_type; + + typedef nlcuda_adapters::matrix matrix; + typedef nlcuda_adapters::matrix matrix_diagonal; + typedef nlcuda_adapters::vector vector; + + typedef nlcuda_adapters::cuda_skyline_lu direct_solver; + + struct provides_row_iterator : std::false_type {}; + + typedef amgcl::detail::empty_params params; + + static std::string name() { return "nlcuda"; } + + typedef builtin builtin_backend; + + static std::shared_ptr copy_matrix( + std::shared_ptr A, const params& + ) { + const typename builtin_backend::matrix &a = *A; + col_type* rowptr = const_cast(a.ptr); + index_type* colind = const_cast(a.col); + value_type* val = const_cast(a.val); + + // Sanity checks: signedness differ, but sizes should + // be the same ! + static_assert(sizeof(NLuint_big) == sizeof(col_type)); + static_assert(sizeof(NLuint) == sizeof(index_type)); + static_assert(sizeof(double) == sizeof(value_type)); + + // Create NLCRSMatrix with rowptr, colind and val arrays + // pointing to arrays in AMGCL matrix. Zero copy. + NLCRSMatrix CRS; + memset(&CRS, 0, sizeof(NLCRSMatrix)); + CRS.type = NL_MATRIX_CRS; + CRS.symmetric_storage = NL_FALSE; + CRS.m = NLuint(a.nrows); + CRS.n = NLuint(a.ncols); + CRS.rowptr = reinterpret_cast(rowptr); + CRS.colind = reinterpret_cast(colind); + CRS.val = val; + CRS.nslices = 0; + CRS.sliceptr = nullptr; + + return std::shared_ptr( + new matrix(nlCUDAMatrixNewFromCRSMatrix(NLMatrix(&CRS))) + ); + } + + static std::shared_ptr copy_vector( + typename builtin_backend::vector const &x, const params& + ) { + return std::make_shared(x.data(), x.size()); + } + + static std::shared_ptr copy_vector( + std::shared_ptr< typename builtin_backend::vector > x, + const params &prm + ) { + return copy_vector(*x, prm); + } + + static std::shared_ptr create_vector( + size_t size, const params& + ) { + return std::make_shared(size); + } + + static std::shared_ptr create_solver( + std::shared_ptr A, + const params &prm + ) { + return std::make_shared(A, prm); + } + }; + + /****************************************************************/ + + template <> struct spmv_impl< + double, + nlcuda_adapters::matrix, + nlcuda_adapters::vector, + double, nlcuda_adapters::vector + > { + static void apply( + double alpha, + const nlcuda_adapters::matrix& A, + const nlcuda_adapters::vector& x, + double beta, + nlcuda_adapters::vector& y + ) { + nlCUDAMatrixSpMV(A.impl_, x.data_, y.data_, alpha, beta); + } + }; + + template <> struct residual_impl< + nlcuda_adapters::matrix, + nlcuda_adapters::vector, + nlcuda_adapters::vector, + nlcuda_adapters::vector + > { + static void apply( + const nlcuda_adapters::vector &rhs, + const nlcuda_adapters::matrix &A, + const nlcuda_adapters::vector &x, + nlcuda_adapters::vector &r + ) { + // r = rhs - A * x; + r.copy_from(rhs); + nlCUDAMatrixSpMV(A.impl_, x.data_, r.data_, -1.0, 1.0); + } + }; + + template <> struct clear_impl { + static void apply(nlcuda_adapters::vector& x) { + x.clear(); + } + }; + + template <> struct inner_product_impl< + nlcuda_adapters::vector, + nlcuda_adapters::vector + > { + static double get( + const nlcuda_adapters::vector& x, + const nlcuda_adapters::vector& y + ) { + nl_assert(x.n_ == y.n_); + NLBlas_t blas = nlCUDABlas(); + return blas->Ddot(blas,x.n_,x.data_,1,y.data_,1); + } + }; + + template <> struct axpby_impl< + double, + nlcuda_adapters::vector, + double, + nlcuda_adapters::vector + > { + static void apply( + double a, + const nlcuda_adapters::vector& x, + double b, + const nlcuda_adapters::vector& y + ) { + nl_assert(x.n_ == y.n_); + NLBlas_t blas = nlCUDABlas(); + if(b != 1.0) { + blas->Dscal(blas, x.n_, b, y.data_, 1); + } + blas->Daxpy(blas, x.n_, a, x.data_, 1, y.data_, 1); + } + }; + + template <> struct axpbypcz_impl< + double, + nlcuda_adapters::vector, + double, + nlcuda_adapters::vector, + double, + nlcuda_adapters::vector + > { + static void apply( + double a, const nlcuda_adapters::vector &x, + double b, const nlcuda_adapters::vector &y, + double c, nlcuda_adapters::vector &z + ) { + nl_assert(x.n_ == y.n_ && y.n_ && z.n_); + NLBlas_t blas = nlCUDABlas(); + + if(c != 1.0) { + blas->Dscal(blas, x.n_, c, z.data_, 1); + } + + blas->Daxpy(blas, x.n_, a, x.data_, 1, z.data_, 1); + blas->Daxpy(blas, x.n_, b, y.data_, 1, z.data_, 1); + } + }; + + template <> struct vmul_impl< + double, + nlcuda_adapters::vector, + nlcuda_adapters::vector, + double, + nlcuda_adapters::vector + > { + static void apply( + double a, + const nlcuda_adapters::vector &x, + const nlcuda_adapters::vector &y, + double b, + nlcuda_adapters::vector &z + ) { + // TODO + nl_assert(false); + } + }; + + template <> struct copy_impl< + nlcuda_adapters::vector, + nlcuda_adapters::vector + > { + static void apply( + const nlcuda_adapters::vector &x, + nlcuda_adapters::vector &y + ) { + y.copy_from(x); + } + }; + +}} + + + + +/*************************************************************************/ + +static NLboolean nlSolveAMGCL_GPU(void) { + + + return NL_FALSE; +} + +/*************************************************************************/ + +NLboolean nlSolveAMGCL() { + return nlExtensionIsInitialized_CUDA() ? + nlSolveAMGCL_GPU() : nlSolveAMGCL_CPU(); +} + +/*************************************************************************/ + #else nlBoolean nlSolveAMGCL() { diff --git a/src/lib/geogram/NL/nl_blas.h b/src/lib/geogram/NL/nl_blas.h index 0a5360bc19dc..765e0379aa14 100644 --- a/src/lib/geogram/NL/nl_blas.h +++ b/src/lib/geogram/NL/nl_blas.h @@ -182,7 +182,8 @@ extern "C" { * \return the dot product between \p x and \p y */ typedef double (*FUNPTR_ddot)( - NLBlas_t blas, int n, const double *x, int incx, const double *y, int incy + NLBlas_t blas, + int n, const double *x, int incx, const double *y, int incy ); /** @@ -194,7 +195,9 @@ extern "C" { * of the vector * \return the norm of \p x */ - typedef double (*FUNPTR_dnrm2)(NLBlas_t blas, int n, const double *x, int incx); + typedef double (*FUNPTR_dnrm2)( + NLBlas_t blas, int n, const double *x, int incx + ); /** * \brief Computes a linear combination of two vectors diff --git a/src/lib/geogram/NL/nl_cuda.c b/src/lib/geogram/NL/nl_cuda.c index ab7caeeea524..e5dcd1989417 100644 --- a/src/lib/geogram/NL/nl_cuda.c +++ b/src/lib/geogram/NL/nl_cuda.c @@ -1107,7 +1107,7 @@ typedef struct NLCUDASparseMatrixStruct { double* val; /* Management of multi-slice matrices, - * used in GARGANTUA mode when NNZ is larger than 2^31 + * used when NNZ is larger than NL_MAX_SLICE_SIZE * then matrix is stored as a linked-list of "slices" * each slice corresponds to a slice with the rows * [row_offset .. row_offset+m] of the matrix. @@ -1269,12 +1269,13 @@ static void nlCRSMatrixCUDASliceSpMV( nlCUDABlas()->flops += (NLulong)(2*Mcuda->nnz); } -static void nlCRSMatrixCUDAMult( - NLCUDASparseMatrix* Mcuda, const double* x, double* y +void nlCUDAMatrixSpMV( + NLMatrix M, const double* x, double* y, double alpha, double beta ) { + NLCUDASparseMatrix* Mcuda = (NLCUDASparseMatrix*)M; /* single-slice matrix */ if(Mcuda->next_slice == NULL) { - nlCRSMatrixCUDASliceSpMV(Mcuda, x, y, 1.0, 0.0); + nlCRSMatrixCUDASliceSpMV(Mcuda, x, y, alpha, beta); return; } @@ -1285,13 +1286,18 @@ static void nlCRSMatrixCUDAMult( Mcuda = Mcuda->next_slice ) { /* - * Note: beta=0.0, because we are not adding to y, - * since y is computed slice-per-slice ! + * Note: y is computed slice-per-slice ! */ - nlCRSMatrixCUDASliceSpMV(Mcuda, x, y, 1.0, 0.0); + nlCRSMatrixCUDASliceSpMV(Mcuda, x, y, alpha, beta); } } +static void nlCRSMatrixCUDAMult( + NLCUDASparseMatrix* Mcuda, const double* x, double* y +) { + nlCUDAMatrixSpMV((NLMatrix)Mcuda, x, y, 1.0, 0.0); +} + #ifdef GARGANTUA /** * \brief Converts in-place an array of 64-bit ints to 32-bit ints @@ -1314,12 +1320,14 @@ static void int32_to_int64(void* data, size_t N) { *to-- = (NLuint_big)*from--; } } +#endif /* * Maximum slice size. It is limited by the maximum size that - * one can CudaMalloc (2 GB) + * one can CudaMalloc (2 GB), this makes 256 M entries + * (size iz determined by the VAL array) */ -#define NL_MAX_SLICE_SIZE (2u*1024u*1024u*1024u/8u) +#define NL_MAX_SLICE_SIZE (256u*1024u*1024u) /** * \brief Decomposes a CRS matrix into multiple slices. @@ -1399,8 +1407,6 @@ NLCUDASparseMatrix* CreateCUDASlicesFromCRSMatrixSlices( return Mcuda; } -#endif - NLMatrix nlCUDAMatrixNewFromCRSMatrix(NLMatrix M_in) { NLCUDASparseMatrix* Mcuda = NL_NEW(NLCUDASparseMatrix); NLCRSMatrix* M = (NLCRSMatrix*)(M_in); @@ -1426,7 +1432,6 @@ NLMatrix nlCUDAMatrixNewFromCRSMatrix(NLMatrix M_in) { Mcuda->val, M->val, val_sz, cudaMemcpyHostToDevice) ); -#ifdef GARGANTUA /* Need to decompose matrix into slices if arrays are too large */ if(Mcuda->nnz > NL_MAX_SLICE_SIZE) { Mcuda->next_slice = CreateCUDASlicesFromCRSMatrixSlices( @@ -1435,7 +1440,6 @@ NLMatrix nlCUDAMatrixNewFromCRSMatrix(NLMatrix M_in) { nl_printf("Matrix has %d slices\n", Mcuda->nb_slices); return (NLMatrix)Mcuda; } -#endif /* * At this point, everything can fit in a single slice, stored diff --git a/src/lib/geogram/NL/nl_cuda.h b/src/lib/geogram/NL/nl_cuda.h index f5ba7873dc2f..b80ea8f9dd1e 100644 --- a/src/lib/geogram/NL/nl_cuda.h +++ b/src/lib/geogram/NL/nl_cuda.h @@ -91,6 +91,19 @@ NLMatrix nlCUDAMatrixNewFromCRSMatrix(NLMatrix M); */ NLMatrix nlCUDAJacobiPreconditionerNewFromCRSMatrix(NLMatrix M); + +/** + * \brief Computes a sparse matrix vector product + * \details Computes y <- alpha M x + beta y + * \param[in] M a matrix created from nlCUDAMAtrixNewFromCRSMatrix() + * \param[in] x device pointer + * \param[in,out] y device pointer + * \param[in] alpha , beta two scalars + */ +void nlCUDAMatrixSpMV( + NLMatrix M, const double* x, double* y, double alpha, double beta +); + /** * \brief Gets a pointer to the BLAS abstraction layer for * BLAS operation on the GPU using CUDA.