From af93fd4a6c73e3b5a338d91874ba4f58ce837672 Mon Sep 17 00:00:00 2001
From: Bruno <Bruno.Levy@inria.fr>
Date: Sat, 16 Nov 2024 23:56:22 +0100
Subject: [PATCH] WIP: AMGCL backend using OpenNL cuda interface

---
 src/lib/geogram/NL/nl_amgcl.cpp | 444 ++++++++++++++++++++++++++++++--
 src/lib/geogram/NL/nl_blas.h    |   7 +-
 src/lib/geogram/NL/nl_cuda.c    |  30 ++-
 src/lib/geogram/NL/nl_cuda.h    |  13 +
 4 files changed, 460 insertions(+), 34 deletions(-)
diff --git a/src/lib/geogram/NL/nl_amgcl.cpp b/src/lib/geogram/NL/nl_amgcl.cpp
index b02c184fa1d5..f3ec1d1b2799 100644
--- a/src/lib/geogram/NL/nl_amgcl.cpp
+++ b/src/lib/geogram/NL/nl_amgcl.cpp
@@ -7,6 +7,7 @@
 extern "C" {
 #include <geogram/NL/nl_amgcl.h>
 #include <geogram/NL/nl_context.h>
+#include <geogram/NL/nl_cuda.h>
 }
 
 extern "C" {
@@ -17,6 +18,7 @@ extern "C" {
 #include <geogram/basic/logger.h>
 #include <geogram/basic/command_line.h>
 #include <geogram/NL/nl.h>
+#include <geogram/NL/nl_matrix.h>
 
 #ifdef NL_WITH_AMGCL
 
@@ -71,6 +73,7 @@ extern "C" {
 #include <type_traits>
 #include <amgcl/make_solver.hpp>
 #include <amgcl/adapter/zero_copy.hpp>
+#include <amgcl/solver/skyline_lu.hpp> // for cuda backend
 
 #ifdef AMGCL_PROFILING
 #include <amgcl/profiler.hpp>
@@ -79,6 +82,8 @@ namespace amgcl {
 }
 #endif
 
+/*************************************************************************/
+
 // Types for column index and row pointers, as expected
 // by AMGCL. Note: AMGCL prefers signed indices.
 // In standard mode,  32-bit indices everywhere
@@ -92,33 +97,35 @@ typedef ptrdiff_t rowptr_t;
 typedef int rowptr_t;
 #endif
 
-typedef amgcl::backend::builtin<double, colind_t, rowptr_t> Backend;
+
+/*************************************************************************/
+
+static NLboolean nlSolveAMGCL_CPU(void) {
+
+    typedef amgcl::backend::builtin<double, colind_t, rowptr_t> Backend;
 
 #ifdef WITH_BOOST
 
-typedef amgcl::runtime::preconditioner<Backend> Precond;
-typedef amgcl::runtime::solver::wrapper<Backend> IterativeSolver;
-typedef amgcl::make_solver<Precond,IterativeSolver> Solver;
-typedef boost::property_tree::ptree Params;
+    typedef amgcl::runtime::preconditioner<Backend> Precond;
+    typedef amgcl::runtime::solver::wrapper<Backend> IterativeSolver;
+    typedef amgcl::make_solver<Precond,IterativeSolver> Solver;
+    typedef boost::property_tree::ptree Params;
 
 #else
 
-typedef amgcl::amg<
-    Backend,
-    amgcl::coarsening::smoothed_aggregation,
-    amgcl::relaxation::spai0
-    > Precond;
+    typedef amgcl::amg<
+	Backend,
+	amgcl::coarsening::smoothed_aggregation,
+	amgcl::relaxation::spai0
+	> Precond;
 
-typedef amgcl::solver::cg<Backend> IterativeSolver;
-typedef amgcl::make_solver<Precond,IterativeSolver> Solver;
-typedef Solver::params Params;
+    typedef amgcl::solver::cg<Backend> IterativeSolver;
+    typedef amgcl::make_solver<Precond,IterativeSolver> Solver;
+    typedef Solver::params Params;
 
 #endif
 
-
-/*************************************************************************/
-
-NLboolean nlSolveAMGCL() {
+    /********************************/
 
     if(
         GEO::CmdLine::get_arg_bool("OT:verbose") ||
@@ -137,7 +144,7 @@ NLboolean nlSolveAMGCL() {
         nlMatrixCompress(&ctxt->M);
     }
 
-    geo_assert(ctxt->M->type == NL_MATRIX_CRS);
+    nl_assert(ctxt->M->type == NL_MATRIX_CRS);
     NLCRSMatrix* M = (NLCRSMatrix*)(ctxt->M);
     size_t n = size_t(M->m);
     NLdouble* b = ctxt->b;
@@ -186,7 +193,7 @@ NLboolean nlSolveAMGCL() {
 
         if(ctxt->no_variables_indirection) {
             x = (double*)ctxt->variable_buffer[k].base_address;
-            geo_assert(
+            nl_assert(
                 ctxt->variable_buffer[k].stride == sizeof(double)
             );
         }
@@ -211,6 +218,405 @@ NLboolean nlSolveAMGCL() {
 
 /*************************************************************************/
 
+namespace nlcuda_adapters {
+    typedef double   value_type;
+    typedef colind_t col_type;
+    typedef rowptr_t ptr_type;
+    typedef colind_t index_type;
+
+    /**
+     * \brief a matrix, stored in device memory
+     */
+    struct matrix {
+	matrix() : impl_(nullptr) {
+	}
+
+	matrix(NLMatrix M) : impl_(M) {
+	}
+
+	~matrix() {
+	    nlDeleteMatrix(impl_);
+	    impl_ = nullptr;
+	}
+
+	matrix(const matrix& rhs) = delete;
+	matrix& operator=(const matrix& rhs) = delete;
+
+	NLMatrix impl_;
+    };
+
+    /**
+     * \brief a diagonal matrix, stored in device memory
+     */
+    typedef matrix matrix_diagonal;
+
+    /**
+     * \brief a vector, stored in device memory
+     */
+    struct vector {
+
+	vector() : n_(0), data_(nullptr) {
+	}
+
+	vector(index_type n) {
+	    n_ = n;
+	    // TODO: should we clear it ?
+	    data_ = NL_NEW_VECTOR(
+		nlCUDABlas(), NL_DEVICE_MEMORY, n_
+	    );
+	    clear();
+	}
+
+	vector(const value_type* x_on_host, index_type n) {
+	    n_ = n;
+	    data_ = NL_NEW_VECTOR(
+		nlCUDABlas(), NL_DEVICE_MEMORY, n_
+	    );
+	    copy_from_host(x_on_host, n);
+	}
+
+	~vector() {
+	    if(data_ != nullptr) {
+		NL_DELETE_VECTOR(
+		    nlCUDABlas(), NL_DEVICE_MEMORY, n_, data_
+		);
+		n_ = 0;
+		data_ = nullptr;
+	    }
+	}
+
+	vector(const vector& rhs) = delete;
+	vector& operator=(const vector& rhs) = delete;
+
+	void clear() {
+	    // TODO: smarter method for clearing a vector
+	    double* tmp = new double[n_];
+	    memset(tmp, 0, bytes());
+	    copy_from_host(tmp, n_);
+	    delete[] tmp;
+	}
+
+	size_t bytes() const {
+	    return sizeof(value_type) * size_t(n_);
+	}
+
+	void copy_from_host(const value_type* x_on_host, index_type n) {
+	    nl_assert(n == n_);
+	    NLBlas_t blas = nlCUDABlas();
+	    blas->Memcpy(
+		blas,
+		const_cast<double*>(x_on_host), NL_HOST_MEMORY,
+		data_, NL_DEVICE_MEMORY,
+		bytes()
+	    );
+	}
+
+	void copy_from_host(const std::vector<value_type>& x_on_host) {
+	    copy_from_host(x_on_host.data(), index_type(x_on_host.size()));
+	}
+
+	void copy_to_host(value_type* x_on_host, index_type n) const {
+	    nl_assert(n == n_);
+	    NLBlas_t blas = nlCUDABlas();
+	    blas->Memcpy(
+		blas,
+		data_, NL_DEVICE_MEMORY,
+		x_on_host, NL_HOST_MEMORY,
+		bytes()
+	    );
+	}
+
+	void copy_to_host(std::vector<value_type>& x_on_host) const {
+	    copy_to_host(x_on_host.data(), index_type(x_on_host.size()));
+	}
+
+	void copy_from_device(const value_type* x_on_device, index_type n) {
+	    nl_assert(n == n_);
+	    NLBlas_t blas = nlCUDABlas();
+	    blas->Memcpy(
+		blas,
+		const_cast<double*>(x_on_device), NL_DEVICE_MEMORY,
+		data_, NL_DEVICE_MEMORY,
+		bytes()
+	    );
+	}
+
+	void copy_from(const vector& rhs) {
+	    nl_assert(rhs.n_ == n_);
+	    copy_from_device(rhs.data_, rhs.n_);
+	}
+
+	index_type n_;
+	double* data_;
+    };
+
+    /**
+     * Wrapper around solver::skyline_lu for use with the NLCUDA backend.
+     * Inspired from AMGCL cuda wrapper
+     * Copies the rhs to the host memory, solves the problem using the host CPU,
+     * then copies the solution back to the compute device(s).
+     */
+    struct cuda_skyline_lu : amgcl::solver::skyline_lu<value_type> {
+	typedef amgcl::solver::skyline_lu<value_type> Base;
+
+	template <class Matrix, class Params>
+	cuda_skyline_lu(const Matrix &A, const Params&):
+	    Base(*A),
+	    rhs_on_host_(amgcl::backend::rows(*A)),
+	    x_on_host_(amgcl::backend::rows(*A)) {
+	}
+
+	void operator()(const vector &rhs, vector &x) const {
+	    rhs.copy_to_host(rhs_on_host_);
+	    static_cast<const Base*>(this)->operator()(rhs_on_host_, x_on_host_);
+	    x.copy_from_host(x_on_host_);
+	}
+
+	mutable std::vector<value_type> rhs_on_host_;
+	mutable std::vector<value_type> x_on_host_;
+    };
+}
+
+namespace amgcl { namespace backend {
+    /**
+     * \brief nlcuda backend for AMGCL
+     */
+    struct nlcuda {
+	typedef nlcuda_adapters::value_type value_type;
+	typedef nlcuda_adapters::col_type col_type;
+	typedef nlcuda_adapters::ptr_type ptr_type;
+	typedef nlcuda_adapters::index_type index_type;
+
+	typedef nlcuda_adapters::matrix matrix;
+	typedef nlcuda_adapters::matrix matrix_diagonal;
+	typedef nlcuda_adapters::vector vector;
+
+	typedef nlcuda_adapters::cuda_skyline_lu direct_solver;
+
+	struct provides_row_iterator : std::false_type {};
+
+	typedef amgcl::detail::empty_params params;
+
+	static std::string name() { return "nlcuda"; }
+
+	typedef builtin<value_type, col_type, ptr_type> builtin_backend;
+
+	static std::shared_ptr<matrix> copy_matrix(
+	    std::shared_ptr<typename builtin_backend::matrix> A, const params&
+	) {
+	    const typename builtin_backend::matrix &a = *A;
+	    col_type* rowptr = const_cast<ptr_type*>(a.ptr);
+	    index_type* colind = const_cast<index_type*>(a.col);
+	    value_type* val = const_cast<value_type*>(a.val);
+
+	    // Sanity checks: signedness differ, but sizes should
+	    // be the same !
+	    static_assert(sizeof(NLuint_big) == sizeof(col_type));
+	    static_assert(sizeof(NLuint) == sizeof(index_type));
+	    static_assert(sizeof(double) == sizeof(value_type));
+
+	    // Create NLCRSMatrix with rowptr, colind and val arrays
+	    // pointing to arrays in AMGCL matrix. Zero copy.
+	    NLCRSMatrix CRS;
+	    memset(&CRS, 0, sizeof(NLCRSMatrix));
+	    CRS.type = NL_MATRIX_CRS;
+	    CRS.symmetric_storage = NL_FALSE;
+	    CRS.m = NLuint(a.nrows);
+	    CRS.n = NLuint(a.ncols);
+	    CRS.rowptr = reinterpret_cast<NLuint_big*>(rowptr);
+	    CRS.colind = reinterpret_cast<NLuint*>(colind);
+	    CRS.val = val;
+	    CRS.nslices = 0;
+	    CRS.sliceptr = nullptr;
+
+	    return std::shared_ptr<matrix>(
+		new matrix(nlCUDAMatrixNewFromCRSMatrix(NLMatrix(&CRS)))
+	    );
+	}
+
+	static std::shared_ptr<vector> copy_vector(
+	    typename builtin_backend::vector const &x, const params&
+	) {
+	    return std::make_shared<vector>(x.data(), x.size());
+	}
+
+	static std::shared_ptr<vector> copy_vector(
+	    std::shared_ptr< typename builtin_backend::vector > x,
+	    const params &prm
+	) {
+	    return copy_vector(*x, prm);
+	}
+
+	static std::shared_ptr<vector> create_vector(
+	    size_t size, const params&
+	) {
+	    return std::make_shared<vector>(size);
+	}
+
+	static std::shared_ptr<direct_solver> create_solver(
+	    std::shared_ptr<typename builtin_backend::matrix > A,
+	    const params &prm
+	) {
+	    return std::make_shared<direct_solver>(A, prm);
+	}
+    };
+
+   /****************************************************************/
+
+    template <> struct spmv_impl<
+	double,
+	nlcuda_adapters::matrix,
+	nlcuda_adapters::vector,
+	double, nlcuda_adapters::vector
+    > {
+	static void apply(
+	    double alpha,
+	    const nlcuda_adapters::matrix& A,
+	    const nlcuda_adapters::vector& x,
+	    double beta,
+	    nlcuda_adapters::vector& y
+	) {
+	    nlCUDAMatrixSpMV(A.impl_, x.data_, y.data_, alpha, beta);
+	}
+    };
+
+    template <> struct residual_impl<
+	nlcuda_adapters::matrix,
+	nlcuda_adapters::vector,
+	nlcuda_adapters::vector,
+	nlcuda_adapters::vector
+    > {
+	static void apply(
+	    const nlcuda_adapters::vector &rhs,
+	    const nlcuda_adapters::matrix &A,
+	    const nlcuda_adapters::vector &x,
+	    nlcuda_adapters::vector &r
+	) {
+	    // r = rhs - A * x;
+	    r.copy_from(rhs);
+	    nlCUDAMatrixSpMV(A.impl_, x.data_, r.data_, -1.0, 1.0);
+	}
+    };
+
+    template <> struct clear_impl<nlcuda_adapters::vector> {
+	static void apply(nlcuda_adapters::vector& x) {
+	    x.clear();
+	}
+    };
+
+    template <> struct inner_product_impl<
+	nlcuda_adapters::vector,
+	nlcuda_adapters::vector
+	> {
+	static double get(
+	    const nlcuda_adapters::vector& x,
+	    const nlcuda_adapters::vector& y
+	) {
+	    nl_assert(x.n_ == y.n_);
+	    NLBlas_t blas = nlCUDABlas();
+	    return blas->Ddot(blas,x.n_,x.data_,1,y.data_,1);
+	}
+    };
+
+    template <> struct axpby_impl<
+	double,
+	nlcuda_adapters::vector,
+	double,
+	nlcuda_adapters::vector
+	> {
+	static void apply(
+	    double a,
+	    const nlcuda_adapters::vector& x,
+	    double b,
+	    const nlcuda_adapters::vector& y
+	) {
+	    nl_assert(x.n_ == y.n_);
+	    NLBlas_t blas = nlCUDABlas();
+	    if(b != 1.0) {
+		blas->Dscal(blas, x.n_, b, y.data_, 1);
+	    }
+	    blas->Daxpy(blas, x.n_, a, x.data_, 1, y.data_, 1);
+	}
+    };
+
+    template <> struct axpbypcz_impl<
+	double,
+	nlcuda_adapters::vector,
+	double,
+	nlcuda_adapters::vector,
+	double,
+	nlcuda_adapters::vector
+	> {
+	static void apply(
+            double a, const nlcuda_adapters::vector &x,
+            double b, const nlcuda_adapters::vector &y,
+            double c,       nlcuda_adapters::vector &z
+	) {
+	    nl_assert(x.n_ == y.n_ && y.n_ && z.n_);
+	    NLBlas_t blas = nlCUDABlas();
+
+	    if(c != 1.0) {
+		blas->Dscal(blas, x.n_, c, z.data_, 1);
+	    }
+
+	    blas->Daxpy(blas, x.n_, a, x.data_, 1, z.data_, 1);
+	    blas->Daxpy(blas, x.n_, b, y.data_, 1, z.data_, 1);
+	}
+    };
+
+    template <> struct vmul_impl<
+	double,
+	nlcuda_adapters::vector,
+	nlcuda_adapters::vector,
+	double,
+	nlcuda_adapters::vector
+	> {
+	static void apply(
+	    double a,
+	    const nlcuda_adapters::vector &x,
+	    const nlcuda_adapters::vector &y,
+	    double b,
+	    nlcuda_adapters::vector &z
+	) {
+	    // TODO
+	    nl_assert(false);
+	}
+    };
+
+    template <> struct copy_impl<
+	nlcuda_adapters::vector,
+	nlcuda_adapters::vector
+	> {
+	static void apply(
+	    const nlcuda_adapters::vector &x,
+	    nlcuda_adapters::vector &y
+	) {
+	    y.copy_from(x);
+	}
+    };
+
+}}
+
+
+
+
+/*************************************************************************/
+
+static NLboolean nlSolveAMGCL_GPU(void) {
+
+
+    return NL_FALSE;
+}
+
+/*************************************************************************/
+
+NLboolean nlSolveAMGCL() {
+    return nlExtensionIsInitialized_CUDA() ?
+	nlSolveAMGCL_GPU() : nlSolveAMGCL_CPU();
+}
+
+/*************************************************************************/
+
 #else
 
 nlBoolean nlSolveAMGCL() {
diff --git a/src/lib/geogram/NL/nl_blas.h b/src/lib/geogram/NL/nl_blas.h
index 0a5360bc19dc..765e0379aa14 100644
--- a/src/lib/geogram/NL/nl_blas.h
+++ b/src/lib/geogram/NL/nl_blas.h
@@ -182,7 +182,8 @@ extern "C" {
  * \return the dot product between \p x and \p y
  */
     typedef double (*FUNPTR_ddot)(
-        NLBlas_t blas, int n, const double *x, int incx, const double *y, int incy
+        NLBlas_t blas,
+	int n, const double *x, int incx, const double *y, int incy
     );
 
 /**
@@ -194,7 +195,9 @@ extern "C" {
  *  of the vector
  * \return the norm of \p x
  */
-    typedef double (*FUNPTR_dnrm2)(NLBlas_t blas, int n, const double *x, int incx);
+    typedef double (*FUNPTR_dnrm2)(
+	NLBlas_t blas, int n, const double *x, int incx
+    );
 
 /**
  * \brief Computes a linear combination of two vectors
diff --git a/src/lib/geogram/NL/nl_cuda.c b/src/lib/geogram/NL/nl_cuda.c
index ab7caeeea524..e5dcd1989417 100644
--- a/src/lib/geogram/NL/nl_cuda.c
+++ b/src/lib/geogram/NL/nl_cuda.c
@@ -1107,7 +1107,7 @@ typedef struct NLCUDASparseMatrixStruct {
     double* val;
 
     /* Management of multi-slice matrices,
-     * used in GARGANTUA mode when NNZ is larger than 2^31
+     * used when NNZ is larger than NL_MAX_SLICE_SIZE
      * then matrix is stored as a linked-list of "slices"
      * each slice corresponds to a slice with the rows
      * [row_offset .. row_offset+m] of the matrix.
@@ -1269,12 +1269,13 @@ static void nlCRSMatrixCUDASliceSpMV(
     nlCUDABlas()->flops += (NLulong)(2*Mcuda->nnz);
 }
 
-static void nlCRSMatrixCUDAMult(
-    NLCUDASparseMatrix* Mcuda, const double* x, double* y
+void nlCUDAMatrixSpMV(
+    NLMatrix M, const double* x, double* y, double alpha, double beta
 ) {
+    NLCUDASparseMatrix* Mcuda = (NLCUDASparseMatrix*)M;
     /* single-slice matrix */
     if(Mcuda->next_slice == NULL) {
-	nlCRSMatrixCUDASliceSpMV(Mcuda, x, y, 1.0, 0.0);
+	nlCRSMatrixCUDASliceSpMV(Mcuda, x, y, alpha, beta);
 	return;
     }
 
@@ -1285,13 +1286,18 @@ static void nlCRSMatrixCUDAMult(
 	Mcuda = Mcuda->next_slice
     ) {
 	/*
-	 * Note: beta=0.0, because we are not adding to y,
-	 * since y is computed slice-per-slice !
+	 * Note: y is computed slice-per-slice !
 	 */
-	nlCRSMatrixCUDASliceSpMV(Mcuda, x, y, 1.0, 0.0);
+	nlCRSMatrixCUDASliceSpMV(Mcuda, x, y, alpha, beta);
     }
 }
 
+static void nlCRSMatrixCUDAMult(
+    NLCUDASparseMatrix* Mcuda, const double* x, double* y
+) {
+    nlCUDAMatrixSpMV((NLMatrix)Mcuda, x, y, 1.0, 0.0);
+}
+
 #ifdef GARGANTUA
 /**
  * \brief Converts in-place an array of 64-bit ints to 32-bit ints
@@ -1314,12 +1320,14 @@ static void int32_to_int64(void* data, size_t N) {
 	*to-- = (NLuint_big)*from--;
     }
 }
+#endif
 
 /*
  * Maximum slice size. It is limited by the maximum size that
- * one can CudaMalloc (2 GB)
+ * one can CudaMalloc (2 GB), this makes 256 M entries
+ * (size iz determined by the VAL array)
  */
-#define NL_MAX_SLICE_SIZE (2u*1024u*1024u*1024u/8u)
+#define NL_MAX_SLICE_SIZE (256u*1024u*1024u)
 
 /**
  * \brief Decomposes a CRS matrix into multiple slices.
@@ -1399,8 +1407,6 @@ NLCUDASparseMatrix* CreateCUDASlicesFromCRSMatrixSlices(
     return Mcuda;
 }
 
-#endif
-
 NLMatrix nlCUDAMatrixNewFromCRSMatrix(NLMatrix M_in) {
     NLCUDASparseMatrix* Mcuda = NL_NEW(NLCUDASparseMatrix);
     NLCRSMatrix* M = (NLCRSMatrix*)(M_in);
@@ -1426,7 +1432,6 @@ NLMatrix nlCUDAMatrixNewFromCRSMatrix(NLMatrix M_in) {
                     Mcuda->val, M->val, val_sz, cudaMemcpyHostToDevice)
                );
 
-#ifdef GARGANTUA
     /* Need to decompose matrix into slices if arrays are too large */
     if(Mcuda->nnz > NL_MAX_SLICE_SIZE) {
 	Mcuda->next_slice = CreateCUDASlicesFromCRSMatrixSlices(
@@ -1435,7 +1440,6 @@ NLMatrix nlCUDAMatrixNewFromCRSMatrix(NLMatrix M_in) {
 	nl_printf("Matrix has %d slices\n", Mcuda->nb_slices);
 	return (NLMatrix)Mcuda;
     }
-#endif
 
     /*
      * At this point, everything can fit in a single slice, stored
diff --git a/src/lib/geogram/NL/nl_cuda.h b/src/lib/geogram/NL/nl_cuda.h
index f5ba7873dc2f..b80ea8f9dd1e 100644
--- a/src/lib/geogram/NL/nl_cuda.h
+++ b/src/lib/geogram/NL/nl_cuda.h
@@ -91,6 +91,19 @@ NLMatrix nlCUDAMatrixNewFromCRSMatrix(NLMatrix M);
  */
 NLMatrix nlCUDAJacobiPreconditionerNewFromCRSMatrix(NLMatrix M);
 
+
+/**
+ * \brief Computes a sparse matrix vector product
+ * \details Computes y <- alpha M x + beta y
+ * \param[in] M a matrix created from nlCUDAMAtrixNewFromCRSMatrix()
+ * \param[in] x device pointer
+ * \param[in,out] y device pointer
+ * \param[in] alpha , beta two scalars
+ */
+void nlCUDAMatrixSpMV(
+    NLMatrix M, const double* x, double* y, double alpha, double beta
+);
+
 /**
  * \brief Gets a pointer to the BLAS abstraction layer for
  *  BLAS operation on the GPU using CUDA.