refactor: use welford's algorithm for stddev computation

Also update the functions to use c++17 return value sugar
jolars · Dec 6, 2023 · f856ef4 · f856ef4
1 parent cb6fe5b
commit f856ef4
Show file tree

Hide file tree

Showing 5 changed files with 142 additions and 79 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -34,8 +34,9 @@ if(BUILD_TESTING)
     FetchContent_MakeAvailable(Catch2)
   endif()
 
-  add_executable(tests tests/gaussian.cpp tests/prox.cpp tests/qnorm.cpp
-                       tests/lambda_sequence.cpp tests/path.cpp)
+  add_executable(
+    tests tests/gaussian.cpp tests/prox.cpp tests/qnorm.cpp
+          tests/lambda_sequence.cpp tests/path.cpp tests/standardize.cpp)
   target_link_libraries(tests PRIVATE libslope Catch2::Catch2WithMain)
   target_include_directories(tests PUBLIC tests/ "${PROJECT_BINARY_DIR}")
 

diff --git a/src/slope/fit_slope.h b/src/slope/fit_slope.h
@@ -47,13 +47,7 @@ fitSlope(const T& x,
   const int n = x.rows();
   const int p = x.cols();
 
-  // standardize
-  VectorXd x_centers(p);
-  VectorXd x_scales(p);
-
-  if (params.standardize) {
-    std::tie(x_centers, x_scales) = computeMeanAndStdDev(x);
-  }
+  auto [x_centers, x_scales] = standardize(x, params.standardize);
 
   std::unique_ptr<Objective> objective = setupObjective(params.objective);
 
@@ -247,18 +241,10 @@ fitSlope(const T& x,
     }
 
     // Store everything for this step of the path
-    double beta0_out;
-    VectorXd beta_out;
-
-    if (params.standardize) {
-      std::tie(beta0_out, beta_out) = unstandardizeCoefficients(
-        beta0, beta, x_centers, x_scales, params.intercept);
-    } else {
-      beta0_out = beta0;
-      beta_out = beta;
-    }
+    auto [beta0_out, beta_out] =
+      rescaleCoefficients(beta0, beta, x_centers, x_scales, params);
 
-    beta0s(path_step) = beta0_out;
+    beta0s(path_step) = std::move(beta0_out);
 
     for (int j = 0; j < p; ++j) {
       if (beta_out(j) != 0) {

diff --git a/src/slope/standardize.cpp b/src/slope/standardize.cpp
@@ -2,62 +2,29 @@
 
 namespace slope {
 
-std::pair<Eigen::VectorXd, Eigen::VectorXd>
-computeMeanAndStdDev(const Eigen::MatrixXd& x)
-{
-  const int n = x.rows();
-  const int p = x.cols();
-
-  Eigen::VectorXd x_means = x.colwise().mean();
-  Eigen::VectorXd x_stddevs(p);
-
-  for (int j = 0; j < p; ++j) {
-    x_stddevs(j) =
-      std::sqrt((x.col(j).array() - x_means(j)).square().sum() / n);
-  }
-
-  return { x_means, x_stddevs };
-}
-
-std::pair<Eigen::VectorXd, Eigen::VectorXd>
-computeMeanAndStdDev(const Eigen::SparseMatrix<double>& x)
-{
-  const int n = x.rows();
-  const int p = x.cols();
-
-  Eigen::VectorXd x_means(p);
-  Eigen::VectorXd x_stddevs(p);
-
-  for (int j = 0; j < p; ++j) {
-    x_means(j) = x.col(j).sum() / n;
-    // TODO: Reconsider this implementation since it might overflow.
-    x_stddevs(j) =
-      std::sqrt(x.col(j).squaredNorm() / n - std::pow(x_means(j), 2));
-  }
-
-  return { x_means, x_stddevs };
-}
-
-std::pair<double, Eigen::VectorXd>
-unstandardizeCoefficients(double beta0,
-                          Eigen::VectorXd beta,
-                          const Eigen::VectorXd& x_means,
-                          const Eigen::VectorXd& x_stddevs,
-                          const bool intercept)
+std::tuple<double, Eigen::VectorXd>
+rescaleCoefficients(double beta0,
+                    Eigen::VectorXd beta,
+                    const Eigen::VectorXd& x_centers,
+                    const Eigen::VectorXd& x_scales,
+                    const SlopeParameters& params)
 {
   const int p = beta.rows();
 
   double x_bar_beta_sum = 0;
 
-  for (int j = 0; j < p; ++j) {
-    beta(j) /= x_stddevs(j);
-    x_bar_beta_sum += x_means(j) * beta(j);
-  }
-  if (intercept) {
-    beta0 -= x_bar_beta_sum;
+  if (params.standardize) {
+    for (int j = 0; j < p; ++j) {
+      beta(j) /= x_scales(j);
+      x_bar_beta_sum += x_centers(j) * beta(j);
+    }
+
+    if (params.intercept) {
+      beta0 -= x_bar_beta_sum;
+    }
   }
 
   return { beta0, beta };
 }
 
-}
+} // namespace slope
diff --git a/src/slope/standardize.h b/src/slope/standardize.h
@@ -1,20 +1,60 @@
 #pragma once
 
+#include "parameters.h"
 #include <Eigen/Sparse>
 
 namespace slope {
 
-std::pair<Eigen::VectorXd, Eigen::VectorXd>
-computeMeanAndStdDev(const Eigen::MatrixXd& x);
+/**
+ * Standardizes the given matrix column-wise.
+ *
+ * This function uses Welford's algorithm to compute the means and standard
+ * deviation.
+ *
+ * @tparam T The type of the input matrix.
+ * @param x The input matrix.
+ * @param standardize Flag indicating whether to standardize the matrix.
+ * @return A tuple containing the means and standard deviations of the columns.
+ */
+template<typename T>
+std::tuple<Eigen::VectorXd, Eigen::VectorXd>
+standardize(const T& x, const bool standardize)
+{
+  const int n = x.rows();
+  const int p = x.cols();
 
-std::pair<Eigen::VectorXd, Eigen::VectorXd>
-computeMeanAndStdDev(const Eigen::SparseMatrix<double>& x);
+  Eigen::VectorXd x_means(p);
+  Eigen::VectorXd x_stddevs(p);
 
-std::pair<double, Eigen::VectorXd>
-unstandardizeCoefficients(double beta0,
-                          Eigen::VectorXd beta,
-                          const Eigen::VectorXd& x_means,
-                          const Eigen::VectorXd& x_stddevs,
-                          const bool intercept);
+  for (int j = 0; j < p; ++j) {
+    double mean = 0.0;
+    double m2 = 0.0;
+    int count = 0;
 
+    for (typename T::InnerIterator it(x, j); it; ++it) {
+      double delta = it.value() - mean;
+      mean += delta / (++count);
+      m2 += delta * (it.value() - mean);
+    }
+
+    // Account for zeros in the column
+    double delta = -mean;
+    while (count < n) {
+      count++;
+      mean += delta / count;
+      m2 -= mean * delta;
+    }
+
+    x_means(j) = mean;
+    x_stddevs(j) = std::sqrt(m2 / n);
+  }
+  return { x_means, x_stddevs };
 }
+
+std::tuple<double, Eigen::VectorXd>
+rescaleCoefficients(double beta0,
+                    Eigen::VectorXd beta,
+                    const Eigen::VectorXd& x_centers,
+                    const Eigen::VectorXd& x_scales,
+                    const SlopeParameters& params);
+} // namespace slope
diff --git a/tests/standardize.cpp b/tests/standardize.cpp
@@ -0,0 +1,69 @@
+#include "test_helpers.hpp"
+#include <Eigen/Core>
+#include <catch2/catch_test_macros.hpp>
+#include <catch2/matchers/catch_matchers.hpp>
+#include <catch2/matchers/catch_matchers_floating_point.hpp>
+#include <slope/standardize.h>
+
+std::tuple<Eigen::VectorXd, Eigen::VectorXd>
+computeMeanAndStdDev(const Eigen::MatrixXd& x)
+{
+  const int n = x.rows();
+  const int p = x.cols();
+
+  Eigen::VectorXd x_means = x.colwise().mean();
+  Eigen::VectorXd x_stddevs(p);
+
+  for (int j = 0; j < p; ++j) {
+    x_stddevs(j) =
+      std::sqrt((x.col(j).array() - x_means(j)).square().sum() / n);
+  }
+
+  return { x_means, x_stddevs };
+}
+
+std::tuple<Eigen::VectorXd, Eigen::VectorXd>
+computeMeanAndStdDev(const Eigen::SparseMatrix<double>& x)
+{
+  const int n = x.rows();
+  const int p = x.cols();
+
+  Eigen::VectorXd x_means(p);
+  Eigen::VectorXd x_stddevs(p);
+
+  for (int j = 0; j < p; ++j) {
+    x_means(j) = x.col(j).sum() / n;
+    // TODO: Reconsider this implementation since it might overflow.
+    x_stddevs(j) =
+      std::sqrt(x.col(j).squaredNorm() / n - std::pow(x_means(j), 2));
+  }
+
+  return { x_means, x_stddevs };
+}
+
+TEST_CASE("Check that standardization algorithm works",
+          "[utils][standardization]")
+{
+  using Catch::Matchers::WithinAbs;
+
+  Eigen::SparseMatrix<double> x(3, 3);
+
+  x.coeffRef(0, 0) = 1;
+  x.coeffRef(1, 0) = 98.2;
+  x.coeffRef(2, 0) = -1007;
+  x.coeffRef(0, 2) = 1000;
+  x.coeffRef(1, 2) = 34;
+
+  Eigen::MatrixXd x_dense = x;
+
+  auto [x_centers_ref, x_scales_ref] = computeMeanAndStdDev(x);
+
+  auto [x_centers, x_scales] = slope::standardize(x, true);
+  auto [x_centers_dense, x_scales_dense] = slope::standardize(x_dense, true);
+
+  REQUIRE_THAT(x_centers, VectorApproxEqual(x_centers_ref));
+  REQUIRE_THAT(x_scales, VectorApproxEqual(x_scales_ref));
+
+  REQUIRE_THAT(x_centers_dense, VectorApproxEqual(x_centers_ref));
+  REQUIRE_THAT(x_scales, VectorApproxEqual(x_scales_ref));
+}