Skip to content

Commit

Permalink
fix: fix error in Welford's standardization algorithm
Browse files Browse the repository at this point in the history
  • Loading branch information
jolars committed Feb 7, 2024
1 parent 9252d4d commit 3d70d06
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 24 deletions.
11 changes: 7 additions & 4 deletions src/slope/standardize.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,20 @@ computeCentersAndScales(const T& x, const bool standardize)
int count = 0;

for (typename T::InnerIterator it(x, j); it; ++it) {
count++;
double delta = it.value() - mean;
mean += delta / (++count);
m2 += delta * (it.value() - mean);
mean += delta / count;
double delta2 = it.value() - mean;
m2 += delta * delta2;
}

// Account for zeros in the column
double delta = -mean;
while (count < n) {
count++;
double delta = -mean;
mean += delta / count;
m2 -= mean * delta;
double delta2 = -mean;
m2 += delta * delta2;
}

x_means(j) = mean;
Expand Down
23 changes: 16 additions & 7 deletions tests/sparse.cpp
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
#include "../src/slope/slope.h"
#include "../src/slope/standardize.h"
#include "test_helpers.hpp"
#include <Eigen/Core>
#include <catch2/catch_test_macros.hpp>
#include <catch2/matchers/catch_matchers.hpp>
#include <slope/slope.h>

TEST_CASE("Sparse and dense methods agree", "[gaussian][sparse]")
{
Expand All @@ -16,29 +17,37 @@ TEST_CASE("Sparse and dense methods agree", "[gaussian][sparse]")
0.0, 0.0, 0.0, 0.0, 0.0, 0.94280368, 0.0, 0.0, 0.3499374, 0.0, 0.22377115,
0.0, 0.0, 0.96893287, 0.95858229, 0.70486475, 0.60885162, 0.0, 0.0,
0.92902639, 0.0, 0.4978676, 0.0, 0.50022619;
Eigen::SparseMatrix<double> x_sparse = x_dense.sparseView();
Eigen::SparseMatrix<double> x_sparse = x_dense.sparseView().eval();

auto [x_centers_sparse, x_scales_sparse] =
slope::computeCentersAndScales(x_sparse, true);
auto [x_centers_dense, x_scales_dense] =
slope::computeCentersAndScales(x_dense, true);

REQUIRE_THAT(x_centers_dense, VectorApproxEqual(x_centers_sparse));
REQUIRE_THAT(x_scales_dense, VectorApproxEqual(x_scales_sparse));

beta << 1, 2, -0.9;

y = x_dense * beta;

Eigen::ArrayXd alpha = Eigen::ArrayXd::Zero(1);
Eigen::ArrayXd alpha = Eigen::ArrayXd::Ones(1);
Eigen::Array<double, 3, 1> lambda;
lambda << 0.5, 0.5, 0.2;

slope::Slope model;

model.setIntercept(true);
model.setIntercept(false);
model.setStandardize(true);

model.fit(x_sparse, y, alpha, lambda);
auto coefs_sparse = model.getCoefs();
Eigen::VectorXd coefs_sparse = model.getCoefs();

model.fit(x_dense, y, alpha, lambda);
auto coefs_dense = model.getCoefs();
Eigen::VectorXd coefs_dense = model.getCoefs();

Eigen::VectorXd coef_sparse = coefs_sparse.col(0);
Eigen::VectorXd coef_dense = coefs_dense.col(0);

REQUIRE_THAT(coef_sparse, VectorApproxEqual(coef_dense, 1e-4));
REQUIRE_THAT(coef_sparse, VectorApproxEqual(coef_dense, 1e-6));
}
60 changes: 47 additions & 13 deletions tests/standardize.cpp
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
#include "../src/slope/standardize.h"
#include "../src/slope/math.h"
#include "test_helpers.hpp"
#include <Eigen/Core>
#include <catch2/catch_test_macros.hpp>
#include <catch2/matchers/catch_matchers.hpp>
#include <catch2/matchers/catch_matchers_floating_point.hpp>
#include <slope/standardize.h>

std::tuple<Eigen::VectorXd, Eigen::VectorXd>
computeMeanAndStdDev(const Eigen::MatrixXd& x)
Expand All @@ -25,19 +26,9 @@ computeMeanAndStdDev(const Eigen::MatrixXd& x)
std::tuple<Eigen::VectorXd, Eigen::VectorXd>
computeMeanAndStdDev(const Eigen::SparseMatrix<double>& x)
{
const int n = x.rows();
const int p = x.cols();

Eigen::VectorXd x_means(p);
Eigen::VectorXd x_stddevs(p);

for (int j = 0; j < p; ++j) {
x_means(j) = x.col(j).sum() / n;
x_stddevs(j) =
std::sqrt(x.col(j).squaredNorm() / n - std::pow(x_means(j), 2));
}
Eigen::MatrixXd x_dense = x;

return { x_means, x_stddevs };
return computeMeanAndStdDev(x_dense);
}

TEST_CASE("Check that standardization algorithm works",
Expand Down Expand Up @@ -67,3 +58,46 @@ TEST_CASE("Check that standardization algorithm works",
REQUIRE_THAT(x_centers_dense, VectorApproxEqual(x_centers_ref));
REQUIRE_THAT(x_scales, VectorApproxEqual(x_scales_ref));
}

TEST_CASE("Check that in-place standardization works",
"[utils][standardization]")
{
using Catch::Matchers::WithinAbs;

Eigen::MatrixXd x_dense(3, 3);
x_dense << 1, 0, 1000, 98.2, 0, 34, -1007, 0, 0;
Eigen::SparseMatrix<double> x_sparse = x_dense.sparseView();
Eigen::VectorXd beta(3);
beta << 1, 0, -1.8;
Eigen::VectorXd y = x_dense * beta;

Eigen::VectorXd residual = y;
residual(0) += 1;
residual(1) += -0.2;
residual(2) += 0.9;

auto [x_centers_dense, x_scales_dense] = computeMeanAndStdDev(x_dense);
auto [x_centers_sparse, x_scales_sparse] = computeMeanAndStdDev(x_sparse);

REQUIRE_THAT(x_centers_dense, VectorApproxEqual(x_centers_sparse));
REQUIRE_THAT(x_scales_dense, VectorApproxEqual(x_scales_sparse));

Eigen::VectorXd residual_dense =
y - x_dense * beta.cwiseQuotient(x_scales_dense);
residual_dense.array() +=
x_centers_dense.cwiseQuotient(x_scales_dense).dot(beta);

Eigen::VectorXd residual_sparse =
y - x_sparse * beta.cwiseQuotient(x_scales_sparse);
residual_sparse.array() +=
x_centers_sparse.cwiseQuotient(x_scales_sparse).dot(beta);

REQUIRE_THAT(residual_dense, VectorApproxEqual(residual_sparse));

auto gradient_dense = slope::computeGradient(
x_dense, residual_dense, x_centers_dense, x_scales_dense, true);
auto gradient_sparse = slope::computeGradient(
x_sparse, residual_sparse, x_centers_sparse, x_scales_sparse, true);

REQUIRE_THAT(gradient_dense, VectorApproxEqual(gradient_sparse));
}

0 comments on commit 3d70d06

Please sign in to comment.