Skip to content

Commit

Permalink
refactor: use welford's algorithm for stddev computation
Browse files Browse the repository at this point in the history
Also update the functions to use c++17 return value sugar
  • Loading branch information
jolars committed Dec 6, 2023
1 parent cb6fe5b commit f856ef4
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 79 deletions.
5 changes: 3 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,9 @@ if(BUILD_TESTING)
FetchContent_MakeAvailable(Catch2)
endif()

add_executable(tests tests/gaussian.cpp tests/prox.cpp tests/qnorm.cpp
tests/lambda_sequence.cpp tests/path.cpp)
add_executable(
tests tests/gaussian.cpp tests/prox.cpp tests/qnorm.cpp
tests/lambda_sequence.cpp tests/path.cpp tests/standardize.cpp)
target_link_libraries(tests PRIVATE libslope Catch2::Catch2WithMain)
target_include_directories(tests PUBLIC tests/ "${PROJECT_BINARY_DIR}")

Expand Down
22 changes: 4 additions & 18 deletions src/slope/fit_slope.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,13 +47,7 @@ fitSlope(const T& x,
const int n = x.rows();
const int p = x.cols();

// standardize
VectorXd x_centers(p);
VectorXd x_scales(p);

if (params.standardize) {
std::tie(x_centers, x_scales) = computeMeanAndStdDev(x);
}
auto [x_centers, x_scales] = standardize(x, params.standardize);

std::unique_ptr<Objective> objective = setupObjective(params.objective);

Expand Down Expand Up @@ -247,18 +241,10 @@ fitSlope(const T& x,
}

// Store everything for this step of the path
double beta0_out;
VectorXd beta_out;

if (params.standardize) {
std::tie(beta0_out, beta_out) = unstandardizeCoefficients(
beta0, beta, x_centers, x_scales, params.intercept);
} else {
beta0_out = beta0;
beta_out = beta;
}
auto [beta0_out, beta_out] =
rescaleCoefficients(beta0, beta, x_centers, x_scales, params);

beta0s(path_step) = beta0_out;
beta0s(path_step) = std::move(beta0_out);

for (int j = 0; j < p; ++j) {
if (beta_out(j) != 0) {
Expand Down
65 changes: 16 additions & 49 deletions src/slope/standardize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,62 +2,29 @@

namespace slope {

std::pair<Eigen::VectorXd, Eigen::VectorXd>
computeMeanAndStdDev(const Eigen::MatrixXd& x)
{
const int n = x.rows();
const int p = x.cols();

Eigen::VectorXd x_means = x.colwise().mean();
Eigen::VectorXd x_stddevs(p);

for (int j = 0; j < p; ++j) {
x_stddevs(j) =
std::sqrt((x.col(j).array() - x_means(j)).square().sum() / n);
}

return { x_means, x_stddevs };
}

std::pair<Eigen::VectorXd, Eigen::VectorXd>
computeMeanAndStdDev(const Eigen::SparseMatrix<double>& x)
{
const int n = x.rows();
const int p = x.cols();

Eigen::VectorXd x_means(p);
Eigen::VectorXd x_stddevs(p);

for (int j = 0; j < p; ++j) {
x_means(j) = x.col(j).sum() / n;
// TODO: Reconsider this implementation since it might overflow.
x_stddevs(j) =
std::sqrt(x.col(j).squaredNorm() / n - std::pow(x_means(j), 2));
}

return { x_means, x_stddevs };
}

std::pair<double, Eigen::VectorXd>
unstandardizeCoefficients(double beta0,
Eigen::VectorXd beta,
const Eigen::VectorXd& x_means,
const Eigen::VectorXd& x_stddevs,
const bool intercept)
std::tuple<double, Eigen::VectorXd>
rescaleCoefficients(double beta0,
Eigen::VectorXd beta,
const Eigen::VectorXd& x_centers,
const Eigen::VectorXd& x_scales,
const SlopeParameters& params)
{
const int p = beta.rows();

double x_bar_beta_sum = 0;

for (int j = 0; j < p; ++j) {
beta(j) /= x_stddevs(j);
x_bar_beta_sum += x_means(j) * beta(j);
}
if (intercept) {
beta0 -= x_bar_beta_sum;
if (params.standardize) {
for (int j = 0; j < p; ++j) {
beta(j) /= x_scales(j);
x_bar_beta_sum += x_centers(j) * beta(j);
}

if (params.intercept) {
beta0 -= x_bar_beta_sum;
}
}

return { beta0, beta };
}

}
} // namespace slope
60 changes: 50 additions & 10 deletions src/slope/standardize.h
Original file line number Diff line number Diff line change
@@ -1,20 +1,60 @@
#pragma once

#include "parameters.h"
#include <Eigen/Sparse>

namespace slope {

std::pair<Eigen::VectorXd, Eigen::VectorXd>
computeMeanAndStdDev(const Eigen::MatrixXd& x);
/**
* Standardizes the given matrix column-wise.
*
* This function uses Welford's algorithm to compute the means and standard
* deviation.
*
* @tparam T The type of the input matrix.
* @param x The input matrix.
* @param standardize Flag indicating whether to standardize the matrix.
* @return A tuple containing the means and standard deviations of the columns.
*/
template<typename T>
std::tuple<Eigen::VectorXd, Eigen::VectorXd>
standardize(const T& x, const bool standardize)
{
const int n = x.rows();
const int p = x.cols();

std::pair<Eigen::VectorXd, Eigen::VectorXd>
computeMeanAndStdDev(const Eigen::SparseMatrix<double>& x);
Eigen::VectorXd x_means(p);
Eigen::VectorXd x_stddevs(p);

std::pair<double, Eigen::VectorXd>
unstandardizeCoefficients(double beta0,
Eigen::VectorXd beta,
const Eigen::VectorXd& x_means,
const Eigen::VectorXd& x_stddevs,
const bool intercept);
for (int j = 0; j < p; ++j) {
double mean = 0.0;
double m2 = 0.0;
int count = 0;

for (typename T::InnerIterator it(x, j); it; ++it) {
double delta = it.value() - mean;
mean += delta / (++count);
m2 += delta * (it.value() - mean);
}

// Account for zeros in the column
double delta = -mean;
while (count < n) {
count++;
mean += delta / count;
m2 -= mean * delta;
}

x_means(j) = mean;
x_stddevs(j) = std::sqrt(m2 / n);
}
return { x_means, x_stddevs };
}

std::tuple<double, Eigen::VectorXd>
rescaleCoefficients(double beta0,
Eigen::VectorXd beta,
const Eigen::VectorXd& x_centers,
const Eigen::VectorXd& x_scales,
const SlopeParameters& params);
} // namespace slope
69 changes: 69 additions & 0 deletions tests/standardize.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
#include "test_helpers.hpp"
#include <Eigen/Core>
#include <catch2/catch_test_macros.hpp>
#include <catch2/matchers/catch_matchers.hpp>
#include <catch2/matchers/catch_matchers_floating_point.hpp>
#include <slope/standardize.h>

std::tuple<Eigen::VectorXd, Eigen::VectorXd>
computeMeanAndStdDev(const Eigen::MatrixXd& x)
{
const int n = x.rows();
const int p = x.cols();

Eigen::VectorXd x_means = x.colwise().mean();
Eigen::VectorXd x_stddevs(p);

for (int j = 0; j < p; ++j) {
x_stddevs(j) =
std::sqrt((x.col(j).array() - x_means(j)).square().sum() / n);
}

return { x_means, x_stddevs };
}

std::tuple<Eigen::VectorXd, Eigen::VectorXd>
computeMeanAndStdDev(const Eigen::SparseMatrix<double>& x)
{
const int n = x.rows();
const int p = x.cols();

Eigen::VectorXd x_means(p);
Eigen::VectorXd x_stddevs(p);

for (int j = 0; j < p; ++j) {
x_means(j) = x.col(j).sum() / n;
// TODO: Reconsider this implementation since it might overflow.
x_stddevs(j) =
std::sqrt(x.col(j).squaredNorm() / n - std::pow(x_means(j), 2));
}

return { x_means, x_stddevs };
}

TEST_CASE("Check that standardization algorithm works",
"[utils][standardization]")
{
using Catch::Matchers::WithinAbs;

Eigen::SparseMatrix<double> x(3, 3);

x.coeffRef(0, 0) = 1;
x.coeffRef(1, 0) = 98.2;
x.coeffRef(2, 0) = -1007;
x.coeffRef(0, 2) = 1000;
x.coeffRef(1, 2) = 34;

Eigen::MatrixXd x_dense = x;

auto [x_centers_ref, x_scales_ref] = computeMeanAndStdDev(x);

auto [x_centers, x_scales] = slope::standardize(x, true);
auto [x_centers_dense, x_scales_dense] = slope::standardize(x_dense, true);

REQUIRE_THAT(x_centers, VectorApproxEqual(x_centers_ref));
REQUIRE_THAT(x_scales, VectorApproxEqual(x_scales_ref));

REQUIRE_THAT(x_centers_dense, VectorApproxEqual(x_centers_ref));
REQUIRE_THAT(x_scales, VectorApproxEqual(x_scales_ref));
}

0 comments on commit f856ef4

Please sign in to comment.