Skip to content

Commit

Permalink
Remove batched mode as it compromises result quality too much.
Browse files Browse the repository at this point in the history
More specifically, the outcome from batch updates is too different to that of
the iterative updates, such that it's difficult to recommend the former as a
drop-in replacement for the latter. Even though it's easier to parallelize per
epoch, the convergence is such that we need to spend more epochs (briefly
discussed in jlmelville/uwot#83), which defeats the purpose.

By removing batch processing, we also simplify the code a bit, which is nice.
  • Loading branch information
LTLA authored Mar 17, 2023
1 parent 76dd3d3 commit d579059
Show file tree
Hide file tree
Showing 7 changed files with 62 additions and 217 deletions.
2 changes: 1 addition & 1 deletion extern/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ FetchContent_MakeAvailable(knncolle)
FetchContent_Declare(
aarand
GIT_REPOSITORY https://github.com/LTLA/aarand
GIT_TAG 2a8509c499f668bf424306f1aa986da429902c71
GIT_TAG e8397e9e648379f6086b20a83910c39ed8e4dd45
)

FetchContent_MakeAvailable(aarand)
Expand Down
61 changes: 11 additions & 50 deletions include/umappp/Umap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,11 +156,6 @@ class Umap {
*/
static constexpr uint64_t seed = 1234567890;

/**
* See `set_batch()`.
*/
static constexpr bool batch = false;

/**
* See `set_num_threads()`.
*/
Expand All @@ -184,7 +179,6 @@ class Umap {
Float b = Defaults::b;
Float repulsion_strength = Defaults::repulsion_strength;
Float learning_rate = Defaults::learning_rate;
bool batch = Defaults::batch;
int nthreads = Defaults::num_threads;
};

Expand Down Expand Up @@ -359,29 +353,12 @@ class Umap {
return *this;
}

/**
* @param b Whether to optimize in batch mode.
* Batch mode is required for effective parallelization via OpenMP but may reduce the stability of the gradient descent.
*
* Batch mode involves computing forces for all observations and applying them simultaneously.
* This is in contrast to the default where the location of observation is updated before the forces are computed for the next observation.
* As each observation's forces are computed independently, batch mode is more amenable to parallelization;
* however, this comes at the cost of stability as the force calculations for later observations are not aware of updates to the positions of earlier observations.
*
* @return A reference to this `Umap` object.
*/
Umap& set_batch(bool b = Defaults::batch) {
rparams.batch = b;
return *this;
}

/**
* @param n Number of threads to use.
*
* @return A reference to this `Umap` object.
*
* This setting affects nearest neighbor detection (if an existing list of neighbors is not supplied in `initialize()` or `run()`) and spectral initialization.
* If `set_batch()` is `true`, multiple threads will also be used during layout optimization.
*
* The `UMAPPP_CUSTOM_PARALLEL` macro can be set to a function that specifies a custom parallelization scheme.
* This function should be a template that accept three arguments:
Expand Down Expand Up @@ -471,33 +448,17 @@ class Umap {
*
*/
void run(int epoch_limit = 0) {
if (!rparams.batch) {
optimize_layout(
ndim_,
embedding_,
epochs,
rparams.a,
rparams.b,
rparams.repulsion_strength,
rparams.learning_rate,
engine,
epoch_limit
);
} else {
optimize_layout_batched(
ndim_,
embedding_,
epochs,
rparams.a,
rparams.b,
rparams.repulsion_strength,
rparams.learning_rate,
[&]() -> auto { return engine(); },
[](decltype(engine()) s) -> auto { return std::mt19937_64(s); },
epoch_limit,
rparams.nthreads
);
}
optimize_layout(
ndim_,
embedding_,
epochs,
rparams.a,
rparams.b,
rparams.repulsion_strength,
rparams.learning_rate,
engine,
epoch_limit
);
return;
}
};
Expand Down
182 changes: 44 additions & 138 deletions include/umappp/optimize_layout.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,75 +105,6 @@ void optimize_sample(
auto& epoch_of_next_sample = setup.epoch_of_next_sample;
auto& epoch_of_next_negative_sample = setup.epoch_of_next_negative_sample;

const size_t num_obs = head.size();
const Float negative_sample_rate = setup.negative_sample_rate;

size_t start = (i == 0 ? 0 : setup.head[i-1]), end = setup.head[i];
Float* left = embedding + i * ndim;

for (size_t j = start; j < end; ++j) {
if (epoch_of_next_sample[j] > epoch) {
continue;
}

Float* right = embedding + tail[j] * ndim;
Float dist2 = quick_squared_distance(left, right, ndim);
const Float pd2b = std::pow(dist2, b);
const Float grad_coef = (-2 * a * b * pd2b) / (dist2 * (a * pd2b + 1.0));
{
Float* lcopy = left;
Float* rcopy = right;

for (int d = 0; d < ndim; ++d, ++lcopy, ++rcopy) {
Float gradient = alpha * clamp(grad_coef * (*lcopy - *rcopy));
if constexpr(!batch) {
*lcopy += gradient;
*rcopy -= gradient;
} else {
// Doubling as we'll assume symmetry from the same
// force applied by the right node. This allows us to
// avoid worrying about accounting for modifications to
// the right node.
buffer[d] += 2 * gradient;
}
}
}

// Remember that 'epochs_per_negative_sample' is defined as 'epochs_per_sample[j] / negative_sample_rate'.
// We just use it inline below rather than defining a new variable and suffering floating-point round-off.
const size_t num_neg_samples = (epoch - epoch_of_next_negative_sample[j]) *
negative_sample_rate / epochs_per_sample[j]; // i.e., 1/epochs_per_negative_sample.

for (size_t p = 0; p < num_neg_samples; ++p) {
size_t sampled = aarand::discrete_uniform(rng, num_obs);
if (sampled == i) {
continue;
}

Float* right = embedding + sampled * ndim;
Float dist2 = quick_squared_distance(left, right, ndim);
const Float grad_coef = 2 * gamma * b / ((0.001 + dist2) * (a * std::pow(dist2, b) + 1.0));
{
Float* lcopy = left;
const Float* rcopy = right;
for (int d = 0; d < ndim; ++d, ++lcopy, ++rcopy) {
Float gradient = alpha * clamp(grad_coef * (*lcopy - *rcopy));
if constexpr(!batch) {
*lcopy += gradient;
} else {
buffer[d] += gradient;
}
}
}
}

epoch_of_next_sample[j] += epochs_per_sample[j];

// The update to 'epoch_of_next_negative_sample' involves adding
// 'num_neg_samples * epochs_per_negative_sample', which eventually boils
// down to setting epoch_of_next_negative_sample to 'epoch'.
epoch_of_next_negative_sample[j] = epoch;
}
}

template<typename Float, class Setup, class Rng>
Expand All @@ -194,89 +125,64 @@ void optimize_layout(
if (epoch_limit> 0) {
limit_epochs = std::min(epoch_limit, num_epochs);
}


const size_t num_obs = setup.head.size();
for (; n < limit_epochs; ++n) {
const Float epoch = n;
const Float alpha = initial_alpha * (1.0 - epoch / num_epochs);
for (size_t i = 0; i < setup.head.size(); ++i) {
optimize_sample<false>(i, ndim, embedding, static_cast<Float*>(NULL), setup, a, b, gamma, alpha, rng, epoch);
}
}

return;
}
for (size_t i = 0; i < num_obs; ++i) {
size_t start = (i == 0 ? 0 : setup.head[i-1]), end = setup.head[i];
Float* left = embedding + i * ndim;

template<typename Float, class Setup, class SeedFunction, class EngineFunction>
inline void optimize_layout_batched(
int ndim,
Float* embedding,
Setup& setup,
Float a,
Float b,
Float gamma,
Float initial_alpha,
SeedFunction seeder,
EngineFunction creator,
int epoch_limit,
int nthreads
) {
auto& n = setup.current_epoch;
auto num_epochs = setup.total_epochs;
auto limit_epochs = num_epochs;
if (epoch_limit > 0) {
limit_epochs = std::min(epoch_limit, num_epochs);
}
for (size_t j = start; j < end; ++j) {
if (setup.epoch_of_next_sample[j] > epoch) {
continue;
}

const size_t num_obs = setup.head.size();
std::vector<decltype(seeder())> seeds(num_obs);
std::vector<Float> replace_buffer(num_obs * ndim);
Float* replacement = replace_buffer.data();
bool using_replacement = false;
{
Float* right = embedding + setup.tail[j] * ndim;
Float dist2 = quick_squared_distance(left, right, ndim);
const Float pd2b = std::pow(dist2, b);
const Float grad_coef = (-2 * a * b * pd2b) / (dist2 * (a * pd2b + 1.0));

for (; n < limit_epochs; ++n) {
const Float epoch = n;
const Float alpha = initial_alpha * (1.0 - epoch / num_epochs);
Float* lcopy = left;
for (int d = 0; d < ndim; ++d, ++lcopy, ++right) {
Float gradient = alpha * clamp(grad_coef * (*lcopy - *right));
*lcopy += gradient;
*right -= gradient;
}
}

// Fill the seeds.
for (auto& s : seeds) {
s = seeder();
}
// Remember that 'epochs_per_negative_sample' is defined as 'epochs_per_sample[j] / negative_sample_rate'.
// We just use it inline below rather than defining a new variable and suffering floating-point round-off.
const size_t num_neg_samples = (epoch - setup.epoch_of_next_negative_sample[j]) *
setup.negative_sample_rate / setup.epochs_per_sample[j]; // i.e., 1/epochs_per_negative_sample.

// Input and output alternate between epochs, to avoid the need for a
// copy operation on the entire embedding at the end of each epoch.
Float* reference = (using_replacement ? replacement : embedding);
Float* output = (using_replacement ? embedding : replacement);
using_replacement = !using_replacement;
for (size_t p = 0; p < num_neg_samples; ++p) {
size_t sampled = aarand::discrete_uniform(rng, num_obs);
if (sampled == i) {
continue;
}

#ifndef UMAPPP_CUSTOM_PARALLEL
#pragma omp parallel num_threads(nthreads)
{
std::vector<Float> buffer(ndim);
#pragma omp for
for (size_t i = 0; i < setup.head.size(); ++i) {
#else
UMAPPP_CUSTOM_PARALLEL(setup.head.size(), [&](size_t first, size_t last) -> void {
std::vector<Float> buffer(ndim);
for (size_t i = first; i < last; ++i) {
#endif
const Float* right = embedding + sampled * ndim;
Float dist2 = quick_squared_distance(left, right, ndim);
const Float grad_coef = 2 * gamma * b / ((0.001 + dist2) * (a * std::pow(dist2, b) + 1.0));

size_t shift = i * ndim;
std::copy(reference + shift, reference + shift + ndim, buffer.data());
auto rng = creator(seeds[i]);
optimize_sample<true>(i, ndim, reference, buffer.data(), setup, a, b, gamma, alpha, rng, epoch);
std::copy(buffer.begin(), buffer.end(), output + shift);
Float* lcopy = left;
for (int d = 0; d < ndim; ++d, ++lcopy, ++right) {
*lcopy += alpha * clamp(grad_coef * (*lcopy - *right));
}
}

#ifndef UMAPPP_CUSTOM_PARALLEL
setup.epoch_of_next_sample[j] += setup.epochs_per_sample[j];

// The update to 'epoch_of_next_negative_sample' involves adding
// 'num_neg_samples * epochs_per_negative_sample', which eventually boils
// down to setting epoch_of_next_negative_sample to 'epoch'.
setup.epoch_of_next_negative_sample[j] = epoch;
}
}
#else
}
}, nthreads);
#endif
}

if (using_replacement) {
std::copy(replace_buffer.begin(), replace_buffer.end(), embedding);
}

return;
Expand Down
6 changes: 2 additions & 4 deletions tests/R/run.R
Original file line number Diff line number Diff line change
Expand Up @@ -42,17 +42,15 @@ test_that("initialization is done correctly", {

test_that("general run is not too inconsistent", {
ref <- uwot::umap(X = mat, nn_method=list(idx=cbind(1:nrow(mat), res$nn.index), dist=cbind(0, res$nn.dist)), a=2, b=1)
obs <- run_umap(t(res$nn.index - 1L), t(res$nn.dist), 2, 2, 1, FALSE, 123)
obs2 <- run_umap(t(res$nn.index - 1L), t(res$nn.dist), 2, 2, 1, TRUE, 123)
obs <- run_umap(t(res$nn.index - 1L), t(res$nn.dist), 2, 2, 1, 123)

# Values are within range.
expect_true(all(obs < 10 & obs > -10))
# expect_true(all(obs2 < 10 & obs2 > -10)) # Who knows why GHA doesn't like this, but oh well.

png("demo.png", width=10, height=5, units="in", res=120)
par(mfrow=c(1,3))
par(mfrow=c(1,2))
plot(ref[,1], ref[,2], col=id, main="uwot")
plot(obs[,1], obs[,2], col=id, main="umappp")
plot(obs2[,1], obs2[,2], col=id, main="umappp (batched)")
dev.off()
})
4 changes: 2 additions & 2 deletions tests/R/test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ Rcpp::List initialize_umap(Rcpp::IntegerMatrix indices, Rcpp::NumericMatrix dist
}

// [[Rcpp::export(rng=false)]]
Rcpp::NumericMatrix run_umap(Rcpp::IntegerMatrix indices, Rcpp::NumericMatrix distances, int ndim, double a, double b, bool batch, int seed) {
Rcpp::NumericMatrix run_umap(Rcpp::IntegerMatrix indices, Rcpp::NumericMatrix distances, int ndim, double a, double b, int seed) {
int nr = indices.nrow(), nc = indices.ncol();
umappp::NeighborList<> x(nc);
for (int i = 0; i < nc; ++i) {
Expand All @@ -47,7 +47,7 @@ Rcpp::NumericMatrix run_umap(Rcpp::IntegerMatrix indices, Rcpp::NumericMatrix di

Rcpp::NumericMatrix output(ndim, nc);
umappp::Umap<> runner;
runner.set_a(a).set_b(b).set_batch(batch).set_seed(seed);
runner.set_a(a).set_b(b).set_seed(seed);
auto status = runner.run(std::move(x), ndim, (double*)output.begin());

return Rcpp::transpose(output);
Expand Down
7 changes: 2 additions & 5 deletions tests/src/Umap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
#include <random>
#include <cmath>

class UmapTest : public ::testing::TestWithParam<std::tuple<int, int, int> > {
class UmapTest : public ::testing::TestWithParam<std::tuple<int, int> > {
protected:
template<class Param>
void assemble(Param p) {
Expand Down Expand Up @@ -46,8 +46,6 @@ TEST_P(UmapTest, Basic) {
assemble(param);

umappp::Umap<> runner;
bool use_batch = std::get<2>(param);
runner.set_batch(use_batch);

std::vector<double> output(nobs * ndim);
auto status = runner.initialize(std::move(stored), ndim, output.data());
Expand Down Expand Up @@ -76,8 +74,7 @@ INSTANTIATE_TEST_SUITE_P(
UmapTest,
::testing::Combine(
::testing::Values(50, 100, 200), // number of observations
::testing::Values(5, 10, 15), // number of neighbors
::testing::Values(false, true) // use batching mode
::testing::Values(5, 10, 15) // number of neighbors
)
);

Expand Down
Loading

0 comments on commit d579059

Please sign in to comment.