Remove batched mode as it compromises result quality too much.

More specifically, the outcome from batch updates is too different to that of the iterative updates, such that it's difficult to recommend the former as a drop-in replacement for the latter. Even though it's easier to parallelize per epoch, the convergence is such that we need to spend more epochs (briefly discussed in jlmelville/uwot#83), which defeats the purpose. By removing batch processing, we also simplify the code a bit, which is nice.
libscran · Mar 17, 2023 · d579059 · d579059
1 parent 76dd3d3
commit d579059
Show file tree

Hide file tree

Showing 7 changed files with 62 additions and 217 deletions.
diff --git a/extern/CMakeLists.txt b/extern/CMakeLists.txt
@@ -11,7 +11,7 @@ FetchContent_MakeAvailable(knncolle)
 FetchContent_Declare(
   aarand
   GIT_REPOSITORY https://github.com/LTLA/aarand
-  GIT_TAG 2a8509c499f668bf424306f1aa986da429902c71
+  GIT_TAG e8397e9e648379f6086b20a83910c39ed8e4dd45
 )
 
 FetchContent_MakeAvailable(aarand)

diff --git a/include/umappp/Umap.hpp b/include/umappp/Umap.hpp
@@ -156,11 +156,6 @@ class Umap {
          */
         static constexpr uint64_t seed = 1234567890;
 
-        /**
-         * See `set_batch()`.
-         */
-        static constexpr bool batch = false;
-
         /**
          * See `set_num_threads()`.
          */
@@ -184,7 +179,6 @@ class Umap {
         Float b = Defaults::b;
         Float repulsion_strength = Defaults::repulsion_strength;
         Float learning_rate = Defaults::learning_rate;
-        bool batch = Defaults::batch;
         int nthreads = Defaults::num_threads;
     };
 
@@ -359,29 +353,12 @@ class Umap {
         return *this;
     }
 
-    /**
-     * @param b Whether to optimize in batch mode. 
-     * Batch mode is required for effective parallelization via OpenMP but may reduce the stability of the gradient descent. 
-     *
-     * Batch mode involves computing forces for all observations and applying them simultaneously.
-     * This is in contrast to the default where the location of observation is updated before the forces are computed for the next observation.
-     * As each observation's forces are computed independently, batch mode is more amenable to parallelization;
-     * however, this comes at the cost of stability as the force calculations for later observations are not aware of updates to the positions of earlier observations.
-     *
-     * @return A reference to this `Umap` object.
-     */
-    Umap& set_batch(bool b = Defaults::batch) {
-        rparams.batch = b;
-        return *this;
-    }
-
     /**
      * @param n Number of threads to use.
      *
      * @return A reference to this `Umap` object.
      *
      * This setting affects nearest neighbor detection (if an existing list of neighbors is not supplied in `initialize()` or `run()`) and spectral initialization.
-     * If `set_batch()` is `true`, multiple threads will also be used during layout optimization.
      *
      * The `UMAPPP_CUSTOM_PARALLEL` macro can be set to a function that specifies a custom parallelization scheme.
      * This function should be a template that accept three arguments:
@@ -471,33 +448,17 @@ class Umap {
          *
          */
         void run(int epoch_limit = 0) {
-            if (!rparams.batch) {
-                optimize_layout(
-                    ndim_,
-                    embedding_,
-                    epochs,
-                    rparams.a,
-                    rparams.b,
-                    rparams.repulsion_strength,
-                    rparams.learning_rate,
-                    engine,
-                    epoch_limit
-                );
-            } else {
-                optimize_layout_batched(
-                    ndim_,
-                    embedding_,
-                    epochs,
-                    rparams.a,
-                    rparams.b,
-                    rparams.repulsion_strength,
-                    rparams.learning_rate,
-                    [&]() -> auto { return engine(); },
-                    [](decltype(engine()) s) -> auto { return std::mt19937_64(s); },
-                    epoch_limit,
-                    rparams.nthreads
-                );
-            }
+            optimize_layout(
+                ndim_,
+                embedding_,
+                epochs,
+                rparams.a,
+                rparams.b,
+                rparams.repulsion_strength,
+                rparams.learning_rate,
+                engine,
+                epoch_limit
+            );
             return;
         }
     };

diff --git a/include/umappp/optimize_layout.hpp b/include/umappp/optimize_layout.hpp
@@ -105,75 +105,6 @@ void optimize_sample(
     auto& epoch_of_next_sample = setup.epoch_of_next_sample;
     auto& epoch_of_next_negative_sample = setup.epoch_of_next_negative_sample;
 
-    const size_t num_obs = head.size(); 
-    const Float negative_sample_rate = setup.negative_sample_rate;
-
-    size_t start = (i == 0 ? 0 : setup.head[i-1]), end = setup.head[i];
-    Float* left = embedding + i * ndim;
-
-    for (size_t j = start; j < end; ++j) {
-        if (epoch_of_next_sample[j] > epoch) {
-            continue;
-        }
-
-        Float* right = embedding + tail[j] * ndim;
-        Float dist2 = quick_squared_distance(left, right, ndim);
-        const Float pd2b = std::pow(dist2, b);
-        const Float grad_coef = (-2 * a * b * pd2b) / (dist2 * (a * pd2b + 1.0));
-        {
-            Float* lcopy = left;
-            Float* rcopy = right;
-
-            for (int d = 0; d < ndim; ++d, ++lcopy, ++rcopy) {
-                Float gradient = alpha * clamp(grad_coef * (*lcopy - *rcopy));
-                if constexpr(!batch) {
-                    *lcopy += gradient;
-                    *rcopy -= gradient;
-                } else {
-                    // Doubling as we'll assume symmetry from the same
-                    // force applied by the right node. This allows us to
-                    // avoid worrying about accounting for modifications to
-                    // the right node.
-                    buffer[d] += 2 * gradient;
-                }
-            }
-        }
-
-        // Remember that 'epochs_per_negative_sample' is defined as 'epochs_per_sample[j] / negative_sample_rate'.
-        // We just use it inline below rather than defining a new variable and suffering floating-point round-off.
-        const size_t num_neg_samples = (epoch - epoch_of_next_negative_sample[j]) * 
-            negative_sample_rate / epochs_per_sample[j]; // i.e., 1/epochs_per_negative_sample.
-
-        for (size_t p = 0; p < num_neg_samples; ++p) {
-            size_t sampled = aarand::discrete_uniform(rng, num_obs); 
-            if (sampled == i) {
-                continue;
-            }
-
-            Float* right = embedding + sampled * ndim;
-            Float dist2 = quick_squared_distance(left, right, ndim);
-            const Float grad_coef = 2 * gamma * b / ((0.001 + dist2) * (a * std::pow(dist2, b) + 1.0));
-            {
-                Float* lcopy = left;
-                const Float* rcopy = right;
-                for (int d = 0; d < ndim; ++d, ++lcopy, ++rcopy) {
-                    Float gradient = alpha * clamp(grad_coef * (*lcopy - *rcopy));
-                    if constexpr(!batch) {
-                        *lcopy += gradient;
-                    } else {
-                        buffer[d] += gradient;
-                    }
-                }
-            }
-        }
-
-        epoch_of_next_sample[j] += epochs_per_sample[j];
-
-        // The update to 'epoch_of_next_negative_sample' involves adding
-        // 'num_neg_samples * epochs_per_negative_sample', which eventually boils
-        // down to setting epoch_of_next_negative_sample to 'epoch'.
-        epoch_of_next_negative_sample[j] = epoch;
-    }
 }
 
 template<typename Float, class Setup, class Rng>
@@ -194,89 +125,64 @@ void optimize_layout(
     if (epoch_limit> 0) {
         limit_epochs = std::min(epoch_limit, num_epochs);
     }
-
+
+    const size_t num_obs = setup.head.size(); 
     for (; n < limit_epochs; ++n) {
         const Float epoch = n;
         const Float alpha = initial_alpha * (1.0 - epoch / num_epochs);
-        for (size_t i = 0; i < setup.head.size(); ++i) {
-            optimize_sample<false>(i, ndim, embedding, static_cast<Float*>(NULL), setup, a, b, gamma, alpha, rng, epoch);
-        }
-    }
 
-    return;
-}
+        for (size_t i = 0; i < num_obs; ++i) {
+            size_t start = (i == 0 ? 0 : setup.head[i-1]), end = setup.head[i];
+            Float* left = embedding + i * ndim;
 
-template<typename Float, class Setup, class SeedFunction, class EngineFunction>
-inline void optimize_layout_batched(
-    int ndim,
-    Float* embedding, 
-    Setup& setup,
-    Float a, 
-    Float b, 
-    Float gamma,
-    Float initial_alpha,
-    SeedFunction seeder,
-    EngineFunction creator,
-    int epoch_limit,
-    int nthreads
-) {
-    auto& n = setup.current_epoch;
-    auto num_epochs = setup.total_epochs;
-    auto limit_epochs = num_epochs;
-    if (epoch_limit > 0) {
-        limit_epochs = std::min(epoch_limit, num_epochs);
-    }
+            for (size_t j = start; j < end; ++j) {
+                if (setup.epoch_of_next_sample[j] > epoch) {
+                    continue;
+                }
 
-    const size_t num_obs = setup.head.size(); 
-    std::vector<decltype(seeder())> seeds(num_obs);
-    std::vector<Float> replace_buffer(num_obs * ndim);
-    Float* replacement = replace_buffer.data();
-    bool using_replacement = false;
+                {
+                    Float* right = embedding + setup.tail[j] * ndim;
+                    Float dist2 = quick_squared_distance(left, right, ndim);
+                    const Float pd2b = std::pow(dist2, b);
+                    const Float grad_coef = (-2 * a * b * pd2b) / (dist2 * (a * pd2b + 1.0));
 
-    for (; n < limit_epochs; ++n) {
-        const Float epoch = n;
-        const Float alpha = initial_alpha * (1.0 - epoch / num_epochs);
+                    Float* lcopy = left;
+                    for (int d = 0; d < ndim; ++d, ++lcopy, ++right) {
+                        Float gradient = alpha * clamp(grad_coef * (*lcopy - *right));
+                        *lcopy += gradient;
+                        *right -= gradient;
+                    }
+                }
 
-        // Fill the seeds.
-        for (auto& s : seeds) {
-            s = seeder();
-        }
+                // Remember that 'epochs_per_negative_sample' is defined as 'epochs_per_sample[j] / negative_sample_rate'.
+                // We just use it inline below rather than defining a new variable and suffering floating-point round-off.
+                const size_t num_neg_samples = (epoch - setup.epoch_of_next_negative_sample[j]) * 
+                    setup.negative_sample_rate / setup.epochs_per_sample[j]; // i.e., 1/epochs_per_negative_sample.
 
-        // Input and output alternate between epochs, to avoid the need for a
-        // copy operation on the entire embedding at the end of each epoch.
-        Float* reference = (using_replacement ? replacement : embedding); 
-        Float* output = (using_replacement ? embedding : replacement);
-        using_replacement = !using_replacement;
+                for (size_t p = 0; p < num_neg_samples; ++p) {
+                    size_t sampled = aarand::discrete_uniform(rng, num_obs);
+                    if (sampled == i) {
+                        continue;
+                    }
 
-#ifndef UMAPPP_CUSTOM_PARALLEL
-        #pragma omp parallel num_threads(nthreads)
-        {
-            std::vector<Float> buffer(ndim);
-            #pragma omp for
-            for (size_t i = 0; i < setup.head.size(); ++i) {
-#else
-        UMAPPP_CUSTOM_PARALLEL(setup.head.size(), [&](size_t first, size_t last) -> void {
-            std::vector<Float> buffer(ndim);
-            for (size_t i = first; i < last; ++i) {
-#endif
+                    const Float* right = embedding + sampled * ndim;
+                    Float dist2 = quick_squared_distance(left, right, ndim);
+                    const Float grad_coef = 2 * gamma * b / ((0.001 + dist2) * (a * std::pow(dist2, b) + 1.0));
 
-                size_t shift = i * ndim;
-                std::copy(reference + shift, reference + shift + ndim, buffer.data());
-                auto rng = creator(seeds[i]);
-                optimize_sample<true>(i, ndim, reference, buffer.data(), setup, a, b, gamma, alpha, rng, epoch);
-                std::copy(buffer.begin(), buffer.end(), output + shift);
+                    Float* lcopy = left;
+                    for (int d = 0; d < ndim; ++d, ++lcopy, ++right) {
+                        *lcopy += alpha * clamp(grad_coef * (*lcopy - *right));
+                    }
+                }
 
-#ifndef UMAPPP_CUSTOM_PARALLEL
+                setup.epoch_of_next_sample[j] += setup.epochs_per_sample[j];
+
+                // The update to 'epoch_of_next_negative_sample' involves adding
+                // 'num_neg_samples * epochs_per_negative_sample', which eventually boils
+                // down to setting epoch_of_next_negative_sample to 'epoch'.
+                setup.epoch_of_next_negative_sample[j] = epoch;
             }
         }
-#else
-            }
-        }, nthreads);
-#endif
-    }
-
-    if (using_replacement) {
-        std::copy(replace_buffer.begin(), replace_buffer.end(), embedding);
     }
 
     return;

diff --git a/tests/R/run.R b/tests/R/run.R
@@ -42,17 +42,15 @@ test_that("initialization is done correctly", {
 
 test_that("general run is not too inconsistent", {
     ref <- uwot::umap(X = mat, nn_method=list(idx=cbind(1:nrow(mat), res$nn.index), dist=cbind(0, res$nn.dist)), a=2, b=1)
-    obs <- run_umap(t(res$nn.index - 1L), t(res$nn.dist), 2, 2, 1, FALSE, 123)
-    obs2 <- run_umap(t(res$nn.index - 1L), t(res$nn.dist), 2, 2, 1, TRUE, 123)
+    obs <- run_umap(t(res$nn.index - 1L), t(res$nn.dist), 2, 2, 1, 123)
 
     # Values are within range.
     expect_true(all(obs < 10 & obs > -10))
 #    expect_true(all(obs2 < 10 & obs2 > -10)) # Who knows why GHA doesn't like this, but oh well.
 
     png("demo.png", width=10, height=5, units="in", res=120)
-    par(mfrow=c(1,3))
+    par(mfrow=c(1,2))
     plot(ref[,1], ref[,2], col=id, main="uwot")
     plot(obs[,1], obs[,2], col=id, main="umappp")
-    plot(obs2[,1], obs2[,2], col=id, main="umappp (batched)")
     dev.off()
 })
diff --git a/tests/R/test.cpp b/tests/R/test.cpp
@@ -34,7 +34,7 @@ Rcpp::List initialize_umap(Rcpp::IntegerMatrix indices, Rcpp::NumericMatrix dist
 }
 
 // [[Rcpp::export(rng=false)]]
-Rcpp::NumericMatrix run_umap(Rcpp::IntegerMatrix indices, Rcpp::NumericMatrix distances, int ndim, double a, double b, bool batch, int seed) {
+Rcpp::NumericMatrix run_umap(Rcpp::IntegerMatrix indices, Rcpp::NumericMatrix distances, int ndim, double a, double b, int seed) {
     int nr = indices.nrow(), nc = indices.ncol();
     umappp::NeighborList<> x(nc);
     for (int i = 0; i < nc; ++i) {
@@ -47,7 +47,7 @@ Rcpp::NumericMatrix run_umap(Rcpp::IntegerMatrix indices, Rcpp::NumericMatrix di
 
     Rcpp::NumericMatrix output(ndim, nc);
     umappp::Umap<> runner;
-    runner.set_a(a).set_b(b).set_batch(batch).set_seed(seed);
+    runner.set_a(a).set_b(b).set_seed(seed);
     auto status = runner.run(std::move(x), ndim, (double*)output.begin());
 
     return Rcpp::transpose(output);

diff --git a/tests/src/Umap.cpp b/tests/src/Umap.cpp
@@ -12,7 +12,7 @@
 #include <random>
 #include <cmath>
 
-class UmapTest : public ::testing::TestWithParam<std::tuple<int, int, int> > {
+class UmapTest : public ::testing::TestWithParam<std::tuple<int, int> > {
 protected:
     template<class Param>
     void assemble(Param p) {
@@ -46,8 +46,6 @@ TEST_P(UmapTest, Basic) {
     assemble(param);
 
     umappp::Umap<> runner;
-    bool use_batch = std::get<2>(param);
-    runner.set_batch(use_batch);
 
     std::vector<double> output(nobs * ndim);
     auto status = runner.initialize(std::move(stored), ndim, output.data());
@@ -76,8 +74,7 @@ INSTANTIATE_TEST_SUITE_P(
     UmapTest,
     ::testing::Combine(
         ::testing::Values(50, 100, 200), // number of observations
-        ::testing::Values(5, 10, 15), // number of neighbors
-        ::testing::Values(false, true) // use batching mode
+        ::testing::Values(5, 10, 15) // number of neighbors
     )
 );