dillondaudert · dillondaudert · May 14, 2020 · Apr 24, 2020 · Apr 25, 2020 · Apr 25, 2020
diff --git a/README.md b/README.md
@@ -26,13 +26,38 @@ UMAP can use a precomputed distance matrix instead of finding the nearest neighb
 embedding = umap(distances, n_components; metric=:precomputed)
 ```
 
+## Fitting a UMAP model to a dataset and transforming new data
+
+### Constructing a model
+To construct a model to use for embedding new data, use the constructor:
+```jl
+model = UMAP_(X, n_components; <kwargs>)
+```
+where the constructor takes the same keyword arguments (kwargs) as `umap`. The returned object has the following fields:
+```jl
+model.graph     # The graph of fuzzy simplicial set membership strengths of each point in the dataset
+model.embedding # The embedding of the dataset
+model.data      # A reference to the original dataset
+model.knns      # A matrix of indices of nearest neighbors of points in the dataset,
+                # as determined on the original manifold (may be approximate)
+model.dists     # The distances of the neighbors indicated by model.knns
+```
+
+### Embedding new data
+To transform new data into the existing embedding of a UMAP model, use the `umap_transform` function:
+```jl
+Q_embedding = umap_transform(Q, model; <kwargs>)
+```
+where `Q` is a matrix of new query data to embed into the existing embedding, and `model` is the object obtained from the `UMAP_` call above. `Q` must come from a space of the same dimensionality as `model.data` (ie `X` in the `UMAP_` call above).
+
+The remaining keyword arguments (kwargs) are the same as for above functions.
+
 ## Implementation Details
 There are two main steps involved in UMAP: building a weighted graph with edges connecting points to their nearest neighbors, and optimizing the low-dimensional embedding of that graph. The first step is accomplished either by an exact kNN search (for datasets with `< 4096` points) or by the approximate kNN search algorithm, [NNDescent](https://github.com/dillondaudert/NearestNeighborDescent.jl). This step is also usually the most costly.
 
 The low-dimensional embedding is initialized (by default) with the eigenvectors of the normalized Laplacian of the kNN graph. These are found using ARPACK (via [Arpack.jl](https://github.com/JuliaLinearAlgebra/Arpack.jl)).
 
 ## Current Limitations
-- **No transform**: Only one-time embeddings are possible at the moment. That is to say, it isn't possible to "fit" UMAP to a dataset and then use it to "transform" new data
 - **Input data types**: Only data points that are represented by vectors of numbers (passed in as a matrix) are valid inputs. This is mostly due to a lack of support for other formats in [NNDescent](https://github.com/dillondaudert/NearestNeighborDescent.jl). Support for e.g. string datasets is possible in the future
 - **Sequential**: This implementation does not take advantage of any parallelism
 

diff --git a/src/UMAP.jl b/src/UMAP.jl
@@ -11,6 +11,6 @@ include("utils.jl")
 include("embeddings.jl")
 include("umap_.jl")
 
-export umap, UMAP_
+export umap, UMAP_, umap_transform
 
 end # module
diff --git a/src/embeddings.jl b/src/embeddings.jl
@@ -19,6 +19,24 @@ function initialize_embedding(graph::AbstractMatrix{T}, n_components, ::Val{:ran
     return [20 .* rand(T, n_components) .- 10 for _ in 1:size(graph, 1)]
 end
 
+"""
+    initialize_embedding(graph::AbstractMatrix{<:Real}, ref_embedding::AbstractMatrix{T<:AbstractFloat}) -> embedding
+
+Initialize an embedding of points corresponding to the columns of the `graph`, by taking weighted means of
+the columns of `ref_embedding`, where weights are values from the rows of the `graph`.
+
+The resulting embedding will have shape `(size(ref_embedding, 1), length(query_inds))`, where `size(ref_embedding, 1)`
+is the number of components (dimensions) of the `reference embedding`, and `length(query_inds)` is the number of 
+samples in the resulting embedding. Its elements will have type T.
+"""
+function initialize_embedding(graph::AbstractMatrix{<:Real}, ref_embedding::AbstractMatrix{T}) where {T<:AbstractFloat}
+    embed = [zeros(T, size(ref_embedding, 1)) for _ in 1:size(graph, 2)]
+    for (i, col) in enumerate(eachcol(graph))
+        embed[i] = vec(sum(transpose(col) .* ref_embedding, dims=2) ./ (sum(col) + eps(zero(T))))
+    end
+    return embed
+end
+
 """
     spectral_layout(graph, embed_dim) -> embedding
 
@@ -45,6 +63,13 @@ function spectral_layout(graph::SparseMatrixCSC{T},
     return convert.(T, layout)
 end
 
+
+# The optimize_embedding methods have parameters that are ::AbstractVector{<:AbstractVector{T}}.
+# AbstractVector{<:AbstractVector{T}} allows arguments to be views of some other array/matrix
+# rather than just vectors themselves (so we can avoid copying the model.data and instead just
+# create a view to satisfy our reshaping needs). For example, calling collect(eachcol(X)) creates
+# an Array of SubArrays, and SubArray is not an Array, but SubArray <: AbstractArray.
+
 """
     optimize_embedding(graph, embedding, n_epochs, initial_alpha, min_dist, spread, gamma, neg_sample_rate) -> embedding
 
@@ -53,22 +78,71 @@ Optimize an embedding by minimizing the fuzzy set cross entropy between the high
 # Arguments
 - `graph`: a sparse matrix of shape (n_samples, n_samples)
 - `embedding`: a vector of length (n_samples,) of vectors representing the embedded data points
-- `n_epochs`: the number of training epochs for optimization
+- `n_epochs`::Integer: the number of training epochs for optimization
 - `initial_alpha`: the initial learning rate
 - `gamma`: the repulsive strength of negative samples
 - `neg_sample_rate::Integer`: the number of negative samples per positive sample
 """
 function optimize_embedding(graph,
-                            embedding,
-                            n_epochs,
+                            embedding::AbstractVector{<:AbstractVector{<:AbstractFloat}},
+                            n_epochs::Integer,
                             initial_alpha,
                             min_dist,
                             spread,
                             gamma,
                             neg_sample_rate,
                             _a=nothing,
                             _b=nothing)
+    return optimize_embedding(
+        graph,
+        embedding,
+        embedding,
+        n_epochs,
+        initial_alpha,
+        min_dist,
+        spread,
+        gamma,
+        neg_sample_rate,
+        _a,
+        _b,
+        move_ref=true
+    )
+end
+
+"""
+    optimize_embedding(graph, query_embedding, ref_embedding, n_epochs, initial_alpha, min_dist, spread, gamma, neg_sample_rate, _a=nothing, _b=nothing; move_ref=false) -> embedding
+
+Optimize an embedding by minimizing the fuzzy set cross entropy between the high and low dimensional simplicial sets using stochastic gradient descent.
+Optimize "query" samples with respect to "reference" samples.
+
+# Arguments
+- `graph`: a sparse matrix of shape (n_samples, n_samples)
+- `query_embedding`: a vector of length (n_samples,) of vectors representing the embedded data points to be optimized
+- `ref_embedding`: a vector of length (n_samples,) of vectors representing the embedded data points to optimize against
+- `n_epochs`::Integer: the number of training epochs for optimization
+- `initial_alpha`: the initial learning rate
+- `gamma`: the repulsive strength of negative samples
+- `neg_sample_rate`: the number of negative samples per positive sample
+- `_a`: this controls the embedding. If the actual argument is `nothing`, this is determined automatically by `min_dist` and `spread`.
+- `_b`: this controls the embedding. If the actual argument is `nothing`, this is determined automatically by `min_dist` and `spread`.
+
+# Keyword Arguments
+- `move_ref::Bool = false`: if true, also improve the embeddings in `ref_embedding`, else fix them and only improve embeddings in `query_embedding`.
+"""
+function optimize_embedding(graph,
+                            query_embedding::AbstractVector{<:AbstractVector{T}},
+                            ref_embedding::AbstractVector{<:AbstractVector{T}},
+                            n_epochs::Integer,
+                            initial_alpha,
+                            min_dist,
+                            spread,
+                            gamma,
+                            neg_sample_rate,
+                            _a=nothing,
+                            _b=nothing;
+                            move_ref::Bool=false) where {T <: AbstractFloat}
     a, b = fit_ab(min_dist, spread, _a, _b)
+    self_reference = query_embedding === ref_embedding
 
     alpha = initial_alpha
     for e in 1:n_epochs
@@ -77,42 +151,44 @@ function optimize_embedding(graph,
                 j = rowvals(graph)[ind]
                 p = nonzeros(graph)[ind]
                 if rand() <= p
-                    sdist = evaluate(SqEuclidean(), embedding[i], embedding[j])
+                    sdist = evaluate(SqEuclidean(), query_embedding[i], ref_embedding[j])
                     if sdist > 0
                         delta = (-2 * a * b * sdist^(b-1))/(1 + a*sdist^b)
                     else
                         delta = 0
                     end
-                    @simd for d in eachindex(embedding[i])
-                        grad = clamp(delta * (embedding[i][d] - embedding[j][d]), -4, 4)
-                        embedding[i][d] += alpha * grad
-                        embedding[j][d] -= alpha * grad
+                    @simd for d in eachindex(query_embedding[i])
+                        grad = clamp(delta * (query_embedding[i][d] - ref_embedding[j][d]), -4, 4)
+                        query_embedding[i][d] += alpha * grad
+                        if move_ref
+                            ref_embedding[j][d] -= alpha * grad
+                        end
                     end
 
                     for _ in 1:neg_sample_rate
-                        k = rand(1:size(graph, 2))
-                        i != k || continue
-                        sdist = evaluate(SqEuclidean(), embedding[i], embedding[k])
+                        k = rand(eachindex(ref_embedding))
+                        if i == k && self_reference
+                            continue
+                        end
+                        sdist = evaluate(SqEuclidean(), query_embedding[i], ref_embedding[k])
                         if sdist > 0
                             delta = (2 * gamma * b) / ((1//1000 + sdist)*(1 + a*sdist^b))
                         else
                             delta = 0
                         end
-                        @simd for d in eachindex(embedding[i])
+                        @simd for d in eachindex(query_embedding[i])
                             if delta > 0
-                                grad = clamp(delta * (embedding[i][d] - embedding[k][d]), -4, 4)
+                                grad = clamp(delta * (query_embedding[i][d] - ref_embedding[k][d]), -4, 4)
                             else
                                 grad = 4
                             end
-                            embedding[i][d] += alpha * grad
+                            query_embedding[i][d] += alpha * grad
                         end
                     end
-
                 end
             end
         end
         alpha = initial_alpha*(1 - e//n_epochs)
     end
-
-    return embedding
+    return query_embedding
 end