Skip to content

Commit

Permalink
Merge pull request #22 from dillondaudert/input_data_fmt
Browse files Browse the repository at this point in the history
Reduce mem usage in optimize_embedding; eigs which=:SM (fix)
  • Loading branch information
dillondaudert authored Dec 27, 2019
2 parents 24adafb + b39b179 commit 3c6987a
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 123 deletions.
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LsqFit = "2fda8390-95c7-5789-9bda-21331edee243"
NearestNeighborDescent = "dd2c4c9e-a32f-5b2f-b342-08c2f244fce8"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"

[compat]
Expand Down
1 change: 1 addition & 0 deletions src/UMAP.jl
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ using NearestNeighborDescent
using SparseArrays

include("utils.jl")
include("embeddings.jl")
include("umap_.jl")

export umap, UMAP_
Expand Down
118 changes: 118 additions & 0 deletions src/embeddings.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# initializing and optimizing embeddings

function initialize_embedding(graph::AbstractMatrix{T}, n_components, ::Val{:spectral}) where {T}
local embed
try
embed = spectral_layout(graph, n_components)
# expand
expansion = 10 / maximum(embed)
embed .= (embed .* expansion) .+ (1//10000) .* randn.(T)
embed = collect(eachcol(embed))
catch e
@info "$e\nError encountered in spectral_layout; defaulting to random layout"
embed = initialize_embedding(graph, n_components, Val(:random))
end
return embed
end

function initialize_embedding(graph::AbstractMatrix{T}, n_components, ::Val{:random}) where {T}
return [20 .* rand(T, n_components) .- 10 for _ in 1:size(graph, 1)]
end

"""
spectral_layout(graph, embed_dim) -> embedding
Initialize the graph layout with spectral embedding.
"""
function spectral_layout(graph::SparseMatrixCSC{T},
embed_dim::Integer) where {T<:Real}
graph_f64 = convert.(Float64, graph)
D_ = Diagonal(dropdims(sum(graph_f64; dims=2); dims=2))
D = inv(sqrt(D_))
# normalized laplacian
L = Symmetric(I - D*graph*D)

k = embed_dim+1
num_lanczos_vectors = max(2k+1, round(Int, sqrt(size(L, 1))))
# get the 2nd - embed_dim+1th smallest eigenvectors
eigenvals, eigenvecs = eigs(L; nev=k,
ncv=num_lanczos_vectors,
which=:SM,
tol=1e-4,
v0=ones(Float64, size(L, 1)),
maxiter=size(L, 1)*5)
layout = permutedims(eigenvecs[:, 2:k])::Array{Float64, 2}
return convert.(T, layout)
end

"""
optimize_embedding(graph, embedding, n_epochs, initial_alpha, min_dist, spread, gamma, neg_sample_rate) -> embedding
Optimize an embedding by minimizing the fuzzy set cross entropy between the high and low dimensional simplicial sets using stochastic gradient descent.
# Arguments
- `graph`: a sparse matrix of shape (n_samples, n_samples)
- `embedding`: a vector of length (n_samples,) of vectors representing the embedded data points
- `n_epochs`: the number of training epochs for optimization
- `initial_alpha`: the initial learning rate
- `gamma`: the repulsive strength of negative samples
- `neg_sample_rate::Integer`: the number of negative samples per positive sample
"""
function optimize_embedding(graph,
embedding,
n_epochs,
initial_alpha,
min_dist,
spread,
gamma,
neg_sample_rate,
_a=nothing,
_b=nothing)
a, b = fit_ab(min_dist, spread, _a, _b)

alpha = initial_alpha
for e in 1:n_epochs
@inbounds for i in 1:size(graph, 2)
for ind in nzrange(graph, i)
j = rowvals(graph)[ind]
p = nonzeros(graph)[ind]
if rand() <= p
sdist = evaluate(SqEuclidean(), embedding[i], embedding[j])
if sdist > 0
delta = (-2 * a * b * sdist^(b-1))/(1 + a*sdist^b)
else
delta = 0
end
@simd for d in eachindex(embedding[i])
grad = clamp(delta * (embedding[i][d] - embedding[j][d]), -4, 4)
embedding[i][d] += alpha * grad
embedding[j][d] -= alpha * grad
end

for _ in 1:neg_sample_rate
k = rand(1:size(graph, 2))
i != k || continue
sdist = evaluate(SqEuclidean(), embedding[i], embedding[k])
if sdist > 0
delta = (2 * gamma * b) / ((1//1000 + sdist)*(1 + a*sdist^b))
else
delta = 0
end
@simd for d in eachindex(embedding[i])
if delta > 0
grad = clamp(delta * (embedding[i][d] - embedding[k][d]), -4, 4)
else
grad = 4
end
embedding[i][d] += alpha * grad
end
end

end
end
end
alpha = initial_alpha*(1 - e//n_epochs)
end

return embedding
end
121 changes: 1 addition & 120 deletions src/umap_.jl
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ function UMAP_(X::AbstractMatrix{S},
# TODO: if target variable y is passed, then construct target graph
# in the same manner and do a fuzzy simpl set intersection

return UMAP_(graph, embedding)
return UMAP_(graph, hcat(embedding...))
end

"""
Expand Down Expand Up @@ -201,122 +201,3 @@ function compute_membership_strengths(knns::AbstractMatrix{S},
end
return rows, cols, vals
end

function initialize_embedding(graph::AbstractMatrix{T}, n_components, ::Val{:spectral}) where {T}
local embed
try
embed = spectral_layout(graph, n_components)
# expand
expansion = 10 / maximum(embed)
embed .= (embed .* expansion) .+ (1//10000) .* randn.(T)
catch e
print("Error encountered in spectral_layout; defaulting to random layout\n")
embed = initialize_embedding(graph, n_components, Val(:random))
end
return embed
end

function initialize_embedding(graph::AbstractMatrix{T}, n_components, ::Val{:random}) where {T}
return 20 .* rand(T, n_components, size(graph, 1)) .- 10
end

"""
optimize_embedding(graph, embedding, n_epochs, initial_alpha, min_dist, spread, gamma, neg_sample_rate) -> embedding
Optimize an embedding by minimizing the fuzzy set cross entropy between the high and low dimensional simplicial sets using stochastic gradient descent.
# Arguments
- `graph`: a sparse matrix of shape (n_samples, n_samples)
- `embedding`: a dense matrix of shape (n_components, n_samples)
- `n_epochs`: the number of training epochs for optimization
- `initial_alpha`: the initial learning rate
- `gamma`: the repulsive strength of negative samples
- `neg_sample_rate::Integer`: the number of negative samples per positive sample
"""
function optimize_embedding(graph,
embedding,
n_epochs,
initial_alpha,
min_dist,
spread,
gamma,
neg_sample_rate,
_a=nothing,
_b=nothing)
a, b = fit_ab(min_dist, spread, _a, _b)

alpha = initial_alpha
for e in 1:n_epochs

@inbounds for i in 1:size(graph, 2)
for ind in nzrange(graph, i)
j = rowvals(graph)[ind]
p = nonzeros(graph)[ind]
if rand() <= p
@views sdist = evaluate(SqEuclidean(), embedding[:, i], embedding[:, j])
if sdist > 0
delta = (-2 * a * b * sdist^(b-1))/(1 + a*sdist^b)
else
delta = 0
end
@simd for d in 1:size(embedding, 1)
grad = clamp(delta * (embedding[d,i] - embedding[d,j]), -4, 4)
embedding[d,i] += alpha * grad
embedding[d,j] -= alpha * grad
end

for _ in 1:neg_sample_rate
k = rand(1:size(graph, 2))
@views sdist = evaluate(SqEuclidean(),
embedding[:, i], embedding[:, k])
if sdist > 0
delta = (2 * gamma * b) / ((1//1000 + sdist)*(1 + a*sdist^b))
elseif i == k
continue
else
delta = 0
end
@simd for d in 1:size(embedding, 1)
if delta > 0
grad = clamp(delta * (embedding[d, i] - embedding[d, k]), -4, 4)
else
grad = 4
end
embedding[d, i] += alpha * grad
end
end

end
end
end
alpha = initial_alpha*(1 - e//n_epochs)
end

return embedding
end

"""
spectral_layout(graph, embed_dim) -> embedding
Initialize the graph layout with spectral embedding.
"""
function spectral_layout(graph::SparseMatrixCSC{T},
embed_dim::Integer) where {T<:Real}
D_ = Diagonal(dropdims(sum(graph; dims=2); dims=2))
D = inv(sqrt(D_))
# normalized laplacian
# TODO: remove sparse() when PR #30018 is merged
L = sparse(Symmetric(I - D*graph*D))

k = embed_dim+1
num_lanczos_vectors = max(2k+1, round(Int, sqrt(size(L, 1))))
# get the 2nd - embed_dim+1th smallest eigenvectors
eigenvals, eigenvecs = eigs(L; nev=k,
ncv=num_lanczos_vectors,
which=:SR,
tol=1e-4,
v0=ones(T, size(L, 1)),
maxiter=size(L, 1)*5)
layout = permutedims(eigenvecs[:, 2:k])::Array{T, 2}
return layout
end
3 changes: 2 additions & 1 deletion test/runtests.jl
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
using Test
using Distances: Euclidean, CosineDist
using Random
using SparseArrays
using LinearAlgebra
using UMAP
using UMAP: fuzzy_simplicial_set, compute_membership_strengths, smooth_knn_dists, smooth_knn_dist, spectral_layout, optimize_embedding, knn_search, combine_fuzzy_sets, fit_ab, SMOOTH_K_TOLERANCE
using UMAP: initialize_embedding, fuzzy_simplicial_set, compute_membership_strengths, smooth_knn_dists, smooth_knn_dist, spectral_layout, optimize_embedding, knn_search, combine_fuzzy_sets, fit_ab, SMOOTH_K_TOLERANCE


include("utils_tests.jl")
Expand Down
6 changes: 4 additions & 2 deletions test/umap_tests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -87,17 +87,18 @@
end

@testset "optimize_embedding" begin
Random.seed!(0)
A = sprand(10000, 10000, 0.001)
B = dropzeros(A + A' - A .* A')
layout = spectral_layout(B, 5)
layout = initialize_embedding(B, 5, Val(:random))
n_epochs = 1
initial_alpha = 1.
min_dist = 1.
spread = 1.
gamma = 1.
neg_sample_rate = 5
embedding = optimize_embedding(B, layout, n_epochs, initial_alpha, min_dist, spread, gamma, neg_sample_rate)
@test embedding isa Array{Float64, 2}
@test embedding isa Array{Array{Float64, 1}, 1}
end

@testset "spectral_layout" begin
Expand All @@ -108,6 +109,7 @@
@inferred spectral_layout(B, 5)
layout32 = spectral_layout(convert(SparseMatrixCSC{Float32}, B), 5)
@test layout32 isa Array{Float32, 2}
@inferred spectral_layout(convert(SparseMatrixCSC{Float32}, B), 5)
end

end

0 comments on commit 3c6987a

Please sign in to comment.