Remove Annoy indexes

Annoy indexes not popular these days, at least when it comes to vector databases. Such indexes work okay-ish low dimensions but they suffers badly from a curse of dimensionality which makes them inapt for a high number of dimensions. Now that Annoy is gone, issue (*) also disappears and we can drop 'no-ubsan', 'no-cpu-aarch64', and 'no-asan' from tests. (*) spotify/annoy#456
rschu1ze · Jun 10, 2024 · 60efa4f · 60efa4f
1 parent e8d282a
commit 60efa4f
Show file tree

Hide file tree

Showing 26 changed files with 31 additions and 932 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -230,9 +230,6 @@
 [submodule "contrib/minizip-ng"]
 	path = contrib/minizip-ng
 	url = https://github.com/zlib-ng/minizip-ng
-[submodule "contrib/annoy"]
-	path = contrib/annoy
-	url = https://github.com/ClickHouse/annoy
 [submodule "contrib/qpl"]
 	path = contrib/qpl
 	url = https://github.com/intel/qpl

diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt
@@ -205,7 +205,6 @@ add_contrib (morton-nd-cmake morton-nd)
 if (ARCH_S390X)
     add_contrib(crc32-s390x-cmake crc32-s390x)
 endif()
-add_contrib (annoy-cmake annoy)
 
 option(ENABLE_USEARCH "Enable USearch" ${ENABLE_LIBRARIES})
 if (ENABLE_USEARCH)

diff --git a/contrib/annoy b/contrib/annoy
diff --git a/contrib/annoy-cmake/CMakeLists.txt b/contrib/annoy-cmake/CMakeLists.txt
diff --git a/docs/en/engines/table-engines/mergetree-family/annindexes.md b/docs/en/engines/table-engines/mergetree-family/annindexes.md
@@ -126,81 +126,8 @@ was specified for ANN indexes, the default value is 100 million.
 
 # Available ANN Indexes {#available_ann_indexes}
 
-- [Annoy](/docs/en/engines/table-engines/mergetree-family/annindexes.md#annoy-annoy)
-
 - [USearch](/docs/en/engines/table-engines/mergetree-family/annindexes.md#usearch-usearch)
 
-## Annoy {#annoy}
-
-Annoy indexes are currently experimental, to use them you first need to `SET allow_experimental_annoy_index = 1`. They are also currently
-disabled on ARM due to memory safety problems with the algorithm.
-
-This type of ANN index is based on the [Annoy library](https://github.com/spotify/annoy) which recursively divides the space into random
-linear surfaces (lines in 2D, planes in 3D etc.).
-
-<div class='vimeo-container'>
-  <iframe src="//www.youtube.com/embed/QkCCyLW0ehU"
-    width="640"
-    height="360"
-    frameborder="0"
-    allow="autoplay;
-    fullscreen;
-    picture-in-picture"
-    allowfullscreen>
-  </iframe>
-</div>
-
-Syntax to create an Annoy index over an [Array(Float32)](../../../sql-reference/data-types/array.md) column:
-
-```sql
-CREATE TABLE table_with_annoy_index
-(
-  id Int64,
-  vectors Array(Float32),
-  INDEX [ann_index_name] vectors TYPE annoy([Distance[, NumTrees]]) [GRANULARITY N]
-)
-ENGINE = MergeTree
-ORDER BY id;
-```
-
-Annoy currently supports two distance functions:
-- `L2Distance`, also called Euclidean distance, is the length of a line segment between two points in Euclidean space
-  ([Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)).
-- `cosineDistance`, also called cosine similarity, is the cosine of the angle between two (non-zero) vectors
-  ([Wikipedia](https://en.wikipedia.org/wiki/Cosine_similarity)).
-
-For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no
-distance function was specified during index creation, `L2Distance` is used as default.
-
-Parameter `NumTrees` is the number of trees which the algorithm creates (default if not specified: 100). Higher values of `NumTree` mean
-more accurate search results but slower index creation / query times (approximately linearly) as well as larger index sizes.
-
-:::note
-All arrays must have same length. To avoid errors, you can use a
-[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints), for example, `CONSTRAINT constraint_name_1 CHECK
-length(vectors) = 256`. Also, empty `Arrays` and unspecified `Array` values in INSERT statements (i.e. default values) are not supported.
-:::
-
-The creation of Annoy indexes (whenever a new part is build, e.g. at the end of a merge) is a relatively slow process. You can increase
-setting `max_threads_for_annoy_index_creation` (default: 4) which controls how many threads are used to create an Annoy index. Please be
-careful with this setting, it is possible that multiple indexes are created in parallel in which case there can be overparallelization.
-
-Setting `annoy_index_search_k_nodes` (default: `NumTrees * LIMIT`) determines how many tree nodes are inspected during SELECTs. Larger
-values mean more accurate results at the cost of longer query runtime:
-
-```sql
-SELECT *
-FROM table_name
-ORDER BY L2Distance(vectors, Point)
-LIMIT N
-SETTINGS annoy_index_search_k_nodes=100;
-```
-
-:::note
-The Annoy index currently does not work with per-table, non-default `index_granularity` settings (see
-[here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml.
-:::
-
 ## USearch {#usearch}
 
 This type of ANN index is based on the [USearch library](https://github.com/unum-cloud/usearch), which implements the [HNSW
@@ -247,3 +174,15 @@ was specified during index creation, `f16` is used as default.
 
 For normalized data, `L2Distance` is usually a better choice, otherwise `cosineDistance` is recommended to compensate for scale. If no
 distance function was specified during index creation, `L2Distance` is used as default.
+
+:::note
+All arrays must have same length. To avoid errors, you can use a
+[CONSTRAINT](/docs/en/sql-reference/statements/create/table.md#constraints), for example, `CONSTRAINT constraint_name_1 CHECK
+length(vectors) = 256`. Also, empty `Arrays` and unspecified `Array` values in INSERT statements (i.e. default values) are not supported.
+:::
+
+:::note
+The USearch index currently does not work with per-table, non-default `index_granularity` settings (see
+[here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml.
+:::
+
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -598,10 +598,6 @@ endif()
 
 dbms_target_link_libraries(PUBLIC ch_contrib::consistent_hashing)
 
-if (TARGET ch_contrib::annoy)
-    dbms_target_link_libraries(PUBLIC ch_contrib::annoy)
-endif()
-
 if (TARGET ch_contrib::usearch)
     dbms_target_link_libraries(PUBLIC ch_contrib::usearch)
 endif()

diff --git a/src/Core/Settings.h b/src/Core/Settings.h
@@ -903,11 +903,8 @@ class IColumn;
     M(Bool, allow_experimental_object_type, false, "Allow Object and JSON data types", 0) \
     M(Bool, allow_experimental_variant_type, false, "Allow Variant data type", 0) \
     M(Bool, allow_experimental_dynamic_type, false, "Allow Dynamic data type", 0) \
-    M(Bool, allow_experimental_annoy_index, false, "Allows to use Annoy index. Disabled by default because this feature is experimental", 0) \
     M(Bool, allow_experimental_usearch_index, false, "Allows to use USearch index. Disabled by default because this feature is experimental", 0) \
     M(UInt64, max_limit_for_ann_queries, 1'000'000, "SELECT queries with LIMIT bigger than this setting cannot use ANN indexes. Helps to prevent memory overflows in ANN search indexes.", 0) \
-    M(UInt64, max_threads_for_annoy_index_creation, 4, "Number of threads used to build Annoy indexes (0 means all cores, not recommended)", 0) \
-    M(Int64, annoy_index_search_k_nodes, -1, "SELECT queries search up to this many nodes in Annoy indexes.", 0) \
     M(Bool, throw_on_unsupported_query_inside_transaction, true, "Throw exception if unsupported query is used inside transaction", 0) \
     M(TransactionsWaitCSNMode, wait_changes_become_visible_after_commit_mode, TransactionsWaitCSNMode::WAIT_UNKNOWN, "Wait for committed changes to become actually visible in the latest snapshot", 0) \
     M(Bool, implicit_transaction, false, "If enabled and not already inside a transaction, wraps the query inside a full transaction (begin + commit or rollback)", 0) \
@@ -996,6 +993,9 @@ class IColumn;
     MAKE_OBSOLETE(M, Bool, query_plan_optimize_projection, true) \
     MAKE_OBSOLETE(M, Bool, query_cache_store_results_of_queries_with_nondeterministic_functions, false) \
     MAKE_OBSOLETE(M, Bool, optimize_move_functions_out_of_any, false) \
+    MAKE_OBSOLETE(M, Bool, allow_experimental_annoy_index, false) \
+    MAKE_OBSOLETE(M, UInt64, max_threads_for_annoy_index_creation, 4) \
+    MAKE_OBSOLETE(M, Int64, annoy_index_search_k_nodes, -1) \
     MAKE_OBSOLETE(M, Bool, allow_experimental_undrop_table_query, true) \
     MAKE_OBSOLETE(M, Bool, allow_experimental_s3queue, true) \
     MAKE_OBSOLETE(M, Bool, query_plan_optimize_primary_key, true) \

diff --git a/src/Databases/DatabaseReplicated.cpp b/src/Databases/DatabaseReplicated.cpp
@@ -977,7 +977,6 @@ void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeep
         query_context->setSetting("allow_experimental_object_type", 1);
         query_context->setSetting("allow_experimental_variant_type", 1);
         query_context->setSetting("allow_experimental_dynamic_type", 1);
-        query_context->setSetting("allow_experimental_annoy_index", 1);
         query_context->setSetting("allow_experimental_usearch_index", 1);
         query_context->setSetting("allow_experimental_bigint_types", 1);
         query_context->setSetting("allow_experimental_window_functions", 1);

diff --git a/src/Interpreters/InterpreterCreateQuery.cpp b/src/Interpreters/InterpreterCreateQuery.cpp
@@ -759,8 +759,6 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::getTableProperti
                 if (index_desc.type == INVERTED_INDEX_NAME && !settings.allow_experimental_inverted_index)
                     throw Exception(ErrorCodes::ILLEGAL_INDEX, "Please use index type 'full_text' instead of 'inverted'");
                 /// ----
-                if (index_desc.type == "annoy" && !settings.allow_experimental_annoy_index)
-                    throw Exception(ErrorCodes::INCORRECT_QUERY, "Annoy index is disabled. Turn on allow_experimental_annoy_index");
                 if (index_desc.type == "usearch" && !settings.allow_experimental_usearch_index)
                     throw Exception(ErrorCodes::INCORRECT_QUERY, "USearch index is disabled. Turn on allow_experimental_usearch_index");
 

diff --git a/src/Parsers/ASTIndexDeclaration.h b/src/Parsers/ASTIndexDeclaration.h
@@ -13,7 +13,6 @@ class ASTIndexDeclaration : public IAST
 {
 public:
     static const auto DEFAULT_INDEX_GRANULARITY = 1uz;
-    static const auto DEFAULT_ANNOY_INDEX_GRANULARITY = 100'000'000uz;
     static const auto DEFAULT_USEARCH_INDEX_GRANULARITY = 100'000'000uz;
 
     ASTIndexDeclaration(ASTPtr expression, ASTPtr type, const String & name_);

diff --git a/src/Parsers/ParserCreateIndexQuery.cpp b/src/Parsers/ParserCreateIndexQuery.cpp
@@ -89,9 +89,7 @@ bool ParserCreateIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected
     else
     {
         auto index_type = index->getType();
-        if (index_type && index_type->name == "annoy")
-            index->granularity = ASTIndexDeclaration::DEFAULT_ANNOY_INDEX_GRANULARITY;
-        else if (index_type && index_type->name == "usearch")
+        if (index_type && index_type->name == "usearch")
             index->granularity = ASTIndexDeclaration::DEFAULT_USEARCH_INDEX_GRANULARITY;
         else
             index->granularity = ASTIndexDeclaration::DEFAULT_INDEX_GRANULARITY;

diff --git a/src/Parsers/ParserCreateQuery.cpp b/src/Parsers/ParserCreateQuery.cpp
@@ -212,9 +212,7 @@ bool ParserIndexDeclaration::parseImpl(Pos & pos, ASTPtr & node, Expected & expe
     else
     {
         auto index_type = index->getType();
-        if (index_type->name == "annoy")
-            index->granularity = ASTIndexDeclaration::DEFAULT_ANNOY_INDEX_GRANULARITY;
-        else if (index_type->name == "usearch")
+        if (index_type->name == "usearch")
             index->granularity = ASTIndexDeclaration::DEFAULT_USEARCH_INDEX_GRANULARITY;
         else
             index->granularity = ASTIndexDeclaration::DEFAULT_INDEX_GRANULARITY;

diff --git a/src/Processors/QueryPlan/ReadFromMergeTree.cpp b/src/Processors/QueryPlan/ReadFromMergeTree.cpp
@@ -24,7 +24,6 @@
 #include <Processors/Transforms/SelectByIndicesTransform.h>
 #include <QueryPipeline/QueryPipelineBuilder.h>
 #include <Storages/MergeTree/MergeTreeDataSelectExecutor.h>
-#include <Storages/MergeTree/MergeTreeIndexAnnoy.h>
 #include <Storages/MergeTree/MergeTreeIndexUSearch.h>
 #include <Storages/MergeTree/MergeTreeReadPool.h>
 #include <Storages/MergeTree/MergeTreePrefetchedReadPool.h>
@@ -1476,10 +1475,6 @@ static void buildIndexes(
                 MergeTreeIndexConditionPtr condition;
                 if (index_helper->isVectorSearch())
                 {
-#ifdef ENABLE_ANNOY
-                    if (const auto * annoy = typeid_cast<const MergeTreeIndexAnnoy *>(index_helper.get()))
-                        condition = annoy->createIndexCondition(query_info, context);
-#endif
 #ifdef ENABLE_USEARCH
                     if (const auto * usearch = typeid_cast<const MergeTreeIndexUSearch *>(index_helper.get()))
                         condition = usearch->createIndexCondition(query_info, context);

diff --git a/src/Storages/MergeTree/MergeTreeIOSettings.h b/src/Storages/MergeTree/MergeTreeIOSettings.h
@@ -73,7 +73,6 @@ struct MergeTreeWriterSettings
         , rewrite_primary_key(rewrite_primary_key_)
         , blocks_are_granules_size(blocks_are_granules_size_)
         , query_write_settings(query_write_settings_)
-        , max_threads_for_annoy_index_creation(global_settings.max_threads_for_annoy_index_creation)
         , low_cardinality_max_dictionary_size(global_settings.low_cardinality_max_dictionary_size)
         , low_cardinality_use_single_dictionary_for_part(global_settings.low_cardinality_use_single_dictionary_for_part != 0)
     {
@@ -94,8 +93,6 @@ struct MergeTreeWriterSettings
     bool blocks_are_granules_size;
     WriteSettings query_write_settings;
 
-    size_t max_threads_for_annoy_index_creation;
-
     size_t low_cardinality_max_dictionary_size;
     bool low_cardinality_use_single_dictionary_for_part;
 };