Skip to content

Commit

Permalink
Do not reset Annoy index during build-up with > 1 mark
Browse files Browse the repository at this point in the history
  • Loading branch information
rschu1ze committed Aug 21, 2023
1 parent b074e44 commit 066ec55
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 4 deletions.
5 changes: 5 additions & 0 deletions docs/en/engines/table-engines/mergetree-family/annindexes.md
Original file line number Diff line number Diff line change
Expand Up @@ -216,3 +216,8 @@ ORDER BY L2Distance(vectors, Point)
LIMIT N
SETTINGS annoy_index_search_k_nodes=100;
```
:::note
The Annoy index currently does not work with per-table, non-default `index_granularity` settings (see
[here](https://github.com/ClickHouse/ClickHouse/pull/51325#issuecomment-1605920475)). If necessary, the value must be changed in config.xml.
:::
2 changes: 1 addition & 1 deletion src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1642,7 +1642,7 @@ MarkRanges MergeTreeDataSelectExecutor::filterMarksUsingIndex(
{
if (index_mark != index_range.begin || !granule || last_index_mark != index_range.begin)
granule = reader.read();
// Cast to Ann condition

auto ann_condition = std::dynamic_pointer_cast<IMergeTreeIndexConditionApproximateNearestNeighbor>(condition);
if (ann_condition != nullptr)
{
Expand Down
6 changes: 4 additions & 2 deletions src/Storages/MergeTree/MergeTreeIndexAnnoy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -172,7 +172,8 @@ void MergeTreeIndexAggregatorAnnoy<Distance>::update(const Block & block, size_t
if (offsets[i + 1] - offsets[i] != size)
throw Exception(ErrorCodes::INCORRECT_DATA, "All arrays in column {} must have equal length", index_column_name);

index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(size);
if (!index)
index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(size);

/// Add all rows of block
index->add_item(index->get_n_items(), array.data());
Expand All @@ -195,7 +196,8 @@ void MergeTreeIndexAggregatorAnnoy<Distance>::update(const Block & block, size_t
if (data.empty())
throw Exception(ErrorCodes::LOGICAL_ERROR, "Tuple has 0 rows, {} rows expected", rows_read);

index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(data[0].size());
if (!index)
index = std::make_shared<AnnoyIndexWithSerialization<Distance>>(data[0].size());

for (const auto & item : data)
index->add_item(index->get_n_items(), item.data());
Expand Down
5 changes: 5 additions & 0 deletions tests/queries/0_stateless/02354_annoy_index.reference
Original file line number Diff line number Diff line change
Expand Up @@ -142,3 +142,8 @@ Expression (Projection)
Description: annoy GRANULARITY 4
Parts: 1/1
Granules: 4/4
--- Test correctness of Annoy index with > 1 mark
1 [1,0,0,0]
9000 [9000,0,0,0]
1 (1,0,0,0)
9000 (9000,0,0,0)
34 changes: 33 additions & 1 deletion tests/queries/0_stateless/02354_annoy_index.sql
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
-- Tags: no-fasttest, no-ubsan, no-cpu-aarch64
-- Tags: no-fasttest, no-ubsan, no-cpu-aarch64, no-ordinary-database

SET allow_experimental_annoy_index = 1;
SET allow_experimental_analyzer = 0;
Expand Down Expand Up @@ -249,3 +249,35 @@ DROP TABLE tab;

-- (*) Storage and search in Annoy indexes is inherently random. Tests which check for exact row matches would be unstable. Therefore,
-- comment them out.

SELECT '--- Test correctness of Annoy index with > 1 mark';

CREATE TABLE tab(id Int32, vector Array(Float32), INDEX annoy_index vector TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes=0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0; -- disable adaptive granularity due to bug
INSERT INTO tab SELECT number, [toFloat32(number), 0., 0., 0.] from numbers(10000);

SELECT *
FROM tab
ORDER BY L2Distance(vector, [1.0, 0.0, 0.0, 0.0])
LIMIT 1;

SELECT *
FROM tab
ORDER BY L2Distance(vector, [9000.0, 0.0, 0.0, 0.0])
LIMIT 1;

DROP TABLE tab;

CREATE TABLE tab(id Int32, vector Tuple(Float32, Float32, Float32, Float32), INDEX annoy_index vector TYPE annoy()) ENGINE = MergeTree ORDER BY id SETTINGS index_granularity_bytes=0, min_rows_for_wide_part = 0, min_bytes_for_wide_part = 0; -- disable adaptive granularity due to bug
INSERT INTO tab SELECT number, (toFloat32(number), 0., 0., 0.) from numbers(10000);

SELECT *
FROM tab
ORDER BY L2Distance(vector, (1.0, 0.0, 0.0, 0.0))
LIMIT 1;

SELECT *
FROM tab
ORDER BY L2Distance(vector, (9000.0, 0.0, 0.0, 0.0))
LIMIT 1;

DROP TABLE tab;

0 comments on commit 066ec55

Please sign in to comment.