From ebf1979bb22fd7ac9c85bb391a695e3a008ccfa0 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Mon, 4 Mar 2019 19:21:02 -0500 Subject: [PATCH 1/2] Add documentation for min_hash filter Closes #20757 --- .../tokenfilters/minhash-tokenfilter.asciidoc | 115 +++++++++++++++++- 1 file changed, 113 insertions(+), 2 deletions(-) diff --git a/docs/reference/analysis/tokenfilters/minhash-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/minhash-tokenfilter.asciidoc index eb6a4d820ef1b..06fa09b8aa89a 100644 --- a/docs/reference/analysis/tokenfilters/minhash-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/minhash-tokenfilter.asciidoc @@ -1,7 +1,41 @@ [[analysis-minhash-tokenfilter]] -=== Minhash Token Filter +=== MinHash Token Filter -A token filter of type `min_hash` hashes each token of the token stream and divides +==== Theory +Minhash token filter allows you to hash documents for similarity search. +Similarity search, or nearest neighbor search is a complex problem. +A naive solution requires an exhaustive pairwise comparison between a query +document and every document in an index. This is a prohibitive operation +if the index is large. A number of approximate nearest neighbor search +solutions have been developed to make similarity search more practical and +computationally feasible. One of these solutions involves hashing of documents. + +Documents are hashed in a way that similar documents are more likely +to produce the same hash code and are put into the same hash bucket, +while dissimilar documents are more likely to be hashed into +different hash buckets. This type of hashing is known as +locality sensitive hashing (LSH). + +Depending on what constitutes the similarity between documents, +various LSH functions https://arxiv.org/abs/1408.2927[have been proposed]. +For Jaccard similarity, a popular LSH function is +https://en.wikipedia.org/wiki/MinHash[MinHash]. +A general idea of the way MinHash produces a signature for a document +is by applying a random permutation over the whole index vocabulary (random +numbering for the vocabulary), and recording the minimum value for this permutation +for the document (the minimum number for a vocabulary word that is present +in the document). The permutations are run several times; +combining the minimum values for all of them will constitute a +signature for the document. + +In practice, instead of random permutations, a number of hash functions +are chosen. A hash function calculates a hash code for each of a +document's tokens and chooses the minimum hash code among them. +The minimum hash codes from all hash functions are combined +to form a signature for the document. + +==== MinHash Token Filter in Elasticsearch +The `min_hash` token filter hashes each token of the token stream and divides the resulting hashes into buckets, keeping the lowest-valued hashes per bucket. It then returns these hashes as tokens. @@ -20,3 +54,80 @@ The following are settings that can be set for a `min_hash` token filter. bucket to its circular right. Only takes effect if hash_set_size is equal to one. Defaults to `true` if bucket_count is greater than one, else `false`. |======================================================================= + +Some points to consider while setting up a `min_hash` filter: + +* `min_hash` filter input tokens should typically be k-words shingles produced +from <>. You should +choose `k` large enough so that the probability of any given shingle +occurring in a document is low. At the same time, as +internally each shingle is hashed into to 128-bit hash, you should choose +`k` small enough so that all possible +different k-words shingles can be hashed to 128-bit hash with +minimal collision. 5-word shingles typically work well. + +* choosing the right settings for `hash_count`, `bucket_count` and +`hash_set_size` needs some experimentation. +** To improve the precision, you should increase `bucket_count` or +`hash_set_size`. Higher values of `bucket_count` or `hash_set_size` +will provide a higher guarantee that different tokens are +indexed to different buckets. +** To improve the recall, +you should increase `hash_token` parameter. For example, +setting `hash_count=2`, will make each token to be hashed in +two different ways, thus increasing the number of potential +candidates for search. + +* the default settings makes the `min_hash` filter to produce for +each document 512 `min_hash` tokens, each is of size 16 bytes. +Thus, each document's size will be increased by around 8Kb. + +* `min_hash` filter is used to hash for Jaccard similarity. This means +that it doesn't matter how many times a document contains a certain token, +only that if it contains it or not. + +Here is an example of setting up a `min_hash` filter: + +[source,js] +-------------------------------------------------- +POST /index1 +{ + "settings": { + "analysis": { + "filter": { + "my_shingle_filter": { + "type": "shingle", + "min_shingle_size": 5, + "max_shingle_size": 5, + "output_unigrams": false + }, + "my_minhash_filter": { + "type": "min_hash", + "hash_count": 1, + "bucket_count": 512, + "hash_set_size": 1, + "with_rotation": true + } + }, + "analyzer": { + "my_analyzer": { + "tokenizer": "standard", + "filter": [ + "my_shingle_filter", + "my_minhash_filter" + ] + } + } + } + }, + "mappings": { + "properties": { + "text": { + "type": "text", + "analyzer": "my_analyzer" + } + } + } +} +-------------------------------------------------- +// NOTCONSOLE \ No newline at end of file From a27305056f1bb8263a8cb6cc485515eda0d0ea39 Mon Sep 17 00:00:00 2001 From: Mayya Sharipova Date: Tue, 5 Mar 2019 14:13:28 -0500 Subject: [PATCH 2/2] Address feedback --- .../tokenfilters/minhash-tokenfilter.asciidoc | 92 ++++++++++--------- 1 file changed, 49 insertions(+), 43 deletions(-) diff --git a/docs/reference/analysis/tokenfilters/minhash-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/minhash-tokenfilter.asciidoc index 06fa09b8aa89a..21c7387e0f7f5 100644 --- a/docs/reference/analysis/tokenfilters/minhash-tokenfilter.asciidoc +++ b/docs/reference/analysis/tokenfilters/minhash-tokenfilter.asciidoc @@ -1,40 +1,6 @@ [[analysis-minhash-tokenfilter]] === MinHash Token Filter -==== Theory -Minhash token filter allows you to hash documents for similarity search. -Similarity search, or nearest neighbor search is a complex problem. -A naive solution requires an exhaustive pairwise comparison between a query -document and every document in an index. This is a prohibitive operation -if the index is large. A number of approximate nearest neighbor search -solutions have been developed to make similarity search more practical and -computationally feasible. One of these solutions involves hashing of documents. - -Documents are hashed in a way that similar documents are more likely -to produce the same hash code and are put into the same hash bucket, -while dissimilar documents are more likely to be hashed into -different hash buckets. This type of hashing is known as -locality sensitive hashing (LSH). - -Depending on what constitutes the similarity between documents, -various LSH functions https://arxiv.org/abs/1408.2927[have been proposed]. -For Jaccard similarity, a popular LSH function is -https://en.wikipedia.org/wiki/MinHash[MinHash]. -A general idea of the way MinHash produces a signature for a document -is by applying a random permutation over the whole index vocabulary (random -numbering for the vocabulary), and recording the minimum value for this permutation -for the document (the minimum number for a vocabulary word that is present -in the document). The permutations are run several times; -combining the minimum values for all of them will constitute a -signature for the document. - -In practice, instead of random permutations, a number of hash functions -are chosen. A hash function calculates a hash code for each of a -document's tokens and chooses the minimum hash code among them. -The minimum hash codes from all hash functions are combined -to form a signature for the document. - -==== MinHash Token Filter in Elasticsearch The `min_hash` token filter hashes each token of the token stream and divides the resulting hashes into buckets, keeping the lowest-valued hashes per bucket. It then returns these hashes as tokens. @@ -68,11 +34,11 @@ minimal collision. 5-word shingles typically work well. * choosing the right settings for `hash_count`, `bucket_count` and `hash_set_size` needs some experimentation. -** To improve the precision, you should increase `bucket_count` or +** to improve the precision, you should increase `bucket_count` or `hash_set_size`. Higher values of `bucket_count` or `hash_set_size` will provide a higher guarantee that different tokens are indexed to different buckets. -** To improve the recall, +** to improve the recall, you should increase `hash_token` parameter. For example, setting `hash_count=2`, will make each token to be hashed in two different ways, thus increasing the number of potential @@ -86,6 +52,41 @@ Thus, each document's size will be increased by around 8Kb. that it doesn't matter how many times a document contains a certain token, only that if it contains it or not. +==== Theory +MinHash token filter allows you to hash documents for similarity search. +Similarity search, or nearest neighbor search is a complex problem. +A naive solution requires an exhaustive pairwise comparison between a query +document and every document in an index. This is a prohibitive operation +if the index is large. A number of approximate nearest neighbor search +solutions have been developed to make similarity search more practical and +computationally feasible. One of these solutions involves hashing of documents. + +Documents are hashed in a way that similar documents are more likely +to produce the same hash code and are put into the same hash bucket, +while dissimilar documents are more likely to be hashed into +different hash buckets. This type of hashing is known as +locality sensitive hashing (LSH). + +Depending on what constitutes the similarity between documents, +various LSH functions https://arxiv.org/abs/1408.2927[have been proposed]. +For https://en.wikipedia.org/wiki/Jaccard_index[Jaccard similarity], a popular +LSH function is https://en.wikipedia.org/wiki/MinHash[MinHash]. +A general idea of the way MinHash produces a signature for a document +is by applying a random permutation over the whole index vocabulary (random +numbering for the vocabulary), and recording the minimum value for this permutation +for the document (the minimum number for a vocabulary word that is present +in the document). The permutations are run several times; +combining the minimum values for all of them will constitute a +signature for the document. + +In practice, instead of random permutations, a number of hash functions +are chosen. A hash function calculates a hash code for each of a +document's tokens and chooses the minimum hash code among them. +The minimum hash codes from all hash functions are combined +to form a signature for the document. + + +==== Example of setting MinHash Token Filter in Elasticsearch Here is an example of setting up a `min_hash` filter: [source,js] @@ -95,7 +96,7 @@ POST /index1 "settings": { "analysis": { "filter": { - "my_shingle_filter": { + "my_shingle_filter": { <1> "type": "shingle", "min_shingle_size": 5, "max_shingle_size": 5, @@ -103,10 +104,10 @@ POST /index1 }, "my_minhash_filter": { "type": "min_hash", - "hash_count": 1, - "bucket_count": 512, - "hash_set_size": 1, - "with_rotation": true + "hash_count": 1, <2> + "bucket_count": 512, <3> + "hash_set_size": 1, <4> + "with_rotation": true <5> } }, "analyzer": { @@ -123,11 +124,16 @@ POST /index1 "mappings": { "properties": { "text": { - "type": "text", + "fingerprint": "text", "analyzer": "my_analyzer" } } } } -------------------------------------------------- -// NOTCONSOLE \ No newline at end of file +// NOTCONSOLE +<1> setting a shingle filter with 5-word shingles +<2> setting min_hash filter to hash with 1 hash +<3> setting min_hash filter to hash tokens into 512 buckets +<4> setting min_hash filter to keep only a single smallest hash in each bucket +<5> setting min_hash filter to fill empty buckets with values from neighboring buckets