From ebf1979bb22fd7ac9c85bb391a695e3a008ccfa0 Mon Sep 17 00:00:00 2001
From: Mayya Sharipova <mayya.sharipova@elastic.co>
Date: Mon, 4 Mar 2019 19:21:02 -0500
Subject: [PATCH 1/2] Add documentation for min_hash filter

Closes #20757
---
 .../tokenfilters/minhash-tokenfilter.asciidoc | 115 +++++++++++++++++-
 1 file changed, 113 insertions(+), 2 deletions(-)

diff --git a/docs/reference/analysis/tokenfilters/minhash-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/minhash-tokenfilter.asciidoc
index eb6a4d820ef1b..06fa09b8aa89a 100644
--- a/docs/reference/analysis/tokenfilters/minhash-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/minhash-tokenfilter.asciidoc
@@ -1,7 +1,41 @@
 [[analysis-minhash-tokenfilter]]
-=== Minhash Token Filter
+=== MinHash Token Filter
 
-A token filter of type `min_hash` hashes each token of the token stream and divides
+==== Theory
+Minhash token filter allows you to hash documents for similarity search.
+Similarity search, or nearest neighbor search is a complex problem.
+A naive solution requires an exhaustive pairwise comparison between a query
+document and every document in an index. This is a prohibitive operation
+if the index is large. A number of approximate nearest neighbor search
+solutions have been developed to make similarity search more practical and
+computationally feasible. One of these solutions involves hashing of documents.
+
+Documents are hashed in a way that similar documents are more likely
+to produce the same hash code and are put into the same hash bucket,
+while dissimilar documents are more likely to be hashed into
+different hash buckets. This type of hashing is known as
+locality sensitive hashing (LSH).
+
+Depending on what constitutes the similarity between documents,
+various LSH functions https://arxiv.org/abs/1408.2927[have been proposed].
+For Jaccard similarity, a popular LSH function is
+https://en.wikipedia.org/wiki/MinHash[MinHash].
+A general idea of the way MinHash produces a signature for a document
+is by applying a random permutation over the whole index vocabulary (random
+numbering for the vocabulary), and recording the minimum value for this permutation
+for the document (the minimum number for a vocabulary word that is present
+in the document). The permutations are run several times;
+combining the minimum values for all of them will constitute a
+signature for the document.
+
+In practice, instead of random permutations, a number of hash functions
+are chosen. A hash function calculates a hash code for each of a
+document's tokens and chooses the minimum hash code among them.
+The minimum hash codes from all hash functions are combined
+to form a signature for the document.
+
+==== MinHash Token Filter in Elasticsearch
+The `min_hash` token filter hashes each token of the token stream and divides
 the resulting hashes into buckets, keeping the lowest-valued hashes per
 bucket. It then returns these hashes as tokens.
 
@@ -20,3 +54,80 @@ The following are settings that can be set for a `min_hash` token filter.
 bucket to its circular right. Only takes effect if hash_set_size is equal to one.
 Defaults to `true` if bucket_count is greater than one, else `false`.
 |=======================================================================
+
+Some points to consider while setting up a `min_hash` filter:
+
+* `min_hash` filter input tokens should typically be k-words shingles produced
+from <<analysis-shingle-tokenfilter,shingle token filter>>.  You should
+choose `k` large enough so that the probability of any given shingle
+occurring in a  document is low. At the same time, as
+internally each shingle is hashed into to 128-bit hash, you should choose
+`k` small enough so that all possible
+different k-words shingles can be hashed to 128-bit hash with
+minimal collision. 5-word shingles typically work well.
+
+* choosing the right settings for `hash_count`, `bucket_count` and
+`hash_set_size` needs some experimentation.
+** To improve the precision, you should increase `bucket_count` or
+`hash_set_size`. Higher values of `bucket_count` or `hash_set_size`
+will provide a higher guarantee that different tokens are
+indexed to different buckets.
+** To improve the recall,
+you should increase `hash_token` parameter. For example,
+setting `hash_count=2`, will make each token to be hashed in
+two different ways, thus increasing the number of potential
+candidates for search.
+
+* the default settings makes the  `min_hash` filter to produce for
+each document 512 `min_hash` tokens, each is of size 16 bytes.
+Thus, each document's size will be increased by around 8Kb.
+
+* `min_hash` filter is used to hash for Jaccard similarity. This means
+that it doesn't matter how many times a document contains a certain token,
+only that if it contains it or not.
+
+Here is an example of setting up a `min_hash` filter:
+
+[source,js]
+--------------------------------------------------
+POST /index1
+{
+  "settings": {
+    "analysis": {
+      "filter": {
+        "my_shingle_filter": {
+          "type": "shingle",
+          "min_shingle_size": 5,
+          "max_shingle_size": 5,
+          "output_unigrams": false
+        },
+        "my_minhash_filter": {
+          "type": "min_hash",
+          "hash_count": 1,
+          "bucket_count": 512,
+          "hash_set_size": 1,
+          "with_rotation": true
+        }
+      },
+      "analyzer": {
+        "my_analyzer": {
+          "tokenizer": "standard",
+          "filter": [
+            "my_shingle_filter",
+            "my_minhash_filter"
+          ]
+        }
+      }
+    }
+  },
+  "mappings": {
+    "properties": {
+      "text": {
+        "type": "text",
+        "analyzer": "my_analyzer"
+      }
+    }
+  }
+}
+--------------------------------------------------
+// NOTCONSOLE
\ No newline at end of file

From a27305056f1bb8263a8cb6cc485515eda0d0ea39 Mon Sep 17 00:00:00 2001
From: Mayya Sharipova <mayya.sharipova@elastic.co>
Date: Tue, 5 Mar 2019 14:13:28 -0500
Subject: [PATCH 2/2] Address feedback

---
 .../tokenfilters/minhash-tokenfilter.asciidoc | 92 ++++++++++---------
 1 file changed, 49 insertions(+), 43 deletions(-)

diff --git a/docs/reference/analysis/tokenfilters/minhash-tokenfilter.asciidoc b/docs/reference/analysis/tokenfilters/minhash-tokenfilter.asciidoc
index 06fa09b8aa89a..21c7387e0f7f5 100644
--- a/docs/reference/analysis/tokenfilters/minhash-tokenfilter.asciidoc
+++ b/docs/reference/analysis/tokenfilters/minhash-tokenfilter.asciidoc
@@ -1,40 +1,6 @@
 [[analysis-minhash-tokenfilter]]
 === MinHash Token Filter
 
-==== Theory
-Minhash token filter allows you to hash documents for similarity search.
-Similarity search, or nearest neighbor search is a complex problem.
-A naive solution requires an exhaustive pairwise comparison between a query
-document and every document in an index. This is a prohibitive operation
-if the index is large. A number of approximate nearest neighbor search
-solutions have been developed to make similarity search more practical and
-computationally feasible. One of these solutions involves hashing of documents.
-
-Documents are hashed in a way that similar documents are more likely
-to produce the same hash code and are put into the same hash bucket,
-while dissimilar documents are more likely to be hashed into
-different hash buckets. This type of hashing is known as
-locality sensitive hashing (LSH).
-
-Depending on what constitutes the similarity between documents,
-various LSH functions https://arxiv.org/abs/1408.2927[have been proposed].
-For Jaccard similarity, a popular LSH function is
-https://en.wikipedia.org/wiki/MinHash[MinHash].
-A general idea of the way MinHash produces a signature for a document
-is by applying a random permutation over the whole index vocabulary (random
-numbering for the vocabulary), and recording the minimum value for this permutation
-for the document (the minimum number for a vocabulary word that is present
-in the document). The permutations are run several times;
-combining the minimum values for all of them will constitute a
-signature for the document.
-
-In practice, instead of random permutations, a number of hash functions
-are chosen. A hash function calculates a hash code for each of a
-document's tokens and chooses the minimum hash code among them.
-The minimum hash codes from all hash functions are combined
-to form a signature for the document.
-
-==== MinHash Token Filter in Elasticsearch
 The `min_hash` token filter hashes each token of the token stream and divides
 the resulting hashes into buckets, keeping the lowest-valued hashes per
 bucket. It then returns these hashes as tokens.
@@ -68,11 +34,11 @@ minimal collision. 5-word shingles typically work well.
 
 * choosing the right settings for `hash_count`, `bucket_count` and
 `hash_set_size` needs some experimentation.
-** To improve the precision, you should increase `bucket_count` or
+** to improve the precision, you should increase `bucket_count` or
 `hash_set_size`. Higher values of `bucket_count` or `hash_set_size`
 will provide a higher guarantee that different tokens are
 indexed to different buckets.
-** To improve the recall,
+** to improve the recall,
 you should increase `hash_token` parameter. For example,
 setting `hash_count=2`, will make each token to be hashed in
 two different ways, thus increasing the number of potential
@@ -86,6 +52,41 @@ Thus, each document's size will be increased by around 8Kb.
 that it doesn't matter how many times a document contains a certain token,
 only that if it contains it or not.
 
+==== Theory
+MinHash token filter allows you to hash documents for similarity search.
+Similarity search, or nearest neighbor search is a complex problem.
+A naive solution requires an exhaustive pairwise comparison between a query
+document and every document in an index. This is a prohibitive operation
+if the index is large. A number of approximate nearest neighbor search
+solutions have been developed to make similarity search more practical and
+computationally feasible. One of these solutions involves hashing of documents.
+
+Documents are hashed in a way that similar documents are more likely
+to produce the same hash code and are put into the same hash bucket,
+while dissimilar documents are more likely to be hashed into
+different hash buckets. This type of hashing is known as
+locality sensitive hashing (LSH).
+
+Depending on what constitutes the similarity between documents,
+various LSH functions https://arxiv.org/abs/1408.2927[have been proposed].
+For https://en.wikipedia.org/wiki/Jaccard_index[Jaccard similarity], a popular
+LSH function is https://en.wikipedia.org/wiki/MinHash[MinHash].
+A general idea of the way MinHash produces a signature for a document
+is by applying a random permutation over the whole index vocabulary (random
+numbering for the vocabulary), and recording the minimum value for this permutation
+for the document (the minimum number for a vocabulary word that is present
+in the document). The permutations are run several times;
+combining the minimum values for all of them will constitute a
+signature for the document.
+
+In practice, instead of random permutations, a number of hash functions
+are chosen. A hash function calculates a hash code for each of a
+document's tokens and chooses the minimum hash code among them.
+The minimum hash codes from all hash functions are combined
+to form a signature for the document.
+
+
+==== Example of setting MinHash Token Filter in Elasticsearch
 Here is an example of setting up a `min_hash` filter:
 
 [source,js]
@@ -95,7 +96,7 @@ POST /index1
   "settings": {
     "analysis": {
       "filter": {
-        "my_shingle_filter": {
+        "my_shingle_filter": { <1>
           "type": "shingle",
           "min_shingle_size": 5,
           "max_shingle_size": 5,
@@ -103,10 +104,10 @@ POST /index1
         },
         "my_minhash_filter": {
           "type": "min_hash",
-          "hash_count": 1,
-          "bucket_count": 512,
-          "hash_set_size": 1,
-          "with_rotation": true
+          "hash_count": 1,   <2>
+          "bucket_count": 512, <3>
+          "hash_set_size": 1, <4>
+          "with_rotation": true <5>
         }
       },
       "analyzer": {
@@ -123,11 +124,16 @@ POST /index1
   "mappings": {
     "properties": {
       "text": {
-        "type": "text",
+        "fingerprint": "text",
         "analyzer": "my_analyzer"
       }
     }
   }
 }
 --------------------------------------------------
-// NOTCONSOLE
\ No newline at end of file
+// NOTCONSOLE
+<1> setting a shingle filter with 5-word shingles
+<2> setting min_hash filter to hash with 1 hash
+<3> setting min_hash filter to hash tokens into 512 buckets
+<4> setting min_hash filter to keep only a single smallest hash in each bucket
+<5> setting min_hash filter to fill empty buckets with values from neighboring buckets