Benchmark ELSER (elastic#438)

inqueue · Dec 6, 2023 · 4f9df84 · 4f9df84
1 parent 54de171
commit 4f9df84
Show file tree

Hide file tree

Showing 11 changed files with 30,951 additions and 0 deletions.
diff --git a/elser-ingest-speedtest/README.md b/elser-ingest-speedtest/README.md
@@ -0,0 +1,19 @@
+## ELSER Speed Test Track
+
+### Prerequisites
+#### Set up ES cloud deployment
+Create a deployment which contains an ML node with at least 4GB of memory.
+
+### Parameters
+This track accepts the following parameters with Rally 0.8.0+ using `--track-params`:
+* `number_of_allocations` (default: 1)
+* `threads_per_allocation` (default: 2)
+* `queue_capacity` (default: 1024)
+* `bulk_size` (default: 100)
+* `bulk_indexing_clients` (default: 1)
+* `ingest_percentage` (default: 100)
+
+### Data Setup
+The documents are all of a fixed length input equivalent to 256 Word Piece tokens. They were created by taking words from the BERT vocabulary that tokenize as a single token and generating fixed length inputs from a random selection of single token words.
+
+See `_support/generate_fixed_length_docs.py` and other files in the `_support` folder.
diff --git a/elser-ingest-speedtest/_support/bert_vocab_whole_words.json b/elser-ingest-speedtest/_support/bert_vocab_whole_words.json
diff --git a/elser-ingest-speedtest/_support/filter_vocab.py b/elser-ingest-speedtest/_support/filter_vocab.py
@@ -0,0 +1,16 @@
+import re
+
+pattern = re.compile(r"^[a-zA-Z]{2,}$")
+
+
+lines = None
+with open("huggingface.co_bert-base-uncased_raw_main_vocab.txt") as raw_vocab:
+    lines = raw_vocab.readlines()
+
+
+l2 = ['"' + s.strip() + '"' for s in lines if pattern.match(s)]
+
+with open("bert_vocab_whole_words.json", "w") as outfile:
+    outfile.write("[")
+    outfile.write(",".join(l2))
+    outfile.write("]")
diff --git a/elser-ingest-speedtest/_support/generate_fixed_length_docs.py b/elser-ingest-speedtest/_support/generate_fixed_length_docs.py
@@ -0,0 +1,15 @@
+import json
+import random
+
+DOCUMENT_FIXED_LENGTH = 256
+DOCUMENT_COUNT = 1000000
+
+with open("bert_vocab_whole_words.json") as word_file:
+    word_list = json.load(word_file)
+
+with open("../document_set.json", "w") as doc_file:
+    for i in range(DOCUMENT_COUNT):
+        doc_words = random.choices(word_list, k=DOCUMENT_FIXED_LENGTH)
+        doc = {"body": " ".join(doc_words)}
+
+        doc_file.writelines([json.dumps(doc), "\n"])
diff --git a/elser-ingest-speedtest/_support/google-storage-uploader.py b/elser-ingest-speedtest/_support/google-storage-uploader.py
@@ -0,0 +1,13 @@
+# Imports the Google Cloud client library
+from google.cloud import storage
+
+# Instantiates a client
+storage_client = storage.Client()
+
+# The name for the new bucket
+bucket_name = "ml-elser-benchmark-data"
+
+# Creates the new bucket
+bucket = storage_client.create_bucket(bucket_name)
+
+print(f"Bucket {bucket.name} created.")