Skip to content

Commit

Permalink
Added epoch_md5 document ID type (elastic#28)
Browse files Browse the repository at this point in the history
* Added epoch_md5 document id type

* Updated README and added SHA1 id to the document id benchmark

* Refactoring of document id challenge following review

* Updated note about disk space requirements

* Updated note about index size and disk space requirements

* Updates following review
  • Loading branch information
cdahlqvist authored Sep 21, 2018
1 parent aa3f097 commit f07365f
Show file tree
Hide file tree
Showing 6 changed files with 682 additions and 137 deletions.
28 changes: 28 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,34 @@ The table below shows the track parameters that can be adjusted along with defau
| --------- | ----------- | ---- | ------------- |
| `bulk_indexing_clients` | Number of bulk indexing clients/connections | `int` | `32` |

### 10) document_id_evaluation

This challenge examines the indexing throughput as a function of shard size as well as the resulting storage requirements for a set of different types of document IDs. For each document ID type, it indexes 200 million documents into a single-shard index, which should be about 40GB in size. Once all data has been indexed, index statistics are recorded before and after a forcemerge down to a single segment.

The following document id types are benchmarked:

`auto` - This test uses document IDs autogenerated by Elasticsearch. This allows Elasticsearch to optimize for indexing speed as the operation can not be an update.

`uuid` - This test uses a UUID4 as document ID. This is largely random in nature and we have removed `-` characters that never change from it to make it a bit shorter.

`sha1` - This test uses a SHA1 hash formatted as a hexadecimal string as document ID.

`epoch_uuid` - This test uses an UUID string prefixed by the hexadecimal representation of an epoch timestamp. This makes identifiers largely ordered over time, which can have a positive impact on indexing throughput.

`epoch_md5` - This test uses an base64 encoded MD5 hash prefixed by the hexadecimal representation of an epoch timestamp. This makes identifiers largely ordered over time, which can have a positive impact on indexing throughput.

`epoch_md5-10pct/60s` - This test uses the `epoch_md5` identifier described above, but simulates a portion of events arriving delayed by setting the timestamp to 60s (1 minute) in the past for 10% of events.

`epoch_md5-10pct/300s` - This test uses the `epoch_md5` identifier described above, but simulates a portion of events arriving delayed by setting the timestamp to 300s (5 minutes) in the past for 10% of events.

Note that this challenge will generate up to ~300GB of data on disk and will require additional space for merging and overhead. Make sure around 400GB of disk space is available before running this to be on the safe side.

The table below shows the track parameters that can be adjusted along with default values:

| Parameter | Explanation | Type | Default Value |
| --------- | ----------- | ---- | ------------- |
| `bulk_indexing_clients` | Number of bulk indexing clients/connections | `int` | `20` |

## Custom parameter sources

### elasticlogs\_bulk\_source
Expand Down
224 changes: 92 additions & 132 deletions eventdata/challenges/document_id_benchmark.json
Original file line number Diff line number Diff line change
@@ -1,222 +1,182 @@
{% set p_bulk_indexing_clients = (bulk_indexing_clients | default(20)) %}
{% set p_iterations = bulk_indexing_iterations | default(100000) %}
{% set p_iterations = bulk_indexing_iterations | default(200000) %}
{% set p_iterations_per_client = (p_iterations / p_bulk_indexing_clients) | int %}

{
"name": "document_id_evaluation",
"description": "Index about 20GB of data into five single shard indices using different document ID types. IDs are autogenerated by Elasticsearch, meaning there are no conflicts.",
"description": "Indexes about 40GB of data into seven single shard indices using different document ID types. IDs are autogenerated by Elasticsearch, meaning there are no conflicts.",
"meta": {
"client_count": {{ p_bulk_indexing_clients }},
"benchmark_type": "document-id-evaluation"
},
"schedule": [
{% for id in [{ 'type': 'auto', 'desc': 'auto' },
{ 'type': 'uuid', 'desc': 'uuid' },
{ 'type': 'sha1', 'desc': 'sha1' },
{ 'type': 'epoch_uuid', 'desc': 'epoch_uuid' },
{ 'type': 'epoch_md5', 'desc': 'epoch_md5'} ] %}
{
"name": "deleteindex_elasticlogs-auto",
"name": "deleteindex_elasticlogs-{{ id['desc'] }}",
"operation": {
"operation-type": "delete-index",
"index": "elasticlogs-auto"
}
},
{
"name": "deleteindex_elasticlogs-uuid",
"operation": {
"operation-type": "delete-index",
"index": "elasticlogs-uuid"
}
},
{
"name": "deleteindex_elasticlogs-epoch-no_delay",
"operation": {
"operation-type": "delete-index",
"index": "elasticlogs-epoch-no_delay"
}
},
{
"name": "deleteindex_elasticlogs-epoch-10pct_60s",
"operation": {
"operation-type": "delete-index",
"index": "elasticlogs-epoch-10pct_60s"
}
},
{
"name": "deleteindex_elasticlogs-epoch-10pct_300s",
"operation": {
"operation-type": "delete-index",
"index": "elasticlogs-epoch-10pct_300s"
}
"index": "elasticlogs-{{ id['desc'] }}"
},
"include-in-reporting": false
},
{
"name": "create_elasticlogs-auto",
"name": "create_elasticlogs-{{ id['desc'] }}",
"operation": {
"operation-type": "createindex",
"index_name": "elasticlogs-auto",
"index_name": "elasticlogs-{{ id['desc'] }}",
"index_template_body": {
"template": "elasticlogs-auto",
"template": "elasticlogs-{{ id['desc'] }}",
"settings": {
"index.refresh_interval": "5s",
"index.codec": "best_compression",
"index.translog.retention.size": "10mb",
"index.number_of_replicas": 0,
"index.number_of_shards": 1
},
"mappings": "mappings.json",
"aliases": {}
},
"index_template_name": "elasticlogs-auto"
}
"index_template_name": "elasticlogs-{{ id['desc'] }}"
},
"include-in-reporting": false
},
{
"name": "index-append-1000-elasticlogs-auto",
"name": "index-append-1000-elasticlogs-{{ id['desc'] }}",
"operation": {
"operation-type": "bulk",
"param-source": "elasticlogs_bulk",
"index": "elasticlogs-auto",
"bulk-size": 1000
"index": "elasticlogs-{{ id['desc'] }}",
"bulk-size": 1000,
"id_type": "{{ id['type'] }}"
},
"iterations": {{ p_iterations_per_client }},
"clients": {{ p_bulk_indexing_clients }},
"meta": {
"id_mode": "auto"
"id_mode": "{{ id['desc'] }}"
}
},
{
"name": "create_elasticlogs-uuid",
"name": "indicesstats-elasticlogs-{{ id['desc'] }}",
"operation": {
"operation-type": "createindex",
"index_name": "elasticlogs-uuid",
"index_template_body": {
"template": "elasticlogs-uuid",
"settings": {
"index.refresh_interval": "5s",
"index.codec": "best_compression",
"index.number_of_replicas": 0,
"index.number_of_shards": 1
},
"mappings": "mappings.json",
"aliases": {}
},
"index_template_name": "elasticlogs-uuid"
}
},
{
"name": "index-append-1000-elasticlogs-uuid",
"operation": {
"operation-type": "bulk",
"param-source": "elasticlogs_bulk",
"index": "elasticlogs-uuid",
"bulk-size": 1000,
"id_type": "uuid"
"operation-type": "indicesstats",
"index_pattern": "elasticlogs-{{ id['desc'] }}"
},
"iterations": {{ p_iterations_per_client }},
"clients": {{ p_bulk_indexing_clients }},
"meta": {
"id_mode": "uuid"
"forcemerged": "no",
"id_mode": "{{ id['desc'] }}"
}
},

{
"name": "create_elasticlogs-epoch-no_delay",
"name": "force-merge-{{ id['desc'] }}",
"operation": {
"operation-type": "createindex",
"index_name": "elasticlogs-epoch-no_delay",
"index_template_body": {
"template": "elasticlogs-epoch-no_delay",
"settings": {
"index.refresh_interval": "5s",
"index.codec": "best_compression",
"index.number_of_replicas": 0,
"index.number_of_shards": 1
},
"mappings": "mappings.json",
"aliases": {}
},
"index_template_name": "elasticlogs-epoch-no_delay"
}
"operation-type": "force-merge",
"max-num-segments": 1
},
"clients": 1
},
{
"name": "index-append-1000-elasticlogs-epoch-no_delay",
{
"name": "indicesstats-elasticlogs-fm-{{ id['desc'] }}",
"operation": {
"operation-type": "bulk",
"param-source": "elasticlogs_bulk",
"index": "elasticlogs-epoch-no_delay",
"bulk-size": 1000,
"id_type": "epoch_uuid"
"operation-type": "indicesstats",
"index_pattern": "elasticlogs-{{ id['desc'] }}"
},
"iterations": {{ p_iterations_per_client }},
"clients": {{ p_bulk_indexing_clients }},
"meta": {
"id_mode": "epoch_uuid-no_delay"
"forcemerged": "yes",
"id_mode": "{{ id['desc'] }}"
}
},
{% endfor %}
{% for id in [{ 'type': 'epoch_md5', 'desc': 'epoch_md5-10pct_60s', 'delay': 60 },
{ 'type': 'epoch_md5', 'desc': 'epoch_md5-10pct_300s', 'delay': 300 }] %}
{
"name": "deleteindex_elasticlogs-{{ id['desc'] }}",
"operation": {
"operation-type": "delete-index",
"index": "elasticlogs-{{ id['desc'] }}"
},
"include-in-reporting": false
},
{
"name": "create_elasticlogs-epoch-10pct_60s",
"name": "create_elasticlogs-{{ id['desc'] }}",
"operation": {
"operation-type": "createindex",
"index_name": "elasticlogs-epoch-10pct_60s",
"index_name": "elasticlogs-{{ id['desc'] }}",
"index_template_body": {
"template": "elasticlogs-epoch-10pct_60s",
"template": "elasticlogs-{{ id['desc'] }}",
"settings": {
"index.refresh_interval": "5s",
"index.codec": "best_compression",
"index.translog.retention.size": "10mb",
"index.number_of_replicas": 0,
"index.number_of_shards": 1
},
"mappings": "mappings.json",
"aliases": {}
},
"index_template_name": "elasticlogs-epoch-10pct_60s"
}
"index_template_name": "elasticlogs-{{ id['desc'] }}"
},
"include-in-reporting": false
},
{
"name": "index-append-1000-elasticlogs-epoch-10pct_60s",
"name": "index-append-1000-elasticlogs-{{ id['desc'] }}",
"operation": {
"operation-type": "bulk",
"param-source": "elasticlogs_bulk",
"index": "elasticlogs-epoch-10pct_60s",
"index": "elasticlogs-{{ id['desc'] }}",
"bulk-size": 1000,
"id_type": "epoch_uuid",
"id_type": "{{ id['type'] }}",
"id_delay_probability": 0.1,
"id_delay_secs": 60
"id_delay_secs": {{ id['delay'] }}
},
"iterations": {{ p_iterations_per_client }},
"clients": {{ p_bulk_indexing_clients }},
"meta": {
"id_mode": "epoch_uuid-10pct/60s"
"id_mode": "{{ id['desc'] }}"
}
},
{
"name": "create_elasticlogs-epoch-10pct_300s",
"name": "indicesstats-elasticlogs-{{ id['desc'] }}",
"operation": {
"operation-type": "createindex",
"index_name": "elasticlogs-epoch-10pct_300s",
"index_template_body": {
"template": "elasticlogs-epoch-10pct_300s",
"settings": {
"index.refresh_interval": "5s",
"index.codec": "best_compression",
"index.number_of_replicas": 0,
"index.number_of_shards": 1
},
"mappings": "mappings.json",
"aliases": {}
},
"index_template_name": "elasticlogs-epoch-10pct_300s"
"operation-type": "indicesstats",
"index_pattern": "elasticlogs-{{ id['desc'] }}"
},
"meta": {
"forcemerged": "no",
"id_mode": "{{ id['desc'] }}"
}
},

{
"name": "index-append-1000-elasticlogs-epoch-10pct_300s",
"name": "force-merge-{{ id['desc'] }}",
"operation": {
"operation-type": "bulk",
"param-source": "elasticlogs_bulk",
"index": "elasticlogs-epoch-10pct_300s",
"bulk-size": 1000,
"id_type": "epoch_uuid",
"id_delay_probability": 0.1,
"id_delay_secs": 300
"operation-type": "force-merge",
"max-num-segments": 1
},
"clients": 1
},
{
"name": "indicesstats-elasticlogs-fm-{{ id['desc'] }}",
"operation": {
"operation-type": "indicesstats",
"index_pattern": "elasticlogs-{{ id['desc'] }}"
},
"iterations": {{ p_iterations_per_client }},
"clients": {{ p_bulk_indexing_clients }},
"meta": {
"id_mode": "epoch_uuid-10pct/300s"
"forcemerged": "yes",
"id_mode": "{{ id['desc'] }}"
}
},
{% endfor %}
{
"name": "refresh-final",
"operation": "refresh",
"iterations": 1,
"clients": 1,
"include-in-reporting": false
}
]
}
Loading

0 comments on commit f07365f

Please sign in to comment.