From 31e445922b8e6c140543f87e7b60dd82f30129f2 Mon Sep 17 00:00:00 2001 From: Daniel Mican Date: Fri, 1 Dec 2023 16:15:12 -0500 Subject: [PATCH] CSV benchmarks --- README.md | 18 ++++++++++++++++-- cmd/publish-test-data.py | 10 +++++++++- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 0ec6390..925c4a4 100644 --- a/README.md +++ b/README.md @@ -133,8 +133,8 @@ Hardware: |--------------------|-------------------|------------|-------------------| | Simple Aggregation | 36,000 msgs / sec | 256 MiB | 102 MiB | | Enrichment | 13,000 msgs /sec | 368 MiB | 124 MiB | -| CSV Disk | | | | -| CSV Memory | | | | +| CSV Disk Join | 11,500 msgs /sec | 312 MiB | 152 MiB | +| CSV Memory Join | 33,200 msgs / sec | 300 MiB | 107 MiB | ### Simple Aggregate @@ -158,5 +158,19 @@ python3 cmd/publish-test-data.py --num-messages=1000000 --topic="topic-enrich" /usr/bin/time -l python3 cmd/sql-flow.py run /Users/danielmican/code/github.com/turbolytics/sql-flow/dev/config/benchmarks/enrich.yml ``` +### CSV Disk Join + +``` +python3 cmd/publish-test-data.py --num-messages=1000000 --topic="topic-csv-filesystem-join" +SQLFLOW_STATIC_ROOT=/Users/danielmican/code/github.com/turbolytics/sql-flow/dev /usr/bin/time -l python3 cmd/sql-flow.py run /Users/danielmican/code/github.com/turbolytics/sql-flow/dev/config/examples/csv.filesystem.join.yml +``` + +## CSV Memory Join + +``` +SQLFLOW_STATIC_ROOT=/Users/danielmican/code/github.com/turbolytics/sql-flow/dev /usr/bin/time -l python3 cmd/sql-flow.py run /Users/danielmican/code/github.com/turbolytics/sql-flow/dev/config/examples/csv.mem.join.yml +python3 cmd/publish-test-data.py --num-messages=1000000 --topic="topic-csv-mem-join" +``` + --- Like SQLFlow? Use SQLFlow? Feature Requests? Please let us know! danny@turbolytics.io \ No newline at end of file diff --git a/cmd/publish-test-data.py b/cmd/publish-test-data.py index 64fbaea..b31d20a 100644 --- a/cmd/publish-test-data.py +++ b/cmd/publish-test-data.py @@ -32,6 +32,14 @@ "originalTimestamp": "2015-12-12T19:11:01.152Z" } +cities = [ + 'San Fransisco', + 'Baltimore', + 'New York', + 'Miami', + 'Asheville', +] + @click.command() @click.option('--num-messages', default=1001, type=int) @@ -45,7 +53,7 @@ def main(num_messages, topic): producer = Producer(conf) for i in range(num_messages): e = copy.deepcopy(event) - e['properties']['city'] = e['properties']['city'] + str(random.randrange(0, 1000)) + e['properties']['city'] = random.choice(cities) j_event = json.dumps(e) producer.produce(topic, value=j_event) if i % 1000 == 0: