From 9d4216f3abd08bc9dbdc20e8518131502c197916 Mon Sep 17 00:00:00 2001 From: turbolytics <151242797+turbolytics@users.noreply.github.com> Date: Sun, 26 Nov 2023 20:22:45 -0500 Subject: [PATCH] Documentation (#15) * Documentation * images * more --------- Co-authored-by: Daniel Mican --- README.md | 108 ++++++++++++++++++++++----- dev/config/benchmarks/enrich.yml | 2 - dev/config/benchmarks/simple_agg.yml | 4 +- dev/config/inferred_schema.yml | 2 - dev/config/local_docker.yml | 24 ++++++ 5 files changed, 114 insertions(+), 26 deletions(-) create mode 100644 dev/config/local_docker.yml diff --git a/README.md b/README.md index f1aa598..364ec39 100644 --- a/README.md +++ b/README.md @@ -1,44 +1,111 @@ -# sql-flow -DuckDB for streaming data. +# SQLFlow: DuckDB for streaming data. -SQL Flow enables SQL-based streaming transformations, powered by DuckDB. +SQLFlow enables SQL-based stream-processing, powered by [DuckDB](https://duckdb.org/). SQLFlow enables writing kafka stream processing logic in pure sql. -SQL Flow enables writing SQL against a stream of kafka data. +SQLFlow supports: +- Kafka streaming - Writing a consumer that performs SQL based transformations and publishing the output to another kafka topic. +- JSON on the wire +- Writing stream transformations in pure SQL, powered by [DuckDB](https://duckdb.org/) +- Performant [librdkafka](https://github.com/confluentinc/librdkafka) [python consumer](https://github.com/confluentinc/confluent-kafka-python) -SQL Flow has a number of goals: -- Make it trivial to use SQL for streaming data transformations. -- Support high performance kafka streaming. +SQLFlow is currently not a good fit for: +- Stateful stream processing +- Wire protocols other than JSON -What SQL Flow isn't: -- A stateful streaming engine like Flink. +SQLFlow is a kafka consumer that embeds SQL for stream transformation: +Screenshot 2023-11-26 at 8 16 47 PM ## Getting Started +### Docker +[Docker is the easiest way to get started.](https://hub.docker.com/r/turbolytics/sql-flow) +- Pull the sql-flow docker image +``` +docker pull turbolytics/sql-flow:latest +``` -## Development +- Validate config by invoking it on test data +``` +docker run -v $(PWD)/dev:/tmp/conf -v /tmp/sqlflow:/tmp/sqlflow sql-flow dev invoke /tmp/conf/config/inferred_schema.yml /tmp/conf/fixtures/simple.json +['{"city":"New York","city_count":28672}', '{"city":"Baltimore","city_count":28672}'] ``` -C_INCLUDE_PATH=/opt/homebrew/Cellar/librdkafka/2.3.0/include LIBRARY_PATH=/opt/homebrew/Cellar/librdkafka/2.3.0/lib pip install confluent-kafka + +- Start kafka locally using docker +``` +cd dev && docker-compose -f kafka-single.yml up ``` +- Publish test messages to kafka ``` -cd dev -docker-compose -f kafka-single.yml up +python3 cmd/publish-test-data.py --num-messages=10000 --topic="topic-local-docker" ``` +- Start kafka consumer from inside docker-compose container +``` +docker exec -it kafka-console-consumer --bootstrap-server=kafka1:9092 --topic=output-1 ``` -python3 cmd/sql-flow.py run /Users/danielmican/code/github.com/turbolytics/sql-flow/dev/config/inferred_schema.yml -python publish-test-data.py +- Start SQLFlow in docker + +``` +docker run -v $(PWD)/dev:/tmp/conf -v /tmp/sqlflow:/tmp/sqlflow sql-flow run /tmp/conf/config/local_docker.yml ``` +- Verify output in the kafka consumer + ``` -docker run -v $(PWD)/dev:/tmp/conf -v /tmp/sqlflow:/tmp/sqlflow sql-flow dev invoke /tmp/conf/config/inferred_schema.yml /tmp/conf/fixtures/simple.json -['{"city":"New York","city_count":28672}', '{"city":"Baltimore","city_count":28672}'] +... +... +{"city":"San Francisco504","city_count":1} +{"city":"San Francisco735","city_count":1} +{"city":"San Francisco533","city_count":1} +{"city":"San Francisco556","city_count":1} ``` +The `dev invoke` command enables testing a sql-flow pipeline configuration on a batch of test data. This enables fast feedback local development before launching a sql-flow consumer that reads from kafka. + +## Configuration + +The heart of sql-flow is the pipeline configuration file. Each configuration file specifies: + +- Kafka configuration +- Pipeline configuration + - Input configuration + - SQL transformation + - Output configuration + +Screenshot 2023-11-26 at 8 10 44 PM + +Every instance of sql-flow needs a pipeline configuration file. + + +## Recipes + +Coming Soon, until then checkout: + +- [Benchmark configurations](./dev/config/benchmarks) +- [Unit Test configurations](./tests/) + + +## Development + +- Install python deps +``` +pip install -r requirements.txt +pip install -r requirements.dev.txt + +C_INCLUDE_PATH=/opt/homebrew/Cellar/librdkafka/2.3.0/include LIBRARY_PATH=/opt/homebrew/Cellar/librdkafka/2.3.0/lib pip install confluent-kafka +``` + +- Run tests +``` +pytests tests +``` + + ## Benchmarks **Methodology** @@ -76,7 +143,7 @@ python3 cmd/publish-test-data.py --num-messages=1000000 --topic="topic-simple-ag ### Enriches Performs an enrichment. Output is 1:1 records with input, but -each output record is enchanced with additional information. +each output record is enhanced with additional information. ``` python3 cmd/publish-test-data.py --num-messages=1000000 --topic="topic-enrich" @@ -86,4 +153,7 @@ python3 cmd/publish-test-data.py --num-messages=1000000 --topic="topic-enrich" 13k messages / second 368 MiB Max - maximum resident size 124 MiB - peak memory footprint -``` \ No newline at end of file +``` + +--- +Like SQLFlow? Use SQLFlow? Feature Requests? Please let us know! danny@turbolytics.io \ No newline at end of file diff --git a/dev/config/benchmarks/enrich.yml b/dev/config/benchmarks/enrich.yml index d21b831..12d7718 100644 --- a/dev/config/benchmarks/enrich.yml +++ b/dev/config/benchmarks/enrich.yml @@ -1,11 +1,9 @@ -# Kafka stuff kafka: brokers: [localhost:9092] group_id: test auto_offset_reset: earliest pipeline: - # input stuff input: batch_size: 1000 topics: diff --git a/dev/config/benchmarks/simple_agg.yml b/dev/config/benchmarks/simple_agg.yml index 4d708ff..8f0d7f6 100644 --- a/dev/config/benchmarks/simple_agg.yml +++ b/dev/config/benchmarks/simple_agg.yml @@ -1,11 +1,9 @@ -# Kafka stuff kafka: brokers: [localhost:9092] group_id: test auto_offset_reset: earliest pipeline: - # input stuff input: batch_size: 1000 topics: @@ -21,4 +19,4 @@ pipeline: output: type: kafka - topic: output-1 \ No newline at end of file + topic: output-1 diff --git a/dev/config/inferred_schema.yml b/dev/config/inferred_schema.yml index 3c3bc05..f07f64a 100644 --- a/dev/config/inferred_schema.yml +++ b/dev/config/inferred_schema.yml @@ -1,13 +1,11 @@ -# Kafka stuff kafka: brokers: [localhost:9092] group_id: test auto_offset_reset: earliest pipeline: - # input stuff input: batch_size: 1000 topics: diff --git a/dev/config/local_docker.yml b/dev/config/local_docker.yml new file mode 100644 index 0000000..909c1f9 --- /dev/null +++ b/dev/config/local_docker.yml @@ -0,0 +1,24 @@ + + +kafka: + brokers: [host.docker.internal:29092] + group_id: test + auto_offset_reset: earliest + +pipeline: + input: + batch_size: 1000 + topics: + - "topic-local-docker" + + sql: | + SELECT + properties.city as city, + count(*) as city_count + FROM batch + GROUP BY + city + + output: + type: kafka + topic: output-1 \ No newline at end of file