From 5d40a6c7ee4e3f3b7fc708fb38e4b13a7173e0a5 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Wed, 8 May 2024 02:46:49 +0200 Subject: [PATCH] Documentation: Improve structure and layout --- README.md | 7 +- cratedb_toolkit/datasets/README.md | 20 ----- doc/datasets.md | 66 +++++++++++++++ doc/index.md | 10 +-- .../io/README.md => doc/io/index.md | 84 ++++--------------- doc/io/influxdb/index.md | 12 +++ doc/io/influxdb/loader.md | 52 ++++++++++++ doc/io/mongodb/index.md | 13 +++ doc/io/mongodb/loader.md | 58 +++++++++++++ .../README.md => doc/io/mongodb/migr8.md | 14 ++-- doc/retention.md | 4 +- doc/sandbox.md | 2 +- pyproject.toml | 13 +-- 13 files changed, 240 insertions(+), 115 deletions(-) delete mode 100644 cratedb_toolkit/datasets/README.md create mode 100644 doc/datasets.md rename cratedb_toolkit/io/README.md => doc/io/index.md (52%) create mode 100644 doc/io/influxdb/index.md create mode 100644 doc/io/influxdb/loader.md create mode 100644 doc/io/mongodb/index.md create mode 100644 doc/io/mongodb/loader.md rename cratedb_toolkit/io/mongodb/README.md => doc/io/mongodb/migr8.md (96%) diff --git a/README.md b/README.md index 153764a7..51403098 100644 --- a/README.md +++ b/README.md @@ -41,7 +41,7 @@ Contributions of all kinds are much welcome, in order to make it more solid, and to add features. Breaking changes should be expected until a 1.0 release, so version pinning is -strongly recommended, especially when you use it as a library. +strongly recommended, especially when using it as a library. ## Install @@ -53,12 +53,13 @@ pip install --upgrade cratedb-toolkit Verify installation. ```shell -cratedb-toolkit --version +ctk --version ``` Run with Docker. ```shell -docker run --rm "ghcr.io/crate-workbench/cratedb-toolkit" cratedb-toolkit --version +alias ctk="docker run --rm "ghcr.io/crate-workbench/cratedb-toolkit" ctk" +ctk --version ``` diff --git a/cratedb_toolkit/datasets/README.md b/cratedb_toolkit/datasets/README.md deleted file mode 100644 index 3a0f7ab4..00000000 --- a/cratedb_toolkit/datasets/README.md +++ /dev/null @@ -1,20 +0,0 @@ -# CrateDB Datasets API - -Provide access to datasets, to be easily consumed by tutorials -and/or production applications. - -## Synopsis - -```python -from cratedb_toolkit.datasets import load_dataset - -dataset = load_dataset("tutorial/weather-basic") -dataset.dbtable(dburi="crate://crate@localhost/", table="weather_data").load() -``` - -```python -from cratedb_toolkit.datasets import load_dataset - -dataset = load_dataset("kaggle://guillemservera/global-daily-climate-data/daily_weather.parquet") -dataset.dbtable(dburi="crate://crate@localhost/", table="kaggle_daily_weather").create() -``` diff --git a/doc/datasets.md b/doc/datasets.md new file mode 100644 index 00000000..8d531261 --- /dev/null +++ b/doc/datasets.md @@ -0,0 +1,66 @@ +# Datasets API + +Provide access to datasets, to be easily consumed by tutorials +and/or production applications. + +## Install +```shell +pip install --upgrade 'cratedb-toolkit[datasets]' +``` + +## Synopsis + +```python +from cratedb_toolkit.datasets import load_dataset + +dataset = load_dataset("tutorial/weather-basic") +print(dataset.ddl) +``` + +## Usage + +### Built-in datasets +Load an example dataset into a CrateDB database table. +```python +from cratedb_toolkit.datasets import load_dataset + +dataset = load_dataset("tutorial/weather-basic") +dataset.dbtable(dburi="crate://crate@localhost/", table="weather_data").load() +``` + +### Kaggle +For accessing datasets on Kaggle, you will need an account on their platform. + +#### Authentication +Either create a configuration file `~/.kaggle/kaggle.json` in JSON format, +```json +{"username":"acme","key":"134af98bdb0bd0fa92078d9c37ac8f78"} +``` +or, alternatively, use those environment variables. +```shell +export KAGGLE_USERNAME=acme +export KAGGLE_KEY=134af98bdb0bd0fa92078d9c37ac8f78 +``` + +#### Acquisition +Load a dataset on Kaggle into a CrateDB database table. +```python +from cratedb_toolkit.datasets import load_dataset + +dataset = load_dataset("kaggle://guillemservera/global-daily-climate-data/daily_weather.parquet") +dataset.dbtable(dburi="crate://crate@localhost/", table="kaggle_daily_weather").load() +``` + + +## In Practice + +Please refer to those notebooks to learn how `load_dataset` works in practice. + +- [How to Build Time Series Applications in CrateDB] +- [Exploratory data analysis with CrateDB] +- [Time series decomposition with CrateDB] + + +[Exploratory data analysis with CrateDB]: https://github.com/crate/cratedb-examples/blob/main/topic/timeseries/exploratory_data_analysis.ipynb +[How to Build Time Series Applications in CrateDB]: https://github.com/crate/cratedb-examples/blob/main/topic/timeseries/dask-weather-data-import.ipynb +[Time series decomposition with CrateDB]: https://github.com/crate/cratedb-examples/blob/main/topic/timeseries/time-series-decomposition.ipynb diff --git a/doc/index.md b/doc/index.md index b410690c..07b04889 100644 --- a/doc/index.md +++ b/doc/index.md @@ -89,7 +89,9 @@ about a possible feature. :hidden: install +io/index retention +datasets ``` ```{toctree} @@ -103,14 +105,6 @@ backlog ``` - -[Changelog]: https://github.com/crate-workbench/cratedb-toolkit/blob/main/CHANGES.md -[development documentation]: https://cratedb-toolkit.readthedocs.io/sandbox.html -[Documentation]: https://cratedb-toolkit.readthedocs.io/ -[Issues]: https://github.com/crate-workbench/cratedb-toolkit/issues -[License]: https://github.com/crate-workbench/cratedb-toolkit/blob/main/LICENSE -[PyPI]: https://pypi.org/project/cratedb-toolkit/ -[Source code]: https://github.com/crate-workbench/cratedb-toolkit [cratedb-toolkit]: https://cratedb-toolkit.readthedocs.io/ [influxio]: https://influxio.readthedocs.io/ diff --git a/cratedb_toolkit/io/README.md b/doc/io/index.md similarity index 52% rename from cratedb_toolkit/io/README.md rename to doc/io/index.md index 0898c33b..82822d54 100644 --- a/cratedb_toolkit/io/README.md +++ b/doc/io/index.md @@ -1,9 +1,13 @@ -# Load and extract data into/from CrateDB +# I/O Subsystem +Load and extract data into/from CrateDB. ## About +Using the InfluxDB and MongoDB I/O subsystems, you can transfer data from +[InfluxDB] and [MongoDB] to [CrateDB] and [CrateDB Cloud]. -A one-stop command `ctk load table` to load data into CrateDB database tables. +## What's inside +A one-stop command `ctk load table` to load data into database tables. ## Installation @@ -78,76 +82,16 @@ ctk shell --command="SELECT * FROM data_weather LIMIT 10;" --format=json - Exercise data imports from AWS S3 and other Object Storage providers. -## InfluxDB +```{toctree} +:maxdepth: 2 +:hidden: -Using the adapter to [influxio], you can transfer data from InfluxDB to CrateDB. - -Import two data points into InfluxDB. -```shell -export INFLUX_ORG=example -export INFLUX_TOKEN=token -export INFLUX_BUCKET_NAME=testdrive -export INFLUX_MEASUREMENT=demo -influx bucket create -influx write --precision=s "${INFLUX_MEASUREMENT},region=amazonas temperature=42.42,humidity=84.84 1556896326" -influx write --precision=s "${INFLUX_MEASUREMENT},region=amazonas temperature=45.89,humidity=77.23,windspeed=5.4 1556896327" -influx query "from(bucket:\"${INFLUX_BUCKET_NAME}\") |> range(start:-100y)" -``` - -Transfer data. -```shell -export CRATEDB_SQLALCHEMY_URL=crate://crate@localhost:4200/testdrive/demo -ctk load table influxdb2://example:token@localhost:8086/testdrive/demo -crash --command "SELECT * FROM testdrive.demo;" -``` - -Todo: More convenient table querying. -```shell -export CRATEDB_SQLALCHEMY_URL=crate://crate@localhost:4200/testdrive/demo -ctk shell --command "SELECT * FROM testdrive.demo;" -ctk show table "testdrive.demo" -``` - - -## MongoDB - -Using the MongoDB subsystem, you can transfer data from MongoDB to CrateDB. - -Import two data points into MongoDB. -```shell -mongosh mongodb://localhost:27017/testdrive < +MongoDB ``` +[CrateDB]: https://github.com/crate/crate [CrateDB Cloud]: https://console.cratedb.cloud/ -[influxio]: https://github.com/daq-tools/influxio +[InfluxDB]: https://github.com/influxdata/influxdb +[MongoDB]: https://github.com/mongodb/mongo diff --git a/doc/io/influxdb/index.md b/doc/io/influxdb/index.md new file mode 100644 index 00000000..759d252f --- /dev/null +++ b/doc/io/influxdb/index.md @@ -0,0 +1,12 @@ +(influxdb)= +# InfluxDB I/O Subsystem + +## About +Import and export data into/from InfluxDB, for humans and machines. + + +```{toctree} +:maxdepth: 1 + +loader +``` diff --git a/doc/io/influxdb/loader.md b/doc/io/influxdb/loader.md new file mode 100644 index 00000000..803bebac --- /dev/null +++ b/doc/io/influxdb/loader.md @@ -0,0 +1,52 @@ +(influxdb-loader)= +# InfluxDB Table Loader + +## About +Load data from InfluxDB into CrateDB using a one-stop command +`ctk load table influxdb2://...`, in order to facilitate convenient +data transfers to be used within data pipelines or ad hoc operations. + +## Details +The InfluxDB table loader is based on the [influxio] package. Please also check +its documentation to learn about more of its capabilities, supporting you when +working with InfluxDB. + +## Install +```shell +pip install --upgrade 'cratedb-toolkit[influxdb]' +``` + +## Example +Import two data points into InfluxDB. + +```shell +export INFLUX_ORG=example +export INFLUX_TOKEN=token +export INFLUX_BUCKET_NAME=testdrive +export INFLUX_MEASUREMENT=demo +influx bucket create +influx write --precision=s "${INFLUX_MEASUREMENT},region=amazonas temperature=42.42,humidity=84.84 1556896326" +influx write --precision=s "${INFLUX_MEASUREMENT},region=amazonas temperature=45.89,humidity=77.23,windspeed=5.4 1556896327" +influx query "from(bucket:\"${INFLUX_BUCKET_NAME}\") |> range(start:-100y)" +``` + +Transfer data from InfluxDB bucket/measurement into CrateDB schema/table. +```shell +export CRATEDB_SQLALCHEMY_URL=crate://crate@localhost:4200/testdrive/demo +ctk load table influxdb2://example:token@localhost:8086/testdrive/demo +crash --command "SELECT * FROM testdrive.demo;" +``` + +Query data in CrateDB. +```shell +export CRATEDB_SQLALCHEMY_URL=crate://crate@localhost:4200/testdrive/demo +ctk shell --command "SELECT * FROM testdrive.demo;" +ctk show table "testdrive.demo" +``` + +:::{todo} +- More convenient table querying. +::: + + +[influxio]: inv:influxio:*:label#index diff --git a/doc/io/mongodb/index.md b/doc/io/mongodb/index.md new file mode 100644 index 00000000..deef1bf8 --- /dev/null +++ b/doc/io/mongodb/index.md @@ -0,0 +1,13 @@ +(mongodb)= +# MongoDB I/O Subsystem + +## About +Using the MongoDB subsystem, you can transfer data from and to MongoDB. + + +```{toctree} +:maxdepth: 1 + +loader +migr8 +``` diff --git a/doc/io/mongodb/loader.md b/doc/io/mongodb/loader.md new file mode 100644 index 00000000..369dd5a2 --- /dev/null +++ b/doc/io/mongodb/loader.md @@ -0,0 +1,58 @@ +(mongodb-loader)= +# MongoDB Table Loader + +## About +Load data from MongoDB into CrateDB using a one-stop command +`ctk load table mongodb://...`, in order to facilitate convenient +data transfers to be used within data pipelines or ad hoc operations. + +## Install +```shell +pip install --upgrade 'cratedb-toolkit[mongodb]' +``` + +## Example +Import two data points into MongoDB. + +```shell +mongosh mongodb://localhost:27017/testdrive <=1", ] mongodb = [ - "cr8", + "cratedb-toolkit[io]", "orjson<4,>=3.3.1", "pymongo<5,>=3.10.1", "python-bsonjs<0.5", @@ -221,7 +222,7 @@ markers = [ [tool.ruff] line-length = 120 -select = [ +lint.select = [ # Bandit "S", # Bugbear @@ -249,7 +250,7 @@ select = [ "RET", ] -extend-ignore = [ +lint.extend-ignore = [ # zip() without an explicit strict= parameter "B905", # df is a bad variable name. Be kinder to your future self. @@ -267,7 +268,7 @@ extend-exclude = [ "workbench.py", ] -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "doc/conf.py" = ["A001", "ERA001"] "tests/*" = ["S101"] # Allow use of `assert`, and `print`. "examples/*" = ["T201"] # Allow `print` @@ -303,12 +304,12 @@ format = [ { cmd = "black ." }, # Configure Ruff not to auto-fix (remove!): # unused imports (F401), unused variables (F841), `print` statements (T201), and commented-out code (ERA001). - { cmd = "ruff --fix --ignore=ERA --ignore=F401 --ignore=F841 --ignore=T20 --ignore=ERA001 ." }, + { cmd = "ruff check --fix --ignore=ERA --ignore=F401 --ignore=F841 --ignore=T20 --ignore=ERA001 ." }, { cmd = "pyproject-fmt --keep-full-version pyproject.toml" }, ] lint = [ - { cmd = "ruff ." }, + { cmd = "ruff check ." }, { cmd = "black --check ." }, { cmd = "validate-pyproject pyproject.toml" }, { cmd = "mypy" },