From 5d40a6c7ee4e3f3b7fc708fb38e4b13a7173e0a5 Mon Sep 17 00:00:00 2001
From: Andreas Motl <andreas.motl@crate.io>
Date: Wed, 8 May 2024 02:46:49 +0200
Subject: [PATCH] Documentation: Improve structure and layout

---
 README.md                                     |  7 +-
 cratedb_toolkit/datasets/README.md            | 20 -----
 doc/datasets.md                               | 66 +++++++++++++++
 doc/index.md                                  | 10 +--
 .../io/README.md => doc/io/index.md           | 84 ++++---------------
 doc/io/influxdb/index.md                      | 12 +++
 doc/io/influxdb/loader.md                     | 52 ++++++++++++
 doc/io/mongodb/index.md                       | 13 +++
 doc/io/mongodb/loader.md                      | 58 +++++++++++++
 .../README.md => doc/io/mongodb/migr8.md      | 14 ++--
 doc/retention.md                              |  4 +-
 doc/sandbox.md                                |  2 +-
 pyproject.toml                                | 13 +--
 13 files changed, 240 insertions(+), 115 deletions(-)
 delete mode 100644 cratedb_toolkit/datasets/README.md
 create mode 100644 doc/datasets.md
 rename cratedb_toolkit/io/README.md => doc/io/index.md (52%)
 create mode 100644 doc/io/influxdb/index.md
 create mode 100644 doc/io/influxdb/loader.md
 create mode 100644 doc/io/mongodb/index.md
 create mode 100644 doc/io/mongodb/loader.md
 rename cratedb_toolkit/io/mongodb/README.md => doc/io/mongodb/migr8.md (96%)

diff --git a/README.md b/README.md
index 153764a7..51403098 100644
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ Contributions of all kinds are much welcome, in order to make it more solid,
 and to add features.
 
 Breaking changes should be expected until a 1.0 release, so version pinning is
-strongly recommended, especially when you use it as a library.
+strongly recommended, especially when using it as a library.
 
 
 ## Install
@@ -53,12 +53,13 @@ pip install --upgrade cratedb-toolkit
 
 Verify installation.
 ```shell
-cratedb-toolkit --version
+ctk --version
 ```
 
 Run with Docker.
 ```shell
-docker run --rm "ghcr.io/crate-workbench/cratedb-toolkit" cratedb-toolkit --version
+alias ctk="docker run --rm "ghcr.io/crate-workbench/cratedb-toolkit" ctk"
+ctk --version
 ```
 
 
diff --git a/cratedb_toolkit/datasets/README.md b/cratedb_toolkit/datasets/README.md
deleted file mode 100644
index 3a0f7ab4..00000000
--- a/cratedb_toolkit/datasets/README.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# CrateDB Datasets API
-
-Provide access to datasets, to be easily consumed by tutorials
-and/or production applications.
-
-## Synopsis
-
-```python
-from cratedb_toolkit.datasets import load_dataset
-
-dataset = load_dataset("tutorial/weather-basic")
-dataset.dbtable(dburi="crate://crate@localhost/", table="weather_data").load()
-```
-
-```python
-from cratedb_toolkit.datasets import load_dataset
-
-dataset = load_dataset("kaggle://guillemservera/global-daily-climate-data/daily_weather.parquet")
-dataset.dbtable(dburi="crate://crate@localhost/", table="kaggle_daily_weather").create()
-```
diff --git a/doc/datasets.md b/doc/datasets.md
new file mode 100644
index 00000000..8d531261
--- /dev/null
+++ b/doc/datasets.md
@@ -0,0 +1,66 @@
+# Datasets API
+
+Provide access to datasets, to be easily consumed by tutorials
+and/or production applications.
+
+## Install
+```shell
+pip install --upgrade 'cratedb-toolkit[datasets]'
+```
+
+## Synopsis
+
+```python
+from cratedb_toolkit.datasets import load_dataset
+
+dataset = load_dataset("tutorial/weather-basic")
+print(dataset.ddl)
+```
+
+## Usage
+
+### Built-in datasets
+Load an example dataset into a CrateDB database table.
+```python
+from cratedb_toolkit.datasets import load_dataset
+
+dataset = load_dataset("tutorial/weather-basic")
+dataset.dbtable(dburi="crate://crate@localhost/", table="weather_data").load()
+```
+
+### Kaggle
+For accessing datasets on Kaggle, you will need an account on their platform.
+
+#### Authentication
+Either create a configuration file `~/.kaggle/kaggle.json` in JSON format,
+```json
+{"username":"acme","key":"134af98bdb0bd0fa92078d9c37ac8f78"}
+```
+or, alternatively, use those environment variables.
+```shell
+export KAGGLE_USERNAME=acme
+export KAGGLE_KEY=134af98bdb0bd0fa92078d9c37ac8f78
+```
+
+#### Acquisition
+Load a dataset on Kaggle into a CrateDB database table.
+```python
+from cratedb_toolkit.datasets import load_dataset
+
+dataset = load_dataset("kaggle://guillemservera/global-daily-climate-data/daily_weather.parquet")
+dataset.dbtable(dburi="crate://crate@localhost/", table="kaggle_daily_weather").load()
+```
+
+
+## In Practice
+
+Please refer to those notebooks to learn how `load_dataset` works in practice.
+
+- [How to Build Time Series Applications in CrateDB]
+- [Exploratory data analysis with CrateDB]
+- [Time series decomposition with CrateDB]
+
+
+[Exploratory data analysis with CrateDB]: https://github.com/crate/cratedb-examples/blob/main/topic/timeseries/exploratory_data_analysis.ipynb
+[How to Build Time Series Applications in CrateDB]: https://github.com/crate/cratedb-examples/blob/main/topic/timeseries/dask-weather-data-import.ipynb
+[Time series decomposition with CrateDB]: https://github.com/crate/cratedb-examples/blob/main/topic/timeseries/time-series-decomposition.ipynb
diff --git a/doc/index.md b/doc/index.md
index b410690c..07b04889 100644
--- a/doc/index.md
+++ b/doc/index.md
@@ -89,7 +89,9 @@ about a possible feature.
 :hidden:
 
 install
+io/index
 retention
+datasets
 ```
 
 ```{toctree}
@@ -103,14 +105,6 @@ backlog
 ```
 
 
-
-[Changelog]: https://github.com/crate-workbench/cratedb-toolkit/blob/main/CHANGES.md
-[development documentation]: https://cratedb-toolkit.readthedocs.io/sandbox.html
-[Documentation]: https://cratedb-toolkit.readthedocs.io/
-[Issues]: https://github.com/crate-workbench/cratedb-toolkit/issues
-[License]: https://github.com/crate-workbench/cratedb-toolkit/blob/main/LICENSE
-[PyPI]: https://pypi.org/project/cratedb-toolkit/
-[Source code]: https://github.com/crate-workbench/cratedb-toolkit
 [cratedb-toolkit]: https://cratedb-toolkit.readthedocs.io/
 [influxio]: https://influxio.readthedocs.io/
 
diff --git a/cratedb_toolkit/io/README.md b/doc/io/index.md
similarity index 52%
rename from cratedb_toolkit/io/README.md
rename to doc/io/index.md
index 0898c33b..82822d54 100644
--- a/cratedb_toolkit/io/README.md
+++ b/doc/io/index.md
@@ -1,9 +1,13 @@
-# Load and extract data into/from CrateDB
+# I/O Subsystem
 
+Load and extract data into/from CrateDB.
 
 ## About
+Using the InfluxDB and MongoDB I/O subsystems, you can transfer data from
+[InfluxDB] and [MongoDB] to [CrateDB] and [CrateDB Cloud].
 
-A one-stop command `ctk load table` to load data into CrateDB database tables.
+## What's inside
+A one-stop command `ctk load table` to load data into database tables.
 
 
 ## Installation
@@ -78,76 +82,16 @@ ctk shell --command="SELECT * FROM data_weather LIMIT 10;" --format=json
 - Exercise data imports from AWS S3 and other Object Storage providers.
 
 
-## InfluxDB
+```{toctree}
+:maxdepth: 2
+:hidden:
 
-Using the adapter to [influxio], you can transfer data from InfluxDB to CrateDB.
-
-Import two data points into InfluxDB.
-```shell
-export INFLUX_ORG=example
-export INFLUX_TOKEN=token
-export INFLUX_BUCKET_NAME=testdrive
-export INFLUX_MEASUREMENT=demo
-influx bucket create
-influx write --precision=s "${INFLUX_MEASUREMENT},region=amazonas temperature=42.42,humidity=84.84 1556896326"
-influx write --precision=s "${INFLUX_MEASUREMENT},region=amazonas temperature=45.89,humidity=77.23,windspeed=5.4 1556896327"
-influx query "from(bucket:\"${INFLUX_BUCKET_NAME}\") |> range(start:-100y)"
-```
-
-Transfer data.
-```shell
-export CRATEDB_SQLALCHEMY_URL=crate://crate@localhost:4200/testdrive/demo
-ctk load table influxdb2://example:token@localhost:8086/testdrive/demo
-crash --command "SELECT * FROM testdrive.demo;"
-```
-
-Todo: More convenient table querying.
-```shell
-export CRATEDB_SQLALCHEMY_URL=crate://crate@localhost:4200/testdrive/demo
-ctk shell --command "SELECT * FROM testdrive.demo;"
-ctk show table "testdrive.demo"
-```
-
-
-## MongoDB
-
-Using the MongoDB subsystem, you can transfer data from MongoDB to CrateDB.
-
-Import two data points into MongoDB.
-```shell
-mongosh mongodb://localhost:27017/testdrive <<EOF
-db.demo.remove({})
-db.demo.insertMany([
-  {
-    timestamp: new Date(1556896326),
-    region: "amazonas",
-    temperature: 42.42,
-    humidity: 84.84,
-  },
-  {
-    timestamp: new Date(1556896327),
-    region: "amazonas",
-    temperature: 45.89,
-    humidity: 77.23,
-    windspeed: 5.4,
-  },
-])
-db.demo.find({})
-EOF
-```
-
-Todo: Use `mongoimport`.
-```shell
-mongoimport --uri 'mongodb+srv://MYUSERNAME:SECRETPASSWORD@mycluster-ABCDE.azure.mongodb.net/test?retryWrites=true&w=majority'
-```
-
-Transfer data.
-```shell
-export CRATEDB_SQLALCHEMY_URL=crate://crate@localhost:4200/testdrive/demo
-ctk load table mongodb://localhost:27017/testdrive/demo
-crash --command "SELECT * FROM testdrive.demo;"
+InfluxDB <influxdb/index>
+MongoDB <mongodb/index>
 ```
 
 
+[CrateDB]: https://github.com/crate/crate
 [CrateDB Cloud]: https://console.cratedb.cloud/
-[influxio]: https://github.com/daq-tools/influxio
+[InfluxDB]: https://github.com/influxdata/influxdb
+[MongoDB]: https://github.com/mongodb/mongo
diff --git a/doc/io/influxdb/index.md b/doc/io/influxdb/index.md
new file mode 100644
index 00000000..759d252f
--- /dev/null
+++ b/doc/io/influxdb/index.md
@@ -0,0 +1,12 @@
+(influxdb)=
+# InfluxDB I/O Subsystem
+
+## About
+Import and export data into/from InfluxDB, for humans and machines.
+
+
+```{toctree}
+:maxdepth: 1
+
+loader
+```
diff --git a/doc/io/influxdb/loader.md b/doc/io/influxdb/loader.md
new file mode 100644
index 00000000..803bebac
--- /dev/null
+++ b/doc/io/influxdb/loader.md
@@ -0,0 +1,52 @@
+(influxdb-loader)=
+# InfluxDB Table Loader
+
+## About
+Load data from InfluxDB into CrateDB using a one-stop command
+`ctk load table influxdb2://...`, in order to facilitate convenient
+data transfers to be used within data pipelines or ad hoc operations.
+
+## Details
+The InfluxDB table loader is based on the [influxio] package. Please also check
+its documentation to learn about more of its capabilities, supporting you when
+working with InfluxDB.
+
+## Install
+```shell
+pip install --upgrade 'cratedb-toolkit[influxdb]'
+```
+
+## Example
+Import two data points into InfluxDB.
+
+```shell
+export INFLUX_ORG=example
+export INFLUX_TOKEN=token
+export INFLUX_BUCKET_NAME=testdrive
+export INFLUX_MEASUREMENT=demo
+influx bucket create
+influx write --precision=s "${INFLUX_MEASUREMENT},region=amazonas temperature=42.42,humidity=84.84 1556896326"
+influx write --precision=s "${INFLUX_MEASUREMENT},region=amazonas temperature=45.89,humidity=77.23,windspeed=5.4 1556896327"
+influx query "from(bucket:\"${INFLUX_BUCKET_NAME}\") |> range(start:-100y)"
+```
+
+Transfer data from InfluxDB bucket/measurement into CrateDB schema/table.
+```shell
+export CRATEDB_SQLALCHEMY_URL=crate://crate@localhost:4200/testdrive/demo
+ctk load table influxdb2://example:token@localhost:8086/testdrive/demo
+crash --command "SELECT * FROM testdrive.demo;"
+```
+
+Query data in CrateDB.
+```shell
+export CRATEDB_SQLALCHEMY_URL=crate://crate@localhost:4200/testdrive/demo
+ctk shell --command "SELECT * FROM testdrive.demo;"
+ctk show table "testdrive.demo"
+```
+
+:::{todo}
+- More convenient table querying.
+:::
+
+
+[influxio]: inv:influxio:*:label#index
diff --git a/doc/io/mongodb/index.md b/doc/io/mongodb/index.md
new file mode 100644
index 00000000..deef1bf8
--- /dev/null
+++ b/doc/io/mongodb/index.md
@@ -0,0 +1,13 @@
+(mongodb)=
+# MongoDB I/O Subsystem
+
+## About
+Using the MongoDB subsystem, you can transfer data from and to MongoDB.
+
+
+```{toctree}
+:maxdepth: 1
+
+loader
+migr8
+```
diff --git a/doc/io/mongodb/loader.md b/doc/io/mongodb/loader.md
new file mode 100644
index 00000000..369dd5a2
--- /dev/null
+++ b/doc/io/mongodb/loader.md
@@ -0,0 +1,58 @@
+(mongodb-loader)=
+# MongoDB Table Loader
+
+## About
+Load data from MongoDB into CrateDB using a one-stop command
+`ctk load table mongodb://...`, in order to facilitate convenient
+data transfers to be used within data pipelines or ad hoc operations.
+
+## Install
+```shell
+pip install --upgrade 'cratedb-toolkit[mongodb]'
+```
+
+## Example
+Import two data points into MongoDB.
+
+```shell
+mongosh mongodb://localhost:27017/testdrive <<EOF
+db.demo.remove({})
+db.demo.insertMany([
+  {
+    timestamp: new Date(1556896326),
+    region: "amazonas",
+    temperature: 42.42,
+    humidity: 84.84,
+  },
+  {
+    timestamp: new Date(1556896327),
+    region: "amazonas",
+    temperature: 45.89,
+    humidity: 77.23,
+    windspeed: 5.4,
+  },
+])
+db.demo.find({})
+EOF
+```
+
+Transfer data from MongoDB database/collection into CrateDB schema/table.
+```shell
+export CRATEDB_SQLALCHEMY_URL=crate://crate@localhost:4200/testdrive/demo
+ctk load table mongodb://localhost:27017/testdrive/demo
+```
+
+Query data in CrateDB.
+```shell
+export CRATEDB_SQLALCHEMY_URL=crate://crate@localhost:4200/testdrive/demo
+ctk shell --command "SELECT * FROM testdrive.demo;"
+ctk show table "testdrive.demo"
+```
+
+
+:::{todo}
+Use `mongoimport`.
+```shell
+mongoimport --uri 'mongodb+srv://MYUSERNAME:SECRETPASSWORD@mycluster-ABCDE.azure.mongodb.net/test?retryWrites=true&w=majority'
+```
+:::
diff --git a/cratedb_toolkit/io/mongodb/README.md b/doc/io/mongodb/migr8.md
similarity index 96%
rename from cratedb_toolkit/io/mongodb/README.md
rename to doc/io/mongodb/migr8.md
index 907ba16c..ecbc48f8 100644
--- a/cratedb_toolkit/io/mongodb/README.md
+++ b/doc/io/mongodb/migr8.md
@@ -1,13 +1,15 @@
-# MongoDB → CrateDB Migration Tool
+(migr8)=
+# migr8
+
+## About
 
 A utility program, called `migr8`, supporting data migrations
 between MongoDB and CrateDB.
 
-A one-stop command `ctk load table mongodb://...`, wrapping the `migr8`
-steps into a complete pipeline, to facilitate convenient data transfers.
-
-
-## About
+:::{tip}
+Please also visit the documentation about the [](#mongodb-loader)
+to learn about a more high-level interface.
+:::
 
 ### Details
 
diff --git a/doc/retention.md b/doc/retention.md
index 3d23c8e9..b0a6c235 100644
--- a/doc/retention.md
+++ b/doc/retention.md
@@ -3,7 +3,9 @@
 ## About
 
 A data retention and expiration management subsystem for CrateDB, implementing
-different retention strategies.
+multiple strategies.
+
+### Details
 
 The application manages the life-cycle of data stored in CrateDB, handling
 concerns of data expiry, size reduction, and archival. Within a system storing
diff --git a/doc/sandbox.md b/doc/sandbox.md
index d6f58caf..45a355f3 100644
--- a/doc/sandbox.md
+++ b/doc/sandbox.md
@@ -18,7 +18,7 @@ source .venv/bin/activate
 
 Install project in sandbox mode.
 ```shell
-pip install --editable='.[all,develop,test]'
+pip install --editable='.[all,develop,docs,test]'
 ```
 
 Run tests. `TC_KEEPALIVE` keeps the auxiliary service containers running, which
diff --git a/pyproject.toml b/pyproject.toml
index f761841e..23d9b1a4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -127,6 +127,7 @@ docs = [
   "sphinxext-opengraph<1",
 ]
 influxdb = [
+  "cratedb-toolkit[io]",
   "influxio==0.2.0",
 ]
 io = [
@@ -135,7 +136,7 @@ io = [
   "pandas<3,>=1",
 ]
 mongodb = [
-  "cr8",
+  "cratedb-toolkit[io]",
   "orjson<4,>=3.3.1",
   "pymongo<5,>=3.10.1",
   "python-bsonjs<0.5",
@@ -221,7 +222,7 @@ markers = [
 [tool.ruff]
 line-length = 120
 
-select = [
+lint.select = [
   # Bandit
   "S",
   # Bugbear
@@ -249,7 +250,7 @@ select = [
   "RET",
 ]
 
-extend-ignore = [
+lint.extend-ignore = [
   # zip() without an explicit strict= parameter
   "B905",
   # df is a bad variable name. Be kinder to your future self.
@@ -267,7 +268,7 @@ extend-exclude = [
   "workbench.py",
 ]
 
-[tool.ruff.per-file-ignores]
+[tool.ruff.lint.per-file-ignores]
 "doc/conf.py" = ["A001", "ERA001"]
 "tests/*" = ["S101"]  # Allow use of `assert`, and `print`.
 "examples/*" = ["T201"]  # Allow `print`
@@ -303,12 +304,12 @@ format = [
   { cmd = "black ." },
   # Configure Ruff not to auto-fix (remove!):
   # unused imports (F401), unused variables (F841), `print` statements (T201), and commented-out code (ERA001).
-  { cmd = "ruff --fix --ignore=ERA --ignore=F401 --ignore=F841 --ignore=T20 --ignore=ERA001 ." },
+  { cmd = "ruff check --fix --ignore=ERA --ignore=F401 --ignore=F841 --ignore=T20 --ignore=ERA001 ." },
   { cmd = "pyproject-fmt --keep-full-version pyproject.toml" },
 ]
 
 lint = [
-  { cmd = "ruff ." },
+  { cmd = "ruff check ." },
   { cmd = "black --check ." },
   { cmd = "validate-pyproject pyproject.toml" },
   { cmd = "mypy" },