From f3f8b03978b1bd2e6a00fdcc37964e9b761f8805 Mon Sep 17 00:00:00 2001 From: aandrusenko Date: Wed, 23 Sep 2020 04:46:41 +0300 Subject: [PATCH] add example with python parameters file --- .../docs/command-reference/params/index.md | 100 +++++++++++++++++- content/docs/command-reference/run.md | 4 +- content/docs/start/experiments.md | 2 +- .../user-guide/basic-concepts/parameter.md | 4 +- .../user-guide/dvc-files-and-directories.md | 2 +- 5 files changed, 102 insertions(+), 10 deletions(-) diff --git a/content/docs/command-reference/params/index.md b/content/docs/command-reference/params/index.md index a0cb741bd72..cbdae24c821 100644 --- a/content/docs/command-reference/params/index.md +++ b/content/docs/command-reference/params/index.md @@ -24,8 +24,8 @@ dependencies: _parameters_. Parameters are defined using the the `-p` In contrast to a regular dependency, a parameter is not a file (or directory). Instead, it consists of a _parameter name_ (or key) to find inside a -YAML, JSON, or TOML _parameters file_. Multiple parameter dependencies can be -specified from one or more parameters files. +YAML, JSON, TOML, or Python _parameters file_. Multiple parameter dependencies +can be specified from one or more parameters files. The default parameters file name is `params.yaml`. Parameters should be organized as a tree hierarchy inside, as DVC will locate param names by their @@ -91,8 +91,8 @@ $ dvc run -n train -d users.csv -o model.pkl \ python train.py ``` -> Note that we could use the same parameter addressing with JSON or TOML -> parameters files. +> Note that we could use the same parameter addressing with JSON, TOML, or +> Python parameters files. The `train.py` script will have some code to parse the needed parameters. For example: @@ -143,6 +143,98 @@ $ dvc run -n train -d logs/ -o users.csv \ python train.py ``` +## Examples: Python parameters file + +Consider this parameters file in Python format, named `params.py`: + +```python +IS_BOOL: bool = True +CONST = 5 + +# All standard variable types are supported +FLOAT = 0.001 +STR = 'abc' +DICT = { + "a": 1, + "b": 2 +} +LIST = [1, 2, 3] +SET = {4, 5, 6} +TUPLE = (10, 100) +NONE = None + + +# It is possible to retrieve either class constants +# or own variables defined in __init__ +class TrainConfig: + EPOCHS = 70 + + def __init__(self): + # TrainConfig.layers param will be 9 + self.layers = 5 + self.layers = 9 + # TrainConfig.foo will NOT be found because the complex expression + self.foo = 1 + 2 + # TrainConfig.bar will NOT be found + bar = 1 + + +class TestConfig: + TEST_DIR = "path" + METRICS = ["metric"] +``` + +The following [stage](/doc/command-reference/run) depends on params `IS_BOOL`, +`CONST`, as well as `TrainConfig`'s `EPOCHS` and `layers`: + +```dvc +$ dvc run -n train -d users.csv -o model.pkl \ + -p params.py:IS_BOOL,CONST,TrainConfig.EPOCHS,TrainConfig.layers \ + python train.py +``` + +Resulting `dvc.yaml` and `dvc.lock` files (notice the `params` list): + +```yaml +stages: + train: + cmd: python train.py + deps: + - users.csv + params: + - IS_BOOL + - CONST + - TrainConfig.EPOCHS + - TrainConfig.layers + outs: + - model.pkl +``` + +```yaml +train: + cmd: python train.py + deps: + - path: users.csv + md5: 23be4307b23dcd740763d5fc67993f11 + params: + CONST: 5 + IS_BOOL: true + TrainConfig.EPOCHS: 70 + TrainConfig.layers: 9 + outs: + - path: model.pkl + md5: 1c06b4756f08203cc496e4061b1e7d67 +``` + +Alternatively, the entire `TestConfig` group can be referenced (also a +dictionary), instead of the parameters in it: + +```dvc +$ dvc run -n train -d users.csv -o model.pkl \ + -p params.py:IS_BOOL,CONST,TestConfig \ + python train.py +``` + ## Examples: Print all parameters Following the previous example, we can use `dvc params diff` to list all of the diff --git a/content/docs/command-reference/run.md b/content/docs/command-reference/run.md index 0d475bca786..a3f0ffd94a1 100644 --- a/content/docs/command-reference/run.md +++ b/content/docs/command-reference/run.md @@ -114,8 +114,8 @@ Relevant notes: [parameters](/doc/command-reference/params) (`-p`/`--params` option) are a special type of key/value dependencies. Multiple parameter dependencies can be -specified from within one or more YAML, JSON or TOML parameters files (e.g. -`params.yaml`). This allows tracking experimental hyperparameters easily. +specified from within one or more YAML, JSON, TOML, or Python parameters files +(e.g. `params.yaml`). This allows tracking experimental hyperparameters easily. Special types of output files, [metrics](/doc/command-reference/metrics) (`-m` and `-M` options) and [plots](/doc/command-reference/plots) (`--plots` and diff --git a/content/docs/start/experiments.md b/content/docs/start/experiments.md index 3d988002914..fde0ff24d25 100644 --- a/content/docs/start/experiments.md +++ b/content/docs/start/experiments.md @@ -100,7 +100,7 @@ parameters. It's pretty common for data science pipelines to include configuration files that define adjustable parameters to train a model, do pre-processing, etc. DVC provides a mechanism for stages to depend on the values of specific sections of -such a config file (YAML, JSON and TOML formats are supported). +such a config file (YAML, JSON, TOML, and Python formats are supported). Luckily, we should already have a stage with [parameters](/doc/command-reference/params) in `dvc.yaml`: diff --git a/content/docs/user-guide/basic-concepts/parameter.md b/content/docs/user-guide/basic-concepts/parameter.md index c2b2e011393..0dd2a90f44a 100644 --- a/content/docs/user-guide/basic-concepts/parameter.md +++ b/content/docs/user-guide/basic-concepts/parameter.md @@ -4,5 +4,5 @@ match: [parameter, parameters, param, params, hyperparameter, hyperparameters] --- Pipeline stages (defined in `dvc.yaml`) can depend on specific values inside an -arbitrary YAML, JSON, or TOML file (`params.yaml` by default). Stages are -invalidated when any of their parameter values change. See `dvc param`. +arbitrary YAML, JSON, TOML, or Python file (`params.yaml` by default). Stages +are invalidated when any of their parameter values change. See `dvc param`. diff --git a/content/docs/user-guide/dvc-files-and-directories.md b/content/docs/user-guide/dvc-files-and-directories.md index ba2cbcb0b96..b3f2bb0d6d2 100644 --- a/content/docs/user-guide/dvc-files-and-directories.md +++ b/content/docs/user-guide/dvc-files-and-directories.md @@ -150,7 +150,7 @@ the possible following fields: - `deps`: List of dependency file or directory paths of this stage (relative to `wdir` which defaults to the file's location) - `params`: List of parameter dependency keys (field names) that - are read from a YAML, JSON, or TOML file (`params.yaml` by default). + are read from a YAML, JSON, TOML, or Python file (`params.yaml` by default). - `outs`: List of output file or directory paths of this stage (relative to `wdir` which defaults to the file's location), and optionally, whether or not this file or directory is cached (`true` by