diff --git a/content/docs/command-reference/params/index.md b/content/docs/command-reference/params/index.md index 992a753630..9fb513da23 100644 --- a/content/docs/command-reference/params/index.md +++ b/content/docs/command-reference/params/index.md @@ -69,8 +69,8 @@ The `dvc params diff` command is available to show parameter changes, displaying their current and previous values. 💡 Parameters can also be used for -[templating](/doc/user-guide/dvc-files/advanced-dvc.yaml#templating) `dvc.yaml` -itself. +[templating](/doc/user-guide/project-structure/pipelines-files#templating) +`dvc.yaml` itself. ## Options diff --git a/content/docs/sidebar.json b/content/docs/sidebar.json index 4713c94801..720f0f9a14 100644 --- a/content/docs/sidebar.json +++ b/content/docs/sidebar.json @@ -89,34 +89,33 @@ "children": [ { "label": "What is DVC?", - "slug": "what-is-dvc", - "source": "what-is-dvc.md" + "slug": "what-is-dvc" }, { - "label": "DVC Files", - "slug": "dvc-files", + "slug": "project-structure", + "source": "user-guide/project-structure/index.md", "children": [ { - "label": ".dvc files", - "slug": ".dvc" + "label": "Pipelines Files (dvc.yaml)", + "slug": "pipelines-files" }, { - "label": "dvc.yaml & dvc.lock", - "slug": "dvc-yaml" + "label": ".dvc Files", + "slug": "dvc-files" }, { - "label": "Advanced dvc.yaml", - "slug": "advanced-dvc-yaml" + "label": ".dvcignore Files", + "slug": "dvcignore-files", + "tutorials": { + "katacoda": "https://katacoda.com/dvc/courses/examples/dvcignore" + } + }, + { + "label": "Internal Files", + "slug": "internal-files" } ] }, - "dvc-internals", - { - "slug": "dvcignore", - "tutorials": { - "katacoda": "https://katacoda.com/dvc/courses/examples/dvcignore" - } - }, { "label": "How To", "slug": "how-to", diff --git a/content/docs/start/data-versioning.md b/content/docs/start/data-versioning.md index 93c3ec3ce8..aa3d7f22b4 100644 --- a/content/docs/start/data-versioning.md +++ b/content/docs/start/data-versioning.md @@ -50,9 +50,9 @@ $ dvc add data/data.xml DVC stores information about the added file (or a directory) in a special `.dvc` file named `data/data.xml.dvc`, a small text file with a human-readable -[format](/doc/user-guide/dvc-files/.dvc). This file can be easily versioned like -source code with Git, as a placeholder for the original data (which gets listed -in `.gitignore`): +[format](/doc/user-guide/project-structure/dvc-files). This file can be easily +versioned like source code with Git, as a placeholder for the original data +(which gets listed in `.gitignore`): ```dvc $ git add data/data.xml.dvc data/.gitignore diff --git a/content/docs/use-cases/versioning-data-and-model-files/tutorial.md b/content/docs/use-cases/versioning-data-and-model-files/tutorial.md index 524a2b9065..493800128e 100644 --- a/content/docs/use-cases/versioning-data-and-model-files/tutorial.md +++ b/content/docs/use-cases/versioning-data-and-model-files/tutorial.md @@ -173,8 +173,8 @@ then `git commit` `.dvc` files that contain file hashes that point to cached data. In this case we created `data.dvc` and `model.h5.dvc`. Refer to -[DVC Files](/doc/user-guide/dvc-files#dvc-files) to learn more about how these -files work. +[DVC Files](/doc/user-guide/project-structure/dvc-files) to learn more about how +these files work. diff --git a/content/docs/user-guide/dvc-files.md b/content/docs/user-guide/dvc-files.md deleted file mode 100644 index d0974712c3..0000000000 --- a/content/docs/user-guide/dvc-files.md +++ /dev/null @@ -1,257 +0,0 @@ -# DVC Files - -There are a few special DVC file formats that enable its features: - -- Files ending with the `.dvc` extension ("dot DVC files") are placeholders to - track data files and directories. A DVC project usually has one - `.dvc` file per large data file or directory being tracked. -- `dvc.yaml` files (or _pipelines files_) specify stages that form the - pipeline(s) of a project, and how they connect (_dependency graph_ or DAG). - - These normally have a matching `dvc.lock` file to record the pipeline state - and track its outputs. - -Both `.dvc` files and `dvc.yaml` use human-friendly YAML 1.2 schemas, described -below. We encourage you to get familiar with them so you may create, generate, -and edit them on your own. - -These metafiles should be versioned with Git (in Git-enabled -repositories). - -See the [internals guide](/doc/user-guide/dvc-internals) for the contents of the -`.dvc/` directory. - -> See also [`.dvcignore`](/doc/user-guide/dvcignore). - -## `.dvc` files - -When you add a file or directory to a DVC project with `dvc add`, -`dvc import`, or `dvc import-url`, a `.dvc` file is created based on the data -file name (e.g. `data.xml.dvc`). These files contain the information needed to -track the data with DVC. - -They use a simple [YAML](https://yaml.org/) format, meant to be easy to read, -edit, or even created manually. Here is a sample: - -```yaml -outs: - - md5: a304afb96060aad90176268345e10355 - path: data.xml - desc: Cats and dogs dataset - -# Comments and user metadata are supported. -meta: - name: 'John Doe' - email: john@doe.com -``` - -`.dvc` files can contain the following fields: - -- `outs` (always present): List of [output entries](#output-entries) (details - below) that represent the files or directories tracked with DVC. Typically - there is only one (but several can be added or combined manually). -- `deps`: List of [dependency entries](#dependency-entries) (details below). - Only present when `dvc import` or `dvc import-url` are used to generate this - `.dvc` file. Typically there is only one (but several can be added manually). -- `wdir`: Working directory for the `outs` and `deps` paths (relative to the - `.dvc` file's location). It defaults to `.` (the file's location). -- `md5`: (only for imports) MD5 hash of the import `.dvc` file - itself. -- `meta` (optional): Arbitrary metadata can be added manually with this field. - Any YAML contents is supported. `meta` contents are ignored by DVC, but they - can be meaningful for user processes that read `.dvc` files. - -Note that comments can be added to `.dvc` files using the `# comment` syntax. -`meta` fields and `#` comments are preserved among executions of the `dvc repro` -and `dvc commit` commands, but not when a `.dvc` file is overwritten by -`dvc add`, `dvc move`, `dvc import`, or `dvc import-url`. - -### Output entries - -`outs` fields can contain these subfields: - -- `path`: Path to the file or directory (relative to `wdir`, which defaults to - the file's location) -- `md5`, `etag`, or `checksum`: Hash value for the file or directory being - tracked with DVC. MD5 is used for most locations (local file system and SSH); - [ETag](https://en.wikipedia.org/wiki/HTTP_ETag#Strong_and_weak_validation) for - HTTP, S3, or Azure [external outputs](/doc/user-guide/managing-external-data); - and a special _checksum_ for HDFS and WebHDFS. -- `size`: Size of the file or directory (sum of all files). -- `nfiles`: If this output is a directory, the number of files inside - (recursive). -- `isexec`: Whether this is an executable file. DVC preserves execute - permissions upon `dvc checkout` and `dvc pull`. This has no effect on - directories, or in general on Windows. -- `cache`: Whether or not this file or directory is cached (`true` - by default). See the `--no-commit` option of `dvc add`. -- `persist`: Whether the output file/dir should remain in place while - `dvc repro` runs (`false` by default: outputs are deleted when `dvc repro` - starts -- `desc` (optional): User description for this output (supported in metrics and - plots too). This doesn't affect any DVC operations. - -### Dependency entries - -`deps` fields can contain these subfields: - -- `path`: Path to the dependency (relative to `wdir`, which defaults to the - file's location) -- `md5`, `etag`, or `checksum`: Hash value for the file or directory being - tracked with DVC. MD5 is used for most locations (local file system and SSH); - [ETag](https://en.wikipedia.org/wiki/HTTP_ETag#Strong_and_weak_validation) for - HTTP, S3, or Azure external dependencies; and a special - _checksum_ for HDFS and WebHDFS. See `dvc import-url` for more information. -- `size`: Size of the file or directory (sum of all files). -- `nfiles`: If this dependency is a directory, the number of files inside - (recursive). -- `repo`: This entry is only for external dependencies created with - `dvc import`, and can contains the following fields: - - - `url`: URL of Git repository with source DVC project - - `rev`: Only present when the `--rev` option of `dvc import` is used. - Specific commit hash, branch or tag name, etc. (a - [Git revision](https://git-scm.com/docs/revisions)) used to import the - dependency from. - - `rev_lock`: Git commit hash of the external DVC repository at - the time of importing or updating the dependency (with `dvc update`) - -## `dvc.yaml` file - -`dvc.yaml` files describe data science or machine learning pipelines, similar to -how [Makefiles](https://www.gnu.org/software/make/manual/make.html#Introduction) -work for building software. Its YAML structure contains a list of stages which -can be written manually or generated by user code. - -> A helper command, `dvc run`, is also available to add or update stages in -> `dvc.yaml`. Additionally, a `dvc.lock` file is also created or updated by -> `dvc run` and `dvc repro`, to record the pipeline state. - -Here's a comprehensive `dvc.yaml` example: - -```yaml -stages: - features: - cmd: jupyter nbconvert --execute featurize.ipynb - deps: - - data/clean - params: - - levels.no - outs: - - features - metrics: - - performance.json - training: - desc: Train model with Python - cmd: - - pip install -r requirements.txt - - python train.py --out ${model_file} - deps: - - requirements.txt - - train.py - - features - outs: - - model.pkl: - desc: My model description - plots: - - logs.csv: - x: epoch - x_label: Epoch - - meta: 'For deployment' - # User metadata (per stage) and comments are supported. -``` - -`dvc.yaml` files consists of a group of `stages` with names provided explicitly -by the user with the `--name` (`-n`) option of `dvc run`. Each stage can contain -the following fields: - -- `cmd` (always present): One or more commands executed by the stage (may - contain either a single value, or a list). Commands are executed sequentially - until all are finished or until one of them fails (see `dvc repro` for - details). -- `wdir`: Working directory for the stage command to run in (relative to the - file's location). It defaults to `.` (the file's location). -- `deps`: List of dependency file or directory paths of this stage - (relative to `wdir`). -- `params`: List of parameter dependency keys (field names) to - track in `params.yaml`. The list may also contain other YAML, JSON, TOML, or - Python file names, with a sub-list of the param names to track in them. -- `outs`: List of output file or directory paths of this stage - (relative to `wdir`). See [Output entries](#output-entries) above for more - details. -- `metrics`: List of [metrics files](/doc/command-reference/metrics), and - optionally, whether or not this metrics file is cached (`true` by - default). See the `--metrics-no-cache` (`-M`) option of `dvc run`. -- `plots`: List of [plot metrics](/doc/command-reference/plots), and optionally, - their default configuration (subfields matching the options of - `dvc plots modify`), and whether or not this plots file is cached - ( `true` by default). See the `--plots-no-cache` option of `dvc run`. -- `frozen`: Whether or not this stage is frozen from reproduction -- `always_changed`: Whether or not this stage is considered as changed by - commands such as `dvc status` and `dvc repro`. `false` by default -- `meta` (optional): Arbitrary metadata can be added manually with this field. - Any YAML contents is supported. `meta` contents are ignored by DVC, but they - can be meaningful for user processes that read or write `.dvc` files directly. -- `desc` (optional): User description for this stage. This doesn't affect any - DVC operations. - -`dvc.yaml` files also support `# comments`. - -💡 Keep in mind that there may be more than one `dvc.yaml` files in each -DVC project. DVC checks all of them for consistency during -operations that require rebuilding DAGs (like `dvc dag`). - -Note that we maintain a `dvc.yaml` -[schema](https://github.com/iterative/dvcyaml-schema) that can be used by -editors like [VSCode](/doc/install/plugins#visual-studio-code) or -[PyCharm](/doc/install/plugins#pycharmintellij) to enable automatic syntax -checks and auto-completion. - -### `dvc.lock` file - -For every `dvc.yaml` file, a matching `dvc.lock` (YAML) file usually exists. -It's created or updated by DVC commands such as `dvc run` and `dvc repro`. -`dvc.lock` describes the latest pipeline state. It has several purposes: - -- Tracking of intermediate and final outputs of a pipeline — - similar to [`.dvc` files](#dvc-files). -- Allow DVC to detect when stage definitions, or their dependencies have - changed. Such conditions "invalidate" stages, considering them outdated and - requiring their reproduction (see `dvc status`, `dvc repro`). -- `dvc.lock` is needed internally for several DVC commands to operate, such as - `dvc checkout`, `dvc get`, and `dvc import`. - -Here's an example `dvc.lock` based on the one in `dvc.yaml` above: - -```yaml -stages: - features: - cmd: jupyter nbconvert --execute featurize.ipynb - deps: - - path: data/clean - md5: d8b874c5fa18c32b2d67f73606a1be60 - params: - params.yaml: - levels.no: 5 - outs: - - path: features - md5: 2119f7661d49546288b73b5730d76485 - - path: performance.json - md5: ea46c1139d771bfeba7942d1fbb5981e - - path: logs.csv - md5: f99aac37e383b422adc76f5f1fb45004 -``` - -Stage commands are listed again in `dvc.lock`, in order to know when their -definitions change in the `dvc.yaml` file. - -Regular dependencies and all kinds of outputs -(including [metrics](/doc/command-reference/metrics) and -[plots](/doc/command-reference/plots) files) are also listed (per stage) in -`dvc.lock`, but with an additional field to store the hash value of each file or -directory tracked by DVC. Specifically: `md5`, `etag`, or `checksum` (same as in -`deps` and `outs` entries of `.dvc` files). - -Full parameters (key and value) are listed separately under -`params`, grouped by parameters file. diff --git a/content/docs/user-guide/dvc-files/.dvc.md b/content/docs/user-guide/dvc-files/.dvc.md deleted file mode 100644 index dea640c962..0000000000 --- a/content/docs/user-guide/dvc-files/.dvc.md +++ /dev/null @@ -1,94 +0,0 @@ -# `.dvc` files - -When you add a file or directory with `dvc add`, `dvc import`, or -`dvc import-url`, a file ending with the `.dvc` extension ("dot DVC file") is -created based on the data file name (e.g. `data.xml.dvc`). It contain the -information needed to track the data with DVC. - -They use a simple [YAML](https://yaml.org/) format, meant to be easy to read, -edit, or even created manually. Here is an example: - -```yaml -outs: - - md5: a304afb96060aad90176268345e10355 - path: data.xml - desc: Cats and dogs dataset - -# Comments and user metadata are supported. -meta: - name: 'John Doe' - email: john@doe.com -``` - -## Accepted fields - -`.dvc` files can contain the following fields: - -- `outs` (always present): List of [output entries](#output-entries) (details - below) that represent the files or directories tracked with DVC. Typically - there is only one (but several can be added or combined manually). -- `deps`: List of [dependency entries](#dependency-entries) (details below). - Only present when `dvc import` or `dvc import-url` are used to generate this - `.dvc` file. Typically there is only one (but several can be added manually). -- `wdir`: Working directory for the `outs` and `deps` paths (relative to the - `.dvc` file's location). It defaults to `.` (the file's location). -- `md5`: (only for imports) MD5 hash of the import `.dvc` file - itself. -- `meta` (optional): Arbitrary metadata can be added manually with this field. - Any YAML contents is supported. `meta` contents are ignored by DVC, but they - can be meaningful for user processes that read `.dvc` files. - -Note that comments can be added to `.dvc` files using the `# comment` format. -`meta` fields and `#` comments are preserved among executions of the `dvc repro` -and `dvc commit` commands, but not when a `.dvc` file is overwritten by -`dvc add`, `dvc move`, `dvc import`, or `dvc import-url`. - -### Output entries - -`outs` fields can contain these subfields: - -- `path`: Path to the file or directory (relative to `wdir`, which defaults to - the file's location) -- `md5`, `etag`, or `checksum`: Hash value for the file or directory being - tracked with DVC. MD5 is used for most locations (local file system and SSH); - [ETag](https://en.wikipedia.org/wiki/HTTP_ETag#Strong_and_weak_validation) for - HTTP, S3, or Azure [external outputs](/doc/user-guide/managing-external-data); - and a special _checksum_ for HDFS and WebHDFS. -- `size`: Size of the file or directory (sum of all files). -- `nfiles`: If this output is a directory, the number of files inside - (recursive). -- `isexec`: Whether this is an executable file. DVC preserves execute - permissions upon `dvc checkout` and `dvc pull`. This has no effect on - directories, or in general on Windows. -- `cache`: Whether or not this file or directory is cached (`true` - by default). See the `--no-commit` option of `dvc add`. -- `persist`: Whether the output file/dir should remain in place while - `dvc repro` runs (`false` by default: outputs are deleted when `dvc repro` - starts -- `desc` (optional): User description for this output (supported in metrics and - plots too). This doesn't affect any DVC operations. - -### Dependency entries - -`deps` fields can contain these subfields: - -- `path`: Path to the dependency (relative to `wdir`, which defaults to the - file's location) -- `md5`, `etag`, or `checksum`: Hash value for the file or directory being - tracked with DVC. MD5 is used for most locations (local file system and SSH); - [ETag](https://en.wikipedia.org/wiki/HTTP_ETag#Strong_and_weak_validation) for - HTTP, S3, or Azure external dependencies; and a special - _checksum_ for HDFS and WebHDFS. See `dvc import-url` for more information. -- `size`: Size of the file or directory (sum of all files). -- `nfiles`: If this dependency is a directory, the number of files inside - (recursive). -- `repo`: This entry is only for external dependencies created with - `dvc import`, and can contains the following fields: - - - `url`: URL of Git repository with source DVC project - - `rev`: Only present when the `--rev` option of `dvc import` is used. - Specific commit hash, branch or tag name, etc. (a - [Git revision](https://git-scm.com/docs/revisions)) used to import the - dependency from. - - `rev_lock`: Git commit hash of the external DVC repository at - the time of importing or updating the dependency (with `dvc update`) diff --git a/content/docs/user-guide/dvc-files/advanced-dvc-yaml.md b/content/docs/user-guide/dvc-files/advanced-dvc-yaml.md deleted file mode 100644 index b90816663e..0000000000 --- a/content/docs/user-guide/dvc-files/advanced-dvc-yaml.md +++ /dev/null @@ -1,229 +0,0 @@ -# Advanced dvc.yaml Usage - -> ⚠ī¸ These features will be released shortly, in DVC 2.0 ⚠ī¸ - -The following features are supported only via manual edition of `dvc.yaml` files -(`dvc run` cannot currently produce them). - -## Templating - -`dvc.yaml` supports a templating format to insert values from different sources -in the YAML structure itself. The sources can be -[parameters files](/doc/command-reference/params), or `vars` defined in -`dvc.yaml` instead. - -Let's say we have `params.yaml` (default params file) with the following -contents: - -```yaml -models: - us: - threshold: 10 - filename: 'model-us.hdf5' -``` - -Those values can be used anywhere in `dvc.yaml` with the `${}` _substitution -expression_: - - -```yaml -stages: - build-us: - cmd: >- - python train.py - --thresh ${models.us.threshold} - --out ${models.us.filename} - outs: - - ${models.us.filename}: - cache: true -``` - - -DVC will track simple param values (numbers, strings, etc.) used in `${}` (they -will be listed by `dvc params diff`). - -Alternatively, values for substitution can be listed as top-level `vars` like -this: - -```yaml -vars: - - models: - us: - threshold: 10 - - desc: 'Reusable description' - -stages: - build-us: - desc: ${desc} - cmd: python train.py --thresh ${models.us.threshold} -``` - -> DVC merges values from params files and `vars` when possible. For example, -> `{"grp": {"a": 1}}` merges with `{"grp": {"b": 2}}`, but not with -> `{"grp": {"a": 7}}`. - -> Note that values from `vars` are not tracked like parameters. - -To load additional params files, list them in the top `vars`, in the desired -order, e.g.: - -> Their paths will be evaluated based on -> [`wdir`](/doc/user-guide/dvc-files/dvc.yaml#accepted-fields), if one given. - -```yaml -vars: - - params.json - - myvar: 'value' - - config/myapp.yaml -``` - -(ℹī¸) Note that the default `params.yaml` file is always loaded first, if -present. - -It's also possible to specify what to include from additional params files, with -a `:` colon: - -```yaml -vars: - - params.json:clean,feats - -stages: - featurize: - cmd: ${feats.exec} - deps: - - ${clean.filename} - outs: - - ${feats.dirname} -``` - -Stage-specific values are also supported, with inner `vars`. You may also load -additional params files locally. For example: - -```yaml -stages: - build-us: - vars: - - params.json:build - - model: - filename: 'model-us.hdf5' - cmd: python train.py ${build.epochs} --out ${model.filename} - outs: - - ${model.filename} -``` - -⚠ī¸ Known limitations of local `vars`: - -- [`wdir`](/doc/user-guide/dvc-files/dvc.yaml#accepted-fields) cannot use values - from local `vars`, as DVC uses the working directory first (to load any values - from params files listed in `vars`). -- `foreach` is also incompatible with local `vars` at the moment. - -The substitution expression supports these forms: - -```yaml -${param} # Simple -${param.key} # Nested values through . (period) -${param.list[0]} # List elements via index in [] (square brackets) -``` - -> To use the expression literally in `dvc.yaml`, escape it with a backslash, -> e.g. `\${...`. - -## Generating multiple stages - -You can define more than one stage in a single `dvc.yaml` entry with the -following syntax. A `foreach` element accepts a list or dictionary with values -to iterate on, while `do` contains the regular stage fields (`cmd`, `outs`, -etc.). Here's a simple example: - -```yaml -stages: - cleanups: # Multi-stage - foreach: # List of simple values - - raw1 - - labels1 - - raw2 labels2 - do: - cmd: clean.py "${item}" -``` - -Upon `dvc repro`, each item in the list is expanded into its own stage by -substituting its value in expression `${item}`. The item's value is appended to -each stage name after a `@`. The final generated stages are saved to `dvc.lock`: - -```yaml -cleanups@raw2 labels2: - cmd: echo "raw2 labels2" -cleanups@raw1: - cmd: echo "raw1" -cleanups@labels1: - cmd: echo "labels1" -``` - -For lists containing complex values (e.g. dictionaries), the substitution -expression can use the `${item.key}` form. Stage names will be appended with a -zero-based index. For example: - -```yaml -stages: - train: - foreach: - - epochs: 3 - thresh: 10 - - epochs: 10 - thresh: 15 - do: - cmd: python train.py ${item.epochs} ${item.thresh} -``` - -```yaml -# dvc.lock -stages: - train@0: - cmd: python train.py 3 10 - train@1: - cmd: python train.py 10 15 -``` - -DVC can also iterate on a dictionary given directly to `foreach`, resulting in -two substitution expressions being available: `${key}` and `${item}`. The former -is used for the stage names: - -```yaml -stages: - build: - foreach: - uk: - epochs: 3 - thresh: 10 - us: - epochs: 10 - thresh: 15 - do: - cmd: python train.py '${key}' ${item.epochs} ${item.thresh} - outs: - - model-${key}.hdfs -``` - -```yaml -# dvc.lock -stages: - build@uk: - cmd: python train.py 'uk' 3 10 - outs: - - model-uk.hdfs - build@us: ... -``` - -Importantly, dictionaries [from parameters](#templating) files can be used in -`foreach` multi-stages as well: - -```yaml -stages: - mystages: - foreach: ${myobject} # From params.yaml - do: - cmd: ./script.py ${key} ${item.prop1} - outs: - - ${item.prop2} -``` diff --git a/content/docs/user-guide/dvc-files/dvc-yaml.md b/content/docs/user-guide/dvc-files/dvc-yaml.md deleted file mode 100644 index 772cefab47..0000000000 --- a/content/docs/user-guide/dvc-files/dvc-yaml.md +++ /dev/null @@ -1,158 +0,0 @@ -# `dvc.yaml` file - -`dvc.yaml` files describe data science or machine learning pipelines (similar to -how [Makefiles](https://www.gnu.org/software/make/manual/make.html#Introduction) -work for building software). Its YAML structure contains a list of stages which -can be written manually or generated by user code. - -> A helper command, `dvc run`, is also available to add or update stages in -> `dvc.yaml`. Additionally, a `dvc.lock` file is also created or updated by -> `dvc run` and `dvc repro`, to record the pipelines' state. - -Here's a comprehensive `dvc.yaml` example: - -```yaml -stages: - features: - cmd: jupyter nbconvert --execute featurize.ipynb - deps: - - data/clean - params: - - levels.no - outs: - - features - metrics: - - performance.json - training: - desc: Train model with Python - cmd: - - pip install -r requirements.txt - - python train.py --out ${model_file} - deps: - - requirements.txt - - train.py - - features - outs: - - ${model_file}: - desc: My model description - plots: - - logs.csv: - x: epoch - x_label: Epoch - meta: 'For deployment' - # User metadata and comments are supported. -``` - -💡 Keep in mind that there may be multiple `dvc.yaml` files in each DVC -project. All of them are checked for consistency during operations that -require rebuilding [DAGs](/doc/command-reference/dag) (like `dvc repro`). - -## Accepted fields - -`dvc.yaml` files consists of a set of `stages` with names provided by the user -(for example with the `--name` option of `dvc run`). Each stage entry can -contain the following fields: - -- `cmd` (always present): One or more commands executed by the stage (may - contain either a single value, or a list). Commands are executed sequentially - until all are finished or until one of them fails (see `dvc repro` for - details). -- `wdir`: Working directory for the stage command to run in (relative to the - file's location). Any paths in other fields are also based on this. It - defaults to `.` (the file's location). -- `deps`: List of dependency file or directory paths of this stage - (relative to `wdir`). -- `params`: List of parameter dependency keys (field names) to - track from `params.yaml` (in `wdir`). The list may also contain other YAML, - JSON, TOML, or Python file names, with a sub-list of the param names to track - in them. -- `outs`: List of output file or directory paths of this stage - (relative to `wdir`). See [Output entries](#output-entries) for more details. -- `metrics`: List of [metrics files](/doc/command-reference/metrics), and - optionally, whether or not this metrics file is cached (`true` by - default). See the `--metrics-no-cache` (`-M`) option of `dvc run`. -- `plots`: List of [plot metrics](/doc/command-reference/plots), and optionally, - their default configuration (subfields matching the options of - `dvc plots modify`), and whether or not this plots file is cached - ( `true` by default). See the `--plots-no-cache` option of `dvc run`. -- `frozen`: Whether or not this stage is frozen from reproduction -- `always_changed`: Whether or not this stage is considered as changed by - commands such as `dvc status` and `dvc repro`. `false` by default -- `meta` (optional): Arbitrary metadata can be added manually with this field. - Any YAML contents is supported. `meta` contents are ignored by DVC, but they - can be meaningful for user processes that read or write `.dvc` files directly. -- `desc` (optional): User description for this stage. This doesn't affect any - DVC operations. - -See [Advanced dvc.yaml Usage](/doc/user-guide/dvc-files/advanced-dvc.yaml) for -info on the `${}` syntax, as well as `foreach`/`do` fields. - -`dvc.yaml` files also support `# comments`. - -Note that we maintain a `dvc.yaml` -[schema](https://github.com/iterative/dvcyaml-schema) that can be used by -editors like [VSCode](/doc/install/plugins#visual-studio-code) or -[PyCharm](/doc/install/plugins#pycharmintellij) to enable automatic syntax -validation and auto-completion. - -### Output entries - -`outs` fields can contain these subfields: - -- `cache`: Whether or not this file or directory is cached (`true` - by default). See the `--no-commit` option of `dvc add`. -- `persist`: Whether the output file/dir should remain in place while - `dvc repro` runs (`false` by default: outputs are deleted when `dvc repro` - starts -- `desc` (optional): User description for this output. This doesn't affect any - DVC operations. - -## `dvc.lock` file - -For every `dvc.yaml` file, a matching `dvc.lock` (YAML) file usually exists. -It's created or updated by DVC commands such as `dvc run` and `dvc repro`. -`dvc.lock` describes the latest pipeline state. It has several purposes: - -- Tracking of intermediate and final outputs of a pipeline — - similar to [`.dvc` files](#dvc-files). -- Allow DVC to detect when stage definitions, or their dependencies have - changed. Such conditions invalidate stages, requiring their reproduction (see - `dvc status`, `dvc repro`). -- `dvc.lock` is needed internally for several DVC commands to operate, such as - `dvc checkout`, `dvc get`, and `dvc import`. - -Here's an example `dvc.lock` (based on the `dvc.yaml` example above): - -```yaml -stages: - features: - cmd: jupyter nbconvert --execute featurize.ipynb - deps: - - path: data/clean - md5: d8b874c5fa18c32b2d67f73606a1be60 - params: - params.yaml: - levels.no: 5 - outs: - - path: features - md5: 2119f7661d49546288b73b5730d76485 - - path: performance.json - md5: ea46c1139d771bfeba7942d1fbb5981e - - path: logs.csv - md5: f99aac37e383b422adc76f5f1fb45004 -``` - -Stage commands are listed again in `dvc.lock`, in order to know when their -definitions change in `dvc.yaml`. - -Regular dependencies and all kinds of outputs -(including [metrics](/doc/command-reference/metrics) and -[plots](/doc/command-reference/plots) files) are also listed (per stage) in -`dvc.lock`, but with an additional field with a hash of their last known -contents. Specifically: `md5`, `etag`, or `checksum` are used (same as in `deps` -and `outs` entries of `.dvc` files). - -Full parameter dependencies (key and value) are listed too (under -`params`), grouped by parameters file. And in the case of -[templated `dvc.yaml`](/doc/user-guide/dvc-files/advanced-dvc.yaml) files, their -actual values are substituted into the `dvc.lock` YAML structure. diff --git a/content/docs/user-guide/dvc-files/index.md b/content/docs/user-guide/dvc-files/index.md deleted file mode 100644 index 960747c823..0000000000 --- a/content/docs/user-guide/dvc-files/index.md +++ /dev/null @@ -1,24 +0,0 @@ -# DVC Files - -There are a few special DVC file formats that enable its features: - -- Files ending with the `.dvc` extension ("dot DVC files") are placeholders to - track data files and directories. A DVC project usually has one - `.dvc` file per large data file or directory being tracked. -- `dvc.yaml` files (or _pipelines files_) specify stages that form the - pipeline(s) of a project, and how they connect (_dependency graph_ or DAG). - - These normally have a matching `dvc.lock` file to record the pipeline state - and track its outputs. - -Both `.dvc` files and `dvc.yaml` use human-friendly YAML 1.2 schemas, described -below. We encourage you to get familiar with them so you may create, generate, -and edit them on your own. - -These metafiles should be versioned with Git (in Git-enabled -repositories). - -See [DVC Internals](/doc/user-guide/dvc-internals) for the contents of the -`.dvc/` directory. - -See also [`.dvcignore`](/doc/user-guide/dvcignore). diff --git a/content/docs/user-guide/project-structure/dvc-files.md b/content/docs/user-guide/project-structure/dvc-files.md new file mode 100644 index 0000000000..2683d512a3 --- /dev/null +++ b/content/docs/user-guide/project-structure/dvc-files.md @@ -0,0 +1,87 @@ +# `.dvc` Files + +You can use `dvc add` to track data files or directories located in your current +workspace, or in supported +[external locations](/doc/user-guide/managing-external-data). Additionally, +`dvc import` and `dvc import-url` let you bring data from external locations to +your project, and start tracking it locally. + +> See [Data Versioning](/doc/start/data-versioning) and +> [Data Access](/doc/start/data-access) for more info. + +Files ending with the `.dvc` extension ("dot DVC file") are created by these +commands as data placeholders that can be versioned with Git. They contain the +information needed to track the target data over time. Here's an example: + +```yaml +outs: + - md5: a304afb96060aad90176268345e10355 + path: data.xml + desc: Cats and dogs dataset + +# Comments and user metadata are supported. +meta: + name: 'Devee Bird' + email: devee@dvc.org +``` + +These files use the [YAML 1.2](https://yaml.org/) file format, and a +human-friendly schema described below. We encourage you to get familiar with it +so you may modify, write, or generate `.dvc` files on your own. + +> See also +> [How to Merge Conflicts](/doc/user-guide/how-to/merge-conflicts#dvc-files). + +## Specification + +These are the fields that are accepted at the root level of the `.dvc` file +schema: + +| Field | Description | +| ------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `outs` | (Required) list of [output entries](#output-entries) (details below) that represent the files or directories tracked with DVC. Typically there is only one (but several can be added or combined manually). | +| `deps` | List of [dependency entries](#dependency-entries) (details below). Only present when `dvc import` or `dvc import-url` are used to generate this `.dvc` file. Typically there is only one (but several can be added manually). | +| `wdir` | Working directory for the `outs` and `deps` paths (relative to the `.dvc` file's location). It defaults to `.` (the file's location). | +| `md5` | (Only for imports) MD5 hash of the `.dvc` file itself. | +| `meta` | (Optional) arbitrary user metadata can be added manually with this field. Any YAML content is supported. `meta` contents are ignored by DVC. | + +Comments can be entered using the `# comment` format. + +> `meta` fields and `#` comments are preserved among executions of `dvc repro` +> and `dvc commit`, but not when the file is overwritten by `dvc add`, +> `dvc move`, `dvc import`, or `dvc import-url`. + +## Output entries + +The following subfields may be present under `outs` entries: + +| Field | Description | +| ------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | (Required) Path to the file or directory (relative to `wdir`, which defaults to the file's location) | +| `md5`
`etag`
`checksum` | Hash value for the file or directory being tracked with DVC. MD5 is used for most locations (local file system and SSH); [ETag](https://en.wikipedia.org/wiki/HTTP_ETag#Strong_and_weak_validation) for HTTP, S3, or Azure [external outputs](/doc/user-guide/managing-external-data); and a special _checksum_ for HDFS and WebHDFS. | +| `size` | Size of the file or directory (sum of all files). | +| `nfiles` | If this output is a directory, the number of files inside (recursive). | +| `isexec` | Whether this is an executable file. DVC preserves execute permissions upon `dvc checkout` and `dvc pull`. This has no effect on directories, or in general on Windows. | +| `cache` | Whether or not this file or directory is cached (`true` by default). See the `--no-commit` option of `dvc add`. | +| `persist` | Whether the output file/dir should remain in place while `dvc repro` runs (`false` by default: outputs are deleted when `dvc repro` starts) | +| `desc` | (Optional) user description for this output (supported in metrics and plots too). This doesn't affect any DVC operations. | + +## Dependency entries + +The following subfields may be present under `deps` entries: + +| Field | Description | +| ------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `path` | (Required) Path to the dependency (relative to `wdir`, which defaults to the file's location) | +| `md5`
`etag`
`checksum` | Hash value for the file or directory being tracked with DVC. MD5 is used for most locations (local file system and SSH); [ETag](https://en.wikipedia.org/wiki/HTTP_ETag#Strong_and_weak_validation) for HTTP, S3, or Azure external dependencies; and a special _checksum_ for HDFS and WebHDFS. See `dvc import-url` for more information. | +| `size` | Size of the file or directory (sum of all files). | +| `nfiles` | If this dependency is a directory, the number of files inside (recursive). | +| `repo` | This entry is only for external dependencies created with `dvc import`, and can contain `url`, `rev`, and `rev_lock` (detailed below). | + +### Dependency `repo` subfields: + +| Field | Description | +| ---------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `url` | URL of Git repository with source DVC project | +| `rev` | Only present when the `--rev` option of `dvc import` is used. Specific commit hash, branch or tag name, etc. (a [Git revision](https://git-scm.com/docs/revisions)) used to import the dependency from. | +| `rev_lock` | Git commit hash of the external DVC repository at the time of importing or updating the dependency (with `dvc update`) | diff --git a/content/docs/user-guide/dvcignore.md b/content/docs/user-guide/project-structure/dvcignore-files.md similarity index 99% rename from content/docs/user-guide/dvcignore.md rename to content/docs/user-guide/project-structure/dvcignore-files.md index 0b5feb173b..0afd13ce75 100644 --- a/content/docs/user-guide/dvcignore.md +++ b/content/docs/user-guide/project-structure/dvcignore-files.md @@ -1,4 +1,4 @@ -# `.dvcignore` File +# `.dvcignore` Files Marks which files and/or directories should be excluded when traversing a DVC project. diff --git a/content/docs/user-guide/project-structure/index.md b/content/docs/user-guide/project-structure/index.md new file mode 100644 index 0000000000..e00b552e32 --- /dev/null +++ b/content/docs/user-guide/project-structure/index.md @@ -0,0 +1,24 @@ +# Project Structure + +Using `dvc init` in your workspace will start a DVC +project, including the internal `.dvc/` directory. From there on, you +will create and manage different DVC files and populate the cache +as you use DVC and work on your data science experiments. + +- `dvc.yaml` _pipelines files_ define stages that form the pipeline(s) of a + project. All stage-based features such as `dvc params`, `dvc metrics`, and + `dvc plots` are specified here. + +- `.dvc` files ("dot DVC files") are placeholders to track data files and + directories. + +- `.dvcignore` files (optional) contain a list of paths for DVC to ignore, which + can dramatically increase its operational performance. + +- Internal files and directories in + [`.dvc/`](/doc/user-guide/project-structure/internal-files) contains the local + [configuration](/doc/command-reference/config) file(s), default local cache + location, and other utilities that DVC needs to operate. + +These metafiles should be versioned with Git (in Git-enabled +repositories). diff --git a/content/docs/user-guide/dvc-internals.md b/content/docs/user-guide/project-structure/internal-files.md similarity index 99% rename from content/docs/user-guide/dvc-internals.md rename to content/docs/user-guide/project-structure/internal-files.md index 71da70cc10..8f62aba764 100644 --- a/content/docs/user-guide/dvc-internals.md +++ b/content/docs/user-guide/project-structure/internal-files.md @@ -1,4 +1,4 @@ -# DVC Internal Directories and Files +# Internal Directories and Files Once initialized in a project, DVC populates its installation directory (`.dvc/`) with the internal directories and files needed for DVC diff --git a/content/docs/user-guide/project-structure/pipelines-files.md b/content/docs/user-guide/project-structure/pipelines-files.md new file mode 100644 index 0000000000..5f3fd42ee2 --- /dev/null +++ b/content/docs/user-guide/project-structure/pipelines-files.md @@ -0,0 +1,421 @@ +# Pipelines Files (`dvc.yaml`) + +You can construct data science or machine learning pipelines by defining +individual [stages](/doc/command-reference/run) in one or more `dvc.yaml` files +(or _pipelines files_). Stages form a pipeline when they connect with each other +(forming a _dependency graph_, see `dvc dag`). Refer to +[Data Pipelines](/doc/start/data-pipelines). + +> Note that a helper command, `dvc run`, is available to create (and execute) +> stages. + +`dvc.yaml` files can be versioned with Git. + +These files use the [YAML 1.2](https://yaml.org/) file format, and a +human-friendly schema explained below. We encourage you to get familiar with it +so you may modify, write, or generate stages and pipelines on your own. + +> Note that we use [GNU/Linux](https://www.gnu.org/software/software.html) in +> most of our examples. + +## Stages + +The `stages` list contains a list of user-defined stages. Here's a simple one +named `transpose`: + +```yaml +stages: + transpose: + cmd: ./trans.r rows.txt > columns.txt + deps: + - rows.txt + outs: + - columns.txt +``` + +The most important part of a stage it's the terminal command(s) it executes +(`cmd` field). This is what DVC runs when the stage is reproduced (see +`dvc repro`). + +If a command reads input files, these (or their directory locations) can be +defined as dependencies (`deps`). DVC will check whether they have +changed to decide whether the stage requires re-execution (see `dvc status`). + +If it writes files or dirs, they can be defined as outputs +(`outs`). DVC will track them going forward (similar to using `dvc add`). + +### Parameter dependencies + +[Parameters](/doc/command-reference/params) are a special type of stage +dependency. They consist of a name/value pair to find in a YAML, JSON, TOML, or +Python parameters file (`params.yaml` by default). Example: + +```yaml +stages: + preprocess: + cmd: bin/cleanup raw.txt clean.txt + deps: + - raw.txt + params: + - threshold + - passes + outs: + - clean.txt +``` + +This allows several stages to depend on values of a shared structured file +(which can be versioned directly with Git). See also `dvc params diff`. + +### Metrics and Plots outputs + +Like [common outputs](#outputs), metrics and plots +files are produced by the stage `cmd`. However, their purpose is different. +Typically they contain metadata to evaluate pipeline processes. Example: + +```yaml +stages: + build: + cmd: python train.py + deps: + - features.csv + outs: + - model.pt + metrics: + - accuracy.txt: + cache: false + plots: + - auc.json: + cache: false +``` + +> `cache: false` is typical here, since they're small enough for Git to version +> directly. + +The commands in `dvc metrics` and `dvc plots` help you display and compare +metrics and plots. + +## Templating + +⚠ī¸ This feature is only available in DVC 2.0 ⚠ī¸ + +`dvc.yaml` supports a templating format to insert values from different sources +in the YAML structure itself. These sources can be +[parameters files](/doc/command-reference/params), or `vars` defined in +`dvc.yaml` instead. + +> Note that this parameterization feature is only supported via manual edition +> of `dvc.yaml` and incompatible with `dvc run`. + +Let's say we have `params.yaml` (default params file) with the following +contents: + +```yaml +models: + us: + threshold: 10 + filename: 'model-us.hdf5' +``` + +Those values can be used anywhere in `dvc.yaml` with the `${}` _substitution +expression_: + + +```yaml +stages: + build-us: + cmd: >- + python train.py + --thresh ${models.us.threshold} + --out ${models.us.filename} + outs: + - ${models.us.filename}: + cache: true +``` + + +DVC will track simple param values (numbers, strings, etc.) used in `${}` (they +will be listed by `dvc params diff`). + +Alternatively, values for substitution can be listed as top-level `vars` like +this: + +```yaml +vars: + - models: + us: + threshold: 10 + - desc: 'Reusable description' + +stages: + build-us: + desc: ${desc} + cmd: python train.py --thresh ${models.us.threshold} +``` + +> Note that values from `vars` are not tracked like parameters. + +To load additional params files, list them in the top `vars`, in the desired +order, e.g.: + +> Params file paths will be evaluated based on +> [`wdir`](/doc/user-guide/project-structure/pipelines-files#specification), if +> one given. + +```yaml +vars: + - params.json + - myvar: 'value' + - config/myapp.yaml +``` + +(ℹī¸) Note that the default `params.yaml` file is always loaded first, if +present. + +It's also possible to specify what to include from additional params files, with +a `:` colon: + +```yaml +vars: + - params.json:clean,feats + +stages: + featurize: + cmd: ${feats.exec} + deps: + - ${clean.filename} + outs: + - ${feats.dirname} +``` + +Stage-specific values are also supported, with inner `vars`. You may also load +additional params files locally. For example: + +```yaml +stages: + build-us: + vars: + - params.json:build + - model: + filename: 'model-us.hdf5' + cmd: python train.py ${build.epochs} --out ${model.filename} + outs: + - ${model.filename} +``` + +DVC merges values from params files and `vars` in each scope when possible. For +example, `{"grp": {"a": 1}}` merges with `{"grp": {"b": 2}}`, but not with +`{"grp": {"a": 7}}`. + +⚠ī¸ Known limitations of local `vars`: + +- [`wdir`](/doc/user-guide/project-structure/pipelines-files#specification) + cannot use values from local `vars`, as DVC uses the working directory first + (to load any values from params files listed in `vars`). +- `foreach` is also incompatible with local `vars` at the moment. + +The substitution expression supports these forms: + +```yaml +${param} # Simple +${param.key} # Nested values through . (period) +${param.list[0]} # List elements via index in [] (square brackets) +``` + +> To use the expression literally in `dvc.yaml` (so DVC does not replace it for +> a value), escape it with a backslash, e.g. `\${...`. + +## Generating multiple stages + +⚠ī¸ This feature is only available in DVC 2.0 ⚠ī¸ + +You can define more than one stage in a single `dvc.yaml` entry with the +following syntax. A `foreach` element accepts a list or dictionary with values +to iterate on, while `do` contains the regular stage fields (`cmd`, `outs`, +etc.). Here's a simple example: + +```yaml +stages: + cleanups: # Multi-stage + foreach: # List of simple values + - raw1 + - labels1 + - raw2 + do: + cmd: clean.py "${item}" + out: + - ${item}.cln +``` + +Upon `dvc repro`, each item in the list is expanded into its own stage by +substituting its value in expression `${item}`. The item's value is appended to +each stage name after a `@`. The final generated stages are saved to `dvc.lock`: + +```yaml +cleanups@raw2 labels2: + cmd: echo "raw2 labels2" +cleanups@raw1: + cmd: echo "raw1" +cleanups@labels1: + cmd: echo "labels1" +``` + +For lists containing complex values (e.g. dictionaries), the substitution +expression can use the `${item.key}` form. Stage names will be appended with a +zero-based index. For example: + +```yaml +stages: + train: + foreach: + - epochs: 3 + thresh: 10 + - epochs: 10 + thresh: 15 + do: + cmd: python train.py ${item.epochs} ${item.thresh} +``` + +```yaml +# dvc.lock +stages: + train@0: + cmd: python train.py 3 10 + train@1: + cmd: python train.py 10 15 +``` + +DVC can also iterate on a dictionary given directly to `foreach`, resulting in +two substitution expressions being available: `${key}` and `${item}`. The former +is used for the stage names: + +```yaml +stages: + build: + foreach: + uk: + epochs: 3 + thresh: 10 + us: + epochs: 10 + thresh: 15 + do: + cmd: python train.py '${key}' ${item.epochs} ${item.thresh} + outs: + - model-${key}.hdfs +``` + +```yaml +# dvc.lock +stages: + build@uk: + cmd: python train.py 'uk' 3 10 + outs: + - model-uk.hdfs + build@us: ... +``` + +Importantly, dictionaries from +[parameters files](/doc/command-reference/params#examples) can be used in +`foreach` multi-stages as well: + +```yaml +stages: + mystages: + foreach: ${myobject} # From params.yaml + do: + cmd: ./script.py ${key} ${item.prop1} + outs: + - ${item.prop2} +``` + +> Note that this feature is not compatible with [templating](#templating) at the +> moment. + +## Specification + +| Field | Description | +| ---------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `cmd` | (Required) One or more commands executed by the stage (may contain either a single value or a list). Commands are executed sequentially until all are finished or until one of them fails (see `dvc repro`). | +| `wdir` | Working directory for the stage command to run in (relative to the file's location). Any paths in other fields are also based on this. It defaults to `.` (the file's location). | +| `deps` | List of dependency paths of this stage (relative to `wdir`). | +| `outs` | List of output paths of this stage (relative to `wdir`). See [Output entries](#output-entries) for more details. | +| `params` | List of parameter dependency keys (field names) to track from `params.yaml` (in `wdir`). The list may also contain other parameters file names, with a sub-list of the param names to track in them. | +| `metrics` | List of [metrics files](/doc/command-reference/metrics), and optionally, whether or not this metrics file is cached (`true` by default). See the `--metrics-no-cache` (`-M`) option of `dvc run`. | +| `plots` | List of [plot metrics](/doc/command-reference/plots), and optionally, their default configuration (subfields matching the options of `dvc plots modify`), and whether or not this plots file is cached ( `true` by default). See the `--plots-no-cache` option of `dvc run`. | +| `frozen` | Whether or not this stage is frozen from reproduction | +| `always_changed` | Whether or not this stage is considered as changed by commands such as `dvc status` and `dvc repro`. `false` by default | +| `meta` | (Optional) arbitrary metadata can be added manually with this field. Any YAML content is supported. `meta` contents are ignored by DVC, but they can be meaningful for user processes that read or write `.dvc` files directly. | +| `desc` | (Optional) user description for this stage. This doesn't affect any DVC operations. | + +`dvc.yaml` files also support `# comments`. + +Note that we maintain a `dvc.yaml` +[schema](https://github.com/iterative/dvcyaml-schema) that can be used by +editors like [VSCode](/doc/install/plugins#visual-studio-code) or +[PyCharm](/doc/install/plugins#pycharmintellij) to enable automatic syntax +validation and auto-completion. + +> See also +> [How to Merge Conflicts](/doc/user-guide/how-to/merge-conflicts#dvcyaml). + +### Output entries + +| Field | Description | +| --------- | ------------------------------------------------------------------------------------------------------------------------------------------ | +| `cache` | Whether or not this file or directory is cached (`true` by default). See the `--no-commit` option of `dvc add`. | +| `persist` | Whether the output file/dir should remain in place while `dvc repro` runs (`false` by default: outputs are deleted when `dvc repro` starts | +| `desc` | (Optional) user description for this output. This doesn't affect any DVC operations. | + +## dvc.lock file + +> ⚠ī¸ Avoid editing these files. DVC will create and update them for you. + +To record the state of your pipeline(s) and help track its outputs, +DVC will maintain a `dvc.lock` file for each `dvc.yaml`. Their purposes include: + +- Allow DVC to detect when stage definitions, or their dependencies + have changed. Such conditions invalidate stages, requiring their reproduction + (see `dvc status`). +- Tracking of intermediate and final outputs of a pipeline — + similar to `.dvc` files. +- Needed for several DVC commands to operate, such as `dvc checkout` or + `dvc get`. + +Here's an example: + +```yaml +stages: + features: + cmd: jupyter nbconvert --execute featurize.ipynb + deps: + - path: data/clean + md5: d8b874c5fa18c32b2d67f73606a1be60 + params: + params.yaml: + levels.no: 5 + outs: + - path: features + md5: 2119f7661d49546288b73b5730d76485 + - path: performance.json + md5: ea46c1139d771bfeba7942d1fbb5981e + - path: logs.csv + md5: f99aac37e383b422adc76f5f1fb45004 +``` + +Stages are listed again in `dvc.lock`, in order to know if their definitions +change in `dvc.yaml`. + +Regular dependencies and all kinds of outputs +(including [metrics](/doc/command-reference/metrics) and +[plots](/doc/command-reference/plots) files) are also listed (per stage) in +`dvc.lock`, but with an additional field storing a hash of their last known +contents. Specifically: `md5`, `etag`, or `checksum` are used (same as in `deps` +and `outs` entries of `.dvc` files). + +Full parameter dependencies (key and value) are listed too (under +`params`), grouped by parameters file. + +Note that in the case of [templated](#templating) `dvc.yaml` files, the actual +values are substituted in `dvc.lock` (no `${}` expressions remain). And for +those with [multi-stages](#generating-multiple-stages), individual stages are +expanded (`foreach` structures are not preserved). diff --git a/content/linked-terms.js b/content/linked-terms.js index 3484cc63ef..b5266ea20c 100644 --- a/content/linked-terms.js +++ b/content/linked-terms.js @@ -1,14 +1,14 @@ module.exports = [ { matches: '.dvc', - url: '/doc/user-guide/dvc-files/.dvc' + url: '/doc/user-guide/tracking-existing-data#dot-dvc-files' }, { matches: 'dvc.yaml', - url: '/doc/user-guide/dvc-files/dvc-yaml' + url: '/doc/user-guide/creating-pipelines#dvc-yaml-files' }, { matches: 'dvc.lock', - url: '/doc/user-guide/dvc-files/dvc-yaml#dvclock-file' + url: '/doc/user-guide/creating-pipelines#dvclock-file' } ] diff --git a/redirects-list.json b/redirects-list.json index 9371e8dae4..d0bc854d32 100644 --- a/redirects-list.json +++ b/redirects-list.json @@ -28,11 +28,17 @@ "^/doc/tutorials(/.*)? /doc/start", "^/doc/use-cases/data-and-model-files-versioning/?$ /doc/use-cases/versioning-data-and-model-files", - "^/doc/user-guide/dvc-file-format$ /doc/user-guide/dvc-files-and-directories", - "^/doc/user-guide/dvc-files-and-directories$ /doc/user-guide/dvc-files", + "^/doc/user-guide/dvc-file-format$ /doc/user-guide/project-structure", + "^/doc/user-guide/dvc-files-and-directories$ /doc/user-guide/project-structure", + "^/doc/user-guide/dvc-files$ /doc/user-guide/project-structure", "^/doc/user-guide/updating-tracked-files$ /doc/user-guide/how-to/update-tracked-data", "^/doc/user-guide/how-to/update-tracked-files$ /doc/user-guide/how-to/update-tracked-data", "^/doc/user-guide/merge-conflicts$ /doc/user-guide/how-to/merge-conflicts", + "^/doc/user-guide/dvc-files/dvc-yaml$ /doc/user-guide/project-structure", + "^/doc/user-guide/dvc-files/advanced-dvc-yaml$ /doc/user-guide/project-structure/pipelines-files", + "^/doc/user-guide/dvc-files/.dvc$ /doc/user-guide/project-structure/dvc-files", + "^/doc/user-guide/dvc-internals(/.*)?$ /doc/user-guide/project-structure/internal-files$1", + "^/doc/user-guide/dvcignore$ /doc/user-guide/project-structure/dvcignore-files", "^/doc/understanding-dvc(/.*)?$ /doc/user-guide/what-is-dvc", "^/doc/commands-reference(/.*)?$ /doc/command-reference$1", "^/doc/command-reference/plot$ /doc/command-reference/plots",