From 667550a3973de04e13b0be42dddfc98c81abbc7b Mon Sep 17 00:00:00 2001 From: Jorge Orpinel Date: Sun, 3 Jan 2021 20:11:14 -0600 Subject: [PATCH] run: review -d usage and consistency t/o docs per #1700 --- .../docs/command-reference/params/index.md | 10 ++++----- content/docs/command-reference/run.md | 22 +++++++++---------- content/docs/start/data-pipelines.md | 2 +- .../use-cases/shared-development-server.md | 8 ++++--- .../how-to/add-deps-or-outs-to-a-stage.md | 8 +++---- 5 files changed, 26 insertions(+), 24 deletions(-) diff --git a/content/docs/command-reference/params/index.md b/content/docs/command-reference/params/index.md index 02a92c0e0f..b1e493e313 100644 --- a/content/docs/command-reference/params/index.md +++ b/content/docs/command-reference/params/index.md @@ -87,7 +87,7 @@ Define a [stage](/doc/command-reference/run) that depends on params `lr`, specify `layers` and `epochs` from the `train` group: ```dvc -$ dvc run -n train -d users.csv -o model.pkl \ +$ dvc run -n train -d train.py -d users.csv -o model.pkl \ -p lr,train.epochs,train.layers \ python train.py ``` @@ -130,7 +130,7 @@ Alternatively, the entire group of parameters `train` can be referenced, instead of specifying each of the group parameters separately: ```dvc -$ dvc run -n train -d users.csv -o model.pkl \ +$ dvc run -n train -d train.py -d users.csv -o model.pkl \ -p lr,train \ python train.py ``` @@ -139,7 +139,7 @@ In the examples above, the default parameters file name `params.yaml` was used. This file name can be redefined with a prefix in the `-p` argument: ```dvc -$ dvc run -n train -d logs/ -o users.csv \ +$ dvc run -n train -d train.py -d logs/ -o users.csv -f \ -p parse_params.yaml:threshold,classes_num \ python train.py ``` @@ -182,7 +182,7 @@ The following [stage](/doc/command-reference/run) depends on params `BOOL`, `INT`, as well as `TrainConfig`'s `EPOCHS` and `layers`: ```dvc -$ dvc run -n train -d users.csv -o model.pkl \ +$ dvc run -n train -d train.py -d users.csv -o model.pkl \ -p params.py:BOOL,INT,TrainConfig.EPOCHS,TrainConfig.layers \ python train.py ``` @@ -227,7 +227,7 @@ can be referenced supported), instead of the parameters in it: ```dvc -$ dvc run -n train -d users.csv -o model.pkl \ +$ dvc run -n train -d train.py -d users.csv -o model.pkl \ -p params.py:BOOL,INT,TestConfig \ python train.py ``` diff --git a/content/docs/command-reference/run.md b/content/docs/command-reference/run.md index a835e36b53..a9e68ae5a5 100644 --- a/content/docs/command-reference/run.md +++ b/content/docs/command-reference/run.md @@ -73,7 +73,7 @@ so on (see `dvc dag`). This graph can be restored by DVC later to modify or ```dvc $ dvc run -n printer -d write.sh -o pages ./write.sh -$ dvc run -n scanner -d read.sh -d pages -o signed.pdf ./read.sh +$ dvc run -n scanner -d read.sh -d pages -o signed.pdf ./read.sh pages ``` Stage dependencies can be any file or directory, either untracked, or more @@ -151,7 +151,7 @@ variables in it that should be evaluated dynamically. Examples: ```dvc $ dvc run -n my_stage "./my_script.sh > /dev/null 2>&1" -$ dvc run -n my_stage './my_script.sh $MYENVVAR' +$ dvc run -n my_stage -f './my_script.sh $MYENVVAR' ``` ## Options @@ -317,17 +317,17 @@ dataset (`20180226` is a seed value): ```dvc $ dvc run -n train \ - -d matrix-train.p -d train_model.py \ - -o model.p \ - python train_model.py matrix-train.p 20180226 model.p + -d train_model.py -d matrix-train.p -o model.p \ + python train_model.py 20180226 model.p ``` To update a stage that is already defined, the `-f` (`--force`) option is needed. Let's update the seed for the `train` stage: ```dvc -$ dvc run -n train -f -d matrix-train.p -d train_model.py -o model.p \ - python train_model.py matrix-train.p 18494003 model.p +$ dvc run -n train --force \ + -d train_model.p -d matrix-train.p -o model.p \ + python train_model.py 18494003 model.p ``` ## Example: Separate stages in a subdirectory @@ -341,7 +341,7 @@ $ cd more_stages/ $ dvc run -n process_data \ -d data.in \ -o result.out \ - ./my_script.sh data.in result.out + ./my_script.sh --in data.in --out result.out $ tree .. . ├── dvc.yaml @@ -379,7 +379,7 @@ Execute an R script that parses the XML file: $ dvc run -n parse \ -d parsingxml.R -d data/Posts.xml \ -o data/Posts.csv \ - Rscript parsingxml.R data/Posts.xml data/Posts.csv + Rscript parsingxml.R --in data/Posts.xml --out data/Posts.csv ``` To visualize how these stages are connected into a pipeline (given their outputs @@ -421,9 +421,9 @@ Define a stage with both regular dependencies as well as parameter dependencies: ```dvc $ dvc run -n train \ - -d matrix-train.p -d train_model.py -o model.p \ + -d train_model.py -d matrix-train.p -o model.p \ -p seed,train.lr,train.epochs - python train_model.py matrix-train.p model.p + python train_model.py 20200105 model.p ``` `train_model.py` will include some code to open and parse the parameters: diff --git a/content/docs/start/data-pipelines.md b/content/docs/start/data-pipelines.md index 076158130c..e3ee12a236 100644 --- a/content/docs/start/data-pipelines.md +++ b/content/docs/start/data-pipelines.md @@ -72,7 +72,7 @@ $ dvc run -n prepare \ ``` A `dvc.yaml` file is generated. It includes information about the command we ran -(`python src/prepare.py`), its dependencies, and +(`python src/prepare.py data/data.xml`), its dependencies, and outputs.
diff --git a/content/docs/use-cases/shared-development-server.md b/content/docs/use-cases/shared-development-server.md index 5874bdab7c..ecce0e815c 100644 --- a/content/docs/use-cases/shared-development-server.md +++ b/content/docs/use-cases/shared-development-server.md @@ -80,8 +80,9 @@ Let's say you are cleaning up raw data for later stages: ```dvc $ dvc add raw -$ dvc run -n clean_data -d raw -o clean ./cleanup.py raw clean - # The data is cached in the shared location. +$ dvc run -n clean_data -d cleanup.py -d raw -o clean \ + ./cleanup.py raw clean +# The data is cached in the shared location. $ git add raw.dvc dvc.yaml dvc.lock .gitignore $ git commit -m "cleanup raw data" $ git push @@ -97,7 +98,8 @@ manually. After this, they could decide to continue building this $ git pull $ dvc checkout A raw # Data is linked from cache to workspace. -$ dvc run -n process_clean_data -d clean -o processed ./process.py clean process +$ dvc run -n process_clean_data -d process.py -d clean -o processed + ./process.py clean processed $ git add dvc.yaml dvc.lock $ git commit -m "process clean data" $ git push diff --git a/content/docs/user-guide/how-to/add-deps-or-outs-to-a-stage.md b/content/docs/user-guide/how-to/add-deps-or-outs-to-a-stage.md index 159a760b88..950118bd2b 100644 --- a/content/docs/user-guide/how-to/add-deps-or-outs-to-a-stage.md +++ b/content/docs/user-guide/how-to/add-deps-or-outs-to-a-stage.md @@ -39,13 +39,13 @@ output. To add a missing dependency (`data/raw.csv`) as well as a missing output > dependency/output to the stage: > > ```dvc -> $ dvc run -f --no-exec \ -> -n prepare \ -> -d data/raw.csv \ +> $ dvc run -n prepare \ +> -f --no-exec \ > -d src/prepare.py \ +> -d data/raw.csv \ > -o data/train \ > -o data/validate \ -> python src/prepare.py +> python src/prepare.py data/raw.csv > ``` > > `-f` overwrites the stage in `dvc.yaml`, while `--no-exec` updates the stage