From c3b2c8535466ec5a8ec75ffb7814e3aa416b81af Mon Sep 17 00:00:00 2001 From: dberenbaum Date: Fri, 26 May 2023 17:14:38 -0400 Subject: [PATCH 1/4] guide: drop external outs and reword imports --- content/docs/sidebar.json | 8 +- .../importing-external-data.md | 263 ++++-------------- .../data-management/managing-external-data.md | 201 ------------- .../user-guide/pipelines/external-data.md | 251 +++++++++++++++++ 4 files changed, 307 insertions(+), 416 deletions(-) delete mode 100644 content/docs/user-guide/data-management/managing-external-data.md create mode 100644 content/docs/user-guide/pipelines/external-data.md diff --git a/content/docs/sidebar.json b/content/docs/sidebar.json index 98ae40c110..2f16e63b98 100644 --- a/content/docs/sidebar.json +++ b/content/docs/sidebar.json @@ -167,14 +167,18 @@ "cloud-versioning", "discovering-and-accessing-data", "importing-external-data", - "managing-external-data", "large-dataset-optimization" ] }, { "slug": "pipelines", "source": "pipelines/index.md", - "children": ["defining-pipelines", "running-pipelines", "run-cache"] + "children": [ + "defining-pipelines", + "running-pipelines", + "run-cache", + "external-data" + ] }, { "label": "Experiment Management", diff --git a/content/docs/user-guide/data-management/importing-external-data.md b/content/docs/user-guide/data-management/importing-external-data.md index 3d680bf677..f7fd942225 100644 --- a/content/docs/user-guide/data-management/importing-external-data.md +++ b/content/docs/user-guide/data-management/importing-external-data.md @@ -1,200 +1,22 @@ # Importing External Data -There are cases when data is so large, or its processing is organized in such a -way, that its preferable to avoid moving it from its current external location. -For example data on a network attached storage (NAS), processing data on HDFS, -running [Dask](https://dask.org/) via SSH, or for a script that streams data -from S3 to process it. +To version data that lives outside of your local project, you can +import it. You can choose whether to download that data and whether to push +copies of it to your [DVC remote]. This makes importing the data useful even if +you want to track the data in-place at its original source location. -_External dependencies_ and -[external outputs](/doc/user-guide/data-management/managing-external-data) -provide ways to track and version data outside of the project. +## How importing external data works -## How external dependencies work - -External dependencies will be tracked by DVC, detecting when they -change (triggering stage executions on `dvc repro`, for example). - -To define files or directories in an external location as -[stage](/doc/command-reference/run) dependencies, specify their remote URLs or -external paths in `dvc.yaml` (`deps` field). Use the same format as the `url` of -certain `dvc remote` types. Currently, the following supported `dvc remote` -types/protocols: - -- Amazon S3 -- Microsoft Azure Blob Storage -- Google Cloud Storage -- SSH -- HDFS -- HTTP -- Local files and directories outside the workspace - - - -[Remote storage] is a different feature. - -[remote storage]: /doc/user-guide/data-management/remote-storage - - - -## Examples - -Let's take a look at defining and running a `download_file` stage that simply -downloads a file from an external location, on all the supported location types. - -> See the [Remote alias example](#example-using-dvc-remote-aliases) for info. on -> using remote locations that require manual authentication setup. - -
- -### Amazon S3 - -```cli -$ dvc stage add -n download_file \ - -d s3://mybucket/data.txt \ - -o data.txt \ - aws s3 cp s3://mybucket/data.txt data.txt -``` - -
- -
- -### Microsoft Azure Blob Storage - -```cli -$ dvc stage add -n download_file \ - -d azure://mycontainer/data.txt \ - -o data.txt \ - az storage copy \ - -d data.json \ - --source-account-name my-account \ - --source-container mycontainer \ - --source-blob data.txt -``` - -
- -
- -### Google Cloud Storage - -```cli -$ dvc stage add -n download_file \ - -d gs://mybucket/data.txt \ - -o data.txt \ - gsutil cp gs://mybucket/data.txt data.txt -``` - -
- -
- -### SSH - -```cli -$ dvc stage add -n download_file \ - -d ssh://user@example.com/path/to/data.txt \ - -o data.txt \ - scp user@example.com:/path/to/data.txt data.txt -``` - - - -DVC requires both SSH and SFTP access to work with SSH remote storage. Check -that you can connect both ways with tools like `ssh` and `sftp` (GNU/Linux). -Note that your server's SFTP root might differ from its physical root (`/`). - - - -
- -
- -### HDFS - -```cli -$ dvc stage add -n download_file \ - -d hdfs://user@example.com/data.txt \ - -o data.txt \ - hdfs fs -copyToLocal \ - hdfs://user@example.com/data.txt data.txt -``` - -
- -
- -### HTTP - -> Including HTTPs - -```cli -$ dvc stage add -n download_file \ - -d https://example.com/data.txt \ - -o data.txt \ - wget https://example.com/data.txt -O data.txt -``` - -
- -
- -### local file system paths - -```cli -$ dvc stage add -n download_file \ - -d /home/shared/data.txt \ - -o data.txt \ - cp /home/shared/data.txt data.txt -``` - -
- -## Example: Using DVC remote aliases - -You may want to encapsulate external locations as configurable entities that can -be managed independently. This is useful if the connection requires -authentication, if multiple dependencies (or stages) reuse the same location, or -if the URL is likely to change in the future. - -[DVC remotes][remote storage] can do just this. You may use `dvc remote add` to -define them, and then use a special URL with format -`remote://{remote_name}/{path}` (remote alias) to define the external -dependency. - -Let's see an example using SSH. First, register and configure the remote: - -```cli -$ dvc remote add myssh ssh://user@example.com -$ dvc remote modify --local myssh password 'mypassword' -``` - -> Refer to `dvc remote modify` for more details like setting up access -> credentials for the different remote types. - -Now, use an alias to this remote when defining the stage: - -```cli -$ dvc stage add -n download_file \ - -d remote://myssh/path/to/data.txt \ - -o data.txt \ - wget https://example.com/data.txt -O data.txt -``` - -## Example: `import-url` command - -In the previous examples, special downloading tools were used: `scp`, -`aws s3 cp`, etc. `dvc import-url` simplifies the downloading for all the -supported external path or URL types. +Import external data using `import-url`: ```cli $ dvc import-url https://data.dvc.org/get-started/data.xml Importing 'https://data.dvc.org/get-started/data.xml' -> 'data.xml' ``` -The command above creates the import `.dvc` file `data.xml.dvc`, that contains -an external dependency (in this case an HTTPs URL). +This downloads the file to `data.xml` (see +[Avoiding duplication](#avoiding-duplication) if you want to skip this step). It +also creates the `data.xml.dvc`file, which tracks the source data.
@@ -219,39 +41,54 @@ determine whether the source has changed and we need to download the file again.
-## Example: Imports - -`dvc import` can download a file or directory from any DVC project, -or from a Git repository. It also creates an external dependency in its import -`.dvc` file. +To check the source location for updates, run `dvc update`: ```cli -$ dvc import git@github.com:iterative/example-get-started model.pkl -Importing 'model.pkl (git@github.com:iterative/example-get-started)' --> 'model.pkl' +$ dvc update data.xml.dvc +'data.xml.dvc' didn't change, skipping ``` -The command above creates `model.pkl.dvc`, where the external dependency is -specified (with the `repo` field). +During `dvc push`, DVC will upload the version of the data tracked by +`data.xml.dvc` to the [DVC remote] so that it is backed up in case you need to +recover it. -
+## Avoiding duplication -### Expand to see resulting `.dvc` file +Uploading and downloading copies of the external data may be unnecessary and +impractical in some cases, like if your data is large or static, or you stream +it directly from its source location, or you use cloud versioning to backup old +versions already. -```yaml -# ... -deps: - - path: model.pkl - repo: - url: git@github.com:iterative/example-get-started - rev_lock: 6c73875a5f5b522f90b5afa9ab12585f64327ca7 -outs: - - md5: 3863d0e317dee0a55c4e59d2ec0eef33 - path: model.pkl - cache: true +### Skipping downloads + +You can use `--no-download` to skip the download step when you import or update +the data: + +```cli +$ dvc import-url --no-download https://data.dvc.org/get-started/data.xml +Importing 'https://data.dvc.org/get-started/data.xml' -> 'data.xml' + +$ ls +data.xml.dvc ``` -The `url` and `rev_lock` subfields under `repo` are used to save the origin and -[version](https://git-scm.com/docs/revisions) of the dependency, respectively. +If you don't have time or space to download the data but still want to make a +backup of the data on your [DVC remote] to be able to recover a copy later, you +can instead use `--to-remote`, which will upload the data to remote storage +without saving a local copy. -
+### Skipping uploads + +You can also skip pushing the data to the [DVC remote], and DVC will try to +recover the data from its source location. However, if you don't push the data +and the source location has changed, you may be unable to recover the data. + +[Cloud versioning](/doc/user-guide/data-management/cloud-versioning) enables you +to recover data without having to push a copy to DVC remote storage. If you have +cloud versioning enabled for the source location, you can import it with +`--version-aware`. DVC will track the version ID of all imported files and be +able to recover them from source as long as those versions remain in the source +location. DVC will also know to skip uploading these files during `dvc push` +since it assumes they are available from the source location. + +[dvc remote]: /doc/user-guide/data-management/remote-storage diff --git a/content/docs/user-guide/data-management/managing-external-data.md b/content/docs/user-guide/data-management/managing-external-data.md deleted file mode 100644 index 666ce04fe3..0000000000 --- a/content/docs/user-guide/data-management/managing-external-data.md +++ /dev/null @@ -1,201 +0,0 @@ -# Managing External Data - -> ⚠️ This is an advanced feature for very specific situations and not -> recommended except if there's absolutely no other alternative. In most cases, -> alternatives like the [to-cache] or [to-remote] strategies of `dvc add` and -> `dvc import-url` are more convenient. **Note** that external outputs are not -> pushed or pulled from/to [remote storage]. - -[to-cache]: /doc/command-reference/add#example-transfer-to-an-external-cache -[to-remote]: /doc/command-reference/add#example-transfer-to-remote-storage -[remote storage]: /doc/user-guide/data-management/remote-storage - -There are cases when data is so large, or its processing is organized in such a -way, that its impossible to handle it in the local machine disk. For example -versioning existing data on a network attached storage (NAS), processing data on -HDFS, running [Dask](https://dask.org/) via SSH, or any code that generates -massive files directly to the cloud. - -_External outputs_ (and -[external dependencies](/doc/user-guide/data-management/importing-external-data)) -provide ways to track and version data outside of the project. - -## How external outputs work - -External outputs will be tracked by DVC for -[versioning](/doc/use-cases/versioning-data-and-models), detecting when they -change (reported by `dvc status`, for example). - -To use existing files or directories in an external location as outputs, give -their remote URLs or external paths to `dvc add`, or put them in `dvc.yaml` -(`deps` field). Use the same format as the `url` of the following supported -`dvc remote` types/protocols: - -- Amazon S3 -- SSH -- HDFS -- Local files and directories outside the workspace - -> Avoid using the same DVC remote used for `dvc push`, `dvc pull`, etc. as -> external cache, because it may cause data collisions: the hash of an external -> output could collide with that of a local file with different content. - -> Note that [remote storage] is a different feature. - -## Setting up an external cache - -DVC requires that the project's cache is configured in the same -external location as the data that will be tracked (external outputs). This -avoids transferring files to the local environment and enables [file links] -within the external storage. - -[file links]: - /doc/user-guide/data-management/large-dataset-optimization#file-link-types-for-the-dvc-cache - -As an example, let's create a directory external to the workspace and set it up -as cache: - -```cli -$ mkdir -p /home/shared/dvcstore -$ dvc cache dir /home/shared/dvcstore -``` - -> See `dvc cache dir` and `dvc config cache` for more information. - -💡 Note that in real-life scenarios, often the directory will be in a remote -location, e.g. `s3://mybucket/cache` or `ssh://user@example.com/cache` (see the -examples below). - -> ⚠️ An external cache could be -> [shared](/doc/user-guide/how-to/share-a-dvc-cache) among copies of a DVC -> project. **Do not** use external outputs in that scenario, as `dvc checkout` -> in any project would overwrite the working data for all projects. - -## Examples - -Let's take a look at the following operations on all the supported location -types: - -1. Configure an external cache directory (added as a - `dvc remote`\*) in the same location as the external data, using - `dvc config`. -2. Tracking existing data on the external location using `dvc add` (`--external` - option needed). This produces a `.dvc` file with an external URL or path in - its `outs` field. -3. Creating a simple stage with `dvc stage add` (`--external` option needed) - that moves a local file to the external location. This produces an external - output in `dvc.yaml`. - -> \* Note that for certain remote storage authentication methods, extra config -> steps are required (see `dvc remote modify` for details). Once access is -> setup, use the special `remote://` URL format in step 2. For example: -> `dvc add --external remote://myxcache/existing-data`. - -
- -### Amazon S3 - -```cli -$ dvc remote add s3cache s3://mybucket/cache -$ dvc config cache.s3 s3cache - -$ dvc add --external s3://mybucket/existing-data - -$ dvc stage add -d data.txt \ - --external \ - -o s3://mybucket/data.txt \ - aws s3 cp data.txt s3://mybucket/data.txt -``` - -
- -
- -### SSH - -```cli -$ dvc remote add sshcache ssh://user@example.com/cache -$ dvc config cache.ssh sshcache - -$ dvc add --external ssh://user@example.com/existing-data - -$ dvc stage add -d data.txt \ - --external \ - -o ssh://user@example.com/data.txt \ - scp data.txt user@example.com:/data.txt -``` - - - -DVC requires both SSH and SFTP access to work with SSH remote storage. Check -that you can connect both ways with tools like `ssh` and `sftp` (GNU/Linux). -Note that your server's SFTP root might differ from its physical root (`/`). - - - -
- -
- -### HDFS - -```cli -$ dvc remote add hdfscache hdfs://user@example.com/cache -$ dvc config cache.hdfs hdfscache - -$ dvc add --external hdfs://user@example.com/existing-data - -$ dvc stage add -d data.txt \ - --external \ - -o hdfs://user@example.com/data.txt \ - hdfs fs -copyFromLocal \ - data.txt \ - hdfs://user@example.com/data.txt -``` - -Note that as long as there is a `hdfs://...` URL for your data, DVC can handle -it. So systems like Hadoop, Hive, and HBase are supported! - -
- -
- -### WebHDFS - -```cli -$ dvc remote add webhdfscache webhdfs://user@example.com/cache -$ dvc config cache.webhdfs webhdfscache - -$ dvc add --external webhdfs://user@example.com/existing-data - -$ dvc stage add -d data.txt \ - --external \ - -o webhdfs://user@example.com/data.txt \ - curl --upload-file data.txt \ - "http://user@example.com:50075/webhdfs/v1/data.txt?op=CREATE" -``` - -
- -
- -### local file system paths - -The default cache is in `.dvc/cache`, so there is no need to set a -custom cache location for local paths outside of your project. - -> Exceptions to the above include: external data on different storage devices, -> and partitions mounted on the same file system (e.g. `/mnt/raid/data`). In -> such cases, set up an external cache in the same drive to enable [file links] -> and avoid copying data. - -```cli -$ dvc add --external /home/shared/existing-data - -$ dvc stage add -d data.txt \ - --external \ - -o /home/shared/data.txt \ - cp data.txt /home/shared/data.txt -``` - -
diff --git a/content/docs/user-guide/pipelines/external-data.md b/content/docs/user-guide/pipelines/external-data.md new file mode 100644 index 0000000000..9ba694f94a --- /dev/null +++ b/content/docs/user-guide/pipelines/external-data.md @@ -0,0 +1,251 @@ +# External Data + +Sometimes you need to stream your data dependencies directly from their source +locations outside your local project, or stream your data outputs +directly to some external location, like cloud storage or HDFS. + +## How external dependencies work + +External dependencies will be tracked by DVC, detecting when they +change (triggering stage executions on `dvc repro`, for example). + +To define files or directories in an external location as +[stage](/doc/command-reference/run) dependencies, specify their remote URLs or +external paths in `dvc.yaml` (`deps` field). Use the same format as the `url` of +certain `dvc remote` types. Currently, the following supported `dvc remote` +types/protocols: + +- Amazon S3 +- Microsoft Azure Blob Storage +- Google Cloud Storage +- SSH +- HDFS +- HTTP +- Local files and directories outside the workspace + + + +[Remote storage] is a different feature. + +[remote storage]: /doc/user-guide/data-management/remote-storage + + + +## Examples + +Let's take a look at defining and running a `download_file` stage that simply +downloads a file from an external location, on all the supported location types. + +> See the [Remote alias example](#example-using-dvc-remote-aliases) for info. on +> using remote locations that require manual authentication setup. + +
+ +### Amazon S3 + +```cli +$ dvc stage add -n download_file \ + -d s3://mybucket/data.txt \ + -o data.txt \ + aws s3 cp s3://mybucket/data.txt data.txt +``` + +
+ +
+ +### Microsoft Azure Blob Storage + +```cli +$ dvc stage add -n download_file \ + -d azure://mycontainer/data.txt \ + -o data.txt \ + az storage copy \ + -d data.json \ + --source-account-name my-account \ + --source-container mycontainer \ + --source-blob data.txt +``` + +
+ +
+ +### Google Cloud Storage + +```cli +$ dvc stage add -n download_file \ + -d gs://mybucket/data.txt \ + -o data.txt \ + gsutil cp gs://mybucket/data.txt data.txt +``` + +
+ +
+ +### SSH + +```cli +$ dvc stage add -n download_file \ + -d ssh://user@example.com/path/to/data.txt \ + -o data.txt \ + scp user@example.com:/path/to/data.txt data.txt +``` + + + +DVC requires both SSH and SFTP access to work with SSH remote storage. Check +that you can connect both ways with tools like `ssh` and `sftp` (GNU/Linux). +Note that your server's SFTP root might differ from its physical root (`/`). + + + +
+ +
+ +### HDFS + +```cli +$ dvc stage add -n download_file \ + -d hdfs://user@example.com/data.txt \ + -o data.txt \ + hdfs fs -copyToLocal \ + hdfs://user@example.com/data.txt data.txt +``` + +
+ +
+ +### HTTP + +> Including HTTPs + +```cli +$ dvc stage add -n download_file \ + -d https://example.com/data.txt \ + -o data.txt \ + wget https://example.com/data.txt -O data.txt +``` + +
+ +
+ +### local file system paths + +```cli +$ dvc stage add -n download_file \ + -d /home/shared/data.txt \ + -o data.txt \ + cp /home/shared/data.txt data.txt +``` + +
+ +## Example: Using DVC remote aliases + +You may want to encapsulate external locations as configurable entities that can +be managed independently. This is useful if the connection requires +authentication, if multiple dependencies (or stages) reuse the same location, or +if the URL is likely to change in the future. + +[DVC remotes][remote storage] can do just this. You may use `dvc remote add` to +define them, and then use a special URL with format +`remote://{remote_name}/{path}` (remote alias) to define the external +dependency. + +Let's see an example using SSH. First, register and configure the remote: + +```cli +$ dvc remote add myssh ssh://user@example.com +$ dvc remote modify --local myssh password 'mypassword' +``` + +> Refer to `dvc remote modify` for more details like setting up access +> credentials for the different remote types. + +Now, use an alias to this remote when defining the stage: + +```cli +$ dvc stage add -n download_file \ + -d remote://myssh/path/to/data.txt \ + -o data.txt \ + wget https://example.com/data.txt -O data.txt +``` + +## Example: `import-url` command + +In the previous examples, special downloading tools were used: `scp`, +`aws s3 cp`, etc. `dvc import-url` simplifies the downloading for all the +supported external path or URL types. + +```cli +$ dvc import-url https://data.dvc.org/get-started/data.xml +Importing 'https://data.dvc.org/get-started/data.xml' -> 'data.xml' +``` + +The command above creates the import `.dvc` file `data.xml.dvc`, that contains +an external dependency (in this case an HTTPs URL). + +
+ +### Expand to see resulting `.dvc` file + +```yaml +# ... +deps: + - etag: '"f432e270cd634c51296ecd2bc2f5e752-5"' + path: https://data.dvc.org/get-started/data.xml +outs: + - md5: a304afb96060aad90176268345e10355 + path: data.xml + cache: true + persist: false +``` + +DVC checks the headers returned by the server, looking for an +[HTTP ETag](https://en.wikipedia.org/wiki/HTTP_ETag) or a +[Content-MD5](https://tools.ietf.org/html/rfc1864) header, and uses it to +determine whether the source has changed and we need to download the file again. + +
+ +## Example: Imports + +`dvc import` can download a file or directory from any DVC project, +or from a Git repository. It also creates an external dependency in its import +`.dvc` file. + +```cli +$ dvc import git@github.com:iterative/example-get-started model.pkl +Importing 'model.pkl (git@github.com:iterative/example-get-started)' +-> 'model.pkl' +``` + +The command above creates `model.pkl.dvc`, where the external dependency is +specified (with the `repo` field). + +
+ +### Expand to see resulting `.dvc` file + +```yaml +# ... +deps: + - path: model.pkl + repo: + url: git@github.com:iterative/example-get-started + rev_lock: 6c73875a5f5b522f90b5afa9ab12585f64327ca7 +outs: + - md5: 3863d0e317dee0a55c4e59d2ec0eef33 + path: model.pkl + cache: true +``` + +The `url` and `rev_lock` subfields under `repo` are used to save the origin and +[version](https://git-scm.com/docs/revisions) of the dependency, respectively. + +
From 7e923b2acf76cdf4c24f6a4866a0959f11b0f02d Mon Sep 17 00:00:00 2001 From: dberenbaum Date: Mon, 29 May 2023 07:47:04 -0400 Subject: [PATCH 2/4] guide: add external deps/outs guide --- .../user-guide/pipelines/external-data.md | 118 +++++------------- 1 file changed, 34 insertions(+), 84 deletions(-) diff --git a/content/docs/user-guide/pipelines/external-data.md b/content/docs/user-guide/pipelines/external-data.md index 9ba694f94a..cf619b801c 100644 --- a/content/docs/user-guide/pipelines/external-data.md +++ b/content/docs/user-guide/pipelines/external-data.md @@ -9,11 +9,10 @@ directly to some external location, like cloud storage or HDFS. External dependencies will be tracked by DVC, detecting when they change (triggering stage executions on `dvc repro`, for example). -To define files or directories in an external location as -[stage](/doc/command-reference/run) dependencies, specify their remote URLs or -external paths in `dvc.yaml` (`deps` field). Use the same format as the `url` of -certain `dvc remote` types. Currently, the following supported `dvc remote` -types/protocols: +To define files or directories in an external location as stage +dependencies, specify their remote URLs or external paths in `dvc.yaml` (`deps` +field). Use the same format as the `url` of of the following supported +`dvc remote` types/protocols: - Amazon S3 - Microsoft Azure Blob Storage @@ -23,15 +22,7 @@ types/protocols: - HTTP - Local files and directories outside the workspace - - -[Remote storage] is a different feature. - -[remote storage]: /doc/user-guide/data-management/remote-storage - - - -## Examples +### Examples Let's take a look at defining and running a `download_file` stage that simply downloads a file from an external location, on all the supported location types. @@ -41,7 +32,7 @@ downloads a file from an external location, on all the supported location types.
-### Amazon S3 +#### Amazon S3 ```cli $ dvc stage add -n download_file \ @@ -54,7 +45,7 @@ $ dvc stage add -n download_file \
-### Microsoft Azure Blob Storage +#### Microsoft Azure Blob Storage ```cli $ dvc stage add -n download_file \ @@ -71,7 +62,7 @@ $ dvc stage add -n download_file \
-### Google Cloud Storage +#### Google Cloud Storage ```cli $ dvc stage add -n download_file \ @@ -84,7 +75,7 @@ $ dvc stage add -n download_file \
-### SSH +#### SSH ```cli $ dvc stage add -n download_file \ @@ -105,7 +96,7 @@ Note that your server's SFTP root might differ from its physical root (`/`).
-### HDFS +#### HDFS ```cli $ dvc stage add -n download_file \ @@ -119,7 +110,7 @@ $ dvc stage add -n download_file \
-### HTTP +#### HTTP > Including HTTPs @@ -134,7 +125,7 @@ $ dvc stage add -n download_file \
-### local file system paths +#### local file system paths ```cli $ dvc stage add -n download_file \ @@ -145,7 +136,7 @@ $ dvc stage add -n download_file \
-## Example: Using DVC remote aliases +#### Example: Using DVC remote aliases You may want to encapsulate external locations as configurable entities that can be managed independently. This is useful if the connection requires @@ -176,76 +167,35 @@ $ dvc stage add -n download_file \ wget https://example.com/data.txt -O data.txt ``` -## Example: `import-url` command +## How external outputs work -In the previous examples, special downloading tools were used: `scp`, -`aws s3 cp`, etc. `dvc import-url` simplifies the downloading for all the -supported external path or URL types. +External outputs will be tracked by DVC, detecting when they +change, but not saved in the cache for +[versioning](/doc/use-cases/versioning-data-and-models). -```cli -$ dvc import-url https://data.dvc.org/get-started/data.xml -Importing 'https://data.dvc.org/get-started/data.xml' -> 'data.xml' -``` - -The command above creates the import `.dvc` file `data.xml.dvc`, that contains -an external dependency (in this case an HTTPs URL). - -
+ -### Expand to see resulting `.dvc` file +Saving external outputs to an external cache has been deprecated in DVC 3.0. -```yaml -# ... -deps: - - etag: '"f432e270cd634c51296ecd2bc2f5e752-5"' - path: https://data.dvc.org/get-started/data.xml -outs: - - md5: a304afb96060aad90176268345e10355 - path: data.xml - cache: true - persist: false -``` +Please bear with us as we work on versioning external outputs using +[cloud versioning](/doc/user-guide/data-management/cloud-versioning). -DVC checks the headers returned by the server, looking for an -[HTTP ETag](https://en.wikipedia.org/wiki/HTTP_ETag) or a -[Content-MD5](https://tools.ietf.org/html/rfc1864) header, and uses it to -determine whether the source has changed and we need to download the file again. + -
+To define files or directories in an external location as outputs, give +their remote URLs or external paths to `dvc stage add -O`, or put them in +`dvc.yaml` (`outs` field). For supported external output types and expected URL +formats, see the examples above for +[external dependencies](#how-external-dependencies-work). -## Example: Imports +### Example -`dvc import` can download a file or directory from any DVC project, -or from a Git repository. It also creates an external dependency in its import -`.dvc` file. +Let's take a look at defining and running an `upload_file` stage that simply +uploads a file to an external location. ```cli -$ dvc import git@github.com:iterative/example-get-started model.pkl -Importing 'model.pkl (git@github.com:iterative/example-get-started)' --> 'model.pkl' -``` - -The command above creates `model.pkl.dvc`, where the external dependency is -specified (with the `repo` field). - -
- -### Expand to see resulting `.dvc` file - -```yaml -# ... -deps: - - path: model.pkl - repo: - url: git@github.com:iterative/example-get-started - rev_lock: 6c73875a5f5b522f90b5afa9ab12585f64327ca7 -outs: - - md5: 3863d0e317dee0a55c4e59d2ec0eef33 - path: model.pkl - cache: true +$ dvc stage add -n upload_file \ + -d data.txt \ + -O s3://mybucket/data.txt \ + aws s3 cp data.txt s3://mybucket/data.txt ``` - -The `url` and `rev_lock` subfields under `repo` are used to save the origin and -[version](https://git-scm.com/docs/revisions) of the dependency, respectively. - -
From de0cafe52fdc01ed67ef6c95fbddff44f2b70656 Mon Sep 17 00:00:00 2001 From: dberenbaum Date: Mon, 29 May 2023 09:44:40 -0400 Subject: [PATCH 3/4] guide: refactor external data --- content/docs/sidebar.json | 4 +- .../importing-external-data.md | 72 ++++++++++++------- ...d => external-dependencies-and-outputs.md} | 20 +++--- 3 files changed, 61 insertions(+), 35 deletions(-) rename content/docs/user-guide/pipelines/{external-data.md => external-dependencies-and-outputs.md} (90%) diff --git a/content/docs/sidebar.json b/content/docs/sidebar.json index 2f16e63b98..141b49912b 100644 --- a/content/docs/sidebar.json +++ b/content/docs/sidebar.json @@ -166,7 +166,7 @@ }, "cloud-versioning", "discovering-and-accessing-data", - "importing-external-data", + { "label": "External Data", "slug": "importing-external-data" }, "large-dataset-optimization" ] }, @@ -177,7 +177,7 @@ "defining-pipelines", "running-pipelines", "run-cache", - "external-data" + "external-dependencies-and-outputs" ] }, { diff --git a/content/docs/user-guide/data-management/importing-external-data.md b/content/docs/user-guide/data-management/importing-external-data.md index f7fd942225..6fb533622d 100644 --- a/content/docs/user-guide/data-management/importing-external-data.md +++ b/content/docs/user-guide/data-management/importing-external-data.md @@ -1,9 +1,9 @@ -# Importing External Data +# External Data To version data that lives outside of your local project, you can import it. You can choose whether to download that data and whether to push -copies of it to your [DVC remote]. This makes importing the data useful even if -you want to track the data in-place at its original source location. +copies to your [DVC remote]. This makes importing the data useful even if you +want to track the data in-place at its original source location. ## How importing external data works @@ -54,15 +54,14 @@ recover it. ## Avoiding duplication -Uploading and downloading copies of the external data may be unnecessary and -impractical in some cases, like if your data is large or static, or you stream -it directly from its source location, or you use cloud versioning to backup old +Making copies of the external data may be unnecessary and impractical in some +cases, like if your data is too big to download locally, or you stream it +directly from its source location, or you use cloud versioning to backup old versions already. -### Skipping downloads - -You can use `--no-download` to skip the download step when you import or update -the data: +Use `--no-download` to skip the download step when you import or update the +data. DVC will save the metadata in `data.xml.dvc` but won't download `data.xml` +locally: ```cli $ dvc import-url --no-download https://data.dvc.org/get-started/data.xml @@ -72,23 +71,46 @@ $ ls data.xml.dvc ``` -If you don't have time or space to download the data but still want to make a -backup of the data on your [DVC remote] to be able to recover a copy later, you -can instead use `--to-remote`, which will upload the data to remote storage -without saving a local copy. +To recover this version of the data later, use `dvc pull`, and DVC will try to +download it from its original source location. However, if you have overwritten +the original source data, `dvc pull` may fail. To version the data so you can +recover any version, either push the data to the [DVC remote] or use [cloud +versioning]. + +### Example: Push to remote + +`dvc import-url --to-remote` will not download the data locally but will push +the data to the [DVC remote]: + +```cli +$ dvc import-url --to-remote https://data.dvc.org/get-started/data.xml + +$ ls +data.xml.dvc + +$ dvc push +Everything is up to date. +``` + +### Example: Cloud versioning + +If you are importing from a supported [cloud versioning] provider, +`dvc import-url --no-download --version-aware` will not download the data +locally but will track the cloud provider's version IDs for the data. `dvc pull` +will try to download those version IDs as long as they are available. `dvc push` +will not upload anything because DVC assumes the versions are available at the +source location: -### Skipping uploads +```cli +$ dvc import-url --no-download --version-aware s3://myversionedbucket/data.xml +Importing 's3://myversionedbucket/data.xml' -> 'data.xml' -You can also skip pushing the data to the [DVC remote], and DVC will try to -recover the data from its source location. However, if you don't push the data -and the source location has changed, you may be unable to recover the data. +$ ls +data.xml.dvc -[Cloud versioning](/doc/user-guide/data-management/cloud-versioning) enables you -to recover data without having to push a copy to DVC remote storage. If you have -cloud versioning enabled for the source location, you can import it with -`--version-aware`. DVC will track the version ID of all imported files and be -able to recover them from source as long as those versions remain in the source -location. DVC will also know to skip uploading these files during `dvc push` -since it assumes they are available from the source location. +$ dvc push +Everything is up to date. +``` [dvc remote]: /doc/user-guide/data-management/remote-storage +[cloud versioning]: /doc/user-guide/data-management/cloud-versioning diff --git a/content/docs/user-guide/pipelines/external-data.md b/content/docs/user-guide/pipelines/external-dependencies-and-outputs.md similarity index 90% rename from content/docs/user-guide/pipelines/external-data.md rename to content/docs/user-guide/pipelines/external-dependencies-and-outputs.md index cf619b801c..a60d1bb374 100644 --- a/content/docs/user-guide/pipelines/external-data.md +++ b/content/docs/user-guide/pipelines/external-dependencies-and-outputs.md @@ -1,4 +1,4 @@ -# External Data +# External Dependencies and Outputs Sometimes you need to stream your data dependencies directly from their source locations outside your local project, or stream your data outputs @@ -27,8 +27,8 @@ field). Use the same format as the `url` of of the following supported Let's take a look at defining and running a `download_file` stage that simply downloads a file from an external location, on all the supported location types. -> See the [Remote alias example](#example-using-dvc-remote-aliases) for info. on -> using remote locations that require manual authentication setup. +> See the [Remote alias example](#using-dvc-remote-aliases) for info. on using +> remote locations that require manual authentication setup.
@@ -136,16 +136,18 @@ $ dvc stage add -n download_file \
-#### Example: Using DVC remote aliases +
+ +#### Using DVC remote aliases You may want to encapsulate external locations as configurable entities that can be managed independently. This is useful if the connection requires authentication, if multiple dependencies (or stages) reuse the same location, or if the URL is likely to change in the future. -[DVC remotes][remote storage] can do just this. You may use `dvc remote add` to -define them, and then use a special URL with format -`remote://{remote_name}/{path}` (remote alias) to define the external +[DVC remotes](/doc/user-guide/data-management/remote-storage) can do just this. +You may use `dvc remote add` to define them, and then use a special URL with +format `remote://{remote_name}/{path}` (remote alias) to define the external dependency. Let's see an example using SSH. First, register and configure the remote: @@ -167,6 +169,8 @@ $ dvc stage add -n download_file \ wget https://example.com/data.txt -O data.txt ``` +
+ ## How external outputs work External outputs will be tracked by DVC, detecting when they @@ -177,7 +181,7 @@ change, but not saved in the cache for Saving external outputs to an external cache has been deprecated in DVC 3.0. -Please bear with us as we work on versioning external outputs using +Stay tuned as we work on versioning external outputs using [cloud versioning](/doc/user-guide/data-management/cloud-versioning). From 41b310fa0b9dbd746be09853ed6b8da48929bca8 Mon Sep 17 00:00:00 2001 From: dberenbaum Date: Wed, 7 Jun 2023 15:34:12 -0400 Subject: [PATCH 4/4] drop all external outputs --- content/docs/command-reference/add.md | 64 +++---------------- content/docs/command-reference/destroy.md | 64 +------------------ content/docs/command-reference/stage/add.md | 21 +++--- content/docs/command-reference/version.md | 2 - .../user-guide/how-to/share-a-dvc-cache.md | 5 +- .../pipelines/defining-pipelines.md | 19 ------ .../project-structure/configuration.md | 36 ----------- .../user-guide/project-structure/dvc-files.md | 10 +-- 8 files changed, 29 insertions(+), 192 deletions(-) diff --git a/content/docs/command-reference/add.md b/content/docs/command-reference/add.md index a751562f54..5d851216d9 100644 --- a/content/docs/command-reference/add.md +++ b/content/docs/command-reference/add.md @@ -6,7 +6,7 @@ file. ## Synopsis ```usage -usage: dvc add [-h] [-q | -v] [-R] [--no-commit] [--external] +usage: dvc add [-h] [-q | -v] [-R] [--no-commit] [--glob] [--file ] [-o ] [--to-remote] [-r ] [-j ] [-f] [--desc ] [--meta key=value] [--label ] @@ -16,6 +16,14 @@ positional arguments: targets Files or directories to add ``` +
+ +### Options deprecated in 3.0 + +- `--external` + +
+ ## Description The `dvc add` command is analogous to `git add`, in that it makes DVC aware of @@ -149,21 +157,9 @@ not. specified in `targets`. Shell style wildcards supported: `*`, `?`, `[seq]`, `[!seq]`, and `**` -- `--external` - allow tracking `targets` outside of the DVC repository - in-place. See [Managing External Data]. - - - - Note that this is an advanced feature for very specific situations and not - recommended except if there's absolutely no other alternative. Additionally, - this typically requires an external cache setup (see link above). - - - - `-o `, `--out ` - specify a `path` to the desired location in the workspace to place the `targets` (copying them from their current location). - This enables targeting data outside the project (see an - [example](#example-transfer-to-an-external-cache)). + This enables targeting data outside the project. - `--to-remote` - add a target that's outside the project, neither move it into the workspace, nor cache it. @@ -199,7 +195,6 @@ not. - `-v`, `--verbose` - displays detailed tracing information. [pattern]: https://docs.python.org/3/library/glob.html -[managing external data]: /doc/user-guide/data-management/managing-external-data ## Example: Single file @@ -360,45 +355,6 @@ $ tree .dvc/cache Only the hash values of the `dir/` directory (with `.dir` file extension) and `file2` have been cached. -## Example: Transfer to an external cache - -When you want to add a large dataset that is outside of your -project (e.g. online), you would normally need to download or copy -it into the workspace first. But you may not have enough local -storage space. - -You can however set up an [external cache] that can handle the data. To avoid -ever making a local copy, target the outside data with `dvc add` while -specifying an `--out` (`-o`) path inside of your project. This way the data will -be transferred to the cache directly, and then [linked] into your -workspace. - -Let's add a `data.xml` file via HTTP, putting it in `./data.xml`: - -```cli -$ dvc add https://data.dvc.org/get-started/data.xml -o data.xml -... -$ ls -data.xml data.xml.dvc -``` - -The resulting `.dvc` file will save the provided local `path` as if the data was -already in the workspace, while the `md5` hash points to the copy of the data -that has now been transferred to the cache. Let's check the -contents of `data.xml.dvc` in this case: - -```yaml -outs: - - md5: a304afb96060aad90176268345e10355 - nfiles: 1 - path: data.xml -``` - -[linked]: - /doc/user-guide/data-management/large-dataset-optimization#file-link-types-for-the-dvc-cache -[external cache]: - /doc/user-guide/data-management/managing-external-data#setting-up-an-external-cache - ## Example: Transfer to remote storage Sometimes there's not enough space in the local environment to import a large diff --git a/content/docs/command-reference/destroy.md b/content/docs/command-reference/destroy.md index 922425901b..7c2b44e576 100644 --- a/content/docs/command-reference/destroy.md +++ b/content/docs/command-reference/destroy.md @@ -15,18 +15,12 @@ usage: dvc destroy [-h] [-q | -v] [-f] `dvc destroy` removes `dvc.yaml`, `.dvc` files, and the internal `.dvc/` directory from the project. -Note that the cache directory will be removed as well, unless it's -set to an -[external location](/doc/user-guide/data-management/managing-external-data#setting-up-an-external-cache) -(by default a local cache is located in `.dvc/cache`). If you have setup -[symlinks](/doc/user-guide/data-management/large-dataset-optimization) (from -cache to workspace) in your project, DVC will replace them with the latest +Note that the cache directory will be removed as well. If you have +setup [symlinks](/doc/user-guide/data-management/large-dataset-optimization) +(from cache to workspace) in your project, DVC will replace them with the latest versions of the actual files and directories first, so that your data is intact after destruction. -[external cache]: - /doc/user-guide/data-management/managing-external-data#setting-up-an-external-cache - > Refer to [Project Structure](/doc/user-guide/project-structure) for more > details on the directories and files deleted by this command. @@ -60,55 +54,3 @@ $ ls -a .git code.py foo ``` - -## Example: Preserve an external cache directory - -By default, the cache location is `.dvc/cache`. Let's change its -location to `/mnt/cache` using `dvc cache dir`, add some data, and then try -`dvc destroy`: - -```cli -$ dvc cache dir /mnt/cache -$ echo foo > foo -$ dvc add foo -``` - -Contents of the workspace: - -```cli -$ ls -a -.dvc .git code.py foo foo.dvc -``` - -Contents of the (external) cache (`b1/946a...` contains `foo`): - -```cli -$ tree /mnt/cache -/mnt/cache/ -└── b1 - └── 946ac92492d2347c6235b4d2611184 -``` - -OK, let's destroy the DVC project: - -```cli -$ dvc destroy - -This will destroy all information about your pipelines, all data files... -Are you sure you want to continue? [y/n] -yes - -$ ls -a -.git code.py foo -``` - -`foo.dvc` and the internal `.dvc/` directory were removed (this would include -any cached data prior to changing the cache location). But the cache files in -`/mnt/cache` persist: - -```cli -$ tree /mnt/cache -/mnt/cache/ -└── b1 - └── 946ac92492d2347c6235b4d2611184 -``` diff --git a/content/docs/command-reference/stage/add.md b/content/docs/command-reference/stage/add.md index 4756d4af2f..b38118a0e1 100644 --- a/content/docs/command-reference/stage/add.md +++ b/content/docs/command-reference/stage/add.md @@ -8,7 +8,7 @@ Helper command to create or update stages in `dvc.yaml`. usage: dvc stage add [-h] [-q | -v] -n [-f] [-d ] [-p [:]] [-o ] [-O ] [-c ] - [--external] [--outs-persist ] + [--outs-persist ] [--outs-persist-no-cache ] [-m ] [-M ] [--plots ] [--plots-no-cache ] @@ -20,6 +20,14 @@ positional arguments: command Command to execute ``` +
+ +### Options deprecated in 3.0 + +- `--external` + +
+ ## Description Writes stage definitions to `dvc.yaml` (in the current working directory). To @@ -84,8 +92,8 @@ is reproduced (see also `dvc gc`). Relevant notes: which generates a single `.dir` entry in the cache (refer to [Structure of cache directory] for more info.) -- [external dependencies] and [external outputs] (outside of the - workspace) are also supported (except metrics and plots). +- [external dependencies and outputs] (outside of the workspace) + are also supported (except metrics and plots). - Since outputs are deleted from the workspace before executing stage commands, the underlying code should create any directory structures @@ -102,8 +110,8 @@ is reproduced (see also `dvc gc`). Relevant notes: /docs/user-guide/how-to/add-deps-or-outs-to-a-stage [structure of cache directory]: /doc/user-guide/project-structure/internal-files#structure-of-the-cache-directory -[external dependencies]: /doc/user-guide/external-dependencies -[external outputs]: /doc/user-guide/managing-external-data +[external dependencies and outputs]: + /doc/user-guide/pipelines/external-dependencies-and-outputs [manual process]: /doc/command-reference/move#renaming-stage-outputs ### For displaying and comparing data science experiments @@ -209,9 +217,6 @@ data science experiments. `always_changed` field in `dvc.yaml`). As a result DVC will always execute it when reproducing the pipeline. -- `--external` - allow writing outputs outside of the DVC repository. See - [Managing External Data](/doc/user-guide/data-management/managing-external-data). - - `--desc ` - user description of the stage (optional). This doesn't affect any DVC operations. diff --git a/content/docs/command-reference/version.md b/content/docs/command-reference/version.md index f0829a65c8..0978b8f3f4 100644 --- a/content/docs/command-reference/version.md +++ b/content/docs/command-reference/version.md @@ -19,7 +19,6 @@ usage: dvc version [-h] [-q | -v] | `Supports` | Types of [remote storage] supported by the current DVC setup (their required dependencies are installed) | | `Cache types` | [Types of links] supported (between workspace and cache) | | `Cache directory` | Filesystem type (e.g. ext4, FAT, etc.) and drive on which the cache directory is mounted | -| `Caches` | Cache [location types] configured in the repo (e.g. local, SSH, S3, etc.) | | `Remotes` | Remote [location types][remote storage] configured in the repo (e.g. SSH, S3, Google Drive, etc.) | | `Workspace directory` | Filesystem type (e.g. ext4, FAT, etc.) and drive on which the workspace is mounted | | `Repo` | Shows whether we are in a DVC repo and/or Git repo | @@ -27,7 +26,6 @@ usage: dvc version [-h] [-q | -v] [remote storage]: /doc/user-guide/data-management/remote-storage [types of links]: /doc/user-guide/data-management/large-dataset-optimization#file-link-types-for-the-dvc-cache -[location types]: /doc/user-guide/data-management/managing-external-data > No info about `Cache` or `Workspace directory` is printed if `dvc version` is > used outside a DVC project. diff --git a/content/docs/user-guide/how-to/share-a-dvc-cache.md b/content/docs/user-guide/how-to/share-a-dvc-cache.md index 2111eb5338..8caa279233 100644 --- a/content/docs/user-guide/how-to/share-a-dvc-cache.md +++ b/content/docs/user-guide/how-to/share-a-dvc-cache.md @@ -59,9 +59,8 @@ $ sudo chown -R myuser:ourgroup /home/shared/dvc-cache/ ## Configure the shared cache A cache directory outside the workspace is called an -[external cache](/doc/user-guide/data-management/managing-external-data#setting-up-an-external-cache). -Set it to the directory we created earlier with `dvc cache dir` and configure it -with `dvc config cache`: +external cache. Set it to the directory we created earlier with `dvc cache dir` +and configure it with `dvc config cache`: ```cli $ dvc cache dir /home/shared/dvc-cache diff --git a/content/docs/user-guide/pipelines/defining-pipelines.md b/content/docs/user-guide/pipelines/defining-pipelines.md index 7d77fbc211..f6bfdcbccc 100644 --- a/content/docs/user-guide/pipelines/defining-pipelines.md +++ b/content/docs/user-guide/pipelines/defining-pipelines.md @@ -166,25 +166,6 @@ File system-level dependencies are defined in the `deps` field of `dvc.yaml` stages; Alternatively, using the `--deps` (`-d`) option of `dvc stage add` (see the previous section's example). -
- -### External dependencies: click to learn more. - -A less common kind of dependency is a _URL dependency_. Instead of files in a -local disk, you can `dvc import` data from another DVC project (for -example hosted on GitHub). External dependencies establish relationships between -different projects or systems (see `dvc import-url`). -[Get all the details](/doc/user-guide/data-management/importing-external-data). - - - -DVC will use special methods to check whether the contents of an URL have -changed for the purpose of stage invalidation. - - - -
- ## Parameter dependencies A more granular type of dependency is the parameter (`params` field of diff --git a/content/docs/user-guide/project-structure/configuration.md b/content/docs/user-guide/project-structure/configuration.md index ce31891a09..ccf868c873 100644 --- a/content/docs/user-guide/project-structure/configuration.md +++ b/content/docs/user-guide/project-structure/configuration.md @@ -211,42 +211,6 @@ section): [sharing a cache]: /doc/user-guide/how-to/share-a-dvc-cache [`os.umask`]: https://docs.python.org/3/library/os.html#os.umask -The following parameters allow setting an [external cache] location. A -`dvc remote` name is used (instead of the URL) because often it's necessary to -configure authentication or other connection settings, and configuring a remote -is the way that can be done. - -[external cache]: - /doc/user-guide/data-management/managing-external-data#setting-up-an-external-cache - -- `cache.local` - name of a [local remote] to use as external cache. This will - overwrite the value in `cache.dir` (see `dvc cache dir`). - -- `cache.s3` - name of an Amazon S3 remote to use as external cache. - -- `cache.gs` - name of a Google Cloud Storage remote to use as external cache. - -- `cache.ssh` - name of an SSH remote to use as external cache. - -- `cache.hdfs` - name of an HDFS remote to use as external cache. - -- `cache.webhdfs` - name of an HDFS remote with WebHDFS enabled to use as - external cache. - - - - Avoid using the same [remote storage] used for `dvc push` and `dvc pull` as - external cache, because it may cause file hash overlaps: the hash of an - external output could collide with that of a local file with - different content. - - [remote storage]: /doc/user-guide/data-management/remote-storage - - - -[local remote]: - /doc/user-guide/data-management/remote-storage#file-systems-local-remotes -
diff --git a/content/docs/user-guide/project-structure/dvc-files.md b/content/docs/user-guide/project-structure/dvc-files.md index 9b712e5c20..7eb706cbfc 100644 --- a/content/docs/user-guide/project-structure/dvc-files.md +++ b/content/docs/user-guide/project-structure/dvc-files.md @@ -7,14 +7,6 @@ locally. See [Data Versioning] for more info. [data versioning]: /doc/start/data-management/data-versioning - - -\* Certain -[external locations](/doc/user-guide/data-management/managing-external-data) are -also supported. - - - Files ending with the `.dvc` extension ("dot DVC file") are created by these commands as data placeholders that can be versioned with Git. They contain the information needed to track the target data over time. Here's an example: @@ -80,7 +72,7 @@ The following subfields may be present under `outs` entries: | `push` | Whether or not this file or directory, when previously cached, is uploaded to remote storage by `dvc push` (`true` by default). | [etag]: https://en.wikipedia.org/wiki/HTTP_ETag#Strong_and_weak_validation -[external outputs]: /doc/user-guide/data-management/managing-external-data +[external outputs]: /doc/user-guide/pipelines/external-dependencies-and-outputs [cloud versioning]: /doc/user-guide/data-management/cloud-versioning ## Dependency entries