diff --git a/content/docs/sidebar.json b/content/docs/sidebar.json index 7663b39c47..ff700adeb2 100644 --- a/content/docs/sidebar.json +++ b/content/docs/sidebar.json @@ -40,52 +40,15 @@ "experiments" ] }, - { - "slug": "tutorials", - "source": "tutorials/index.md", - "children": [ - { - "slug": "interactive", - "label": "Interactive Tutorials" - }, - { - "slug": "versioning", - "label": "Data Versioning", - "tutorials": { - "katacoda": "https://katacoda.com/dvc/courses/tutorials/versioning" - } - }, - { - "slug": "pipelines", - "label": "Stages and Pipelines" - }, - { - "slug": "deep", - "label": "Deep Dive Tutorial", - "source": "deep/index.md", - "children": [ - "preparation", - { - "label": "Define ML Pipeline", - "slug": "define-ml-pipeline" - }, - "reproducibility", - "sharing-data" - ] - }, - { - "slug": "community", - "label": "Community Tutorials" - } - ] - }, { "slug": "use-cases", "source": "use-cases/index.md", "children": [ { "label": "Versioning Data & Model Files", - "slug": "versioning-data-and-model-files" + "slug": "versioning-data-and-model-files", + "source": "versioning-data-and-model-files/index.md", + "children": ["tutorial"] }, { "label": "Sharing Data & Model Files", diff --git a/content/docs/tutorials/community.md b/content/docs/tutorials/community.md deleted file mode 100644 index a63edca371..0000000000 --- a/content/docs/tutorials/community.md +++ /dev/null @@ -1,19 +0,0 @@ -# Community Tutorials - -Blog posts from the community which show how they use DVC in their ML scenarios: - -- [A walkthrough of DVC](https://blog.codecentric.de/en/2019/03/walkthrough-dvc/) - -- [DVC dependency management](https://blog.codecentric.de/en/2019/08/dvc-dependency-management/) - -- [Remote training with GitLab-CI and DVC](https://blog.codecentric.de/en/2020/01/remote-training-gitlab-ci-dvc/) - -- [Creating reproducible data science workflows with DVC](https://medium.com/y-data-stories/creating-reproducible-data-science-workflows-with-dvc-3bf058e9797b) - -- [Using DVC to create an efficient version control system for data projects](https://medium.com/qonto-engineering/using-dvc-to-create-an-efficient-version-control-system-for-data-projects-96efd94355fe) - -- [How to use data version control (DVC) in a machine learning project](https://towardsdatascience.com/how-to-use-data-version-control-dvc-in-a-machine-learning-project-a78245c0185) - -- [My first try at DVC](https://stdiff.net/MB2019051301.html) - -- [Effective Management of your Machine Learning Laboratory](https://www.linkedin.com/pulse/effective-management-your-machine-learning-laboratory-ulaganathan/) diff --git a/content/docs/tutorials/deep/define-ml-pipeline.md b/content/docs/tutorials/deep/define-ml-pipeline.md deleted file mode 100644 index 80eb2149fc..0000000000 --- a/content/docs/tutorials/deep/define-ml-pipeline.md +++ /dev/null @@ -1,389 +0,0 @@ -# Define ML Pipeline - -## Get data file - -To include a data file into your data science environment, you need to copy the -file into the repository. We'll create a `data/` directory for the data files -and download a 40MB data archive into this directory. - -```dvc -$ mkdir data -$ wget -P data https://data.dvc.org/tutorial/nlp/100K/Posts.xml.zip -$ du -sh data/* - 41M data/Posts.xml.zip -``` - -At this time, `data/Posts.xml.zip` is a regular (untracked) file. We can track -it with DVC using `dvc add` (see below). After executing the command you will -see a new file `data/Posts.xml.zip.dvc` and a change in `data/.gitignore`. Both -of these files have to be committed to the repository. - -```dvc -$ dvc add data/Posts.xml.zip -$ du -sh data/* - 41M data/Posts.xml.zip -4.0K data/Posts.xml.zip.dvc - -$ git status -s data/ -?? data/.gitignore -?? data/Posts.xml.zip.dvc - -$ git add . -$ git commit -m "add raw dataset" -``` - -You may have noticed that the actual data file was not committed to the Git -repo. The reason is that DVC included it in `data/.gitignore`, so that Git -ignores this data file from now on. - -> DVC will always exclude data files from the Git repository by listing them in -> `.gitignore`. - -Refer to -[Versioning Data and Model Files](/doc/use-cases/versioning-data-and-model-files), -`dvc add`, and `dvc run` for more information on storing and versioning data -files with DVC. - -Note that to modify or replace a data file tracked by DVC, you may need to run -`dvc unprotect` or `dvc remove` first (see the -[Update Tracked File](/doc/user-guide/updating-tracked-files) guide). To rename -or move it, you can use `dvc move`. - -## Data file internals - -If you take a look at the [DVC-file](/doc/user-guide/dvc-files-and-directories) -created by `dvc add`, you will see that outputs are tracked in the -`outs` field. In this file, only one output is specified. The output contains -the data file path in the repository and its MD5 hash. This hash value -determines the location of the actual content file in the -[cache directory](/doc/user-guide/dvc-files-and-directories#structure-of-cache-directory), -`.dvc/cache`. - -```dvc -$ cat data/Posts.xml.zip.dvc -md5: 7559eb45beb7e90f192e836be8032a64 -outs: -- cache: true - md5: ec1d2935f811b77cc49b031b999cbf17 - path: Posts.xml.zip - -$ du -sh .dvc/cache/ec/* - 41M .dvc/cache/ec/1d2935f811b77cc49b031b999cbf17 -``` - -> Outputs from DVC-files define the relationship between the data file path in a -> repository and the path in the cache directory. - -Keeping actual file contents in the cache, and a copy of the cached -file in the workspace during `$ git checkout` is a regular trick -that [Git-LFS](https://git-lfs.github.com/) (Git for Large File Storage) uses. -This trick works fine for tracking small files with source code. For large data -files, this might not be the best approach, because of _checkout_ operation for -a 10Gb data file might take several seconds and a 50GB file checkout (think -copy) might take a few minutes. - -DVC was designed with large data files in mind. This means gigabytes or even -hundreds of gigabytes in file size. Instead of copying files from cache to -workspace, DVC can create reflinks or other file link types. - -> When reflinks are not supported by the file system, DVC defaults to copying -> files, which doesn't optimize file storage. However, it's easy to enable other -> file link types on most systems. See -> [File link types](/doc/user-guide/large-dataset-optimization#file-link-types-for-the-dvc-cache) -> for more information. - -Creating file links is a quick file system operation. So, with DVC you can -easily checkout a few dozen files of any size. A file link prevents you from -using twice as much space in the hard drive. Even if each of the files contains -41MB of data, the overall size of the repository is still 41MB. Both of the -files correspond to the same `inode` (a file metadata record) in the file -system. Refer to -[Large Dataset Optimization](/doc/user-guide/large-dataset-optimization) for -more details. - -> Note that in systems supporting reflinks, use the `df` command to confirm that -> free space on the drive didn't decline by the file size that we are adding, so -> no duplication takes place. `du` may be inaccurate with reflinks. - -```dvc -$ ls -i data/Posts.xml.zip -78483929 data/Posts.xml.zip - -$ ls -i .dvc/cache/ec/ -78483929 88519f8465218abb23ce0e0e8b1384 - -$ du -sh . - 41M . -``` - -Note that `ls -i` prints the index number(78483929) of each file and inode for -`data/Posts.xml.zip` and `.dvc/cache/ec/88519f8465218abb23ce0e0e8b1384` remained -same. - -## Running commands - -Once the data files are in the workspace, you can start processing the data and -train ML models out of the data files. DVC helps you to define -[stages](/doc/command-reference/run) of your ML process and easily connect them -into a ML [pipeline](/doc/command-reference/pipeline). - -`dvc run` executes any command that you pass it as a list of parameters. -However, the command to run alone is not as interesting as its role within a -larger data pipeline, so we'll need to specify its dependencies and -outputs. We call all this a pipeline _stage_. Dependencies may -include input files or directories, and the actual command to run. Outputs are -files written to by the command, if any. - -- Option `-d in.tsv` specifies a dependency file or directory. The dependency - can be a regular file from a repository or a data file. - -- `-o out.dat` (lower case o) specifies an output data file. DVC will track this - data file by creating a corresponding - [DVC-file](/doc/user-guide/dvc-files-and-directories) (as if running - `dvc add out.dat` after `dvc run` instead). - -- `-O tmp.dat` (upper case O) specifies a simple output file (not to be added to - DVC). - -It's important to specify dependencies and outputs before the command to run -itself. - -Let's see how an extraction command `unzip` works under DVC, for example: - -```dvc -$ dvc run -d data/Posts.xml.zip -o data/Posts.xml \ - unzip data/Posts.xml.zip -d data/ - -Running command: - unzip data/Posts.xml.zip -d data/ -Archive: data/Posts.xml.zip - inflating: data/Posts.xml -Saving information to 'Posts.xml.dvc'. - -To track the changes with git run: - - git add data/.gitignore Posts.xml.dvc - -$ du -sh data/* - -145M data/Posts.xml -41M data/Posts.xml.zip -4.0K data/Posts.xml.zip.dvc -``` - -In these commands, option `-d` specifies an output directory for the tar -command. `-d data/Posts.xml.zip` defines the input file and `-o data/Posts.xml` -the resulting extracted data file. - -The `unzip` command extracts data file `data/Posts.xml.zip` to a regular file -`data/Posts.xml`. It knows nothing about data files or DVC. DVC executes the -command and does some additional work if the command was successful: - -1. DVC transforms all the outputs (`-o` option) into tracked data files (similar - to using `dvc add` for each of them). As a result, all the actual data - contents go to the cache directory `.dvc/cache`, and each of the - file names will be added to `.gitignore`. - -2. For reproducibility purposes, `dvc run` creates the `Posts.xml.dvc` stage - file in the project with information about this pipeline stage. - (See [DVC Files](/doc/user-guide/dvc-files-and-directories)). Note that the - name of this file could be specified by using the `-f` option, for example - `-f extract.dvc`. - -Let's take a look at the resulting stage file created by `dvc run` above: - -```dvc -$ cat Posts.xml.dvc - -cmd: ' unzip data/Posts.xml.zip -d data/' -deps: -- md5: ec1d2935f811b77cc49b031b999cbf17 - path: data/Posts.xml.zip -md5: 16129387a89cb5a329eb6a2aa985415e -outs: -- cache: true - md5: c1fa36d90caa8489a317eee917d8bf03 - path: data/Posts.xml -``` - -Parent fields in the file above include: - -- `cmd`: The command to run -- `deps`: Dependencies with MD5 hashes -- `outs`: Outputs with MD5 hashes - -And (as with the `dvc add` command) the `data/.gitignore` file was modified. Now -it includes the command output file, `Posts.xml`. - -```dvc -$ git status -s - M data/.gitignore -?? Posts.xml.dvc - -$ cat data/.gitignore -Posts.xml.zip -Posts.xml -``` - -The output file `Posts.xml` was transformed by DVC into a data file in -accordance with the `-o` option. You can find the corresponding cache file with -the hash value, as a path starting in `c1/fa36d`: - -```dvc -$ ls .dvc/cache/ -2f/ a8/ - -$ du -sh .dvc/cache/c1/* .dvc/cache/ec/* - 41M .dvc/cache/ec/1d2935f811b77cc49b031b999cbf17 -145M .dvc/cache/c1/fa36d90caa8489a317eee917d8bf03 - -$ du -sh . -186M . -``` - -Let's commit the result of the `unzip` command. This will be the first stage of -our ML pipeline. - -```dvc -$ git add . -$ git commit -m "extract data" -``` - -## Running in bulk - -A single [stage](/doc/command-reference/run) of our ML pipeline was created and -committed into repository. It isn't necessary to commit stages right after their -creation. You can create a few and commit them with Git together later. - -Let's create the following stages: converting an XML file to TSV, and then -separating training and testing datasets: - -```dvc -$ dvc run -d data/Posts.xml -d code/xml_to_tsv.py -d code/conf.py \ - -o data/Posts.tsv \ - python code/xml_to_tsv.py -Using 'Posts.tsv.dvc' as a stage file -Reproducing 'Posts.tsv.dvc': - python code/xml_to_tsv.py - -$ dvc run -d data/Posts.tsv -d code/split_train_test.py \ - -d code/conf.py \ - -o data/Posts-test.tsv -o data/Posts-train.tsv \ - python code/split_train_test.py 0.33 20180319 -Using 'Posts-test.tsv.dvc' as a stage file -Reproducing 'Posts-test.tsv.dvc': - python code/split_train_test.py 0.33 20180319 -Positive size 2049, negative size 97951 -``` - -The result of the commands above are two -[stage files](/doc/command-reference/run) corresponding to each of the commands, -`Posts-test.tsv.dvc` and `Posts.tsv.dvc`. Also, a `code/conf.pyc` file was -created by the command itself. There's no need track this output file with Git. -Let's manually include this type of file into `.gitignore`. - -```dvc -$ git status -s - M data/.gitignore -?? Posts-test.tsv.dvc -?? Posts.tsv.dvc -?? code/conf.pyc - -$ echo "*.pyc" >> .gitignore -``` - -As mentioned before, both of stage files can be committed to the repository -together: - -```dvc -$ git add . -$ git commit -m "Process to TSV and separate test and train" -``` - -Let's run and save the following commands for our pipeline. First, define the -feature extraction stage, that takes `train` and `test` TSV files and generates -corresponding matrix files: - -```dvc -$ dvc run -d code/featurization.py -d code/conf.py \ - -d data/Posts-train.tsv -d data/Posts-test.tsv \ - -o data/matrix-train.p -o data/matrix-test.p \ - python code/featurization.py -Using 'matrix-train.p.dvc' as a stage file -Reproducing 'matrix-train.p.dvc': - python code/featurization.py -The input data frame data/Posts-train.tsv size is (66999, 3) -The output matrix data/matrix-train.p size is (66999, 5002) and data type is float64 -The input data frame data/Posts-test.tsv size is (33001, 3) -The output matrix data/matrix-test.p size is (33001, 5002) and data type is float64 -``` - -Train a model using the train matrix file: - -```dvc -$ dvc run -d data/matrix-train.p -d code/train_model.py \ - -d code/conf.py -o data/model.p \ - python code/train_model.py 20180319 -Using 'model.p.dvc' as a stage file -Reproducing 'model.p.dvc': - python code/train_model.py 20180319 -Input matrix size (66999, 5002) -X matrix size (66999, 5000) -Y matrix size (66999,) -``` - -And evaluate the result of the trained model using the test feature matrix: - -```dvc -$ dvc run -d data/model.p -d data/matrix-test.p \ - -d code/evaluate.py -d code/conf.py -M data/eval.txt \ - -f Dvcfile \ - python code/evaluate.py -Reproducing 'Dvcfile': - python code/evaluate.py -``` - -The model evaluation stage is the last one for this tutorial. To help in the -pipeline's reproducibility, we use stage file name `Dvcfile`. (This will be -discussed in more detail in the next chapter.) - -Note that the output file `data/eval.txt` was transformed by DVC -into a [metric](/doc/command-reference/metrics) file in accordance with the `-M` -option. - -The result of the last three `dvc run` commands execution is three stage files -and a modified .gitignore file. Let's commit all the changes with Git: - -```dvc -$ git status -s - M data/.gitignore -?? Dvcfile -?? data/eval.txt -?? matrix-train.p.dvc -?? model.p.dvc - -$ git add . -$ git commit -m Evaluate -``` - -The output of the evaluation stage contains the target value in a simple text -form: - -```dvc -$ cat data/eval.txt -AUC: 0.624652 -``` - -You can also show the metrics using the `DVC metrics` command: - -```dvc -$ dvc metrics show -data/eval.txt:AUC: 0.624652 -``` - -> We get that this is probably not the best AUC that you have seen! In this -> document, our focus is DVC, not ML modeling, so we use a relatively small -> dataset without any advanced ML techniques. - -In the next chapter we will try to improve the metrics by changing our modeling -code and using reproducibility in our pipeline. diff --git a/content/docs/tutorials/deep/index.md b/content/docs/tutorials/deep/index.md deleted file mode 100644 index 136cba9b60..0000000000 --- a/content/docs/tutorials/deep/index.md +++ /dev/null @@ -1,31 +0,0 @@ -# Deep Dive Tutorial - -This tutorial shows you how to solve a text classification problem using the DVC -tool. - -Today the data science community is still lacking good practices for organizing -their projects and effectively collaborating. ML algorithms and methods are no -longer simple tribal knowledge but are still difficult to implement, manage and -reuse. - -> One of the biggest challenges in reusing, and hence the managing of ML -> projects, is its reproducibility. - -DVC has been built to address the reproducibility. - -![](/img/reproducibility.png) - -Repository branches can beautifully reflect the non-linear structure common to -the ML process: each hypothesis presented as a Git branch. However, the -inability to store data in a repository and the discrepancy between code and -data make it extremely difficult to manage a data science project with Git. - -DVC streamlines large data files and binary models into a single Git environment -and this approach will not require storing binary files in your Git repository. - -## DVC Workflow - -The diagram below describes all the DVC commands and relationships between a -local cache and remote storage. - -![](/img/flow-large.png) diff --git a/content/docs/tutorials/deep/preparation.md b/content/docs/tutorials/deep/preparation.md deleted file mode 100644 index 6f3b593da7..0000000000 --- a/content/docs/tutorials/deep/preparation.md +++ /dev/null @@ -1,94 +0,0 @@ -# Preparation - -In this document, we will be building an ML model to classify -[StackOverflow](https://stackoverflow.com) questions by two classes: with -`python` tag and without `python` tag. For training purposes, a small subset of -data will be used — only 180Mb xml files. - -Most of the code to solve this problem is ready to be downloaded. We will be -modifying some of the code during this tutorial to improve the model. - -> We have tested our tutorials and examples with Python 3. We don't recommend -> using earlier versions. - -You'll need [Git](https://git-scm.com/) to run the commands in this tutorial. -Also, if DVC is not installed, please follow these [instructions](/doc/install) -to do so. - -> If you're using Windows, please review -> [Running DVC on Windows](/doc/user-guide/running-dvc-on-windows) for important -> tips to improve your experience. - -## Getting the example code - -Take the following steps to initialize a new Git repository and get the example -code into it: - -
- -### Expand to learn how to download on Windows - -Windows doesn't include the `wget` utility by default, but you can use the -browser to download `code.zip`. (Right-click -[this link](https://code.dvc.org/tutorial/nlp/code.zip) and select -`Save Link As...` (Chrome). Save it into the project directory. - -> 💡 Please also review -> [Running DVC on Windows](/doc/user-guide/running-dvc-on-windows) for important -> tips to improve your experience using DVC on Windows. - -
- -```dvc -$ mkdir classify -$ cd classify -$ git init -$ wget https://code.dvc.org/tutorial/nlp/code.zip -$ unzip code.zip -d code && rm -f code.zip -$ git add code -$ git commit -m "download code" -``` - -Now let's install the requirements. But before we do that, we **strongly** -recommend creating a -[virtual environment](https://packaging.python.org/tutorials/installing-packages/#creating-virtual-environments): - -```dvc -$ virtualenv -p python3 .env -$ source .env/bin/activate -$ echo ".env/" >> .gitignore -$ pip install -r code/requirements.txt -``` - -## Initialize - -DVC works best inside Git repositories like the one we're in. Initialize DVC -with: - -At DVC initialization, a new `.dvc/` directory is created for internal -configuration and cache -[files and directories](/doc/user-guide/dvc-files-and-directories#internal-directories-and-files), -that are hidden from the user. This directory is automatically staged with -`git add`, so it can be easily committed with Git: - -```dvc -$ dvc init -... - -$ ls -a .dvc -. .. .gitignore config tmp - -$ git status -s -A .dvc/.gitignore -A .dvc/config -?? .gitignore - -$ git commit -am "init DVC" -``` - -The cache directory, one of the most important parts of any -DVC project, will store the content of all data files. (This is -explained in more detail in the next chapter.) Note that it won't be tracked by -Git — It's a local-only directory, and you cannot push it to a Git remote. - -For more information refer to `dvc init`. diff --git a/content/docs/tutorials/deep/reproducibility.md b/content/docs/tutorials/deep/reproducibility.md deleted file mode 100644 index 939d2df620..0000000000 --- a/content/docs/tutorials/deep/reproducibility.md +++ /dev/null @@ -1,312 +0,0 @@ -# Reproducibility - -## How does it work? - -The most exciting part of DVC is reproducibility. - -> Reproducibility is the time you are getting benefits out of DVC instead of -> spending time managing ML pipelines. - -DVC tracks all the dependencies. This helps you iterate on ML models faster -without thinking what was affected by your last change. - -In order to track all the dependencies, DVC finds and reads all the DVC-files in -a repository and builds a dependency graph -([pipeline](/doc/command-reference/pipeline)) based on these files. - -This is one of the differences between DVC reproducibility and software build -automation tools ([Make](https://www.gnu.org/software/make/), Maven, Ant, -Rakefile etc). It was designed in such a way to localize specification of the -graph nodes (pipeline [stages](/doc/command-reference/run)). - -If you run `repro` on any [DVC-file](/doc/user-guide/dvc-files-and-directories) -from our repository, nothing happens because nothing was changed in the pipeline -defined in the project: There's nothing to reproduce. - -```dvc -$ dvc repro model.p.dvc -``` - -> By default, `dvc repro` tries to read all stages available. - -```dvc -$ dvc repro -``` - -Tries to reproduce the same pipeline, but there is still nothing to reproduce. - -## Adding bigrams - -Our NLP model was based on [unigrams](https://en.wikipedia.org/wiki/N-gram) -only. Let's improve the model by adding bigrams. The bigrams model will extract -signals not only from separate words but also from two-word combinations. This -eventually increases the number of features for the model and hopefully improves -the target [metric](/doc/command-reference/metrics). - -Before editing the `code/featurization.py` file, please create and checkout a -new branch `bigrams`. - -```dvc -$ git checkout -b bigrams -# Please use your favorite text editor: -$ vi code/featurization.py -``` - -Specify `ngram` parameter in `CountVectorizer` (lines 50–53) and increase the -number of features to 6000: - -```python -bag_of_words = CountVectorizer(stop_words='english', - max_features=6000, - ngram_range=(1, 2)) -``` - -Reproduce our changed pipeline: - -```dvc -$ dvc repro - -Reproducing 'matrix-train.p.dvc': - python code/featurization.py -The input data frame data/Posts-train.tsv size is (66999, 3) -The output matrix data/matrix-train.p size is (66999, 6002) and data type is float64 -The input data frame data/Posts-test.tsv size is (33001, 3) -The output matrix data/matrix-test.p size is (33001, 6002) and data type is float64 - -Reproducing 'model.p.dvc': - python code/train_model.py 20180319 -Input matrix size (66999, 6002) -X matrix size (66999, 6000) -Y matrix size (66999,) - -Reproducing 'Dvcfile': - python code/evaluate.py -``` - -The process started with the feature creation stage because one of its -parameters was changed — the edited source code file `code/featurization.py`. -All dependent stages were executed as well. - -Let's take a look at the metrics change. The improvement is close to zero -(+0.0075% to be precise): - -```dvc -$ cat data/eval.txt -AUC: 0.624727 -``` - -This is not a great result but it gives us some information about the model. - -To compare it with the previous AUC, you can use the `dvc metrics` command: - -```dvc -$ dvc metrics show -a - -bigrams: - data/eval.txt: AUC: 0.624727 - -master: - data/eval.txt: AUC: 0.624652 -``` - -> It's convenient to keep track of information even for failed experiments. -> Sometimes a failed hypothesis gives more information than a successful one. - -Let's keep the result in the repository. Later we can find out why bigrams don't -add value to the current model and change that. - -Many DVC-files were changed. This happened due to file hash changes. - -```dvc -$ git status -s -M Dvcfile -M code/featurization.py -M matrix-train.p.dvc -M model.p.dvc -``` - -Now we can commit the changes: - -```dvc -$ git add . -$ git commit -m Bigrams -``` - -## Checkout code and data files - -The previous experiment was done in the 'featurization' stage and provided no -improvements. This might be caused by not having perfect model hyperparameters. -Let's try to improve the model by changing the hyperparameters. - -There is no good reason to improve the last bigrams model. Let's checkout the -original model from the master branch. - -> Note that after checking out code and DVC-files from Git, data files have to -> be checked out as well using the `dvc checkout` command. - -```dvc -$ git checkout master -$ dvc checkout -$ dvc repro -Data and pipelines are up to date. -``` - -After proper checkout, there is nothing to reproduce because all the correct -files were checked out by Git, and all data files by DVC. - -In more detail — `git checkout master` checked out the code and DVC-files. The -DVC-files from the master branch point to old (unigram based) dependencies and -outputs. `dvc checkout` command found all the DVC-files and -restored the data files based on them. - -## Tune the model - -Let's create a new branch for this new experiment. It will help you to organize -all the experiments in a repository and checkout them when needed. - -```dvc -$ git checkout -b tuning -# Please use your favorite text editor: -$ vi code/train_model.py -``` - -Increase the number of trees in the forest to 700 by changing the `n_estimators` -parameter and the number of jobs in the `RandomForestClassifier` class (line -27): - -```python -clf = RandomForestClassifier(n_estimators=700, - n_jobs=6, random_state=seed) -``` - -Only the modeling and the evaluation stage need to be reproduced. Just run: - -```dvc -$ dvc repro - -Reproducing 'model.p.dvc': - python code/train_model.py 20180319 -Input matrix size (66999, 5002) -X matrix size (66999, 5000) -Y matrix size (66999,) -Reproducing 'Dvcfile': - python code/evaluate.py -``` - -Validate the [metric](/doc/command-reference/metrics) and commit all the -changes. - -```dvc -$ cat data/eval.txt -AUC: 0.637561 -``` - -This seems like a good model improvement (+1.28%). Please commit all the -changes: - -```dvc -$ git add . -$ git commit -m '700 trees in the forest' -``` - -## Merge the model to master - -Now we can revisit the failing hypothesis with bigrams, which didn't provide any -model improvement even with one thousand more features. The current model with -700 trees in the forest is stronger and we might be able to get more information -using bigrams. So, let's incorporate the bigrams changes into the current model -using a regular Git merge command. - -> Git merge logic works for data files and respectively for DVC models. - -But first, let's create a branch as usual. - -```dvc -$ git checkout -b train_bigrams -$ git merge bigrams -Auto-merging model.p.dvc -CONFLICT (content): Merge conflict in model.p.dvc -Auto-merging Dvcfile -CONFLICT (content): Merge conflict in Dvcfile -Automatic merge failed; fix conflicts and then commit the result. -``` - -The merge has a few conflicts. All of the conflicts are related to file hash -mismatches in the branches. You can properly merge conflicts by prioritizing the -file hashes from the bigrams branch: that is, by removing all hashes of the -other branch. -[Here](https://help.github.com/en/articles/resolving-a-merge-conflict-using-the-command-line) -you can find a tutorial that clarifies how to do that. It is also important to -remove all automatically generated -[conflict markers](https://git-scm.com/book/en/v2/Git-Tools-Advanced-Merging#_checking_out_conflicts) -(<<<<<<<, -=======, ->>>>>>>) from `model.p.dvc` and `Dvcfile`. - -Another way to solve git merge conflicts is to simply replace all file hashes -with empty strings ''. The only disadvantage of this trick is that DVC will need -to recompute the output hashes. - -After resolving the conflicts you need to checkout a proper version of the data -files: - -```dvc -# Replace conflicting hashes with empty string '' -$ vi model.p.dvc -$ vi Dvcfile -$ dvc checkout -``` - -And reproduce the result: - -```dvc -$ dvc repro - -Reproducing 'model.p.dvc': - python code/train_model.py 20180319 -Input matrix size (66999, 6002) -X matrix size (66999, 6000) -Y matrix size (66999,) -Reproducing 'Dvcfile': - python code/evaluate.py -``` - -Check the target [metric](/doc/command-reference/metrics): - -```dvc -$ cat data/eval.txt -AUC: 0.640389 -``` - -The bigrams increased the target metric by 0.28% and the last change looks like -a reasonable improvement to the ML model. So let's commit the result: - -```dvc -$ git add . -$ git commit -m 'Merge bigrams into the tuned model' -``` - -Now our current branch contains the best model and it can be merged into master. - -```dvc -$ git checkout master -$ dvc checkout -$ git merge train_bigrams -Updating f5ff48c..4bd09da -Fast-forward - Dvcfile | 6 +++--- - code/featurization.py | 3 ++- - code/train_model.py | 2 +- - matrix-train.p.dvc | 6 +++--- - model.p.dvc | 6 +++--- - 5 files changed, 12 insertions(+), 11 deletions(-) -``` - -Fast-forward strategy was applied to this merge. It means that we have all the -changes in the right place and reproduction is not needed. - -```dvc -$ dvc checkout -$ dvc repro -Data and pipelines are up to date. -``` diff --git a/content/docs/tutorials/deep/sharing-data.md b/content/docs/tutorials/deep/sharing-data.md deleted file mode 100644 index ed2fe9a1ed..0000000000 --- a/content/docs/tutorials/deep/sharing-data.md +++ /dev/null @@ -1,66 +0,0 @@ -# Sharing Data - -## Pushing data to the cloud - -We've gone over how source code and -[DVC-files](/doc/user-guide/dvc-files-and-directories) can be shared using a Git -repository. These DVC repositories will contain all the information -needed for reproducibility, so it might be a good idea to share them with your -team using Git hosting services (such as [GitHub](https://github.com/)). - -DVC is able to push the cache to cloud storage. - -> Using shared cloud storage, a colleague can reuse ML models that were trained -> on your machine. - -First, you need to setup the remote storage for this project, that -will be stored in the project's -[config file](/doc/user-guide/dvc-files-and-directories#internal-directories-and-files). -This can be done using the CLI as shown below. - -> Note that we are using the `dvc-public` S3 bucket as an example and you don't -> have write access to it, so in order to follow the tutorial you will need to -> either create your own S3 bucket or use other types of -> [remote storage](/doc/command-reference/remote). E.g. you can set up a local -> remote as we did in the [Configure](/doc/tutorials/get-started#configure) -> chapter of _Get Started_. - -```dvc -$ dvc remote add -d upstream s3://dvc-public/remote/tutorial/nlp -$ git status -s - M .dvc/config -``` - -Then, a simple command pushes files from your cache to the cloud: - -```dvc -$ dvc push -``` - -The command does not push all cached files, but only the ones currently -referenced in the workspace. - -For example, in this tutorial 16 data files were created and only 9 will be -pushed because the rest of the data files belong to different branches like -`bigrams`. - -## Pulling data from the cloud - -In order to reuse your data files, a colleague can pull data the same way from -the master branch: - -```dvc -$ git clone https://github.com/iterative/example-get-started.git -$ cd example-get-started -$ dvc pull data/data.xml.dvc prepare.dvc -``` - -After running `dvc pull` above, all the data files related to the -['prepare' stage](https://github.com/iterative/example-get-started/blob/master/dvc.yaml) -in that repo should be in the right place. You can confirm this by trying to -reproduce the default goal: - -```dvc -$ dvc repro prepare -Data and pipelines are up to date. -``` diff --git a/content/docs/tutorials/index.md b/content/docs/tutorials/index.md deleted file mode 100644 index f0b649e52d..0000000000 --- a/content/docs/tutorials/index.md +++ /dev/null @@ -1,15 +0,0 @@ -# Tutorials - -Our tutorials designed to help new and intermediate users learn about DVC -interactively. - -- [Get Started](/doc/tutorials/get-started) is a step-by-step introduction into - basic DVC features. It doesn't go into much detail, but it provides links and - expandable sections to learn more. - -- When you're done, feel free to check out the other official and community - tutorials that follow. They provide similarly reproducible walkthroughs and - in-depth explanations on more advanced topics. - -Please choose from the navigation sidebar to the left, or click the `Next` -button below ↘ diff --git a/content/docs/tutorials/interactive.md b/content/docs/tutorials/interactive.md deleted file mode 100644 index 81107743da..0000000000 --- a/content/docs/tutorials/interactive.md +++ /dev/null @@ -1,14 +0,0 @@ -# Interactive Tutorials - -Interactive lessons and tutorials on [Katacoda](https://katacoda.com/dvc) that -explain the basic concepts of DVC and show how to use it in simple ML scenarios. -Try and learn DVC without having to install it locally! - -- [Get Started](https://katacoda.com/dvc/courses/get-started)
An - interactive version of the [Get Started](/doc/tutorials/get-started). - -- [Data Versioning](https://katacoda.com/dvc/courses/tutorials/versioning)
- Using DVC commands to work with multiple versions of datasets and ML models. - -- [MNIST](https://katacoda.com/dvc/courses/tutorials/mnist)
Creating a - model to classify images of hand-written digits using MNIST as the data-set. diff --git a/content/docs/tutorials/pipelines.md b/content/docs/tutorials/pipelines.md deleted file mode 100644 index 7babe6da2d..0000000000 --- a/content/docs/tutorials/pipelines.md +++ /dev/null @@ -1,399 +0,0 @@ -# Tutorial: Pipelines - -To show DVC in action, let's play with an actual machine learning scenario. -Let's explore the natural language processing -([NLP](https://en.wikipedia.org/wiki/Natural_language_processing)) problem of -predicting tags for a given StackOverflow question. For example, we want a -classifier that can predict posts about the Python language by tagging them -`python`. (This is a short version of the -[Deep Dive Tutorial](/doc/tutorials/deep).) - -In this example, we will focus on building a simple ML -[pipeline](/doc/command-reference/pipeline) that takes an archive with -StackOverflow posts and trains the prediction model and saves it as an -output. See [Get Started](/doc/tutorials/get-started) to see links -to other examples, tutorials, use cases if you want to cover other aspects of -the DVC. The pipeline itself is a sequence of transformation we apply to the -data file: - -![](/img/example-flow-2x.png) - -DVC helps to describe these transformations and capture actual data involved - -input dataset we are processing, intermediate results (useful if some -transformations take a lot of time to run), output models. This way we can -capture what data and code were used to produce a specific model in a sharable -and reproducible way. - -## Preparation - -> We have tested our tutorials and examples with Python 3. We don't recommend -> using earlier versions. - -You'll need [Git](https://git-scm.com/) to run the commands in this tutorial. -Also, if DVC is not installed, please follow these [instructions](/doc/install) -to do so. - -> If you're using Windows, please review -> [Running DVC on Windows](/doc/user-guide/running-dvc-on-windows) for important -> tips to improve your experience. - -Okay! Let's first download the code and set up a Git repository: - -```dvc -$ mkdir example-pipeline && cd example-pipeline -$ git init -$ dvc get https://github.com/iterative/dataset-registry \ - tutorial/nlp/pipeline.zip -$ unzip pipeline.zip -$ rm -f pipeline.zip -$ git add code/ -$ git commit -m "Download and add code to new Git repo" -``` - -> `dvc get` can download any data artifact tracked in a DVC -> repository, using the appropriate -> [remote storage](/doc/command-reference/remote). It's like `wget`, but for DVC -> or Git repos. In this case we use our -> [dataset registry](https://github.com/iterative/dataset-registry) repo as the -> data source (refer to [Data Registries](/doc/use-cases/data-registries) for -> more info.) - -Now let's install the requirements. But before we do that, we **strongly** -recommend creating a -[virtual environment](https://packaging.python.org/tutorials/installing-packages/#creating-virtual-environments): - -```dvc -$ virtualenv -p python3 .env -$ echo ".env/" >> .gitignore -$ source .env/bin/activate -$ pip install -r code/requirements.txt -``` - -Next, we will create a [pipeline](/doc/command-reference/pipeline) step-by-step, -utilizing the same set of commands that are described in the -[Data Pipelines](/doc/tutorials/get-started/data-pipelines) page of the _Get -Started_. - -> Note that its possible to define more than one pipeline in each DVC project. -> This will be determined by the interdependencies between DVC-files, mentioned -> below. - -Initialize DVC repository (run it inside your Git repo): - -```dvc -$ dvc init -$ git add . -$ git commit -m "Initialize DVC project" -``` - -Download an input dataset to the `data/` directory and take it under DVC -control: - -```dvc -$ mkdir data -$ dvc get https://github.com/iterative/dataset-registry \ - tutorial/nlp/Posts.xml.zip -o data/Posts.xml.zip -$ dvc add data/Posts.xml.zip -``` - -When we run `dvc add` `Posts.xml.zip`, DVC creates a -[DVC-file](/doc/user-guide/dvc-files-and-directories). - -
- -### Expand to learn about DVC internals - -At DVC initialization, a new `.dvc/` directory is created for internal -configuration and cache -[files and directories](/doc/user-guide/dvc-files-and-directories#internal-directories-and-files) -that are hidden from the user. This directory is automatically staged with -`git add`, so it can be easily committed with Git. - -Note that the DVC-file created by `dvc add` has no dependencies, a.k.a. an -_orphan stage_ (see `dvc add`): - -```yaml -md5: c183f094869ef359e87e68d2264b6cdd -wdir: .. -outs: - - md5: ce68b98d82545628782c66192c96f2d2 - path: data/Posts.xml.zip - cache: true - metric: false - persist: false -``` - -This file can be committed with Git instead of the data file itself. - -The data file `Posts.xml.zip` is linked (or copied) from -`.dvc/cache/ce/68b98d82545628782c66192c96f2d2`, and added to `.gitignore`. Even -if you remove it from the workspace, or `git checkout` a different -commit, the data is not lost if a corresponding DVC-file is committed. It's -enough to run `dvc checkout` or `dvc pull` to restore data files. - -
- -Commit the changes with Git: - -```dvc -$ git add data/.gitignore data/Posts.xml.zip.dvc -$ git commit -m "Add dataset archive to project" -``` - -## Define stages - -Each [stage](/doc/command-reference/run) – the parts of a -[pipeline](/doc/command-reference/pipeline) – is described by providing a -command to run, input data it takes and a list of outputs. DVC is -not Python or any other language specific and can wrap any command runnable via -CLI. - -The first stage is to extract XML from the archive. Note that we don't need to -run `dvc add` on `Posts.xml` below, `dvc run` saves the data automatically -(commits into the cache, tracks the file with DVC): - -```dvc -$ dvc run -d data/Posts.xml.zip \ - -o data/Posts.xml \ - -f extract.dvc \ - unzip data/Posts.xml.zip -d data -``` - -Similarly to `dvc add`, `dvc run` creates a stage file (a DVC-file with -dependencies). - -
- -### Expand to learn more about DVC internals - -Here's what the `extract.dvc` stage file looks like: - -```yaml -md5: c4280355ffe277571c1b7aa8a43d8107 -cmd: unzip data/Posts.xml.zip -d data -wdir: . -deps: - - md5: ce68b98d82545628782c66192c96f2d2 - path: data/Posts.xml.zip -outs: - - md5: a304afb96060aad90176268345e10355 - path: data/Posts.xml - cache: true - metric: false - persist: false -``` - -Just like the DVC-file we created earlier with `dvc add`, this stage file uses -`md5` hashes (that point to the cache) to describe and version -control dependencies and outputs. Output `data/Posts.xml` file is saved as -`.dvc/cache/a3/04afb96060aad90176268345e10355` and linked (or copied) to the -workspace, as well as added to `.gitignore`. - -Two things are worth noticing here. First, by analyzing dependencies and outputs -that DVC-files describe, we can restore the full series of commands (pipeline -stages) we need to apply. This is important when you run `dvc repro` to -regenerate the final or intermediate result. - -Second, hopefully it's clear by now that the actual data is stored in the -`.dvc/cache` directory, each file having a name based on an `md5` hash. This -cache is similar to Git's -[objects database](https://git-scm.com/book/en/v2/Git-Internals-Git-Objects), -but made specifically to handle large data files. - -> Note that for performance with large datasets, DVC can use file links from the -> cache to the workspace. This avoids copying actual file contents. Refer to -> [File link types](/doc/user-guide/large-dataset-optimization#file-link-types-for-the-dvc-cache) -> to learn which alternatives exist and how to enable them. - -
- -Let's convert XML into TSV to make feature extraction easier: - -```dvc -$ dvc run -d code/xml_to_tsv.py -d data/Posts.xml -o data/Posts.tsv \ - -f prepare.dvc \ - python code/xml_to_tsv.py data/Posts.xml data/Posts.tsv -``` - -Next, split training and test datasets. Here `0.2` is a test dataset split -ratio, `20170426` is a seed for randomization. There are two output files: - -```dvc -$ dvc run -d code/split_train_test.py -d data/Posts.tsv \ - -o data/Posts-train.tsv -o data/Posts-test.tsv \ - -f split.dvc \ - python code/split_train_test.py \ - data/Posts.tsv 0.2 20170426 \ - data/Posts-train.tsv data/Posts-test.tsv -``` - -Now, extract features and labels from the data. Two TSV as inputs with two -pickle matrices as outputs: - -```dvc -$ dvc run -d code/featurization.py \ - -d data/Posts-train.tsv -d data/Posts-test.tsv \ - -o data/matrix-train.pkl -o data/matrix-test.pkl \ - -f featurize.dvc \ - python code/featurization.py \ - data/Posts-train.tsv data/Posts-test.tsv \ - data/matrix-train.pkl data/matrix-test.pkl -``` - -Then, train a ML model on the training dataset. 20170426 is a seed value here: - -```dvc -$ dvc run -d code/train_model.py -d data/matrix-train.pkl \ - -o data/model.pkl \ - -f train.dvc \ - python code/train_model.py data/matrix-train.pkl \ - 20170426 data/model.pkl -``` - -Finally, evaluate the model on the test dataset and get the -[metric](/doc/command-reference/metrics) file: - -```dvc -$ dvc run -d code/evaluate.py -d data/model.pkl \ - -d data/matrix-test.pkl -M auc.metric \ - -f evaluate.dvc \ - python code/evaluate.py data/model.pkl \ - data/matrix-test.pkl auc.metric -``` - -
- -### Expand to learn more about DVC internals - -By analyzing dependencies and outputs in DVC-files, we can generate a dependency -graph: a series of commands DVC needs to execute. `dvc repro` does this in order -to restore a pipeline and reproduce its intermediate or final results. - -`dvc pipeline show` helps to visualize pipelines (run it with the `-c` option to -see actual commands instead of DVC-files): - -```dvc -$ dvc pipeline show --ascii evaluate.dvc - +------------------------+ - | data/Posts.xml.zip.dvc | - +------------------------+ - * - * - * - +-------------+ - | extract.dvc | - +-------------+ - * - * - * - +-------------+ - | prepare.dvc | - +-------------+ - * - * - * - +-----------+ - | split.dvc | - +-----------+ - * - * - * - +---------------+ - | featurize.dvc | - +---------------+ - ** *** - ** ** - ** ** -+-----------+ ** -| train.dvc | ** -+-----------+ ** - ** *** - ** ** - ** ** - +--------------+ - | evaluate.dvc | - +--------------+ -``` - -
- -## Check results - -An easy way to see metrics across different branches: - -```dvc -$ dvc metrics show - auc.metric: 0.620091 -``` - -> Since the dataset for this example is extremely simplified to make it faster -> to run this pipeline, the exact metric number may vary. - -It's time to save our [pipeline](/doc/command-reference/pipeline). You can -confirm that we do not tack files or raw datasets with Git, by using the -`git status` command. We are just saving a snapshot of the DVC-files that -describe data, transformations (stages), and relationships between them. - -```dvc -$ git add *.dvc auc.metric data/.gitignore -$ git commit -m "Add tag prediction pipeline (6 stages)" -``` - -## Reproduce - -All stages could be automatically and efficiently reproduced even if any source -code files have been modified. For example, let's improve the feature extraction -algorithm by making some modification to the `code/featurization.py`. Please -open it with a text editor and specify `ngram` parameter in `CountVectorizer` -(lines 72–73): - -```python -bag_of_words = CountVectorizer(stop_words='english', - max_features=5000, - ngram_range=(1, 2)) -``` - -Reproduce all required stages to get to the target -[metric](/doc/command-reference/metrics) file: - -```dvc -$ dvc repro evaluate.dvc -WARNING: Dependency 'code/featurization.py' of 'featurize.dvc' changed because it is 'modified'. -WARNING: Stage 'featurize.dvc' changed. -Running command: - python code/featurization.py ... -``` - -Once that's done, check the AUC metric again for an improvement: - -```dvc -$ dvc metrics show -a -workspace: - auc.metric: AUC: 0.648462 -master: - auc.metric: AUC: 0.587951 -``` - -> Since the dataset for this example is extremely simplified to make it faster -> to run this pipeline, the exact metric numbers may vary. - -The `-a` option above tells `dvc metrics show` to show the metrics value for all -Git branches. - -Feel free to commit the remaining changes with Git. - -## Conclusion - -By wrapping your commands with `dvc run`, it's easy to integrate DVC into a -machine learning or data processing [pipeline](/doc/command-reference/pipeline) -or other data science processes without any significant effort to rewrite your -code. - -The key detail to notice is that DVC automatically derives the dependencies -between pipeline [stages](/doc/command-reference/run) by building dependency -graphs that represent data pipelines. - -DVC streamlines all of your experiments into a single, reproducible -project, and it makes it easy to share it with Git, including -dependencies. This collaboration feature provides the ability to review data -science research. diff --git a/content/docs/use-cases/versioning-data-and-model-files.md b/content/docs/use-cases/versioning-data-and-model-files/index.md similarity index 100% rename from content/docs/use-cases/versioning-data-and-model-files.md rename to content/docs/use-cases/versioning-data-and-model-files/index.md diff --git a/content/docs/tutorials/versioning.md b/content/docs/use-cases/versioning-data-and-model-files/tutorial.md similarity index 100% rename from content/docs/tutorials/versioning.md rename to content/docs/use-cases/versioning-data-and-model-files/tutorial.md diff --git a/content/docs/user-guide/what-is-dvc/index.md b/content/docs/user-guide/what-is-dvc/index.md index 0e958b2074..6755f75520 100644 --- a/content/docs/user-guide/what-is-dvc/index.md +++ b/content/docs/user-guide/what-is-dvc/index.md @@ -1,20 +1,25 @@ # What Is DVC? -Data Version Control, or DVC, is **a new type of experiment management -software** built on top of the existing engineering toolset that you're already -used to, and particularly on a source code management (Git). DVC reduces the gap -between existing tools and data science needs, allowing users to take advantage -of experiment management while reusing existing skills and intuition. +Today the data science community is still lacking good practices for organizing +their projects and effectively collaborating. ML algorithms and methods are no +longer simple tribal knowledge but are still difficult to implement, manage and +reuse. + +> One of the biggest challenges in reusing, and hence the managing of ML +> projects, is its reproducibility. + +Data Version Control, or DVC, is a new type of experiment management software +built on top of Git. DVC reduces the gap between existing tools and data science +needs, allowing users to take advantage of experiment management while reusing +existing skills and intuition. + +![](/img/reproducibility.png)_DVC codifies data and ML experiments_ Leveraging an underlying source code management system eliminates the need to use 3rd-party services. Data science experiment sharing and collaboration can be done through regular Git features (commit messages, merges, pull requests, etc) the same way it works for software engineers. -DVC implements a **Git experimentation methodology** where each experiment -exists with its code as well as data, and can be represented as a separate Git -branch or commit. - DVC uses a few core concepts: - **Experiment**: Equivalent to a @@ -54,3 +59,10 @@ DVC uses a few core concepts: - **Cloud storage** support: available complement to the core DVC features. This is how a data scientist transfers large data files or shares a GPU-trained model with those without GPUs available. + +DVC streamlines large data files and binary models into a single Git environment +and this approach will not require storing binary files in your Git repository. +The diagram below describes all the DVC commands and relationships between a +local cache and remote storage: + +![](/img/flow-large.png)_DVC data management_ diff --git a/redirects-list.json b/redirects-list.json index 49c3fa0a16..62d18e3827 100644 --- a/redirects-list.json +++ b/redirects-list.json @@ -19,10 +19,14 @@ "^/((?:deb|rpm)/.+) https://s3-us-east-2.amazonaws.com/dvc-s3-repo/$1 303", "^/(?:help|chat)/?$ https://discordapp.com/invite/dvwXA2N 303", "^/(?:docs|documentation)(/.*)?$ /doc$1", + "^/doc/get-started(/.*)?$ /doc/start", - "^/doc//tutorials/get-started(/.*)?$ /doc/start$1", - "^/doc/tutorial/?$ /doc/tutorials", - "^/doc/tutorial/(.*)? /doc/tutorials/deep/$1", + "^/doc/tutorial(/.*)? /doc/start", + "^/doc/tutorials/get-started(/.*)?$ /doc/start", + "^/doc/tutorials/deep(/.*)?$ /doc/start", + "^/doc/tutorials/pipelines(/.*)?$ /doc/start", + "^/doc/tutorials/versioning(/.*)?$ /doc/use-cases/versioning-data-and-model-files/tutorial", + "^/doc/use-cases/data-and-model-files-versioning/?$ /doc/use-cases/versioning-data-and-model-files", "^/doc/user-guide/dvc-file-format$ /doc/user-guide/dvc-files-and-directories", "^/doc/understanding-dvc(/.*)?$ /doc/user-guide/what-is-dvc",