diff --git a/.eslintrc.json b/.eslintrc.json index 0ef2f1a7f43..d52043ecbdb 100644 --- a/.eslintrc.json +++ b/.eslintrc.json @@ -23,17 +23,14 @@ } }, "rules": { - "max-len": [ - "error", - { "code": 80, "ignoreTemplateLiterals": true, "ignoreUrls": true } - ], "@typescript-eslint/interface-name-prefix": [ "error", { "prefixWithI": "always" } ], "no-tabs": "error", + "react/prop-types": "off", // Props should be described as TS types + "@typescript-eslint/explicit-function-return-type": "off", // TODO: remove after rewriting code to TS - "@typescript-eslint/explicit-function-return-type": "warn", "@typescript-eslint/no-var-requires": "warn" } } diff --git a/.gitignore b/.gitignore index 1fd10aa23e7..25529227d7c 100644 --- a/.gitignore +++ b/.gitignore @@ -54,9 +54,10 @@ typings/ # dotenv environment variables file .env -# Next.js build output -.idea +# Editors & logs *.log +.idea +.vscode # Mac finder artifacts .DS_Store @@ -64,3 +65,4 @@ typings/ # Gatsby cache .cache public +tmp diff --git a/.prettierignore b/.prettierignore new file mode 100644 index 00000000000..6f4cc146993 --- /dev/null +++ b/.prettierignore @@ -0,0 +1,2 @@ +.cache/ +public/ diff --git a/.prettierrc b/.prettierrc index 02b225aad6c..ec859218529 100644 --- a/.prettierrc +++ b/.prettierrc @@ -1,7 +1,9 @@ -semi: false -singleQuote: true -trailingComma: none -printWidth: 80 -tabWidth: 2 -useTabs: false -proseWrap: "always" +{ + "semi": false, + "singleQuote": true, + "trailingComma": "none", + "printWidth": 80, + "tabWidth": 2, + "useTabs": false, + "proseWrap": "always" +} diff --git a/.restyled.yaml b/.restyled.yaml index dc85f8a6830..3e22acf7e85 100644 --- a/.restyled.yaml +++ b/.restyled.yaml @@ -4,5 +4,8 @@ restylers: include: - './*.{js,md}' - 'pages/**/*.js' - - 'content/docs/**/*.{js,md}' + - 'content/**/*.{js,md}' - 'src/**/*.js' + - name: stylelint + include: + - 'src/**/*.css' diff --git a/.stylelintrc b/.stylelintrc new file mode 100644 index 00000000000..f76a199cefb --- /dev/null +++ b/.stylelintrc @@ -0,0 +1,18 @@ +{ + "extends": "stylelint-config-standard", + "rules": { + "no-descending-specificity": null, + "no-duplicate-selectors": null, + "declaration-colon-newline-after": null, + "value-list-comma-newline-after": null, + "property-no-unknown": [ true, { + "ignoreProperties": ["composes"] + } ], + "selector-pseudo-class-no-unknown": [ true, { + "ignorePseudoClasses": ["global"] + } ], + "at-rule-no-unknown": [true, { + "ignoreAtRules": ["mixin"] + }] + }, +} diff --git a/config/postcss/media.js b/config/postcss/media.js new file mode 100644 index 00000000000..3d3484bb8e6 --- /dev/null +++ b/config/postcss/media.js @@ -0,0 +1,20 @@ +/* tslint:disable object-literal-sort-keys */ + +const screens = { + giant: 1200, + desktop: 1005, + tablet: 768, + phablet: 572, + phone: 376 +} + +module.exports = { + customMedia: { + '--xxs-scr': `(max-width: ${screens.phone}px)`, + '--xs-scr': `(max-width: ${screens.phablet}px)`, + '--sm-scr': `(max-width: ${screens.tablet}px)`, + '--md-scr': `(max-width: ${screens.desktop - 1}px)`, + '--lg-scr': `(min-width: ${screens.desktop}px)`, + '--xl-scr': `(min-width: ${screens.giant}px)` + } +} diff --git a/config/postcss/mixins.js b/config/postcss/mixins.js new file mode 100644 index 00000000000..4201d75dcdf --- /dev/null +++ b/config/postcss/mixins.js @@ -0,0 +1,91 @@ +/* tslint:disable object-literal-sort-keys */ + +const focus = { + '&:focus': { + color: 'var(--color-orange)', + outline: 'none' + } +} + +const active = { + '&:active': { + position: 'relative', + top: '1px', + left: '1px' + } +} + +const hover = { + '&:hover': { + opacity: 0.7 + } +} + +module.exports = { + mixins: { + 'h1-desktop': { + 'font-weight': '500', + 'font-size': '40px', + 'line-height': '60px' + }, + 'h1-mobile': { + 'font-weight': '500', + 'font-size': '30px', + 'line-height': '40px' + }, + 'h2-desktop': { + 'font-weight': '500', + 'font-size': '30px', + 'line-height': '40px' + }, + 'h2-mobile': { + 'font-weight': '500', + 'font-size': '25px', + 'line-height': '35px' + }, + 'h3-desktop': { + 'font-weight': '500', + 'font-size': '24px', + 'line-height': '34px' + }, + 'h3-mobile': { + 'font-weight': '500', + 'font-size': '20px', + 'line-height': '30px' + }, + 'text-desktop': { + 'font-size': '24px', + 'line-height': '34px' + }, + 'text-mobile': { + 'font-size': '20px', + 'line-height': '30px' + }, + 'text-diminished': { + 'font-size': '20px', + 'line-height': '30px' + }, + 'text-secondary': { + 'font-size': '16px', + 'line-height': '24px' + }, + 'button-big': { + 'font-size': '20px', + 'line-height': '30px' + }, + 'button-small': { + 'font-size': '16px', + 'line-height': '25px' + }, + link: { + 'text-decoration': 'none', + color: 'var(--color-blue)', + ...hover, + ...focus, + ...active + }, + hover, + focus, + active + } +} diff --git a/content/authors/dmitry_petrov.md b/content/authors/dmitry_petrov.md new file mode 100644 index 00000000000..d3dfb7607b2 --- /dev/null +++ b/content/authors/dmitry_petrov.md @@ -0,0 +1,9 @@ +--- +path: ../authors/dmitry_petrov.md +name: Dmitry Petrov +avatar: ../../static/uploads/avatars/dmitry_petrov.png +--- + +Creator of [http://dvc.org](http://dvc.org) — Git for ML. Ex-Data Scientist +[http://twitter.com/Microsoft](@Microsoft). PhD in CS. Making jokes with a +serious face. diff --git a/content/authors/elle_obrien.md b/content/authors/elle_obrien.md new file mode 100644 index 00000000000..dcbd6a05a94 --- /dev/null +++ b/content/authors/elle_obrien.md @@ -0,0 +1,7 @@ +--- +path: ../authors/elle_obrien.md +name: Elle O'Brien +avatar: ../../static/uploads/avatars/elle_obrien.jpg +--- + +Data scientist at [http://dvc.org](http://dvc.org) diff --git a/content/authors/george_vyshnya.md b/content/authors/george_vyshnya.md new file mode 100644 index 00000000000..5c15245eab6 --- /dev/null +++ b/content/authors/george_vyshnya.md @@ -0,0 +1,9 @@ +--- +path: ../authors/george_vyshnya.md +name: George Vyshnya +avatar: ../../static/uploads/avatars/george_vyshnya.jpeg +--- + +Seasoned Data Scientist / Software Developer with blended experience in software +development, IT, DevOps, PM and C-level roles. CTO at +[http://sbc-group.pl](http://sbc-group.pl) diff --git a/content/authors/marija_ilic.md b/content/authors/marija_ilic.md new file mode 100644 index 00000000000..3e7c360c265 --- /dev/null +++ b/content/authors/marija_ilic.md @@ -0,0 +1,7 @@ +--- +path: ../authors/marija_ilic.md +name: Marija Ilić +avatar: ../../static/uploads/avatars/marija_ilic.png +--- + +Data scientist at Njuškalo, Croatia. diff --git a/content/authors/svetlana_grinchenko.md b/content/authors/svetlana_grinchenko.md new file mode 100644 index 00000000000..70d3146d269 --- /dev/null +++ b/content/authors/svetlana_grinchenko.md @@ -0,0 +1,7 @@ +--- +path: ../authors/svetlana_grinchenko.md +name: Svetlana Grinchenko +avatar: ../../static/uploads/avatars/svetlana_grinchenko.jpeg +--- + +Head of developer relations at [http://dvc.org](http://dvc.org) diff --git a/content/blog/2017-05-15-how-a-data-scientist-can-improve-his-productivity.md b/content/blog/2017-05-15-how-a-data-scientist-can-improve-his-productivity.md new file mode 100644 index 00000000000..02a461f37a6 --- /dev/null +++ b/content/blog/2017-05-15-how-a-data-scientist-can-improve-his-productivity.md @@ -0,0 +1,169 @@ +--- +title: How A Data Scientist Can Improve His Productivity +date: 2017-05-15 +description: | + Data science and machine learning are iterative processes. It is never + possible to successfully complete a data science project in a single pass. +descriptionLong: | + The iteration time is a critical parameter in data science process. The + quicker you iterate, the more you can check ideas and build a better model. + The productivity of data scientists can be improved by speeding up iteration + processes and the DVC tool takes care of this. +picture: ../../static/uploads/images/2017-05-15/post-image.jpg +author: ../authors/dmitry_petrov.md +commentsUrl: https://discuss.dvc.org/t/how-a-data-scientist-can-improve-their-productivity/301 +tags: + - Productivity + - Python + - Tutorial +--- + +Data science and machine learning are iterative processes. It is never possible +to successfully complete a data science project in a single pass. A data +scientist constantly tries new ideas and changes steps of his pipeline: + +1. extract new features and accidentally find noise in the data; + +2. clean up the noise, find one more promising feature; + +3. extract the new feature; + +4. rebuild and validate the model, realize that the learning algorithm + parameters are not perfect for the new feature set; + +5. change machine learning algorithm parameters and retrain the model; + +6. find the ineffective feature subset and remove it from the feature set; + +7. try a few more new features; + +8. try another ML algorithm. And then a data format change is required. + +This is only a small episode in a data scientist’s daily life and it is what +makes our job different from a regular engineering job. + +Business context, ML algorithm knowledge and intuition all help you to find a +good model faster. But you never know for sure what ideas will bring you the +best value. + +This is why the iteration time is a critical parameter in data science process. +The quicker you iterate, the more you can check ideas and build a better model. + +> “A well-engineered pipeline gets data scientists iterating much faster, which +> can be a big competitive edge” From +> [Engineering Practices in Data Science](http://blog.untrod.com/2012/10/engineering-practices-in-data-science.html) +> By Chris Clark. + +## A data science iteration tool + +To speed up the iterations in data science projects we have created an open +source tool [data version control](http://dvc.org) or [DVC.org](http://dvc.org). + +DVC takes care of dependencies between commands that you run, generated data +files, and code files and allows you to easily reproduce any steps of your +research with regards to files changes. + +You can think about DVC as a Makefile for a data science project even though you +do not create a file explicitly. DVC tracks dependencies in your data science +projects when you run data processing or modeling code through a special +command: + +```dvc +$ dvc run python code/xml_to_tsv.py \ + data/Posts.xml data/Posts.tsv +``` + +`dvc run` works as a proxy for your commands. This allows DVC to track input and +output files, construct the dependency graph +([DAG](https://en.wikipedia.org/wiki/Directed_acyclic_graph)), and store the +command and parameters for a future command reproduction. + +The previous command will be automatically piped with the next command because +of the file `data/Posts.tsv` is an output for the previous command and the input +for the next one: + +```dvc +# Split training and testing dataset. Two output files. +# 0.33 is the test dataset splitting ratio. +# 20170426 is a seed for randomization. +$ dvc run python code/split_train_test.py \ + data/Posts.tsv 0.33 20170426 \ + data/Posts-train.tsv data/Posts-test.tsv +``` + +DVC derives the dependencies automatically by looking to the list of the +parameters (even if your code ignores the parameters) and noting the file +changes before and after running the command. + +If you change one of your dependencies (data or code) then all the affected +steps of the pipeline will be reproduced: + +```dvc +# Change the data preparation code. +$ vi code/xml_to_tsv.py + +# Reproduce. +$ dvc repro data/Posts-train.tsv +Reproducing run command for data item data/Posts.tsv. +Reproducing run command for data item data/Posts-train.tsv. +``` + +This is a powerful way of quickly iterating through your pipeline. + +The pipeline might have a lot of steps and forms of acyclic dependencies between +the steps. Below is an example of a canonical machine learning pipeline (more +details in [the DVC tutorials](https://dvc.org/doc/tutorials): + +`gist:dmpetrov/7704a5156bdc32c7379580a61e2fe3b6#dvc_pipeline.sh` + +## Why are regular pipeline tools not enough? + +> “Workflows are expected to be mostly static or slowly changing.” (See +> [Airflow](https://airflow.incubator.apache.org/).) + +Regular pipeline tools like [Airflow](http://airflow.incubator.apache.org) and +[Luigi](https://github.com/spotify/luigi) are good for representing static and +fault tolerant workflows. A huge portion of their functionality is created for +monitoring, optimization and fault tolerance. These are very important and +business critical problems. However, these problems are irrelevant to data +scientists’ daily lives. + +Data scientists need a lightweight, dynamic workflow management system. In +contrast to the traditional airflow-like system, DVC reflects the process of +researching and looking for a great model (and pipeline), not optimizing and +monitoring an existing one. This is why DVC is a good fit for iterative machine +learning processes. When a good model was discovered with DVC, the result could +be incorporated into a data engineering pipeline (Luigi or Airflow). + +## Pipelines and data sharing + +In addition to pipeline description, data reproduction and dynamic nature, DVC +has one more important feature. It was designed in accordance with the best +software engineering practices. DVC is based on Git. It keeps code, and stores +DAG in the Git repository which allows you to share your research results. But +it moves the actual file content outside the Git repository (in `.cache` +directory which DVC includes in `.gitignore`) since Git is not designed to +accommodate large data files. + +The data files can be shared between data scientists through cloud storages +using a simple command: + +```dvc +# Data scientists 1 syncs data to the cloud. +$ dvc sync data/ +``` + +![](/uploads/images/2017-05-15/git-server-or-github.jpeg) + +Currently, AWS S3 and GCP storage are supported by DVC. + +## Conclusion + +The productivity of data scientists can be improved by speeding up iteration +processes and the DVC tool takes care of this. + +We are very interested in your opinion and feedback. Please post your comments +here or contact us on Twitter — [FullStackML](https://twitter.com/FullStackML). + +If you found this tool useful, **please “star” the +[DVC Github repository](https://github.com/iterative/dvc)**. diff --git a/content/blog/2017-07-24-r-code-and-reproducible-model-development-with-dvc.md b/content/blog/2017-07-24-r-code-and-reproducible-model-development-with-dvc.md new file mode 100644 index 00000000000..5aee201f208 --- /dev/null +++ b/content/blog/2017-07-24-r-code-and-reproducible-model-development-with-dvc.md @@ -0,0 +1,224 @@ +--- +title: R code and reproducible model development with DVC +date: 2017-07-24 +description: | + There are a lot of example on how to use Data Version Control (DVC) with a + Python project. In this document I would like to see how it can be used with a + project in R. +descriptionLong: | + In this document we will briefly explore possibilities of a new open source + tool that could help with achieving code simplicity, readability and faster + model development. + + There are a lot of example on how to use Data Version Control (DVC) with a + Python project. In this document I would like to see how it can be used with a + project in R. +picture: ../../static/uploads/images/2017-07-24/post-image.png +pictureComment: DAG on R example +author: ../authors/marija_ilic.md +commentsUrl: https://discuss.dvc.org/t/r-code-and-reproducible-model-development-with-dvc/298 +tags: + - RStats + - R +--- + +[DVC](https://dvc.org) or Data Version Control tool — its idea is to track +files/data dependencies during model development in order to facilitate +reproducibility and track data files versioning. Most of the +[DVC tutorials](https://dvc.org/doc/tutorials) provide good examples of using +DVC with Python language. However, I realized that DVC is a +[language agnostic](https://en.wikipedia.org/wiki/Language-agnostic) tool and +can be used with any programming language. In this blog post, we will see how to +use DVC in R projects. + +## R coding — keep it simple and readable + +Each development is always a combination of following steps presented below: + +![Model development process](/uploads/images/2017-07-24/development-steps.png) +_Model development process_ + +Because of the specificity of the process — iterative development, it is very +important to improve some coding and organizational skills. For example, instead +of having one big R file with code it is better to split code in several logical +files — each responsible for one small piece of work. It is smart to track +history development with +[git](https://git-scm.com/book/en/v2/Getting-Started-About-Version-Control) +tool. Writing “_reusable code”_ is nice skill to have. Put comments in a code +can make our life easier. + +Beside git, next step in further improvements is to try out and work with DVC. +Every time when a change/commit in some of the codes and data sets is made, DVC +will reproduce new results with just one bash command on a linux (or Win +environment). It memorizes dependencies among files and codes so it can easily +repeat all necessary steps/codes instead of us worrying about the order. + +## R example — data and code clarification + +We’ll take an Python example from +[DVC tutorial](https://dvc.org/doc/tutorials/deep) (written by Dmitry Petrov) +and rewrite that code in R. With an example we’ll show how can DVC help during +development and what are its possibilities. + +Firstly, let’s initialize git and dvc on mentioned example and run our codes for +the first time. After that we will simulate some changes in the codes and see +how DVC works on reproducibility. + +R codes can be downloaded from the +[Github repository](https://github.com/Zoldin/R_AND_DVC). A brief explanation of +the codes is presented below: + +**parsingxml.R** — it takes xml that we downloaded from the web and creates +appropriate csv file. + +`gist:Zoldin/47536af63182a0e8daf37a7b989e2e8d#parsingxml.R` + +**train_test_spliting.R** — stratified sampling by target variable (here we are +creating test and train data set) + +`gist:Zoldin/7591c47ce5988cbe087e0038c9a850b9#train_test_splitting.R` + +**featurization.R** — text mining and tf-idf matrix creation. In this part we +are creating predictive variables. + +`gist:Zoldin/9e79c047fd8ad7aa6596b0682aca83c6#featurization.R` + +**train_model.R** — with created variables we are building logistic regression +(LASSO). + +`gist:Zoldin/1617b39f2acbde3cd486616ac442e7cf#train_model.R` + +**evaluate.R** — with trained model we are predicting target on test data set. +AUC is final output which is used as evaluation metric. + +`gist:Zoldin/bfc2d4ee449098a9ff64b99c3326e61d#evaluate.r` + +Firstly, codes from above we will download into the new folder (clone the +repository): + +```dvc +$ mkdir R_DVC_GITHUB_CODE +$ cd R_DVC_GITHUB_CODE + +$ git clone https://github.com/Zoldin/R_AND_DVC +``` + +## DVC installation and initialization + +On the first site it seemed that DVC will not be compatible to work with R +because of the fact that DVC is written in Python and as that needs/requires +Python packages and pip package manager. Nevertheless, the tool can be used with +any programming language, it is language agnostic and as such is excellent for +working with R. + +Dvc installation: + +```dvc +$ pip3 install dvc +$ dvc init +``` + +With code below 5 R scripts with `dvc run` are executed. Each script is started +with some arguments — input and output file names and other parameters (seed, +splitting ratio etc). It is important to use `dvc run` — with this command R +script are entering pipeline (DAG graph). + +```dvc +$ dvc import https://s3-us-west-2.amazonaws.com/dvc-share/so/25K/Posts.xml.tgz \ + data/ + +# Extract XML from the archive. +$ dvc run tar zxf data/Posts.xml.tgz -C data/ + +# Prepare data. +$ dvc run Rscript code/parsingxml.R \ + data/Posts.xml \ + data/Posts.csv + +# Split training and testing dataset. Two output files. +# 0.33 is the test dataset splitting ratio. +# 20170426 is a seed for randomization. +$ dvc run Rscript code/train_test_spliting.R \ + data/Posts.csv 0.33 20170426 \ + data/train_post.csv \ + data/test_post.csv + +# Extract features from text data. +# Two TSV inputs and two pickle matrixes outputs. +$ dvc run Rscript code/featurization.R \ + data/train_post.csv \ + data/test_post.csv \ + data/matrix_train.txt \ + data/matrix_test.txt + +# Train ML model out of the training dataset. +# 20170426 is another seed value. +$ dvc run Rscript code/train_model.R \ + data/matrix_train.txt 20170426 \ + data/glmnet.Rdata + +# Evaluate the model by the testing dataset. +$ dvc run Rscript code/evaluate.R \ + data/glmnet.Rdata \ + data/matrix_test.txt \ + data/evaluation.txt + +# The result. +$ cat data/evaluation.txt +``` + +## Dependency flow graph on R example + +Dependency graph is shown on picture below: + +![Dependency graph](/uploads/images/2017-07-24/dependency-graph.png)_Dependency +graph_ + +DVC memorizes this dependencies and helps us in each moment to reproduce +results. + +For example, lets say that we are changing our training model — using ridge +penalty instead of lasso penalty (changing alpha parameter to `0`). In that case +will change/modify `train_model.R` job and if we want to repeat model +development with this algorithm we don’t need to repeat all steps from above, +only steps marked red on a picture below: + +![](/uploads/images/2017-07-24/marked-steps.png) + +DVC knows based on DAG graph that changed `train_model.R` file will only change +following files: `Glmnet.RData` and `Evaluation.txt`. If we want to see our new +results we need to execute only `train_model.R` and `evaluate.R job`. It is cool +that we don’t have to think all the time what we need to repeat (which steps). +`dvc repro` command will do that instead of us. Here is a code example : + +```dvc +$ vi train_model.R +$ git commit -am "Ridge penalty instead of lasso" +$ dvc repro data/evaluation.txt + +Reproducing run command for data item data/glmnet.Rdata. Args: Rscript code/train_model.R data/matrix_train.txt 20170426 data/glmnet.Rdata +Reproducing run command for data item data/evaluation.txt. Args: Rscript code/evaluate.R data/glmnet.Rdata data/matrix_test.txt data/evaluation.txt + +$ cat data/evaluation.txt +"AUC for the test file is : 0.947697381983095" +``` + +`dvc repro` always re executes steps which are affected with the latest +developer changes. It knows what needs to be reproduced. + +DVC can also work in an _"multi-user environment”_ . Pipelines (dependency +graphs) are visible to others colleagues if we are working in a team and using +git as our version control tool. Data files can be shared if we set up a cloud +and with _dvc sync_ we specify which data can be shared and used for other +users. In that case other users can see the shared data and reproduce results +with those data and their code changes. + +## Summary + +DVC tool improves and accelerates iterative development and helps to keep track +of ML processes and file dependencies in the simple form. On the R example we +saw how DVC memorizes dependency graph and based on that graph re executes only +jobs that are related to the latest changes. It can also work in multi-user +environment where dependency graphs, codes and data can be shared among multiple +users. Because it is language agnostic, DVC allows us to work with multiple +programming languages within a single data science project. diff --git a/content/blog/2017-07-27-data-version-control-in-analytics-devops-paradigm.md b/content/blog/2017-07-27-data-version-control-in-analytics-devops-paradigm.md new file mode 100644 index 00000000000..55e4510fd84 --- /dev/null +++ b/content/blog/2017-07-27-data-version-control-in-analytics-devops-paradigm.md @@ -0,0 +1,190 @@ +--- +title: Data Version Control in Analytics DevOps Paradigm +date: 2017-07-27 +description: | + Why DevOps matters in data science, what specific challenges data scientists + face in the day to day work, and how do we setup a better environment for the + team. +descriptionLong: | + The eternal dream of almost every Data Scientist today is to spend all the + time exploring new datasets, engineering new features, inventing and + validating cool new algorithms and strategies. However, daily routines of a + Data Scientist include raw data pre-processing, dealing with infrastructure, + bringing models to production. That's where good DevOps practices and skills + are essential and will certainly be beneficial for industrial Data Scientists + as they can address the above-mentioned challenges in a self-service manner. +picture: ../../static/uploads/images/2017-07-27/post-image.jpeg +author: ../authors/george_vyshnya.md +commentsUrl: https://discuss.dvc.org/t/data-version-control-in-analytics-devops-paradigm/297 +tags: + - DevOps +--- + +## Data Science and DevOps Convergence + +The primary mission of DevOps is to help the teams to resolve various Tech Ops +infrastructure, tools and pipeline issues. + +At the other hand, as mentioned in the conceptual review by +[Forbes](https://www.forbes.com/sites/teradata/2016/11/14/devops-for-data-science-why-analytics-ops-is-key-to-value/) +in November 2016, the industrial analytics is no more going to be driven by data +scientists alone. It requires an investment in DevOps skills, practices and +supporting technology to move analytics out of the lab and into the business. +There are even +[voices](https://www.computing.co.uk/ctg/news/2433095/a-lot-of-companies-will-stop-hiring-data-scientists-when-they-realise-that-the-majority-bring-no-value-says-data-scientist) +calling Data Scientists to concentrate on agile methodology and DevOps if they +like to retain their jobs in business in the long run. + +## Why DevOps Matters + +The eternal dream of almost every Data Scientist today is to spend all (well, +almost all) the time in the office exploring new datasets, engineering decisive +new features, inventing and validating cool new algorithms and strategies. +However, reality is often different. One of the unfortunate daily routines of a +Data Scientist work is to do raw data pre-processing. It usually translates to +the challenges to + +1. **Pull all kinds of necessary data from a variety of sources** + + - Internal data sources like ERP, CRM, POS systems, or data from online + e-commerce platforms + + - External data, like weather, public holidays, Google trends etc. + +2. **Extract, transform, and load the data** + + - Relate and join the data sources + + - Aggregate and transform the data + +3. **Avoid technical and performance drawbacks** when everything ends up in + “one big table” at the end + +4. **Facilitate continuous machine learning and decision-making in a + business-ready framework** + + - Utilize historic data to train the machine learning models and algorithms + + - Use the current, up-to-date data for decision-making + + - Export back the resulting decisions/recommendations to review by business + stakeholders, either back into the ERP system or some other data warehouse + +Another big challenge is to organize **collaboration and data/model sharing** +inside and across the boundaries of teams of Data Scientists and Software +Engineers. + +DevOps skills as well as effective instruments will certainly be beneficial for +industrial Data Scientists as they can address the above-mentioned challenges in +a self-service manner. + +## Can DVC Be a Solution? + +[Data Version Control](https://dvc.org) or simply DVC comes to the scene +whenever you start looking for effective DevOps-for-Analytics instruments. + +DVC is an open source tool for data science projects. It makes your data science +projects reproducible by automatically building data dependency graph (DAG). +Your code and the dependencies could be easily shared by Git, and data — through +cloud storage (AWS S3, GCP) in a single DVC environment. + +> Although DVC was created for machine learning developers and data scientists +> [originally](https://dvc.org/doc/understanding-dvc/what-is-dvc), it appeared +> to be useful beyond it. Since it brings proven engineering practices to not +> well defined ML process, I discovered it to have enormous potential as an +> Analytical DevOps instrument. + +It clearly helps to manage a big fraction of DevOps issues in daily Data +Scientist routines + +1. **Pull all kinds of necessary data from a variety of sources**. Once you + configure and script your data extraction jobs with DVC, it will be + persistent and operable across your data and service infrastructure + +2. **Extract, transform, and load the data**. ETL is going to be easy and + repeatable once you configure it with DVC scripting. It will become a solid + pipeline to operate without major supportive effort. Moreover, it will track + all changes and trigger an alert for updates in the pipeline steps via DAG. + +3. **Facilitate continuous machine learning and decision-making.** The part of + the pipeline facilitated through DVC scripting can be jobs to upload data + back to any transactional system (like ERP, ERM, CRM etc.), warehouse or data + mart. It will then be exposed to business stakeholders to make intelligent + data-driven decisions. + +4. **Share your algorithms and data**. Machine Learning modeling is an iterative + process and it is extremely important to keep track of your steps, + dependencies between the steps, dependencies between your code and data files + and all code running arguments. This becomes even more important and + complicated in a team environment where data scientists’ collaboration takes + a serious amount of the team’s effort. DVC will be the arm to help you with + it. + +One of the ‘juicy’ features of DVC is ability to support multiple technology +stacks. Whether you prefer R or use promising Python-based implementations for +your industrial data products, DVC will be able to support your pipeline +properly. You can see it in action for both +[Python-based](https://blog.dvc.org/how-a-data-scientist-can-improve-his-productivity) +and +[R-based](https://blog.dvc.org/r-code-and-reproducible-model-development-with-dvc) +technical stacks. + +As such, DVC is going to be one of the tools you would enjoy to use if/when you +embark on building continual analytical environment for your system or across +your organization. + +## Continual Analytical Environment and DevOps + +Building a production pipeline is quite different from building a +machine-learning prototype on a local laptop. Many teams and companies face the +challenges there. + +At the bare minimum, the following requirements shall be met when you move your +solution into production + +1. Periodic re-training of the models/algorithms + +2. Ease of re-deployment and configuration changes in the system + +3. Efficiency and high performance of real-time scoring the new out-of-sample + observations + +4. Availability of the monitor model performance over time + +5. Adaptive ETL and ability to manage new data feeds and transactional systems + as data sources for AI and machine learning tools + +6. Scaling to really big data operations + +7. Security and Authorized access levels to different areas of the analytical + systems + +8. Solid backup and recovery processes/tools + +This goes into the territory traditionally inhabited by DevOps. Data Scientists +should ideally learn to handle the part of those requirements themselves or at +least be informative consultants to classical DevOps gurus. + +DVC can help in many aspects of the production scenario above as it can +orchestrate relevant tools and instruments through its scripting. In such a +setup, DVC scripts will be sharable manifestation (and implementation) of your +production pipeline where each step can be transparently reviewed, easily +maintained, and changed as needed over time. + +## Will DevOps Be Captivating? + +If you are further interested in understanding the ever-proliferating role of +DevOps in the modern Data Science and predictive analytics in business, there +are good resources for your review below + +1. [DevOps For Data Science: Why Analytics Ops Is Key To Value](https://www.forbes.com/sites/teradata/2016/11/14/devops-for-data-science-why-analytics-ops-is-key-to-value/) + (Forbes, Nov 14, 2016) + +2. [Bridging the Gap Between Data Science and DevOps](https://www.packtpub.com/books/content/bridging-gap-between-data-science-and-devops) + +3. [Is DevOps Making Life Better for Data Scientists?](https://devops.com/devops-life-better-data-scientists/) + +By any mean, DVC is going to be a useful instrument to fill the multiple gaps +between the classical in-lab old-school data science practices and growing +demands of business to build solid DevOps processes and workflows to streamline +mature and persistent data analytics. diff --git a/content/blog/2017-08-23-ml-model-ensembling-with-fast-iterations.md b/content/blog/2017-08-23-ml-model-ensembling-with-fast-iterations.md new file mode 100644 index 00000000000..5ca9e3cb8e6 --- /dev/null +++ b/content/blog/2017-08-23-ml-model-ensembling-with-fast-iterations.md @@ -0,0 +1,239 @@ +--- +title: ML Model Ensembling with Fast Iterations +date: 2017-08-23 +description: | + Here we'll talk about tools that help tackling common technical challenges of + building pipelines for the ensemble learning. +descriptionLong: | + In many real-world Machine Learning projects, there is a need to ensemble + complex models as well as maintain pipelines. As we will demonstrate, DVC is a + good tool that helps tackling common technical challenges of building + pipelines for the ensemble learning. +picture: ../../static/uploads/images/2017-08-23/post-image.png +author: ../authors/george_vyshnya.md +commentsUrl: https://discuss.dvc.org/t/ml-model-ensembling-with-fast-iterations/296 +tags: + - Best Practices + - Model Ensembling + - R +--- + +In a model ensembling setup, the final prediction is a composite of predictions +from individual machine learning algorithms. To make the best model composite, +you have to try dozens of combinations of weights for the model set. It takes a +lot of time to come up with the best one. That is why the iteration speed is +crucial in the ML model ensembling. We are going to make our research +reproducible by using [Data Version Control](http://dvc.org) tool - +([DVC](http://dvc.org)). It provides the ability to quickly re-run and replicate +the ML prediction result by executing just a single command `dvc repro`. + +As we will demonstrate, DVC is a good tool that helps tackling common technical +challenges of building pipelines for the ensemble learning. + +## Project Overview + +In this case, we will build an R-based solution to attack the +supervised-learning regression problem to predict win sales per +[Predict Wine Sales](https://inclass.kaggle.com/c/pred-411-2016-04-u3-wine/) +Kaggle competition. + +An ensemble prediction methodology will be used in the project. The weighted +ensemble of three models will be implemented, trained, and predicted from +(namely, these are Linear Regression, `GBM`, and `XGBoost`). + +![](/uploads/images/2017-08-23/ensemble-prediction-methodology.png) + +If properly designed and used, ensemble prediction can perform much better then +predictions of individual machine learning models composing the ensemble. + +Prediction results will be delivered in a format of output CSV file that is +specified in the requirements to the +[Predict Wine Sales](https://inclass.kaggle.com/c/pred-411-2016-04-u3-wine/) +Kaggle competition (so called Kaggle submission file). + +## Important Pre-Requisites + +In order to try the materials of this +[repository](https://github.com/gvyshnya/DVC_R_Ensemble) in your environment, +the following software should be installed on your machine + +- **_Python 3_** runtime environment for your OS (it is required to run DVC + commands in the batch files) + +- **_DVC_** itself (you can install it as a python package by simply doing the + standard command in your command line prompt: `pip install dvc`) + +- **_R_** **_3.4.x_** runtime environment for your OS + +- **_git_** command-line client application for your OS + +## Technical Challenges + +The technical challenges of building the ML pipeline for this project were to +meet business requirements below + +- Ability to conditionally trigger execution of 3 different ML prediction models + +- Ability to conditionally trigger model ensemble prediction based on + predictions of those 3 individual models + +- Ability to specify weights of each of the individual model predictions in the + ensemble + +- Quick and fast redeployment and re-run of the ML pipeline upon frequent + reconfiguration and model tweaks + +- Reproducibility of the pipeline and forecasting results across the multiple + machines and team members + +The next sections below will explain how these challenges are addressed in the +design of ML pipeline for this project. + +## ML Pipeline + +The ML pipeline for this project is presented in the diagram below + +![](/uploads/images/2017-08-23/ml-pipeline.png) + +As you can see, the essential implementation of the solution is as follows + +- [`preprocessing.R`](https://gist.github.com/gvyshnya/443424775b0150baac774cc6cf3cb1cc) + handles all aspects of data manipulations and pre-processing (reading training + and testing data sets, removing outliers, imputing NAs etc.) as well as stores + refined training and testing set data as new files to reuse by model scripts + +- 3 model scripts implement training and forecasting algorithms for each of the + models selected for this project + ([`LR.R`](https://gist.github.com/gvyshnya/7ec76316c24bc1b4f595ef1256f52d3a), + [`GBM.R`](https://gist.github.com/gvyshnya/50e5ea3efa9771d2e7cc121c2f1a04e4), + [`xgboost.R`](https://gist.github.com/gvyshnya/2e5799863f02fec652c194020da82dd3)) + +- [`ensemble.R`](https://gist.github.com/gvyshnya/84379d6a68fd085fe3a26aabad453e55) + is responsible for the weighted ensemble prediction and the final output of + the Kaggle submission file + +- `config.R` is responsible for all of the conditional logic switches needed in + the pipeline (it is included as a source to all of modeling and ensemble + prediction scripts, to get this done) + +There is a special note about lack of feature engineering for this project. It +was an intended specification related to the specifics of the dataset. The +existing features were quite instrumental to predict the target values ‘as is’. +Therefore it had been decided to follow the well-known +[Pareto principle](https://en.wikipedia.org/wiki/Pareto_principle) (interpreted +as “**_20% of efforts address 80% of issues_**”, in this case) and not to spend +more time on it. + +**_Note_**: all `R` and batch files mentioned throughout this blog post are +available online in a separate GitHub +[repository](https://github.com/gvyshnya/DVC_R_Ensemble). You will be also able +to review more details on the implementation of each of the machine learning +prediction models there. + +### Pipeline Configuration Management + +All of the essential tweaks to conditional machine learning pipeline for this +project is managed by a configuration file. For ease of its use across solution, +it was implemented as an R code module (`config.R`), to be included to all model +training and forecasting. Thus the respective parameters (assigned as R +variables) will be retrieved by the runnable scripts, and the conditional logic +there will be triggered respectively. + +This file is not intended to run from a command line (unlike the rest of the R +scripts in the project). + +`gist:gvyshnya/918e94b06ebf222f6bb56ed26a5f44ee#config.R` + +### Why Do We Need DVC? + +As we all know, there is no way to build the ideal ML model with sound +prediction accuracy from the very beginning. You will have to continuously +adjust your algorithm/model implementations based on the cross-validation +appraisal until you yield the blooming results. This is especially true in the +ensemble learning where you have to constantly tweak not only parameters of the +individual prediction models but also the settings of the ensemble itself + +- changing ensemble composition — adding or removing individual prediction + models + +- changing model prediction weights in the resulting ensemble prediction + +Under such a condition, DVC will help you to manage your ensemble ML pipeline in +a really solid manner. Let’s consider the following real-world scenario + +- Your team member changes the settings of `GBM` model and resubmit its + implementation to (this is emulated by the commit + [#8604103f0](https://github.com/gvyshnya/DVC_R_Ensemble/commit/27825d0732f72f07e7e4e48548ddb8a8604103f0), + check sum `27825d0`) + +- You rerun the entire ML pipeline on your computer, to get the newest + predictions from `GBM` as well as the updated final ensemble prediction + +- The results of the prediction appeared to be still not optimal thus someone + changes the weights of individual models in the ensemble, assigning `GBM` + higher weight vs. `xgboost` and `LR` + +- After the ensemble setup changes committed (and updated `config.R` appeared in + the repository, as emulated by the commit + [#eb97612ce](https://github.com/gvyshnya/DVC_R_Ensemble/commit/5bcbe115afcb24886abb4734ff2da42eb97612ce), + check sum `5bcbe11`), you re-run the model predictions and the final ensemble + prediction on your machine once again + +All that you need to do to handle the changes above is simply to keep running +your **DVC** commands per the script developed (see the section below). You do +not have to remember or know explicitly the changes being made into the project +codebase or its pipeline configuration. **DVC** will automatically check out +latest changes from the repo as well as make sure it runs only those steps in +the pipeline that were affected by the recent changes in the code modules. + +### Orchestrating the Pipeline : DVC Command File + +After we developed individual R scripts needed by different steps of our Machine +Learning pipeline, we orchestrate it together using DVC. + +Below is a batch file illustrating how DVC manages steps of the machine learning +process for this project + +`gist:gvyshnya/7f1b8262e3eb7a8b3c16dbfd8cf98644#dvc.bat` + +If you then further edit ensemble configuration setup in `code/config.R`, you +can simply leverage the power of DVC as for automatic dependencies resolving and +tracking to rebuild the new ensemble prediction as follows + +`gist:gvyshnya/9d80e51ba3d7aa5bd37d100ed82376ee` + +## Summary + +In this blog post, we worked through the process of building an ensemble +prediction pipeline using DVC. The essential key features of that pipeline were +as follows + +- **_reproducibility_** — everybody on a team can run it on his/her premise + +- **_separation of data and code_** — this ensured everyone always runs the + latest versions of the pipeline jobs with the most up-to-date ‘golden copy’ of + training and testing data sets + +The helpful side effect of using DVC was you stop keeping in mind what was +changed on every step of modifying your project scripts or in the pipeline +configuration. Due to it maintaining the dependencies graph (DAG) automatically, +it automatically triggered the only steps that were affected by the particular +changes, within the pipeline job setup. It, in turn, provides the capability to +quickly iterate through the entire ML pipeline. + +> As DVC brings proven engineering practices to often suboptimal and messy ML +> processes as well as helps a typical Data Science project team to eliminate a +> big chunk of common +> [DevOps overheads](https://blog.dataversioncontrol.com/data-version-control-in-analytics-devops-paradigm-35a880e99133), +> I found it extremely useful to leverage DVC on the industrial data science and +> predictive analytics projects. + +## Further Reading + +1. [Ensemble Learning and Prediction Introduction](https://en.wikipedia.org/wiki/Ensemble_learning) + +2. [Using DVC in Machine Learning projects in Python](https://blog.dataversioncontrol.com/data-version-control-beta-release-iterative-machine-learning-a7faf7c8be67) + +3. [Using DVC in Machine Learning projects in R](https://blog.dataversioncontrol.com/r-code-and-reproducible-model-development-with-dvc-1507a0e3687b) + +4. [Kaggle Ensembling Guide](https://mlwave.com/kaggle-ensembling-guide/) diff --git a/content/blog/2017-09-26-best-practices-of-orchestrating-python-and-r-code-in-ml-projects.md b/content/blog/2017-09-26-best-practices-of-orchestrating-python-and-r-code-in-ml-projects.md new file mode 100644 index 00000000000..9bdaffbaf5b --- /dev/null +++ b/content/blog/2017-09-26-best-practices-of-orchestrating-python-and-r-code-in-ml-projects.md @@ -0,0 +1,262 @@ +--- +title: Best practices of orchestrating Python and R code in ML projects +date: 2017-09-26 +description: | + What is the best way to integrate R and Python languages in one data science + project? What are the best practices? +descriptionLong: | + Today, data scientists are generally divided among two languages — some prefer + R, some prefer Python. I will try to find an answer to a question: “What is + the best way to integrate both languages in one data science project? What are + the best practices?” +picture: ../../static/uploads/images/2017-09-26/post-image.jpg +pictureComment: | + Image was taken from + [this](http://intersog.com/blog/r-and-python-for-data-science-worthy-opponents/) + page +author: ../authors/marija_ilic.md +commentsUrl: https://discuss.dvc.org/t/best-practices-of-orchestrating-python-and-r-code-in-ml-projects/295 +tags: + - R + - Python + - Tutorial + - Best Practices +--- + +Beside Git and shell scripting additional tools are developed to facilitate the +development of predictive model in a multi-language environments. For fast data +exchange between R and Python let’s use binary data file format +[Feather](https://blog.rstudio.com/2016/03/29/feather/). Another language +agnostic tool [DVC](http://dvc.org) can make the research reproducible — let’s +use DVC to orchestrate R and Python code instead of a regular shell scripts. + +## Machine learning with R and Python + +Both R and Python are having powerful libraries/packages used for predictive +modeling. Usually algorithms used for classification or regression are +implemented in both languages and some scientist are using R while some of them +preferring Python. In an example that was explained in previous +[tutorial](https://blog.dataversioncontrol.com/r-code-and-reproducible-model-development-with-dvc-1507a0e3687b) +target variable was binary output and logistic regression was used as a training +algorithm. One of the algorithms that could also be used for prediction is a +popular [Random Forest algorithm](https://en.wikipedia.org/wiki/Random_forest) +which is implemented in both programming languages. Because of performances it +was decided that Random Forest classifier should be implemented in Python (it +shows better performances than random forest package in R). + +## R example used for DVC demo + +We will use the same example from previous blog +[story](https://blog.dataversioncontrol.com/r-code-and-reproducible-model-development-with-dvc-1507a0e3687b), +add some Python codes and explain how Feather and DVC can simplify the +development process in this combined environment. + +Let’s recall briefly the R codes from previous tutorial: + +![R Jobs](/uploads/images/2017-09-26/r-jobs.png)_R Jobs_ + +Input data are StackOverflow posts — an XML file. Predictive variables are +created from text posts — relative importance +[tf-idf](https://en.wikipedia.org/wiki/Tf%E2%80%93idf) of words among all +available posts is calculated. With tf-idf matrices target is predicted and +lasso logistic regression for predicting binary output is used. AUC is +calculated on the test set and AUC metric is used on evaluation. + +Instead of using logistic regression in R we will write Python jobs in which we +will try to use random forest as training model. Train_model.R and evaluate.R +will be replaced with appropriate Python jobs. + +R codes can be seen +[here](https://blog.dataversioncontrol.com/r-code-and-reproducible-model-development-with-dvc-1507a0e3687b). + +Code for `train_model_Python.py` is presented below: + +`gist:Zoldin/b312897cc492608feef1eaeae7f6eabc#train_model_Python.py` + +Also here we are adding code for `evaluation_python_model.py`: + +`gist:Zoldin/9eef13632d0a9039fe9b0dba376516a4#evaluation_python_model.py` + +Let’s download necessary R and Python codes from above (clone the +[Github](https://github.com/Zoldin/R_AND_DVC) repository): + +```dvc +$ mkdir R_DVC_GITHUB_CODE +$ cd R_DVC_GITHUB_CODE + +$ git clone https://github.com/Zoldin/R_AND_DVC +``` + +Our dependency graph of this data science project look like this: + +![R (marked red) and Python (marked pink) jobs in one project](/uploads/images/2017-09-26/our-dependency-graph.png)_R +(marked red) and Python (marked pink) jobs in one project_ + +Now lets see how it is possible to speed up and simplify process flow with +Feather API and data version control reproducibility. + +## Feather API + +Feather API is designed to improve meta data and data interchange between R and +Python. It provides fast import/export of data frames among both environments +and keeps meta data information which is an improvement over data exchange via +csv/txt file format. In our example Python job will read an input binary file +that was produced in R with Feather api. + +Let’s install Feather library in both environments. + +For Python 3 on linux environment you can use cmd and pip3: + +```dvc +$ sudo pip3 install feather-format +``` + +For R it is necessary to install feather package: + +```R +install.packages(feather) +``` + +After successful installation we can use Feather for data exchange. + +Below is an R syntax for data frame export with Feather (featurization.R): + +```R +library(feather) + +write_feather(dtm_train_tfidf,args[3]) +write_feather(dtm_test_tfidf,args[4]) +print("Two data frame were created with Feather - one for train and one for test data set") +``` + +Python syntax for reading feather input binary files (train_model_python.py): + +```python +import feather as ft + +input = sys.argv[1] +df = ft.read_dataframe(input) +``` + +## Dependency graph with R and Python combined + +The next question what we are asking ourselves is why do we need DVC, why not +just use shell scripting? DVC automatically derives the dependencies between the +steps and builds +[the dependency graph (DAG)](https://en.wikipedia.org/wiki/Directed_acyclic_graph) +transparently to the user. Graph is used for reproducing parts/codes of your +pipeline which were affected by recent changes and we don’t have to think all +the time what we need to repeat (which steps) with the latest changes. + +Firstly, with `dvc run` command we will execute all jobs that are related to our +model development. In that phase DVC creates dependencies that will be used in +the reproducibility phase: + +```dvc +$ dvc import https://s3-us-west-2.amazonaws.com/dvc-share/so/25K/Posts.xml.tgz \ + data/ + +$ dvc run tar zxf data/Posts.xml.tgz -C data/ + +$ dvc run Rscript code/parsingxml.R \ + data/Posts.xml data/Posts.csv + +$ dvc run Rscript code/train_test_spliting.R \ + data/Posts.csv 0.33 20170426 \ + data/train_post.csv data/test_post.csv + +$ dvc run Rscript code/featurization.R \ + data/train_post.csv \ + data/test_post.csv data/matrix_train.feather \ + data/matrix_test.feather + +$ dvc run python3 code/train_model_python.py \ + data/matrix_train.feather \ + 20170426 data/model.p + +$ dvc run python3 code/evaluate_python_mdl.py \ + data/model.p data/matrix_test.feather \ + data/evaluation_python.txt +``` + +After this commands jobs are executed and included in DAG graph. Result (AUC +metrics) is written in evaluation_python.txt file: + +```dvc +$ cat data/evaluation_python.txt +AUC: 0.741432 +``` + +It is possible to improve our result with random forest algorithm. + +We can increase number of trees in the random forest classifier — from 100 to +500: + +```python +clf = RandomForestClassifier(n_estimators=500, + n_jobs=2, + random_state=seed) +clf.fit(x, labels) +``` + +After commited changes (in `train_model_python.py`) with `dvc repro` command all +necessary jobs for `evaluation_python.txt` reproduction will be re-executed. We +don’t need to worry which jobs to run and in which order. + +```dvc +$ git add . +$ git commit +[master a65f346] Random forest classifier — more trees added + 1 file changed, 1 insertion(+), 1 deletion(-) + +$ dvc repro data/evaluation_python.txt + +Reproducing run command for data item data/model.p. Args: python3 code/train_model_python.py data/matrix_train.txt 20170426 data/model.p +Reproducing run command for data item data/evaluation_python.txt. Args: python3 code/evaluate_python_mdl.py data/model.p data/matrix_test.txt data/evaluation_python.txt +Data item “data/evaluation_python.txt” was reproduced. +``` + +Beside code versioning, DVC also cares about data versioning. For example, if we +change data sets `train_post.csv` and `test_post.csv` (use different splitting +ratio) DVC will know that data sets are changed and `dvc repro` will re-execute +all necessary jobs for evaluation_python.txt. + +```dvc +$ dvc run Rscript code/train_test_spliting.R \ + data/Posts.csv 0.15 20170426 \ + data/train_post.csv \ + data/test_post.csv +``` + +Re-executed jobs are marked with red color: + +![](/uploads/images/2017-09-26/re-executed-jobs.png) + +```dvc +$ dvc run Rscript code/train_test_spliting.R \ + data/Posts.csv 0.15 20170426 \ + data/train_post.csv \ + data/test_post.csv + +$ dvc repro data/evaluation_python.txt + +Reproducing run command for data item data/matrix_train.txt. Args: Rscript — vanilla code/featurization.R data/train_post.csv data/test_post.csv data/matrix_train.txt data/matrix_test.txt +Reproducing run command for data item data/model.p. Args: python3 code/train_model_python.py data/matrix_train.txt 20170426 data/model.p +Reproducing run command for data item data/evaluation_python.txt. Args: python3 code/evaluate_python_mdl.py data/model.p data/matrix_test.txt data/evaluation_python.txt + +Data item “data/evaluation_python.txt” was reproduced. + +$ cat data/evaluation_python.txt +AUC: 0.793145 +``` + +New AUC result is 0.793145 which shows an improvement compared to previous +iteration. + +## Summary + +In data science projects it is often used R/Python combined programming. +Additional tools beside git and shell scripting are developed to facilitate the +development of predictive model in a multi-language environments. Using data +version control system for reproducibility and Feather for data interoperability +helps you orchestrate R and Python code in a single environment. diff --git a/content/blog/2018-10-18-ml-best-practices-in-pytorch-dev-conf-2018.md b/content/blog/2018-10-18-ml-best-practices-in-pytorch-dev-conf-2018.md new file mode 100644 index 00000000000..e68a5a5f0f8 --- /dev/null +++ b/content/blog/2018-10-18-ml-best-practices-in-pytorch-dev-conf-2018.md @@ -0,0 +1,156 @@ +--- +title: ML best practices in PyTorch dev conf 2018 +date: 2018-10-18 +description: | + In the Machine Learning (ML) field tools and techniques for best practices are + just starting to be developed. +descriptionLong: | + In the Machine Learning (ML) field tools and techniques for best practices are + just starting to be developed. At the PyTorch developer conference (PTDC-18), + several speakers including **Jerome Pesenti, VP of AI from Facebook** and + **Andrej Karpathy, Director of Tesla AI** spoke about best practices for + machine learning development. +picture: ../../static/uploads/images/2018-10-18/post-image.jpeg +pictureComment: | + The image source: + [link](https://blog.hubspot.com/customers/bid/109553/5-Homepage-Design-Best-Practices) +author: ../authors/dmitry_petrov.md +commentsUrl: https://discuss.dvc.org/t/ml-best-practices-in-pytorch-dev-conf-2018/294 +tags: + - Machine Learning + - Best Practices + - PyTorch + - PTDC-18 +--- + +The issues discussed included applying traditional software development +techniques like unit testing, CI/CD systems, automated deployment, version +control, and more to the ML field. In this blog post, we will go over the best +practices ideas from PTDC-18 and the future of ML tool developments. + +## 1. Engineering practices from PyTorch developers + +In the PTDC-18 +[keynote speech](https://www.facebook.com/pytorch/videos/482401942168584/), +**Jerome Pesenti** described the motivation and goals of PyTorch project and +what the future of machine learning looks like. + +### 1.1. ML tooling future + +Regarding the future of ML, Jerome envisioned a “streamlined development, more +accessible tools, breakthrough hardware, and more”. Talking about the gap huge +gap between software engineering and ML engineering, Presenti said: + +> Machine learning engineering is where we were in Software Engineering 20 years +> ago. A lot of things still need to be invented. We need to figure out what +> testing means, what CD (continuous delivery) means, we need to develop tools +> and environments that people can develop **robust ML that does not have too +> many biases** and does not overfit. + +In that gap lives many opportunities to develop new tools and services. We in +the ML ecosystem are called upon to implement the future of machine learning +tools. Traditional software engineering has many useful tools and techniques +which can either be repurposed for Machine Learning development or used as a +source for ideas in developing new tools. + +### 1.2. PyTorch motivation + +PyTorch 1.0 implements one important engineering principle — “a seamless +transition from AI research to production”. It helps to move AI technology from +research into production as quickly as possible. In order to do that a few +challenges were solved: + +1. **Write code once** — not have to rewrite or re-optimize code to go from + research to prod. + +1. **Performance** — training model on large datasets. + +1. **Other languages** — not only Python which is great for prototyping but also + C++ and other languages. + +1. **Scaling** — deploy PyTorch at scale more easily. + +## 2. Engineering practices for software 2.0 + +### 2.1. Melting of software 2.0 and software 1.0 + +**Andrej Karpathy** from Tesla AI had a +[dedicated talk](https://www.facebook.com/pytorch/videos/169366590639145/) about +best engineering practices in ML. He drew a contrast between traditional +software development (software 1.0) with software utilizing Machine Learning +techniques (software 2.0), saying that + +> “software 2.0 code also has new feature demands, contains bugs, and requires +> iterations.” + +Meaning that ML development has a lifecycle similar to traditional software: + +> “When you are working with these [neural] networks **in production** you are +> doing much more than that [training and measuring models]. You maintaining the +> codebase and that codebase is alive is just like 1.0 code.” + +Machine Learning models need to grow and develop feature-by-feature, bugs need +to be found and fixed, and repeatable processes are a must, as in earlier non-ML +software development practices. + +### 2.2. Software 2.0 best practices + +Karpathy went on to describe how software 1.0 best practices can be used in +software 2.0 (ML modeling): + +1. **Test-driven development** — test/train dataset separation is not enough + since it describes only expected performance. Edge cases have to be tested to + ensure the model performs as required. That requires incorporating more + examples in datasets, or changing model architecture, or changing + optimization functions. + +1. **Continues Integration and Continues Delivery** (CI/CD) — Intelligently used + of CI/CD can propel a team into rapid agile development of software systems. + The phases of CI/CD jobs include: 1) ML model auto re-training when code or + dataset changes; 2) running unit-tests; 3) easy access to the last model; 4) + Auto-deployment to test and/or production systems. + +1. **Version Control** — track all the changes in datasets (labels), not only + code. + +1. Train a **single model** from scratch every time without using other + pre-trained models. (External pre-trained models don’t count as far as I + understand.) A chain of fine-tuning models very quickly disintegrates + codebase. In software 1.0 a single **monorepo** is an analog of a single + model which also helps to avoid disintegration. + +This list of best practices shows how serious Tesla AI is about robust software +which is not surprising for self-driving car area. Any company needs these +practices in order to organize a manageable ML development process. + +## 3. Data file-centric tools + +Frameworks and libraries like PyTorch make a significant step in machine +learning tooling and bringing the best practices. However, frameworks and +libraries might be not enough for many of the ML best practices. For example, +dataset versioning, ML model versioning, continuous integration (CI) and +continuous delivery (CD) requires manipulation and transferring data files. +These can be done in a **more efficient and natural way by data management +tools** and storage systems rather than libraries. + +The need for a machine learning artifact manipulation tool with **data +file-centric philosophy** was the major motivation behind open source project +that we created — Data Version Control (DVC) or [DVC.org](http://dvc.org). + +DVC connects Git with data files and machine learning pipelines which helps keep +version control on machine learning models and datasets using familiar Git +semantics coupled with the power of cloud storage systems such as Amazon’s S3, +Google’s GCS, Microsoft’s Azure or bare-metal servers accessed by SSH. + +If PyTorch helps in organizing code inside an ML project then data-centric tools +like DVC help organized different pieces of ML projects into a single workflow. +The machine learning future requires both types of tools — code level and data +file level. + +## Conclusion + +Thus far only the first steps have been taken toward using machine learning +tooling and the best machine learning practices. Mostly large companies are +using these practices because they faced the problems a while ago. Best +practices should be embraced by the entire industry which will help to bring +machine learning to a higher new level. diff --git a/content/blog/2019-03-05-march-19-dvc-heartbeat.md b/content/blog/2019-03-05-march-19-dvc-heartbeat.md new file mode 100644 index 00000000000..d93c656fc4b --- /dev/null +++ b/content/blog/2019-03-05-march-19-dvc-heartbeat.md @@ -0,0 +1,164 @@ +--- +title: March ’19 DVC❤️Heartbeat +date: 2019-03-05 +description: | + The very first issue of the DVC Heartbeat! News, links, Discord discussions + from the community. +descriptionLong: | + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. + + Some of those are related to our brainchild [DVC](https://dvc.org) and its + journey. The others are a collection of exciting stories and ideas centered + around ML best practices and workflow. +picture: ../../static/uploads/images/2019-03-05/post-image.jpeg +author: ../authors/svetlana_grinchenko.md +commentsUrl: https://discuss.dvc.org/t/march-19-dvc-heartbeat/293 +tags: + - Heartbeat + - Discord Gems +--- + +This is the very first issue of the DVC❤️Heartbeat. Every month we will be +sharing our news, findings, interesting reads, community takeaways, and +everything along the way. + +Some of those are related to our brainchild [DVC](https://dvc.org) and its +journey. The others are a collection of exciting stories and ideas centered +around ML best practices and workflow. + +## News and links + +We read a ton of articles and posts every day and here are a few that caught our +eye. Well-written, offering a different perspective and definitely worth +checking. + +- **[Data science is different now](https://veekaybee.github.io/2019/02/13/data-science-is-different/) + by [Vicki Boykis](https://veekaybee.github.io/)** + + + +> What is becoming clear is that, in the late stage of the hype cycle, data +> science is asymptotically moving closer to engineering, and the +> [skills that data scientists need](https://www.youtube.com/watch?v=frQeK8xo9Ls) +> moving forward are less visualization and statistics-based, and +> [more in line with traditional computer science curricula](https://tech.trivago.com/2018/12/03/teardown-rebuild-migrating-from-hive-to-pyspark/). + +- **[Data Versioning](https://emilygorcenski.com/post/data-versioning/) by + [Emily F. Gorcenski](https://emilygorcenski.com/)** + + + +> I want to explore how the degrees of freedom in versioning machine learning +> systems poses a unique challenge. I’ll identify four key axes on which machine +> learning systems have a notion of version, along with some brief +> recommendations for how to simplify this a bit. + +- **[Reproducibility in Machine Learning](https://blog.mi.hdm-stuttgart.de/index.php/2019/02/26/reproducibility-in-ml/) + by [Pascal Fecht](https://blog.mi.hdm-stuttgart.de/index.php/author/pf023/)** + + + +> ...the objective of this post is not to philosophize about the dangers and +> dark sides of AI. In fact, this post aims to work out common challenges in +> reproducibility for machine learning and shows programming differences to +> other areas of Computer Science. Secondly, we will see practices and workflows +> to create a higher grade of reproducibility in machine learning algorithms. + +
+ +## Discord gems + +There are lots of hidden gems in our Discord community discussions. Sometimes +they are scattered all over the channels and hard to track down. + +We will be sifting through the issues and discussions and share the most +interesting takeaways. + +### Q: [Edit and define DVC files manually, in a Makefile style](https://discordapp.com/channels/485586884165107732/485586884165107734/541622187296161816) + +There is no separate guide for that, but it is very straight forward. See +[DVC file format](https://dvc.org/doc/user-guide/dvc-file-format) description +for how DVC file looks inside in general. All `dvc add` or `dvc run` does is +just computing `md5` fields in it, that is all. You could write your DVC-file +and then run `dvc repro` that will run a command(if any) and compute all needed +checksums,[read more](https://discordapp.com/channels/485586884165107732/485586884165107734/541622187296161816). + +### Q: [Best practices to define the code dependencies](https://discordapp.com/channels/485586884165107732/485586884165107734/547424240677158915) + +There’s a ton of code in that project, and it’s very non-trivial to define the +code dependencies for my training stage — there are a lot of imports going on, +the training code is distributed across many modules, +[read more](https://discordapp.com/channels/485586884165107732/485586884165107734/547424240677158915) + +### Q: [Azure data lake support](https://discordapp.com/channels/485586884165107732/485586884165107734/548495589428428801) + +DVC officially only supports regular Azure blob storage. Gen1 Data Lake should +be accessible by the same interface, so configuring a regular azure remote for +DVC should work. Seems like Gen2 Data Lake +[has disable](https://discordapp.com/channels/485586884165107732/485586884165107734/550546413197590539) +blob API. If you know more details about the difference between Gen1 and Gen2, +feel free to join [our community](https://dvc.org/chat) and share this +knowledge. + +### Q: [What licence DVC is released under](https://discordapp.com/channels/485586884165107732/485596304961962003/542390986299539459) + +Apache 2.0. One of the [most common](https://opensource.org/licenses) and +permissible OSS licences. + +### Q: Setting up S3 compatible remote + +([Localstack](https://discordapp.com/channels/485586884165107732/485596304961962003/543445798868746278), +[wasabi](https://discordapp.com/channels/485586884165107732/485596304961962003/541466951474479115)) + +```dvc +$ dvc remote add upstream s3://my-bucket +$ dvc remote modify upstream region REGION_NAME +$ dvc remote modify upstream endpointurl +``` + +Find and click the `S3 API compatible storage` on +[this page](https://dvc.org/doc/commands-reference/remote-add) + +### Q: [Why DVC creates and updates `.gitignore` file?](https://discordapp.com/channels/485586884165107732/485596304961962003/543914550173368332) + +It adds your data files there, that are tracked by DVC, so that you don’t +accidentally add them to git as well you can open it with file editor of your +liking and see your data files listed there. + +### Q: [Managing data and pipelines with DVC on HDFS](https://discordapp.com/channels/485586884165107732/485596304961962003/545562334983356426) + +With DVC, you could connect your data sources from HDFS with your pipeline in +your local project, by simply specifying it as an external dependency. For +example let’s say your script `process.cmd` works on an input file on HDFS and +then downloads a result to your local workspace, then with DVC it could look +something like: + +```dvc +$ dvc run -d hdfs://example.com/home/shared/input \ + -d process.cmd \ + -o output process.cmd +``` + +[read more](https://discordapp.com/channels/485586884165107732/485596304961962003/545562334983356426). + +
+ +If you have any questions, concerns or ideas, let us know +[here](https://dvc.org/support) and our stellar team will get back to you in no +time. diff --git a/content/blog/2019-04-18-april-19-dvc-heartbeat.md b/content/blog/2019-04-18-april-19-dvc-heartbeat.md new file mode 100644 index 00000000000..5f98edd1da2 --- /dev/null +++ b/content/blog/2019-04-18-april-19-dvc-heartbeat.md @@ -0,0 +1,264 @@ +--- +title: April ’19 DVC❤️Heartbeat +date: 2019-04-18 +description: | + DVC creator Dmitry Petrov is giving a talk on PyCon 2019 🎤, new DVC logo + design, new Discord discussions, interesting reads that caught our eye, and + everything along the way. +descriptionLong: | + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. + + Some of those are related to our brainchild [DVC](https://dvc.org) and its + journey. The others are a collection of exciting stories and ideas centered + around ML best practices and workflow. +picture: ../../static/uploads/images/2019-04-18/post-image.jpeg +author: ../authors/svetlana_grinchenko.md +commentsUrl: https://discuss.dvc.org/t/april-19-dvc-heartbeat/292 +tags: + - Heartbeat + - Discord Gems + - PyCon +--- + +## News and links + +We have some exciting news to share this month! + +DVC is going to [PyCon 2019](https://us.pycon.org/2019/)! It is the first +conference that we attend as a team. When we say ‘team’ — we mean it. Our +engineers are flying from all over the globe to get together offline and catch +up with fellow Pythonistas. + +The [speaker pipeline](https://us.pycon.org/2019/schedule/talks/list/) is +amazing! DVC creator Dmitry Petrov is giving a talk on +[Machine learning model and dataset versioning practices](https://us.pycon.org/2019/schedule/presentation/176/). + +Stop by our booth at the Startup Row on Saturday, May 4, reach out and let us +know that you are willing to chat, or simply find a person with a huge DVC owl +on their shirt! + +Speaking of the owls — DVC has done some rebranding recently and we love our new +logo. Special thanks to [99designs.com](https://99designs.com/) for building a +great platform for finding trusted designers. + +![](/uploads/images/2019-04-18/trusted-designers.png) + +DVC is moving fast (almost as fast as my two-year-old). We do our best to keep +up and totally love all the buzz in our community channels lately! + +Here is a number of interesting reads that caught our eye: + +- **[A walkthrough of DVC](https://blog.codecentric.de/en/2019/03/walkthrough-dvc/) + by [Bert Besser](https://www.linkedin.com/in/bert-besser-284564182/)** + + + +A great article about using DVC with a quite advanced scenario and docker. If +you haven’t had a chance to try [DVC.org](http://dvc.org/) yet — this is a great +comprehensive read on why you should do so right away. + +- **[The state of machine learning operations](https://github.com/EthicalML/state-of-mlops-2019) + by [Alejandro Saucedo](https://www.linkedin.com/in/axsaucedo/)** + + + +A short (only 8 minutes!) and inspiring talk by Alejandro Saucedo at FOSDEM. +Alejandro covers the key trends in machine learning operations, as well as most +recent open source tools and frameworks. Focused on reproducibility, monitoring +and explainability, this lightning talk is a great snapshot of the current state +of ML operations. + +- **[Interview with Kaggle Grandmaster, Senior Computer Vision Engineer at Lyft: Dr. Vladimir I. Iglovikov](https://hackernoon.com/interview-with-kaggle-grandmaster-senior-cv-engineer-at-lyft-dr-vladimir-i-iglovikov-9938e1fc7c) + by [Sanyam Bhutani](https://twitter.com/bhutanisanyam1)** + + + +> There is no way you will become Kaggle Master and not learn how to approach +> anew, the unknown problem in a fast hacking way with a very high number of +> iterations per unit of time. This skill in the world of competitive learning +> is the question of survival + +
+ +## Discord gems + +There are lots of hidden gems in our Discord community discussions. Sometimes +they are scattered all over the channels and hard to track down. + +We are sifting through the issues and discussions and share with you the most +interesting takeaways. + +### Q: [What are the system requirements to install DVC (type of operating system, dependencies of another application (as GIT), memory, cpu, etc).](https://discordapp.com/channels/485586884165107732/485596304961962003/552098155861114891) + +- It supports Windows, Mac, Linux. Python 2 and 3. + +- No specific CPU or RAM requirements — it’s a lightweight command line tool and + should be able run pretty much everywhere you can run Python. + +- It depends on a few Python libraries that it installs as dependencies (they + are specified in the + [`requirements.txt`](https://github.com/iterative/dvc/blob/master/requirements.txt)). + +- It does not depend on Git and theoretically could be run without any SCM. + Running it on top of a Git repository however is recommended and gives you an + ability to actually save history of datasets, models, etc (even though it does + not put them into Git directly). + +### Q: [Do I have to buy a server license to run DVC, do you have this?](https://discordapp.com/channels/485586884165107732/485596304961962003/560212552638791706) + +No server licenses for DVC. It is 100% free and open source. + +### Q: [What is the storage limit when using DVC?](https://discordapp.com/channels/485586884165107732/485596304961962003/560154903331340289) + +I am trying to version control datasets and models with >10 GB (Potentially even +bigger). Can DVC handle this? + +There is no limit. None enforced by DVC itself. It depends on the size of your +local or [remote storages](https://dvc.org/doc/commands-reference/remote). You +need to have some space available on S3, your SSH server or other storage you +are using to keep these data files, models and their version, which you would +like to store. + +### Q: [How does DVC know the sequence of stages to run](https://discordapp.com/channels/485586884165107732/485596304961962003/553731815228178433)? + +How does it connect them? Does it see that there is a dependency which is +outputted from the first run? + +DVC figures out the pipeline by looking at the dependencies and outputs of the +stages. For example, having the following: + +`gist:SvetaGr/a2a28fbc9db0a675422785bc5f925e14#heartbeat-dvc-run-2019-04.sh` + +you will end up with two stages: `download.dvc` and `duplicate.dvc`. The +download one will have `joke.txt` as an output . The duplicate one defined +`joke.txt` as a dependency, as it is the same file. DVC detects that and creates +a pipeline by joining those stages. + +You can inspect the content of each stage file +[here](https://dvc.org/doc/user-guide/dvc-file-format) (they are human +readable). + +### Q: [Is it possible to use the same data of a remote in two different repositories?](https://discordapp.com/channels/485586884165107732/485596304961962003/560022999848321026) + +(e.g. in one repo `run dvc pull -r my_remote` to pull some data and running the +same command in a different git repo should also pull the same) + +Yes! It’s a frequent scenario for multiple repos to share remotes and even local +cache. DVC file serves as a link to the actual data. If you add the same DVC +file (e.g. `data.dvc`) to the new repo and do `dvc pull -r remotename data.dvc`- +it will fetch data. You have to use `dvc remote add` first to specify the +coordinates of the remote storage you would like to share in every project. +Alternatively (check out the question below), you could use `--global` to +specify a single default remote (and/or cache dir) per machine. + +### Q: [Could I set a global remote server, instead of config in each project?](https://discordapp.com/channels/485586884165107732/485586884165107734/559653121228275727) + +Use `--global` when you specify the remote settings. Then remote will be visible +for all projects on the same machine. `--global` — saves remote configuration to +the global config (e.g. `~/.config/dvc/config`) instead of a per project one — +`.dvc/config`. See more details +[here](https://dvc.org/doc/commands-reference/remote-add). + +### Q: [How do I version a large dataset in S3 or any other storage?](https://discordapp.com/channels/485586884165107732/485596304961962003/554679392823934977) + +We would recommend to skim through our +[get started](https://dvc.org/doc/get-started) tutorial, to summarize the data +versioning process of DVC: + +- You create stage (aka DVC) files by adding, importing files (`dvc add` / + `dvc import`) , or run a command to generate files: + +```dvc +$ dvc run --out file.csv "wget https://example.com/file.csv" +``` + +- This stage files are tracked by `git` + +- You use git to retrieve previous stage files (e.g. `git checkout v1.0`) + +- Then use `dvc checkout` to retrieve all the files related by those stage files + +All your files (with each different version) are stored in a `.dvc/cache` +directory, that you sync with a remote file storage (for example, S3) using the +`dvc push` or `dvc pull` commands (analogous to a `git push` / `git pull`, but +instead of syncing your `.git`, you are syncing your `.dvc` directory) on a +remote repository (let’s say an S3 bucket). + +### Q: [How do I move/rename a DVC-file?](https://discordapp.com/channels/485586884165107732/485596304961962003/558216007684980736) + +If you need to move your dvc file somewhere, it is pretty easy, even if done +manually: + +`gist:SvetaGr/b25a5b45773bf94d36e60d48462502f4#heartbeat-dvc-rename.sh` + +### Q: [I performed `dvc push` of a file to a remote. On the remote there is created a directory called `8f` with a file inside called `2ec34faf91ff15ef64abf3fbffa7ee`. The original CSV file doesn’t appear on the remote. Is that expected behaviour?](https://discordapp.com/channels/485586884165107732/485596304961962003/555431645402890255) + +This is an expected behavior. DVC saves files under the name created from their +checksum in order to prevent duplication. If you delete “pushed” file in your +project directory and perform `dvc pull`, DVC will take care of pulling the file +and renaming it to “original” name. + +Below are some details about how DVC cache works, just to illustrate the logic. +When you add a data source: + +`gist:SvetaGr/b69fa8ce36bcce00ecd69e7f2d7ccd2e#heartbeat-remote-file-naming.sh` + +It computes the (md5) checksum of the file and generates a DVC file with related +information: + +`gist:SvetaGr/110ae76df929654ec573ea9e4b1e1980#heartbeat-dvc-file-2019-04.yaml` + +The original file is moved to the cache and a link or copy (depending on your +filesystem) is created to replace it on your working space: + +`gist:SvetaGr/133cb93e5a21c6f21a86f8709ed39ea9#heartbeat-cache-structure-2019-04.sh` + +### Q: [Is it possible to integrate dvc with our in-house tools developed in Python?](https://discordapp.com/channels/485586884165107732/485586884165107734/553570391000481802) + +Absolutely! There are three ways you could interact with DVC: + +1. Use [subprocess](https://docs.python.org/3/library/subprocess.html) to launch + DVC + +2. Use `from dvc.main import main` and use it with regular CLI logic like + `ret = main(‘add’, ‘foo’)` + +3. Use our internal API (see `dvc/repo` and `dvc/command` in our source to get a + grasp of it). It is not officially public yet, and we don’t have any special + docs for it, but it is fairly stable and could definitely be used for a POC. + We’ll add docs and all the official stuff for it in the not-so-distant + future. + +### Q: [Can I still track the linkage between data and model without using `dvc run`](https://discordapp.com/channels/485586884165107732/485586884165107734/555750217522216990) and a graph of tasks? Basically what would like extremely minimal DVC invasion into my GIT repo for an existing machine learning application? + +There are two options: + +1. Use `dvc add` to track models and/or input datasets. It should be enough if + you use `git commit` on DVC files produced by `dvc add`. This is the very + minimum you can get with DVC and it does not require using DVC run. Check the + first part (up to the Pipelines/Add transformations section) of the DVC + [get started](https://dvc.org/doc/get-started). + +2. You could use `--no-exec` in `dvc run` and then just `dvc commit` and + `git commit` the results. That way you’ll get your DVC files with all the + linkages, without having to actually run your commands through DVC. + +If you have any questions, concerns or ideas, let us know +[here](https://dvc.org/support) and our stellar team will get back to you in no +time. diff --git a/content/blog/2019-04-23-dvc-project-ideas-for-google-summer-of-docs-2019.md b/content/blog/2019-04-23-dvc-project-ideas-for-google-summer-of-docs-2019.md new file mode 100644 index 00000000000..62985559a99 --- /dev/null +++ b/content/blog/2019-04-23-dvc-project-ideas-for-google-summer-of-docs-2019.md @@ -0,0 +1,226 @@ +--- +title: DVC project ideas for Google Season of Docs 2019 +date: 2019-04-23 +description: | + DVC.org is applying for Google Season of Docs — a new and unique program + sponsored by Google that pairs technical writers with open source projects to + collaborate on the open source project documentation. +descriptionLong: | + [DVC.org](https://dvc.org) is applying for + [Google Season of Docs](https://developers.google.com/season-of-docs/) — a new + and unique program sponsored by Google that pairs technical writers with open source projects to + collaborate on the open source project documentation. + + It’s happening for the first time in 2019 and we are excited about the + opportunity to be a part of it! +picture: ../../static/uploads/images/2019-04-23/post-image.png +author: ../authors/svetlana_grinchenko.md +commentsUrl: https://discuss.dvc.org/t/dvc-project-ideas-for-google-season-of-docs-2019/291 +tags: + - Google Season of Docs + - Python + - Documentation +--- + +We strongly believe that well-shaped documentation is key for making the product +truly open. We have been investing lots of time and energy in improving our docs +lately. Being a team of 90% engineers we are eager to welcome the writers into +our team and our community. We are happy to share our experience, introduce them +to the world of open source and machine learning best practices, guide through +the OS contribution process and work together on improving our documentation. + +DVC was started in late 2017 by a data scientist and an engineer. It is now +growing pretty fast and though our in-house team is quite small, we have to +thank our contributors (more than 80 in both code and docs) for developing DVC +with us. When working with DVC the technical writer will not only get lots of +hands-on experience in writing technical docs, but will also immerse into DVC +community — a warm and welcoming gathering of ML and DS enthusiasts and an +invaluable source of inspiration and expertise in ML engineering. + +### About DVC + +DVC is a brainchild of a data scientist and an engineer, that was created to +fill in the gaps in the ML processes tooling and evolved into a successful open +source project. + +ML brings changes in development and research processes. These ML processes +require new tools for data versioning, ML pipeline versioning, resource +management for model training and others that haven’t been formalized. The +traditional software development tools do not fully cover ML team’s needs but +there are no good alternatives. It makes engineers to custom develop a new +toolset to manage data files, keep track of ML experiments and connect data and +source code together. The ML process becomes very fragile and requires tons of +tribal knowledge. + +We have been working on [DVC](http://DVC.org) by adopting best ML practices and +turning them into Git-like command line tool. DVC versions multi-gigabyte +datasets and ML models, make them shareable and reproducible. The tool helps to +organize a more rigorous process around datasets and the data derivatives. Your +favorite cloud storage (S3, GCS, or bare metal SSH server) could be used with +DVC as a data file backend. + +If you are interested in learning a little bit more about DVC and its journey, +here is a great interview with DVC creator in the Episode 206 of +Podcast.**init**. Listen to it +[HERE ](https://www.pythonpodcast.com/data-version-control-episode-206/)or read +the transcript +[HERE.](https://towardsdatascience.com/data-version-control-with-dvc-what-do-the-authors-have-to-say-3c3b10f27ee) + +### The state of DVC documentation + +DVC is a pretty young project, developed and maintained solely by engineers. As +many OS projects we started from the bottom and for a long time our +[documentation](https://dvc.org/doc) was a bunch of bits and pieces. Nowadays +improving documentation is one of our top priorities. We moved to the new +in-house built documentation engine and started working with several technical +writers. Certain parts have been tremendously improved recently, e.g. +[Get Started](https://dvc.org/doc/get-started) and +[certain parts of Commands Reference](https://dvc.org/doc/commands-reference/fetch) +. So far most of our documentation has been written majorly by the engineering +team and there is need for improving the overall structure and making some parts +more friendly from a new user perspective. We have mostly complete +[reference documentation](https://dvc.org/doc/commands-reference) for each +command, although some functions are missing good actionable examples. We also +have a [User Guide](https://dvc.org/doc/user-guide/dvc-files-and-directories), +however it is not in very good shape. We strive for making our documentation +clear and comprehensive for users of various backgrounds and proficiency levels +and this is where we do need some fresh perspective. + +### How DVC documentation is built + +We have an open Github Apache-2 licensed repository for the +[DVC website](https://github.com/iterative/dvc.org), the documentation engine +and the [documentation files](https://github.com/iterative/dvc.org). The website +is built with Node.js + React, including the documentation engine (built +in-house). + +Each documentation page is a static Markdown file in the repository, e.g. +[example here](https://github.com/iterative/dvc.org/blob/master/static/docs/get-started/example-versioning.md.). +It is rendered dynamically in the browser, no preprocessing is required. It +means that tech writers or contributors need to write/edit a Markdown file, +create a pull request and merge it into the master branch of the +[repository.](https://github.com/iterative/dvc.org) The complete +[documentation contributing guide](https://github.com/iterative/dvc.org/blob/master/README.md#contributing) +describes the directory structure and locations for the different documentation +parts. + +### DVC’s approach to documentation work + +Documentation tasks and issues are maintained on our doc’s GitHub +[issue tracker](https://github.com/iterative/dvc.org/issues). Changes to the +documentation are made via pull requests on GitHub, and go through our standard +review process which is the same for documentation and code. A technical writer +would be trained in working with our current development process. It generally +means that tech writers or contributors need to write/edit a Markdown file, use +git and Github to create a pull request and publish it. The documentation +[contributing guide](https://github.com/iterative/dvc.org/blob/master/README.md#contributing) +includes style conventions and other details. Documentation is considered of the +same importance as code. Engineering team has a policy to write or update the +relevant sections if something new is released. If it’s something too involved +engineers may create a ticket and ask for help. There is one maintainer who is +responsible for doing final reviews and merging the changes. In this sense, our +documentation is very similar to any other open source project. + +## Project ideas for GSoD’19 + +We identified a number of ideas to work on and there are two major topics these +ideas fall into. Both topics are pretty broad and we don’t expect we can +completely cover them during this GSoD but hopefully we can make certain +progress. + +First of all, we want to bring more structure and logic to our documentation to +improve user onboarding experience. The goal is for a new user to have a clear +path they can follow and understand what takeaways each part of the +documentation provides. In particular, improving how +[Get Started](https://dvc.org/doc/get-started), +[Tutorials](https://dvc.org/doc/tutorial) and +[Examples](https://dvc.org/doc/get-started/example-versioning) relate to each +other, restructuring the existing [User Guide](https://dvc.org/doc/user-guide) +to explain basic concepts, and writing more use cases that resonate with ML +engineers and data scientists. + +The other issue we would like to tackle is improving and expanding the existing +reference docs — commands descriptions, examples, etc. It involves filling in +the gaps and developing new sections, similar to +[this one](https://dvc.org/doc/commands-reference/fetch). We would also love to +see more illustrative materials. + +### Project 1: Improving and expanding User Guide + +**Description and details:** Reviewing, restructuring and filling major gaps in +the User Guide (introductory parts of the basic concepts of DVC), e.g. have a +look at [this ticket](https://github.com/iterative/dvc.org/issues/144) or +[this one](https://github.com/iterative/dvc.org/issues/53). + +**Mentors**: [@shcheklein](https://github.com/shcheklein) and +[@dmpetrov](https://github.com/dmpetrov) + +### Project 2: Expanding and developing new tutorials and use cases. + +**Description and details:** We already have some requests for more tutorials, +e.g. [this ticket](https://github.com/iterative/dvc.org/issues/96). Here is +another good [use case request](https://github.com/iterative/dvc.org/issues/194) +. If you are going to work on this project you would need some domain knowledge, +preferably some basic ML or data science experience. + +**Mentors**: [@shcheklein](https://github.com/shcheklein) and +[@dmpetrov](https://github.com/dmpetrov) + +### Project 3: Improving new user onboarding + +**Description and details:** Analyze and restructure user walkthrough across +[Get started](https://dvc.org/doc/get-started), +[Tutorials](https://dvc.org/doc/tutorial) and +[Examples](https://dvc.org/doc/get-started/example-versioning). These three have +one thing in common — hands-on experience with DVC. If you choose this project, +we will work together to come up with a better location for the Examples (to +move them out of the Get Started shadow), and a better location for the +Tutorials (to reference external tutorials that were developed by our community +members and published on different platforms). + +**Mentors**: [@shcheklein](https://github.com/shcheklein) and +[@dmpetrov](https://github.com/dmpetrov) + +### Project 4: Improving commands reference + +**Description and details:** We will work on improving our +[Commands reference](https://dvc.org/doc/commands-reference) section. This +includes expanding and filling in the gaps. One of the biggest pain points right +now are Examples. Users want them to be +[easy to run and try](https://github.com/iterative/dvc.org/issues/198) and here +is a lot to be done in terms of improvement. We have a good example of how is +should be done [here](https://dvc.org/doc/commands-reference/fetch). + +**Mentors**: [@shcheklein](https://github.com/shcheklein) and +[@dmpetrov](https://github.com/dmpetrov) + +### Project 5: Describe and integrate “DVC packages” + +**Description and details:** Describe the brand new feature “DVC packages” and +integrate it with the rest of the documentation. We have been working hard to +release a few new commands to help with datasets management (have a look at +[this ticket](https://github.com/iterative/dvc/issues/1487)). It’s a major +feature that deserves its place in the Get Started, Use cases, Commands +Reference, etc. + +**Mentors**: [@shcheklein](https://github.com/shcheklein) and +[@dmpetrov](https://github.com/dmpetrov) + +The ideas we outline above are just an example of what we can work on. We are +open for any other suggestions and would like to work together with the +technical writer to make the contribution experience both useful and enjoyable +for all parties involved. If you have any suggestions or questions we would love +to hear from you => DVC.org/support and our DMs on +[Twitter](https://twitter.com/DVCorg) are always open! + +
+ +Special thanks to the [NumFOCUS](https://numfocus.org/) for the ideas list +inspiration. + +If you are a tech writer — check the +[Technical writer guide](https://developers.google.com/season-of-docs/docs/tech-writer-guide). +From April 30, 2019 you can see the list of participating open source +organizations on the [Season of Docs website](https://g.co/seasonofdocs). The +application period for technical writers opens on **May 29, 2019** and ends on +June 28, 2019. diff --git a/content/blog/2019-05-21-may-19-dvc-heartbeat.md b/content/blog/2019-05-21-may-19-dvc-heartbeat.md new file mode 100644 index 00000000000..f3bc2762587 --- /dev/null +++ b/content/blog/2019-05-21-may-19-dvc-heartbeat.md @@ -0,0 +1,300 @@ +--- +title: May ’19 DVC❤️Heartbeat +date: 2019-05-21 +description: | + DVC accepted into Google Season of Docs 🎉, Dmitry's talk at the O’Reilly AI + Conference, new portion of Discord gems, and articles either created or + brought to us by our community. +descriptionLong: | + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. + + Some of those are related to our brainchild [DVC](https://dvc.org) and its + journey. The others are a collection of exciting stories and ideas centered + around ML best practices and workflow. +picture: ../../static/uploads/images/2019-05-21/post-image.jpeg +pictureComment: | + Kudos to [StickerMule.com](https://www.stickermule.com) for our amazing + stickers (and great customer service)! +author: ../authors/svetlana_grinchenko.md +commentsUrl: https://discuss.dvc.org/t/may-19-dvc-heartbeat/290 +tags: + - Heartbeat + - Discord Gems + - Google Season of Docs +--- + +## News and links + +This section of DVC Heartbeat is growing with every new Issue and this is +already quite a good piece of news! + +One of the most exciting things we want to share this month is acceptance of DVC +into the [Google Season of Docs](https://developers.google.com/season-of-docs/). +It is a new and unique program sponsored by Google that pairs technical writers +with open source projects to collaborate and improve the open source project +documentation. You can find the outline of DVC vision and project ideas in +[this dedicated blogpost](https://blog.dataversioncontrol.com/dvc-project-ideas-for-google-summer-of-docs-2019-defe3a73b248) +and check the +[full list of participating open source organizations](https://developers.google.com/season-of-docs/docs/participants/). +Technically the +[program is starting in a few months](https://developers.google.com/season-of-docs/docs/timeline), +but there is already a fantastic increase in the amount of commits and +contributors, and we absolutely love it! + +The other important milestone for us was the first offline meeting with our +distributed remote team. Working side by side and having non-Zoom meetings with +the team was amazing. Joining our forces to prepare for the upcoming conferences +turned out to be the most valuable, educating and uniting experience for the +whole team. + +It’s a shame that our tech lead was unable to join us it due to another visa +denial. We do hope he will finally make it to the USA for the next big +conference. + +![](/uploads/images/2019-05-21/the-world-is-changing.png) + +While we were busy finalizing all the PyCon 2019 prep, our own +[Dmitry Petrov](https://twitter.com/FullStackML) flew to New York to speak at +the +[O’Reilly AI Conference](https://conferences.oreilly.com/artificial-intelligence/ai-ny) +about the +[Open Source tools for Machine Learning Models and Datasets versioning](https://www.oreilly.com/library/view/artificial-intelligence-conference/9781492050544/video324691.html). +Unfortunately the video is available for the registered users only (with a free +trial option) but you can have a look at Dmitry’s slides +[here](https://www.slideshare.net/DmitryPetrov15/dvc-oreilly-artificial-intelligence-conference-2019-new-york). + +![](/uploads/images/2019-05-21/iterative-ai-twitter.png) + +We renamed our Twitter! Our old handle was a bit misleading and we moved from +@Iterativeai to [@DVCorg](https://twitter.com/DVCorg) (yet keep the old one for +future projects). + +Our team is so happy every time we discover an article featuring DVC or +addressing one of the burning ML issues we are trying to solve. Here are some of +our favorite links from the past month: + +- **[Version Control For Your Machine Learning Projects — Episode 206](https://www.pythonpodcast.com/data-version-control-episode-206/)** + by **[Tobias Macey](https://www.linkedin.com/in/tmacey/)** + + + +> Version control has become table stakes for any software team, but for machine +> learning projects there has been no good answer for tracking all of the data +> that goes into building and training models, and the output of the models +> themselves. To address that need Dmitry Petrov built the Data Version Control +> project known as DVC. In this episode he explains how it simplifies +> communication between data scientists, reduces duplicated effort, and +> simplifies concerns around reproducing and rebuilding models at different +> stages of the projects lifecycle. + +- **Here is an + [article](https://towardsdatascience.com/data-version-control-with-dvc-what-do-the-authors-have-to-say-3c3b10f27ee) + by [Favio Vázquez](https://medium.com/@faviovazquez) with a transcript of this + podcast episode.** + + + +- **[Why Git and Git-LFS is not enough to solve the Machine Learning Reproducibility crisis](https://towardsdatascience.com/why-git-and-git-lfs-is-not-enough-to-solve-the-machine-learning-reproducibility-crisis-f733b49e96e8)** + + + +> With Git-LFS your team has better control over the data, because it is now +> version controlled. Does that mean the problem is solved? Earlier we said the +> “_key issue is the training data_”, but that was a lie. Sort of. Yes keeping +> the data under version control is a big improvement. But is the lack of +> version control of the data files the entire problem? No. + +
+ +## Discord gems + +There are lots of hidden gems in our Discord community discussions. Sometimes +they are scattered all over the channels and hard to track down. + +We are sifting through the issues and discussions and share with you the most +interesting takeaways. + +### Q: This might be [a favourite gem of ours ](https://discordapp.com/channels/485586884165107732/485598848111083531/572960640122224640) — our engineers are so fast that someone assumed they were bots. + +We feared that too until we met them in person. They appeared to be real (unless +bots also love Ramen now)! + +![](/uploads/images/2019-05-21/bots-also-love-ramen-now.png) + +### Q: [Is this the best way to track data with DVC when code and data are separate?](https://discordapp.com/channels/485586884165107732/485596304961962003/572974117351849997) Having being burned by this a couple of times, i.e accidentally pushing large files to GitHub, I now keep my code and data separate. + +Every time you run `dvc add` to start tracking some data artifact, its path is +automatically added to the `.gitignore` file, as a result it is hard to commit +it to git by mistake — you would need to explicitly modify the `.gitignore` +first. The feature to track some external data is called +[external outputs](https://dvc.org/doc/user-guide/external-outputs) (if all you +need is to track some data artifacts). Usually it is used when you have some +data on S3 or SSH and don’t want to pull it into your working space, but it’s +working even when your data is located on the same machine outside of the +repository. + +### Q: [How do I wrap a step that downloads a file/directory into a DVC stage?](https://discordapp.com/channels/485586884165107732/485596304961962003/571342592508428289) I want to ensure that it runs only if file has no been downloaded yet + +Use `dvc import` to track and download the remote data first time and next time +when you do dvc repro if data has changed remotely. If you don’t want to track +remote changes (lock the data after it was downloaded), use `dvc run` with a +dummy dependency (any text file will do you do not touch) that runs an actual +wget/curl to get the data. + +### Q: [How do I show a pipeline that does not have a default Dvcfile?](https://discordapp.com/channels/485586884165107732/485596304961962003/570943786151313408) (e.g. I assigned all files names manually with `-f` in the `dvc run` command and I just don’t have `Dvcfile` anymore) + +Almost any command in DVC that deals with pipelines (set of DVC-files) accepts a +single stage as a target, for example: + +```dvc +$ dvc pipeline show — ascii model.dvc +``` + +### Q: [DVC hangs or I’m getting `database is locked` issue](https://discordapp.com/channels/485586884165107732/485596304961962003/570843482218823682) + +It’s a well known problem with NFS, CIFS (Azure) — they do not support file +locks properly which is required by the SQLLite engine to operate. The easiest +workaround — don’t create a DVC project on network attached partition. In +certain cases a fix can be made by changing mounting options, check +[this discussion](https://discordapp.com/channels/485586884165107732/485596304961962003/570276668694855690) +for the Azure ML Service. + +### Q: [How do I use DVC if I use a separate drive to store the data and a small/fast SSD to run computations?](https://discordapp.com/channels/485586884165107732/485596304961962003/570091809594671126) I don’t have enough space to bring data to my working space. + +An excellent question! The short answer is: + +```dvc +# To move your data cache to a big partition +$ dvc cache dir --local /path/to/an/external/partition + +# To enable symlinks/harldinks to avoid actual copying +$ dvc config cache.type reflink, hardlink, symlink, copy + +# To protect the cache +$ dvc config cache.protected true +``` + +The last one is highly recommended to make links in your working space read-only +to avoid corrupting the cache. Read more about different link types +[here](https://dvc.org/doc/user-guide/large-dataset-optimization). + +To add your data first time to the DVC cache, do a clone of the repository on a +big partition and run `dvc add` to add your data. Then you can do `git pull`, +`dvc pull` on a small partition and DVC will create all the necessary links. + +### Q: [Why I’m getting `Paths for outs overlap` error when I run `dvc add` or `dvc run`?](https://discordapp.com/channels/485586884165107732/485596304961962003/571335064374345749) + +Usually it means that a parent directory of one of the arguments for `dvc add` / +`dvc run` is already tracked. For example, you’ve added the whole datasets +directory already. And now you are trying to add a subdirectory, which is +already tracked as a part of the datasets one. No need to do that. You could +`dvc add datasets` or `dvc repro datasets.dvc` to save changes. + +### Q: [I’m getting `ascii codec can’t encode character` error on DVC commands when I deal with unicode file names](https://discordapp.com/channels/485586884165107732/485596304961962003/567310354766495747) + +[Check the locale settings you have](https://perlgeek.de/en/article/set-up-a-clean-utf8-environment) +(`locale` command in Linux). Python expects a locale that can handle unicode +printing. Usually it’s solved with these commands: `export LC_ALL=en_US.UTF-8` +and `export LANG=en_US.UTF-8`. You can place those exports into `.bashrc` or +other file that defines your environment. + +### Q: [Does DVC use the same logins `aws-cli` has when using an S3 bucket as its repo/remote storage](https://discordapp.com/channels/485586884165107732/485596304961962003/563149775340568576)? + +In short — yes, but it can be also configured. DVC is going to use either your +default profile (from `~/.aws/*`) or your env vars by default. If you need more +flexibility (e.g. you need to use different credentials for different projects, +etc) check out +[this guide](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-profiles.html) +to configure custom aws profiles and then you could use them with DVC using +these +[remote options](https://dvc.org/doc/commands-reference/remote-add#options). + +### Q: [How can I output multiple metrics from a single file?](https://discordapp.com/channels/485586884165107732/485596304961962003/566000729505136661) + +Let’s say I have the following in a file: + +```json +{ + “AUC_RATIO”: + { + “train”: 0.8922748258797667, + “valid”: 0.8561602726251776, + “xval”: 0.8843431199314923 + } +} +``` + +How can I show both `train` and `valid` without `xval`? + +You can use `dvc metrics show` command `--xpath` option and provide multiple +attribute names to it: + +```dvc +$ dvc metrics show metrics.json \ + --type json \ + --xpath AUC_RATIO[train,valid] + metrics.json: + 0.89227482588 + 0.856160272625 +``` + +### Q: [What is the quickest way to add a new dependency to a DVC-file?](https://discordapp.com/channels/485586884165107732/485596304961962003/566314479499870211) + +There are a few options to add a new dependency: + +- simply opening a file with your favorite editor and adding a dependency there + without md5. DVC will understand that that stage is changed and will re-run + and re-calculate md5 checksums during the next DVC repro; + +- use `dvc run --no-exec` is another option. It will rewrite the existing file + for you with new parameters. + +### Q: [Is there a way to add a dependency to a python package, so it runs a stage again if it imported the updated library?](https://discordapp.com/channels/485586884165107732/485596304961962003/566315265646788628) + +The only recommended way so far would be to somehow make DVC know about your +package’s version. One way to do that would be to create a separate stage that +would be dynamically printing version of that specific package into a file, that +your stage would depend on: + +```dvc +$ dvc run -o mypkgver 'pip show mypkg > mypkgver’ +$ dvc run -d mypkgver -d ... -o .. mycmd +``` + +### Q: [Is there anyway to forcibly recompute the hashes of dependencies in a pipeline DVC-file?](https://discordapp.com/channels/485586884165107732/485596304961962003/564807276146458624) + +E.g. I made some whitespace/comment changes in my code and I want to tell DVC +“it’s ok, you don’t have to recompute everything”. + +Yes, you could `dvc commit -f`. It will save all current checksum without +re-running your commands. + +### Q: [I have projects that use data that’s stored in S3. I never have data locally to use `dvc push`, but I would like to have this data version controlled.](https://discordapp.com/channels/485586884165107732/485596304961962003/563352000281182218) Is there a way to use the features of DVC in this use case? + +Yes! This DVC features is called +[external outputs](https://dvc.org/doc/user-guide/external-outputs) and +[external dependencies](https://dvc.org/doc/user-guide/external-dependencies). +You can use one of them or both to track, process, and version your data on a +cloud storage without downloading it locally. + +
+ +If you have any questions, concerns or ideas, let us know +[here](https://dvc.org/support) and our stellar team will get back to you in no +time! diff --git a/content/blog/2019-06-26-june-19-dvc-heartbeat.md b/content/blog/2019-06-26-june-19-dvc-heartbeat.md new file mode 100644 index 00000000000..f602b772029 --- /dev/null +++ b/content/blog/2019-06-26-june-19-dvc-heartbeat.md @@ -0,0 +1,233 @@ +--- +title: June ’19 DVC❤️Heartbeat +date: 2019-06-26 +description: | + First DVC user survey, sharing our PyCon experience, new portion of Discord + discussions, and articles either created or brought to us by our community. +descriptionLong: | + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. + + Some of those are related to our brainchild [DVC](https://dvc.org) and its + journey. The others are a collection of exciting stories and ideas centered + around ML best practices and workflow. +picture: ../../static/uploads/images/2019-06-26/post-image.png +pictureComment: | + Thanks to the amazing [Signaturit Tech](https://twitter.com/SignaturitTech) + team for this + [photo](https://twitter.com/SignaturitTech/status/1127927520140120065?s=20)! +author: ../authors/svetlana_grinchenko.md +commentsUrl: https://discuss.dvc.org/t/june-19-dvc-heartbeat/289 +tags: + - PyCon + - Heartbeat + - Discord Gems +--- + +## News and links + +We want to start by saying to our users, contributors, and community members how +grateful we are for the fantastic work you are doing contributing to DVC, giving +talks about DVC, sharing your feedback, use cases and your concerns. A huge +thank you to each of you from the DVC team! + +We would love to give back and support any positive initiative around DVC — just +let us know [here](https://dvc.org/support) and we will send you a bunch of cool +swag, connect to a tech expert or find another way to support your project. Our +[DMs on Twitter](https://twitter.com/DVCorg) are open, too. + +**And if you have 4 minutes to spare, we are conducting out first +[DVC user survey](https://docs.google.com/forms/d/1tmn8YHLUkeSi5AIq4DGJi28iZy9HTazl6DWKe3Hxpnc/edit?ts=5cfc47c2) +and would love to hear from you!** + +Aside from admiring great DVC-related content from our users we have one more +reason to particularly enjoy the past month — DVC team went to Cleveland to +attend [PyCon 2019](https://us.pycon.org/2019/about/) and it was a blast! + +![](/uploads/images/2019-06-26/cleveland-to-attend-pycon-2019.jpeg) _Amazing +[Jennifer](https://github.com/sureL) and her artwork for our +[SupportOpenSource](https://twitter.com/hashtag/SupportOpenSource) contest_ + +We had it all. Running our first ever conference booth, leading an impromptu +unconference discussion and arranging some cool +[#SupportOpenSource](https://twitter.com/hashtag/SupportOpenSource?src=hashtag_click) +activities was great! Last-minute accommodation cancellations, booth equipment +delivery issues, and being late for our very own talk was not so great. Will be +sharing more about it in a separate blogpost soon. + +https://youtu.be/jkfh2PM5Sz8 + +Here is [Dmitry Petrov](https://twitter.com/FullStackML)’s PyCon +[talk](https://www.youtube.com/watch?v=jkfh2PM5Sz8) and +[slides](https://docs.google.com/presentation/d/1CYt0w8WoZAXiQEtVDVDsTnQumzdZx91v32MwEK20R-E/edit) +on Machine learning model and dataset versioning practices. + +We absolutely loved being at PyCon and can’t wait for our next conference! + +
+ +Our team is so happy every time we discover an article featuring DVC or +addressing one of the burning ML issues we are trying to solve. Here are some of +the links that caught our eye past month: + +- **[The Rise of DataOps (from the ashes of Data Governance)](https://towardsdatascience.com/the-rise-of-dataops-from-the-ashes-of-data-governance-da3e0c3ac2c4) + by [Ryan Gross](https://towardsdatascience.com/@ryanwgross).** + +A brilliant comprehensive read on the current data management issues. It might +be the best article we have ever read on this subject. Every word strongly +resonates with our vision and ideas behind DVC. Highly recommended by DVC team! + + + +> Legacy Data Governance is broken in the ML era. Let’s rebuild it as an +> engineering discipline. At the end of the transformation, data governance will +> look a lot more like DevOps, with data stewards, scientists, and engineers +> working closely together to codify the governance policies. + +- **[First Impressions of Data Science Version Control (DVC)](https://medium.com/@christopher.samiullah/first-impressions-of-data-science-version-control-dvc-fe96ab29cdda) + by [Christopher Samiullah](https://christophergs.github.io/)** + + + +> In 2019, we tend to find organizations using a mix of git, Makefiles, ad hoc +> scripts and reference files to try and achieve reproducibility. DVC enters +> this mix offering a cleaner solution, specifically targeting Data Science +> challenges. + +- **[Versioning and Reproducibility with MLV-tools and DVC](https://github.com/peopledoc/mlv-tools-tutorial): + [Talk](https://peopledoc.github.io/mlv-tools-tutorial/talks/pyData/presentation.html#/) + and + [Tutorial](https://peopledoc.github.io/mlv-tools-tutorial/talks/workshop/presentation.html#/) + by [Stéphanie Bracaloni](https://github.com/sbracaloni) and + [Sarah Diot-Girard](https://github.com/SdgJlbl).** + +![](/uploads/images/2019-06-26/versioning-and-reproducibility-with-mlv-tools.png) + +- **[Becoming a machine learning company means investing in foundational technologies](https://www.oreilly.com/ideas/becoming-a-machine-learning-company-means-investing-in-foundational-technologies) + by [Ben Lorica](https://www.oreilly.com/people/4e7ad-ben-lorica)** + + + +> With an eye toward the growing importance of machine learning, we recently +> completed +> [a data infrastructure survey](https://www.oreilly.com/data/free/evolving-data-infrastructure.csp) +> that drew more than 3,200 respondents. + +
+ +## Discord gems + +There are lots of hidden gems in our Discord community discussions. Sometimes +they are scattered all over the channels and hard to track down. + +We are sifting through the issues and discussions and share with you the most +interesting takeaways. + +### Q: [Does DVC support Azure Data Lake Gen1?](https://discordapp.com/channels/485586884165107732/563406153334128681/575655655629651968) + +Azure data lake is HDFS compatible. And DVC supports HDFS remotes. Give it a try +and let us know if you hit any problems [here](https://dvc.org/chat). + +### Q: [An excellent discussion on versioning tabular (SQL) data.](https://discordapp.com/channels/485586884165107732/563406153334128681/575681811401801748) Do you know of any tools that deal better with SQL-specific versioning? + +It’s a wide topic. The actual solution might depend on a specific scenario and +what exactly needs to be versioned. DVC does not provide any special +functionality on top of databases to version their content. + +Depending on your use case, our recommendation would be to run SQL and pull the +result file (CSV/TSV file?) that then can be used to do analysis. This file can +be taken under DVC control. Alternatively, in certain cases source files (that +are used to populate the databases) can be taken under control and we can keep +versions of them, or track incoming updates. + +Read the +[discussion](https://discordapp.com/channels/485586884165107732/563406153334128681/575681811401801748) +to learn more. + +### Q: [How does DVC do the versioning between binary files?](https://discordapp.com/channels/485586884165107732/563406153334128681/575686711821205504) Is there a binary diff, similar to git? Or is every version stored distinctly in full? + +DVC is just saving every file as is, we don’t use binary diffs right now. There +won’t be a full directory (if you added just a few files to a 10M files +directory) duplication, though, since we treat every file inside as a separate +entity. + +### Q: [Is there a way to pass parameters from e.g. `dvc repro` to stages?](https://discordapp.com/channels/485586884165107732/563406153334128681/576160840701575169) + +The simplest option is to create a config file — json or whatnot — that your +scripts would read and your stages depend on. + +### Q: [What is the best way to get cached output files from different branches simultaneously?](https://discordapp.com/channels/485586884165107732/563406153334128681/577852740034625576) For example, cached tensorboard files from different branches to compare experiments. + +There is a way to do that through our (still not officially released) API pretty +easily. Here is an +[example script](https://cdn.discordapp.com/attachments/563406153334128681/577894682722304030/dvc_get_output_files.py) +how it could be done. + +### Q: [Docker and DVC.](https://discordapp.com/channels/485586884165107732/563406153334128681/583949033685516299) To being able to push/pull data we need to run a git clone to get DVC-files and remote definitions — but we worry that would make the container quite heavy (since it contains our entire project history). + +You can do `git clone — depth 1`, which will not download any history except the +latest commits. + +### Q: [After DVC pushing the same file, it creates multiple copies of the same file. Is that how it’s supposed to work?](https://discordapp.com/channels/485586884165107732/485596304961962003/574133734136086559) + +If you are pushing the same file, there are no copies pushed or saved in the +cache. DVC is using checksums to identify files, so if you add the same file +once again, it will detect that cache for it is already in the local cache and +wont copy it again to cache. Same with dvc push, if it sees that you already +have cache file with that checksum on your remote, it won’t upload it again. + +### Q: [How do I uninstall DVC on Mac (installed via `pkg` installer)?](https://discordapp.com/channels/485586884165107732/485596304961962003/574941227624169492) + +Something like this should work: + +```dvc +$ which dvc +/usr/local/bin/dvc -> /usr/local/lib/dvc/dvc + +$ ls -la /usr/local/bin/dvc +/usr/local/bin/dvc -> /usr/local/lib/dvc/dvc + +$ sudo rm -f /usr/local/bin/dvc +$ sudo rm -rf /usr/local/lib/dvc +$ sudo pkgutil --forget com.iterative.dvc +``` + +### Q: [How do I pull from a public S3 bucket (that contains DVC remote)?](https://discordapp.com/channels/485586884165107732/485596304961962003/575236576309674024) + +Just add public URL of the bucket as an HTTP endpoint. See +[here](https://github.com/iterative/example-get-started/blob/master/.dvc/config) +for an example. +[https://remote.dvc.org/get-started](https://remote.dvc.org/get-started) is made +to redirect to the S3 bucket anyone can read from. + +### Q: [I’m getting the same error over and over about locking:](https://discordapp.com/channels/485586884165107732/485596304961962003/575535709490905101) `ERROR: failed to lock before running a command — cannot perform the cmd since DVC is busy and locked. Please retry the command later.` + +Most likely it happens due to an attempt to run DVC on NFS that has some +configuration problems. There is a +[well known problem with DVC on NFS](https://github.com/iterative/dvc/issues/1918) +— sometimes it hangs on trying to lock a file. The usual workaround for this +problem is to allocate DVC cache on NFS, but run the project (git clone, DVC +metafiles, etc) on the local file system. Read +[this answer](https://discuss.dvc.org/t/share-nas-data-in-server/180/4?u=shcheklein) +to see how it can be setup. + +
+ +If you have any questions, concerns or ideas, let us know in the comments below +or connect with DVC team [here](https://dvc.org/support). Our +[DMs on Twitter](https://twitter.com/DVCorg) are open, too. diff --git a/content/blog/2019-08-01-july-19-dvc-heartbeat.md b/content/blog/2019-08-01-july-19-dvc-heartbeat.md new file mode 100644 index 00000000000..8df4729ee1a --- /dev/null +++ b/content/blog/2019-08-01-july-19-dvc-heartbeat.md @@ -0,0 +1,212 @@ +--- +title: July ’19 DVC❤️Heartbeat +date: 2019-08-01 +description: | + As we continue to grow DVC together with our fantastic contributors, we enjoy + more and more insights, discussions, and articles either created or brought to + us by our community. +descriptionLong: | + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. + + Some of those are related to our brainchild [DVC](https://dvc.org) and its + journey. The others are a collection of exciting stories and ideas centered + around ML best practices and workflow. +picture: ../../static/uploads/images/2019-08-01/post-image.png +pictureComment: | + Special edition + [DVC shirt](https://twitter.com/rkuprieiev/status/1144298339200098306?s=20). + We made this one for [Ruslan](https://github.com/efiop) — DVC maintainer and + the best tech lead. +author: ../authors/svetlana_grinchenko.md +commentsUrl: https://discuss.dvc.org/t/july-19-dvc-heartbeat/288 +tags: + - Heartbeat + - Open Source Summit + - Discord Gems +--- + +## News and links + +As we continue to grow DVC together with our fantastic contributors, we enjoy +more and more insights, discussions, and articles either created or brought to +us by our community. We feel it is the right time to start sharing more of your +news, your stories and your discoveries. New Heartbeat is here! + +Speaking of our own news — next month DVC team is going to the +[Open Source North America Summit](https://events.linuxfoundation.org/events/open-source-summit-north-america-2019/). +It is taking place in San Diego on August 21–23. +[Dmitry](https://ossna19.sched.com/speaker/dmitry35) and +[Sveta](https://ossna19.sched.com/speaker/svetlanagrinchenko) will be giving +talks and we will run a booth. So looking forward to it! Stop by for a chat and +some cool swag. And if you are in San Diego on those days and want to catch up — +please let us know [here](http://dvc.org/support) or on Twitter! + + + + + +Every month our team is excited to discover new great pieces of content +addressing some of the burning ML issues. Here are some of the links that caught +our eye in June: + +- **[Principled Machine Learning: Practices and Tools for Efficient Collaboration](https://dev.to/robogeek/principled-machine-learning-4eho) + by [David Herron](https://medium.com/@7genblogger)** + + + +> As we’ve seen in this article some tools and practices can be borrowed from +> regular software engineering. However, the needs of machine learning projects +> dictate tools that better fit the purpose. + +- **First + [ML-REPA](http://ml-repa.ru/)[Meetup: Reproducible ML experiments](http://ml-repa.ru/page6697700.html) + hosted by [Raiffeisen DGTL](https://www.raiffeisen-digital.ru/?utm_referrer=) + — check out the video and slide decks.** + + + +[ML-REPA](http://ml-repa.ru/) is an a new fantastic resource for +Russian-speaking folks interested in Reproducibility, Experiments and Pipelines +Automation. Curated by [Mikhail Rozhkov](https://twitter.com/mnrozhkov) and +highly recommended by our team. + +### [How do you manage your machine learning experiments?](https://www.reddit.com/r/MachineLearning/comments/bx0apm/d_how_do_you_manage_your_machine_learning/) discussion on Reddit is full of insights. + +
[D] How do you manage your machine learning experiments? from r/MachineLearning
+ +
+ +## Discord gems + +There are lots of hidden gems in our Discord community discussions. Sometimes +they are scattered all over the channels and hard to track down. + +We are sifting through the issues and discussions and share with you the most +interesting takeaways. + +### Q: I have within one git repository different folders with very different content (basically different projects, or content I want to have different permissions to), and I thought about using different buckets in AWS as remotes. [I’m not sure if it’s possible with DVC to store some files in some remote, and some other files in some other remote, is it?](https://discordapp.com/channels/485586884165107732/485596304961962003/575718048330416158) + +You can definitely add more than one remote (see +[dvc remote add](https://dvc.org/doc/commands-reference/remote-add)) and then +[dvc push](https://dvc.org/doc/commands-reference/push) has a `-R` option to +pick which one to send the cached data files (deps, outs, etc) to. We would not +recommend doing this though. It complicates the commands you have to run — you +will need to remember to specify a remote name for every command that deals with +data — `push`, `pull`, `gc`, `fetch`, `status`, etc. Please, leave a comment in +the relevant issue [here](https://github.com/iterative/dvc/issues/2095) if this +case is important for you. + +### Q: [Is that possible with DVC to have multiple (few) metric files and compare them all at once?](https://discordapp.com/channels/485586884165107732/485596304961962003/578532350221352987) For example, we’d like to consider as metrics the loss of a neural network training process (loss as a `-M` output of a training stage), and also apart knowing the accuracy of the NN on a test set (another `-M` output of eval stage). + +Yes, it is totally fine to use `-M` in different stages. `dvc metrics show` will +just show both metrics. + +### Q: [I have a scenario where an artifacts (data) folder is created by the dvc run command via the `-o` flag. I have manually added another file into or modified the artifacts folder but when I do `dvc push` nothing happens, is there anyway around this?](https://discordapp.com/channels/485586884165107732/485596304961962003/577362750443880449) + +Let’s first do a quick recap on how DVC handles data files (you can definitely +find more information on the [DVC documentation site](http://dvc.org/docs)). + +- When you do `dvc add`, `dvc run` or `dvc import` DVC puts artifacts (in case + of `dvc run` artifacts == outputs produced by the command) into `.dvc/cache` + directory (default cache location). You don’t see this happening because + [DVC keeps links](https://dvc.org/doc/user-guide/large-dataset-optimization) + (or in certain cases creates a copy) to these files/directories. + +- `dvc push` does not move files from the workspace (that what you see) to the + remote storage, it always moves files/directories that are already in cache + (default is .dvc/cache). + +- So, now you’ve added a file manually, or made some other modifications. But + these files are not in cache yet. The analogy would be `git commit`. You + change the file, you do `git commit`, only after that you can push something + to Git server (Github/Gitlab, etc). The difference is that DVC is doing commit + (moves files to cache) automatically in certain cases — `dvc add`, `dvc run`, + etc. + +There is an explicit command — `dvc commit` - that you should run if you want to +enforce the change to the output produced by `dvc run`. This command will update +the corresponding DVC- files (.dvc extension) and will move data to cache. After +that you should be able to run `dvc push` to save your data on the external +storage. + +Note, when you do an explicit commit like this you are potentially “breaking” +the reproducibility. In a sense that there is no guarantee now that your +directory can be produced by `dvc run`/`dvc repro` — since you changed it +manually. + +### Q: [I’d like to transform my dataset in-place to avoid copying it, but I can’t use `dvc run` to do this because it doesn’t allow the same directory as an output and a dependency.](https://discordapp.com/channels/485586884165107732/485596304961962003/578898899469729796) + +You could do this in one step (one stage). So that getting your data and +modifying it, is one stage. So you don’t depend on the data folder. You just +could depend on your download + modifying script. + +### Q: [Can anyone tell me what this error message is about?](https://discordapp.com/channels/485586884165107732/485596304961962003/579283950778712076) “To avoid unpredictable behavior, rerun command with non overlapping outs paths.” + +Most likely it means that there is a DVC-file that have the same output twice. +Or there two DVC-files that share the same output file. + +### Q: [I’m getting “No such file or directory” error when I do `dvc run` or `dvc repro`](https://discordapp.com/channels/485586884165107732/485596304961962003/580176327701823498). The command runs find if I don’t use DVC. + +That happens because dvc run is trying to ensure that your command is the one +creating your output and removes existing outputs before executing the command. +So that when you run `dvc repro` later, it will be able to fully reproduce the +output. So you need to make the script create the directory or file. + +### Q: [I’m implementing a CI/CD and I would like to simplify my CI/CD or even my training code (keeping them cloud agnostic) by using `dvc pull` inside my Docker container when initializing a training job. ](https://discordapp.com/channels/485586884165107732/485596304961962003/581256265234251776) Can DVC be used in this way? + +Yes, it’s definitely a valid case for DVC. There are different ways of +organizing the storage that training machines are using to access data. From the +very simple — using local storage volume and pulling data from the remote +storage every time — to using NAS or EFS to store a shared DVC cache. + +### Q: [I was able to follow the getting started examples, however now I am trying to push my data to Github, I keep getting the following error: “ERROR: failed to push data to the cloud — upload is not supported by https remote”.](https://discordapp.com/channels/485586884165107732/563406153334128681/598866528984891403) + +HTTP remotes do not support upload yet. Example Get Started repository is using +HTTP to keep it read-only and abstract the actual storage provider we are using +internally. If you actually check the remote URL, you should see that it is an +S3 bucket and AWS provides an HTTP end-point to read data from it. + +### Q: I’m looking to configure AWS S3 as a storage for DVC. I’ve set up the remotes and initialized dvc in the git repository. I tried testing it by pushing a dataset in the form of an excel file. The command completed without any issues but this is what I’m seeing in S3. [DVC seems to have created a subdirectory in the intended directory called “35” where it placed this file with a strange name.](https://discordapp.com/channels/485586884165107732/485596304961962003/585967551708921856) + +This is not an issue, it is an implementation detail. There’s no current way to +upload the files with the original filename (In this case, the S3 bucket will +have the file `data.csv` but with another name `20/893143…`). The reason behind +this decision is because we want to store a file only once no matter how many +dataset versions it’s used in. Also, it’s a reliable way to uniquely identify +the file. You don’t have to be afraid that someone decided to create a file with +the same name (path) but a different content. + +### Q: [Is it possible to only have a shared ‘local’ cache and no remote?](https://discordapp.com/channels/485586884165107732/563406153334128681/587730054893666326) I’m trying to figure out how to use this in a 40 node cluster which already has very fast NFS storage across all the nodes. Not storing everything twice seems desirable. Esp. for the multi-TB input data + +Yes and it’s one of the very common use case, actually. All you need to do is to +use dvc cache dir command to setup an external cache. There are few caveats +though. Please, read +[this link](https://discuss.dvc.org/t/share-nas-data-in-server/180/4?u=shcheklein) +for an example of the workflow. + +
+ +If you have any questions, concerns or ideas, let us know in the comments below +or connect with DVC team [here](https://dvc.org/support). Our +[DMs on Twitter](https://twitter.com/DVCorg) are always open, too. diff --git a/content/blog/2019-09-26-september-19-dvc-heartbeat.md b/content/blog/2019-09-26-september-19-dvc-heartbeat.md new file mode 100644 index 00000000000..f9e1c8c5df0 --- /dev/null +++ b/content/blog/2019-09-26-september-19-dvc-heartbeat.md @@ -0,0 +1,355 @@ +--- +title: September ’19 DVC❤️Heartbeat +date: 2019-09-26 +description: | + Announcing our first meetup in San Francisco, kicking off Google Season of + Docs program, sharing Open Source Summit experience, and more news, links, and + gems. +descriptionLong: | + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. + + Some of those are related to our brainchild [DVC](https://dvc.org) and its + journey. The others are a collection of exciting stories and ideas centered + around ML best practices and workflow. +picture: ../../static/uploads/images/2019-09-26/post-image.jpeg +author: ../authors/svetlana_grinchenko.md +commentsUrl: https://discuss.dvc.org/t/september-19-dvc-heartbeat/287 +tags: + - Discord Gems + - Heartbeat + - Meetup + - Open Source Summit +--- + +## News and links + +We are super excited to co-host our very first +**[meetup in San Francisco on October 10](https://www.meetup.com/San-Francisco-Machine-Learning-Meetup/events/264846847/)**! +We will gather at the brand new Dropbox HQ office at 6:30 pm to discuss +open-source tools to version control ML models and experiments. +[Dmitry Petrov](https://twitter.com/FullStackML) is teaming up with +[Daniel Fischetti](https://www.linkedin.com/in/daniel-fischetti-4a6592bb/) from +[Standard Cognition](https://standard.ai/) to discuss best ML practices. Join us +and save your spot now: + + + +If you are not in SF on this date and happen to be in Europe — don’t miss the +PyCon DE & PyData Berlin 2019 joint event on October 9–11. We cannot make it to +Berlin this year, but we were thrilled to discover 2 independent talks featuring +DVC by +[Alessia Marcolini](https://de.pycon.org/program/pydata-ppgwxl-version-control-for-data-science-alessia-marcolini/) +and +[Katharina Rasch](https://de.pycon.org/program/pydata-cwmae7-tools-that-help-you-get-your-experiments-under-control-katharina-rasch/). + +Some other highlights of the end of summer: + +- Our users and contributors keep creating fantastic pieces of content around + DVC (sharing some links below, but it’s only a fraction of what we have in + stock — can’t be more happy and humbled about it!). + +- We’ve reached 79 contributors to + [DVC core project](https://github.com/iterative/dvc) and 74 contributors to + [DVC documentation](https://github.com/iterative/dvc.org) (and have something + special in mind to celebrate our 100th contributors). + +- we enjoyed working with all the talented + [Google Season of docs](https://developers.google.com/season-of-docs/) + applicants and now moving to the next stage with our chosen tech writer + [Dashamir Hoxha](http://dashohoxha.fs.al/). + +- We’ve crossed the 3,000 stars mark on Github + ([over 3,500 now](https://github.com/iterative/dvc)). Thank you for your + support! + + https://twitter.com/DVCorg/status/1147220439472545793 + +- We’ve had great time at the + [Open Source Summit](https://events.linuxfoundation.org/events/open-source-summit-north-america-2019/program/) + by Linux foundation in San Diego — speaking on stage, running a booth and + chatting with all the amazing open-source crowd out there. + + https://twitter.com/a142hr/status/1164256520235675648 + +![](/uploads/images/2019-09-26/open-source-summit-by-linux-foundation.jpeg) + +
+ +Here are some of the great pieces of content around DVC and ML ops that we +discovered in July and August: + +- ** Great insightful discussion on Twitter about versioning ML projects started + by [Nathan Benaich](https://medium.com/@NathanBenaich).** + + https://twitter.com/NathanBenaich/status/1151815916512010242 + +- **[Our Machine Learning Workflow: DVC, MLFlow and Training in Docker Containers](https://medium.com/ixorthink/our-machine-learning-workflow-dvc-mlflow-and-training-in-docker-containers-5b9c80cdf804) + by [Ward Van Laer](https://medium.com/@ward.vanlaer).** + +> It is possible to manage your work flow using open-source and free tools. + + + +- **[Using DVC to create an efficient version control system for data projects](https://medium.com/qonto-engineering/using-dvc-to-create-an-efficient-version-control-system-for-data-projects-96efd94355fe) + by [Basile Guerrapin](https://medium.com/@basile_16101).** + +> DVC brought versioning for inputs, intermediate files and algorithm models to +> the VAT auto-detection project and this drastically increased our +> **productivity**. + + + +- **[Managing versioned machine learning datasets in DVC, and easily share ML projects with colleagues](https://techsparx.com/software-development/ai/dvc/versioning-example.html) + by [David Herron](https://twitter.com/7genblogger).** + +> In this tutorial we will go over a simple image classifier. We will learn how +> DVC works in a machine learning project, how it optimizes reproducing results +> when the project is changed, and how to share the project with colleagues. + + + +- **[How to use data version control (dvc) in a machine learning project](https://towardsdatascience.com/how-to-use-data-version-control-dvc-in-a-machine-learning-project-a78245c0185) + by [Matthias Bitzer](https://towardsdatascience.com/@matthiasbitzer94).** + +> To illustrate the use of dvc in a machine learning context, we assume that our +> data is divided into train, test and validation folders by default, with the +> amount of data increasing over time either through an active learning cycle or +> by manually adding new data. + + + +- **[Version Control ML Model](https://towardsdatascience.com/version-control-ml-model-4adb2db5f87c) + by [Tianchen Wu](https://towardsdatascience.com/@TianchenW)** + +> This post presents a solution to version control machine learning models with +> git and dvc ([Data Version Control](https://dvc.org/doc/tutorial)). + + + +- **[Reflinks vs symlinks vs hard links, and how they can help machine learning projects](https://dev.to/robogeek/reflinks-vs-symlinks-vs-hard-links-and-how-they-can-help-machine-learning-projects-1cj4) + by [David Herron](https://medium.com/@7genblogger)** + +> In this blog post we’ll go over the details of using links, some cool new +> stuff in modern file systems (reflinks), and an example of how DVC (Data +> Version Control, [https://dvc.org/](https://dvc.org/)) leverages this. + + + +- **[DVC dependency management — a guide](https://blog.codecentric.de/en/2019/08/dvc-dependency-management/) + by [Bert Besser](https://blog.codecentric.de/en/author/bert-besser/) and + [Veronika Schwan](https://blog.codecentric.de/en/author/veronika-schindler/).** + +> This post is a follow-up to +> [A walkthrough of DVC](https://blog.codecentric.de/en/2019/03/walkthrough-dvc/) +> that deals with managing dependencies between DVC projects. In particular, +> this follow-up is about importing specific versions of an artifact (e.g. a +> trained model or a dataset) from one DVC project into another. + + + +- **[Effective ML Teams — Lessons Learne](https://medium.com/@czeslaw.szubert/effective-ml-teams-lessons-learned-6a6e761bc283) + by [Czeslaw Szubert](https://medium.com/@czeslaw.szubert)** + +> In this post I’ll present lessons learned on how to setup successful ML teams +> and what you need to devise an effective enterprise ML strategy. + + + +- **[Lessons learned from training a German Speech Recognition model](https://www.esentri.com/lessons-learned-from-training-a-german-speech-recognition-model/) + by [David Schönleber](https://www.linkedin.com/in/dschoenleber/).** + +> Setting up a documentation-by-design workflow and using appropriate tools +> where needed, e.g. _MLFlow_ and _dvc,_ can be a real deal-breaker. + + + +
+ +## Discord gems + +There are lots of hidden gems in our Discord community discussions. Sometimes +they are scattered all over the channels and hard to track down. + +We are sifting through the issues and discussions and share with you the most +interesting takeaways. + +### Q: I’m getting an error message while trying to use AWS S3 storage: `ERROR: failed to push data to the cloud — Unable to locate credentials.` [Any ideas what’s happening?](https://discordapp.com/channels/485586884165107732/563406153334128681/587792932061577218) + +Most likely you haven’t configured your S3 credentials/AWS account yet. Please, +read the full documentation on the AWS website. The short version of what should +be done is the following: + +- [Create your AWS account.](https://portal.aws.amazon.com/gp/aws/developer/registration/index.html) + +- Log in to your AWS Management Console. + +- Click on your user name at the top right of the page. + +- Click on the Security Credentials link from the drop-down menu. + +- Find the Access Credentials section, and copy the latest `Access Key ID`. + +- Click on the Show link in the same row, and copy the `Secret Access Key`. + +Follow +[this link](https://docs.aws.amazon.com/cli/latest/userguide/cli-chap-configure.html) +to setup your environment. + +### Q: I added data with `dvc add` or `dvc run` and see that it takes twice what it was before (with `du` command). [Does it mean that DVC copies data that is added under its control? How do I prevent this from happening?](https://discordapp.com/channels/485586884165107732/563406153334128681/595402051203235861) + +To give a short summary — by default, DVC copies the files from your working +directory to the cache (this is for safety reasons, it is better to duplicate +the data). If you have reflinks (copy-on-write) enabled on your file system, DVC +will use that method — which is as safe as copying. You can also configure DVC +to use hardlinks/symlinks to save some space and time, but it will require +enabling the protected mode (making data files in workspace read-only). Read +more details [here](https://dvc.org/doc/user-guide/large-dataset-optimization). + +### Q: [How concurrent-friendly is the cache? And different remotes? Is it safe to have several containers/nodes fill the same cache at the same time?](https://discordapp.com/channels/485586884165107732/563406153334128681/599345778703597568) + +It is safe and a very common use case for DVC to have a shared cache. Please, +check [this thread](https://discuss.dvc.org/t/share-nas-data-in-server/180/12), +for example. + +### Q:[What is the proper way to exit the ASCII visualization?](https://discordapp.com/channels/485586884165107732/563406153334128681/603890677176336394) (when you run `dvc pipeline show` command). + +See this +[document](https://dvc.org/doc/commands-reference/pipeline/show#options). To +navigate, use arrows or W, A, S, D keys. To exit, press Q. + +### Q: [Is there an issue if I set my `cache.s3` external cache to my default remote?](https://discordapp.com/channels/485586884165107732/563406153334128681/606197026488844338) I don’t quite understand what an external cache is for other than I have to have it for external outputs. + +Short answer is that we would suggest keeping them separately to avoid possible +checksum overlaps. Checksum on S3 might theoretically overlap with our checksums +(with the content of the file being different), so it could be dangerous. The +chances of losing data are pretty slim, but we would not risk it. Right now, we +are working on making sure there are no possible overlapping. + +### Q: [What’s the right procedure to move a step .dvc file around the project?](https://discordapp.com/channels/485586884165107732/563406153334128681/606425815139221504) + +Assuming the file was created with `dvc run`. There are few possible ways. +Obvious one is to delete the file and create a new one with +`dvc run --no-exec -f file/path/and/name.dvc`. Another possibility is to +rename/move and then edit manually. See +[this document](https://dvc.org/doc/user-guide/dvc-file-format) that describes +how DVC-files are organized. No matter what method you use, you can run +`dvc commit file.dvc` to save changes without running the command again. + +### Q: [`dvc status` doesn’t seem to report things that need to be dvc pushed, is that by design?](https://discordapp.com/channels/485586884165107732/563406153334128681/606917839688957952) + +You should try with dvc status `--cloud` or `dvc status --remote ` +to compare your local cache with a remote one, by default it only compares the +“working directory” with your local cache (to check whether something should be +reproduced and saved or not). + +### Q: [What kind of files can you put into `dvc metrics`?](https://discordapp.com/channels/485586884165107732/563406153334128681/608701494035873792) + +The file could be in any format, `dvc metric` show will try to interpret the +format and output it in the best possible way. Also, if you are using `csv` or +`json`, you can use the `--xpath` flag to query specific measurements. **In +general, you can make any file a metric file and put any content into it, DVC is +not opinionated about it.** Usually though these are files that measures the +performance/accuracy of your model and captures configuration of experiments. +The idea is to use `dvc metrics show` to display all your metrics across +experiments so you can make decisions of which combination (of features, +parameters, algorithms, architecture, etc.) works the best. + +### Q: [Does DVC take into account the timestamp of a file or is the MD5 only depends on the files actual/bits content?](https://discordapp.com/channels/485586884165107732/563406153334128681/613639458000207902) + +DVC takes into account only content (bits) of a file to calculate hashes that +are saved into DVC-files. + +### Q: [Similar to `dvc gc` is there a command to garbage collect from the remote?](https://discordapp.com/channels/485586884165107732/563406153334128681/616421757808541721) + +`dvc gc --remote NAME` is doing this, but you should be extra careful, because +it will remove everything that is not currently “in use” (by the working +directory). Also, please check this +[issue](https://github.com/iterative/dvc/issues/2325) — semantics of this +command might have changed by the time you read this. + +### Q: [How do I use and configure remote storage on IBM Cloud Object Storage?](https://discordapp.com/channels/485586884165107732/485596304961962003/591237578209099786) + +Since it’s S3 compatible, specifying `endpointurl` (exact URL depends on the +[region](https://cloud.ibm.com/docs/services/cloud-object-storage?topic=cloud-object-storage-endpoints)) +is the way to go: + +```dvc +$ dvc remote add -d mybucket s3://path/to/dir +$ dvc remote modify mybucket \ + endpointurl \ + https://s3.eu.cloud-object-storage.appdomain.cloud +``` + +### Q: [How can I push data from client to google cloud bucket using DVC?](https://discordapp.com/channels/485586884165107732/485596304961962003/592958360903483403). Just want to know how can i set the credentials. + +You can do it by setting environment variable pointing to yours credentials +path, like: + +```dvc +$ export GOOGLE_APPLICATION_CREDENTIALS=path/to/credentials +``` + +It is also possible to set this variable via `dvc config`: + +```dvc +$ dvc remote modify myremote credentialpath /path/to/my/creds +``` + +where `myremote` is your remote name. + +
+ +If you have any questions, concerns or ideas, let us know in the comments below +or connect with DVC team [here](https://dvc.org/support). Our +[DMs on Twitter](https://twitter.com/DVCorg) are always open, too. diff --git a/content/blog/2019-10-08-dvc-org-for-hacktoberfest-2019.md b/content/blog/2019-10-08-dvc-org-for-hacktoberfest-2019.md new file mode 100644 index 00000000000..b9e38784c55 --- /dev/null +++ b/content/blog/2019-10-08-dvc-org-for-hacktoberfest-2019.md @@ -0,0 +1,113 @@ +--- +title: DVC.org for Hacktoberfest 2019 +date: 2019-10-08 +description: | + Our favorite month of the year Hacktoberfest is already in full swing and we + at DVC.org are so excited to be a part of it! +descriptionLong: | + Our favorite month of the year + [Hacktoberfest](https://hacktoberfest.digitalocean.com/) is already in full + swing and we at [DVC.org](https://dvc.org) are so excited to be a part of it! +picture: ../../static/uploads/images/2019-10-08/post-image.png +author: ../authors/svetlana_grinchenko.md +commentsUrl: https://discuss.dvc.org/t/dvc-org-for-hacktoberfest-2019/286 +tags: + - Hacktoberfest +--- + +[Hacktoberfest](https://hacktoberfest.digitalocean.com/) is a monthly-long +program that celebrates open source and encourages you to contribute to open +source projects (and rewards you with stickers and a cool T-shirt!). Whether +you’re a seasoned contributor or looking for projects to contribute to for the +first time, you’re welcome to participate! + +It is the 6th season of Hacktoberfest and the 2d year of participating for +DVC.org team. We really enjoyed it in 2018 and this year we are upping the game +with our own cool stickers, special edition T-shirts and a +[collection of carefully picked tickets](https://github.com/iterative/dvc/labels/hacktoberfest). + +### How to participate? + +If you haven’t started your Hacktoberfest challenge yet, it is just the right +time, you have 3 weeks left to submit PRs and get your swag! Here are some +important details: + +- Hacktoberfest is open to everyone in the global community. + +- You can sign up anytime between October 1 and October 31. Make sure to sign up + on the + [official Hacktoberfest website](https://hacktoberfest.digitalocean.com/) for + your PRs to count. + +- To get a shirt, you must make 4 legit pull requests (PRs) between October 1–31 + in any time zone. + +- Pull requests can be made in any public GitHub-hosted repositories/projects, + not just the ones highlighted. + +And the special addition from DVC.org team: + +- Look through the list of + [DVC Hacktoberfest tickets](https://github.com/iterative/dvc/labels/hacktoberfest) + or the list of + [good DVC first issues](https://github.com/iterative/dvc/labels/good%20first%20issue). + +- Make a PR to DVC and get our stickers. + +- Close three issues for DVC and get a special DVC T-shirt. + +### Why contribute to DVC? + +[DVC](http://dvc.org) (Data Version Control) is a relatively young open source +project. It was started in late 2017 by a data scientist and an engineer to fill +in the gaps in the ML processes tooling. Nowadays DVC is growing pretty fast and +though our in-house team is quite small, we have to thank our contributors (more +than 100 in both code and docs) for developing DVC with us. + +DVC is participating in Hacktoberfest for 2 years in a row to bring more people +into open source, to learn from them and to give back by sharing our own +experience. This year we decided to focus on a single important topic for us — +improving UI/UX. + +As our contributors and maintainers were sifting through the feature requests, +bugs, and improvements to create a good +[list of Hacktoberfest tickets](https://github.com/iterative/dvc/labels/hacktoberfest), +we noticed that UI/UX label on Github is popping up again and again. DVC is a +command line tool, and improving UI/UX in our case means making decisions on how +to name command options, where and when to use +[confirmation prompts](https://github.com/iterative/dvc/issues/2498) and/or +where abort execution, what exactly user would expect to see in the output, how +to test it later, etc. + +Why improving UI/UX appears to be so important for DVC at this stage? Perhaps +because the project is more mature now and we are ready to spend more time on +polishing it. Or maybe because it is still too-engineering focused and we used +to disregard/de-prioritize all this ‘fancy’ stuff. Or it is because we just lack +experience in creating good CLI UI/UX! + +One or another, those are great reasons to focus on improving UI (in a broader +sense than just GUI), improving docs, creating powerful consistent experience +for our users and increasing accessibility of DVC. + +That’s how +[Heroku’s CLI style guide](https://devcenter.heroku.com/articles/cli-style-guide) +starts: + +> Heroku CLI plugins should provide a clear user experience, targeted primarily +> for human readability and usability, which delights the user, while at the +> same time supporting advanced users and output formats. This article provides +> a clear direction for designing delightful CLI plugins. + +At DVC we are building user experience in line with these principles too, but we +also have our own challenges. And here we turn for help to the global open +source community and all the contributors out there. + +For all of us who have a heart for open source — let’s discuss, contribute, +learn, take the technologies forward and build something great together! + +Happy hacking! + +
+ +We are happy to hear from you [here](https://dvc.org/support). Our +[DMs on Twitter](https://twitter.com/DVCorg) are always open, too! diff --git a/content/blog/2019-11-05-october-19-dvc-heartbeat.md b/content/blog/2019-11-05-october-19-dvc-heartbeat.md new file mode 100644 index 00000000000..92b76deced5 --- /dev/null +++ b/content/blog/2019-11-05-october-19-dvc-heartbeat.md @@ -0,0 +1,269 @@ +--- +title: October ’19 DVC❤️Heartbeat +date: 2019-11-05 +description: | + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. +descriptionLong: | + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. + + Some of those are related to our brainchild [DVC](https://dvc.org) and its + journey. The others are a collection of exciting stories and ideas centered + around ML best practices and workflow. +picture: ../../static/uploads/images/2019-11-05/post-image.png +author: ../authors/svetlana_grinchenko.md +commentsUrl: https://discuss.dvc.org/t/october-19-dvc-heartbeat/285 +tags: + - Meetup + - Heartbeat + - Hacktoberfest +--- + +## News and links + +Autumn is a great season for new beginnings and there is so much we love about +it this year. Here are some of the highlights: + +- Co-hosting our + [first ever meetup](https://www.meetup.com/San-Francisco-Machine-Learning-Meetup/events/264846847/)! + Our [Dmitry Petrov](https://twitter.com/FullStackML) partnering with + [Dan Fischetti](https://www.linkedin.com/in/daniel-fischetti-4a6592bb/) from + [Standard Cognition](https://twitter.com/standardAI) to discuss Open-source + tools to version control Machine Learning models and experiments. The + recording is available now here. + + https://youtu.be/RHQXK7EC0jI + +- [Getting ready for the Hacktoberfest](https://blog.dataversioncontrol.com/dvc-org-for-hacktoberfest-2019-ce5320151a0c) + and having the whole team get together to pick up and label nice issues and be + ready to support the contributors. + +- Discovering some really cool blogposts, talks and tutorials from our users all + over the world: check + [this blogpost in French](https://blog.octo.com/mise-en-application-de-dvc-sur-un-projet-de-machine-learning/) + or + [this tutorial in German](https://jupyter-tutorial.readthedocs.io/de/latest/reproduce/dvc/init.html)! + +- Having great time working with a [tech writer](https://github.com/dashohoxha) + brought to us by the + [Google Season of Docs](https://developers.google.com/season-of-docs) program. + Check out these + [interactive tutorials](https://dvc.org/doc/tutorials/interactive) we’ve + created together. + +- Having hot internal discussion about Discord vs Slack support/community + channels. If you are on the edge like us, have a look at + [this discussion](https://internals.rust-lang.org/t/exploring-new-communication-channels/7859) + in the Rust community, so helpful. + +- Seeing [Dmitry Petrov](https://twitter.com/FullStackML) being really happy one + day: + + https://twitter.com/FullStackML/status/1169403554290814976 + +
+ +We at [DVC.org](https://dvc.org) are so happy every time we discover an article +featuring DVC or addressing one of the burning ML issues we are trying to solve. +Here are some of the links that caught our eye past month: + +- **Continuous Delivery for Machine Learning by + [Danilo Sato](https://twitter.com/dtsato), + [Arif Wider](https://twitter.com/arifwider), + [Christoph Windheuser](https://twitter.com/intellification) and curated by + [Martin Fowler](https://martinfowler.com/).** + +> As Machine Learning techniques continue to evolve and perform more complex +> tasks, so is evolving our knowledge of how to manage and deliver such +> applications to production. By bringing and extending the principles and +> practices from Continuous Delivery, we can better manage the risks of +> releasing changes to Machine Learning applications in a safe and reliable way. + + + +- **[The Path to Identity Validation](https://medium.com/signaturit-tech-blog/the-path-to-identity-validation-2-3-4f698b2ffae9) + by [Víctor Segura](https://medium.com/@victor.segura).** + +> So, the first question is clear: how to choose the optimal hardware for neural +> networks? Secondly, assuming that we have the appropriate infrastructure, how +> to build the machine learning ecosystem to train our models efficiently and +> not die trying? At **Signaturit**, we have the solution ;) + + + +- **Talk: + [Managing Big Data in Machine Learning projects](https://pretalx.com/pyconuk-2019/talk/GCLBFH/) + by [V Vishnu Anirudh](https://twitter.com/vvasworld) at the + [Pycon UK 2019.](https://2019.pyconuk.org/)** + +> My talk will focus on Version Control Systems (VCS) for big-data projects. +> With the advent of Machine Learning (ML) , the development teams find it +> increasingly difficult to manage and collaborate on projects that deal with +> huge amounts of data and ML models apart from just source code. + +https://youtu.be/4XpHk85_x0E + +- **Podcast: TWIML Talk #295 + [Managing Deep Learning Experiments](https://twimlai.com/twiml-talk-295-managing-deep-learning-experiments-with-lukas-biewald/) + with [Lukas Biewald](https://twitter.com/l2k)** + +> Seeing a need for reproducibility in deep learning experiments, Lukas founded +> Weights & Biases. In this episode we discuss his experiment tracking tool, how +> it works, the components that make it unique in the ML marketplace and the +> open, collaborative culture that Lukas promotes. Listen to Lukas delve into +> how he got his start in deep learning experiments, what his experiment +> tracking used to look like, the current Weights & Biases business success +> strategy, and what his team is working on today. + + + +
+ +## Discord gems + +There are lots of hidden gems in our Discord community discussions. Sometimes +they are scattered all over the channels and hard to track down. + +We are sifting through the issues and discussions and share with you the most +interesting takeaways. + +### Q: I’ve just run a `dvc run` step, and realised I forgot to declare an output file. [Is there a way to add an output file without rerunning the (computationally expensive) step/stage?](https://discordapp.com/channels/485586884165107732/485596304961962003/593743448020877323) + +If you’ve already ran it, you could just open created DVC-file with an editor +and add an entry to the outs field. After that, just run `dvc commit my.dvc` and +it will save the checksums and data without re-running your command. +`dvc run --no-exec` would also work with commit instead of modifying the +DVC-file by hand. + +### Q: [For metric files do I have to use dvc run to set a metric or can I do it some other way?](https://discordapp.com/channels/485586884165107732/485596304961962003/593869598651318282) Can I use metrics functionality without the need to setup and manage DVC cache and remote storage? + +Any file that is under DVC control (e.g. added with `dvc add` or an output in +`dvc run -o`) can be made a metric file with dvc metrics add file. Alternatively +a command `dvc run -M` file makes file a metric without caching it. It means dvc +metrics show can be used while file is still versioned by Git. + +### Q: [Is there a way not to add the full (Azure) connection string to the .dvc/config file that is being checked into Git for using dvc remotes](https://discordapp.com/channels/485586884165107732/485596304961962003/595586670498283520)? I think it’s quite unhealthy to have secrets checked in SCM. + +There are two options — use `AZURE_STORAGE_CONNECTION_STRING` environment +variable or use `--local` flag that will put it into the `.dvc/config.local` +that is added to the `.gitignore`, so you don’t track it with it and so won’t +expose secrets. + +### Q: [I would like to know if it is possible to manage files under DVC whilst keeping them in their original locations (e.g. on a network drive in a given folder structure)](https://discordapp.com/channels/485586884165107732/485596304961962003/601068667131920385)? [If I want to add a large file to be tracked by DVC, and it is in a bucket on S3 or GCS, can I do that without downloading it locally?](https://discordapp.com/channels/485586884165107732/485596304961962003/615278138896941101) + +Yes, you are probably looking for external dependencies and outputs. This is the +[link](https://dvc.org/doc/user-guide/managing-external-data) to the +documentation to start. + +### Q: [How do I setup DVC so that NAS (e.g. Synology) acts as a shared DVC cache?](https://discordapp.com/channels/485586884165107732/485596304961962003/606388040377565215) + +Using NAS (e.g. NFS) is a very common scenario for DVC. In short you use +`dvc cache dir` to setup a cache externally. Set cache type to use symlinks and +enable protected mode. We are preparing a +[document](https://github.com/iterative/dvc.org/blob/31c5d424c6530bb793af69c2af578d2b8a374d02/static/docs/use-cases/shared-storage-on-nfs.md) +how to setup the NFS as a shared cache, but I think it can be applied to any +NAS. + +### Q: So I have some data that is in the hundreds of gigs. [If I enable symlink, hardlink strategy and cache protecting, will DVC automatically choose this strategy over copying when trying to use dvc add](https://discordapp.com/channels/485586884165107732/485596304961962003/608013531010301952)? + +Yes, it will! Here is some clarification. So when you set those settings like +that, `dvc add` data will move data to your cache and then will create a +hardlink from your cache to your workspace. + +Unless your cache directory and your workspace are on different file systems, +move should be instant. Please, find more information +[here](https://dvc.org/doc/user-guide/large-dataset-optimization). + +### Q: My repo’s DVC is “busy and locked” and I’m not sure how it got that way and how to remove/diagnose the lock. [Any suggestions?](https://discordapp.com/channels/485586884165107732/485596304961962003/608392956679815168) + +DVC uses a lock file to prevent running two commands at the same time. The lock +[file](https://dvc.org/doc/user-guide/dvc-files-and-directories#dvc-files-and-directories) +is under the `.dvc` directory. If no DVC commands running and you are still +getting this error it’s safe to remove this file manually to resolve the issue. + +### Q: [I’m trying to understand how does DVC remote add work in case of a local folder and what is the best workflow when data is outside of your project root?](https://discordapp.com/channels/485586884165107732/485596304961962003/611209851757920266) + +When using DVC, in most cases we assume that your data will be somewhere under +project root. There is an option to use so called +[external dependencies](https://dvc.org/doc/user-guide/managing-external-data), +which is data that is usually too big to be stored under your project root, but +if you operate on data that is of some reasonable size, I would recommend +starting with putting data somewhere under project root. Remotes are usually +places where you store your data, but it is DVC task to move your data around. +But if you want to keep your current setup where you will have data in different +place than your project, you will need to refer to data with full paths. So, for +example: + +1. You are in `/home/gabriel/myproject` and you have initialized dvc and git + repository + +2. You have `featurize.py` in your project dir, and want to use data to produce + some features and than `train.py` to train a model. + +3. Run the command: + +```dvc +$ dvc run -d /research_data/myproject/videos \ + -o /research_data/myproject/features \ + python featurize.py +``` + +to tell DVC, that you use `/research_data/myproject/videos` to featurize, and +produce output to your features dir. Note that your code should be aware of +those paths, they can be hardcoded inside `featurize.py`, but point of `dvc run` +is just to tell DVC what artifacts belong to currently defined step of ML +pipeline. + +### Q: When I run `du` command to check how much space DVC project consumes I see that it duplicates/copies data. [It’s very space and time consuming to copy large data files, is there a way to avoid that?](https://discordapp.com/channels/485586884165107732/485596304961962003/613935477896249364) It takes too long to add large files to DVC. + +Yes! You don’t have to copy files with DVC. First of all, there are two reasons +when du can show that it takes double the space to store data under DVC control. +du can be inaccurate when the underlying file system supports reflinks (XFS on +Linux, APFS on Mac, etc). This is actually the best scenario since no copying is +happening and no changes are required to any DVC settings. Second, case means +that copy semantics is used by default. It can be turned off by providing cache +type `symlinks`, `hardlinks`. Please, read more on this +[here](https://dvc.org/doc/user-guide/large-dataset-optimization#file-link-types-for-the-dvc-cache). + +### Q: [How can I detach a file from DVC control?](https://discordapp.com/channels/485586884165107732/485596304961962003/615479227189559323) + +Just removing the corresponding DVC-file and running `dvc gc` after that should +be enough. It’ll stop tracking the data file and clean the local cache that +might still contain it. Note! Don’t forget to run `dvc unprotect` if you use +advanced[ DVC setup with symlinks and hardlinks](https://dvc.org/doc/user-guide/large-dataset-optimization) +(`cache.type` config option is not default). If `dvc gc` behavior is not +granular enough you can manually find the by its cache from the DVC-file in +`.dvc/cache` and remote storage. Learn +[here](https://dvc.org/doc/user-guide/dvc-files-and-directories#structure-of-cache-directory) +how they are organized. + +### Q: [I’m trying to understand if DVC is an appropriate solution for storing data under GDPR requirements.](https://discordapp.com/channels/485586884165107732/485596304961962003/621057268145848340) That means that permanent deletion of files with sensitive data needs to be fully supported. + +Yes, in this sense DVC is not very different from using bare S3, SSH or any +other storage where you can go and just delete data. DVC can give a bit of +overhead to locate a specific file to delete, but otherwise it’s all the same +you will be able to delete any file you want. See more details on how you +retrospectively can edit directories under DVC control +[here](https://discordapp.com/channels/485586884165107732/485596304961962003/621062105524862987). + +
+ +If you have any questions, concerns or ideas, let us know in the comments below +or connect with DVC team [here](https://dvc.org/support). Our +[DMs on Twitter](https://twitter.com/DVCorg) are always open, too. diff --git a/content/blog/2019-12-14-november-19-dvc-heartbeat.md b/content/blog/2019-12-14-november-19-dvc-heartbeat.md new file mode 100644 index 00000000000..17038ea4489 --- /dev/null +++ b/content/blog/2019-12-14-november-19-dvc-heartbeat.md @@ -0,0 +1,278 @@ +--- +title: November ’19 DVC❤️Heartbeat +date: 2019-12-14 +description: | + Co-hosting our first ever meetup, sharing our Hacktoberfest experience, 4K ⭐, + fresh Discord gems and other + news. +descriptionLong: | + Every month we are sharing here our news, findings, interesting reads, + community takeaways, and everything along the way. + + Some of those are related to our brainchild [DVC](https://dvc.org) and its + journey. The others are a collection of exciting stories and ideas centered + around ML best practices and workflow. +picture: ../../static/uploads/images/2019-12-14/post-image.jpeg +pictureComment: + How cool is this handmade swag from our community? We were in tears! +author: ../authors/svetlana_grinchenko.md +commentsUrl: https://discuss.dvc.org/t/november-19-dvc-heartbeat/284 +tags: + - Meetup + - Heartbeat + - Hacktoberfest +--- + +The past few months have been so busy and full of great events! We love how +involved our community is and can’t wait to share more with you: + +- We have organized our very first + [meetup](https://www.meetup.com/San-Francisco-Machine-Learning-Meetup/events/264846847/)! + So many great conversations, new use cases and insights! Many thanks to + [Dan Fischetti](https://www.linkedin.com/in/daniel-fischetti-4a6592bb/) from + [Standard Cognition](https://standard.ai/), who joined our Dmitry Petrov on + stage. Watch the recording here. + + https://youtu.be/RHQXK7EC0jI + +- [Hacktoberfest](https://blog.dataversioncontrol.com/dvc-org-for-hacktoberfest-2019-ce5320151a0c) + was a great exercise for DVC team on many levels and we really enjoyed + supporting new contributors. Kudos to + [Nabanita Dash](https://twitter.com/explorer_07) for organizing a cool + DVC-themed hackathon! + + https://twitter.com/psociiit/status/1185150096792535040 + +- We’ve crossed 4k stars mark on [Github](https://github.com/iterative/dvc)! + +- DVC was participating in the + [Devsprints](https://twitter.com/FossMec/status/1192866498324254720) (Thank + you [Kurian Benoy](https://twitter.com/kurianbenoy2) for the intro!) and we + were happy to jump in and help with some mentoring. + + https://twitter.com/FossMec/status/1192866498324254720 + +![](/uploads/images/2019-12-14/devsprints.png)_Devsprints participants on our +[Discord](http://dvc.org/chat) channel_ + +- DVC became part of the default + [Homebrew formulae](https://formulae.brew.sh/formula/dvc)! So now you can + install it as easy as `brew install dvc`! + +- We helped 2 aspiring speakers deliver their very first conference talks. + [Kurian Benoy](https://twitter.com/kurianbenoy2/status/1183427495342694401?s=20) + was speaking at [PyconIndia](https://in.pycon.org/2019/) and + [Aman Sharma](https://www.linkedin.com/in/aman-sharma606/) was speaking at + [SciPyIndia](https://scipy.in/2019#speakers). **Supporting speakers is + something we are passionate about and if you ever wanted to give a talk on a + DVC-related topic — we are here to help, just + [let us know](https://dvc.org/support)!** + + https://youtu.be/Ipzf6oQqQpo + +- Our own [Dmitry Petrov](https://twitter.com/FullStackML) went to Europe to + speak at the + [Open Source Summit Europe](https://osseu19.sched.com/speaker/dmitry35) in + Lyon, [Highload++](https://www.highload.ru/moscow/2019/abstracts/6032) in + Moscow and made a stop in in Berlin to co-host a + [meetup](https://www.meetup.com/codecentric-Berlin/events/265555810/) with our + favourite AI folks from [Codecentric](https://www.codecentric.de/)! + +
+ +Here are some of the great pieces of content around DVC and ML ops that we +discovered in October and November: + +- **[Deploy Machine Learning Models with Django](https://www.deploymachinelearning.com/) + by Piotr Płoński.** + +> …building your ML system has a great advantage — it is tailored to your needs. +> It has all features that are needed in your ML system and can be as complex as +> you wish. This tutorial is for readers who are familiar with ML and would like +> to learn how to build ML web services. + + + +- **[How to Manage Your Machine Learning Workflow with DVC, Weights & Biases, and Docker](https://towardsdatascience.com/how-to-manage-your-machine-learning-workflow-with-dvc-weights-biases-and-docker-5529ea4e59e0) + by [James Le](https://towardsdatascience.com/@james_aka_yale).** + +> In this article, I want to show 3 powerful tools to simplify and scale up +> machine learning development within an organization by making it easy to +> track, reproduce, manage, and deploy models. + + + +- **[Creating a solid Data Science development environment](https://towardsdatascience.com/creating-a-solid-data-science-development-environment-60df14ce3a34) + by + [Gabriel dos Santos Goncalves](https://towardsdatascience.com/@gabrielsgoncalves)** + +> We do believe that Data Science is a field that can become even more mature by +> using best practices in project development and that Conda, Git, DVC, and +> JupyterLab are key components of this new approach + + + +- **[Creating reproducible data science workflows with DVC](https://medium.com/y-data-stories/creating-reproducible-data-science-workflows-with-dvc-3bf058e9797b) + by [Gleb Ivashkevich](https://medium.com/@glib.ivashkevych).** + +> DVC is a powerful tool and we covered only the fundamentals of it. + + + +
+ +## Discord gems + +There are lots of hidden gems in our Discord community discussions. Sometimes +they are scattered all over the channels and hard to track down. + +We are sifting through the issues and discussions and share with you the most +interesting takeaways. + +### Q: When you do a `dvc import` you get the state of the data in the original repo at that moment in time from that repo, right? [The overall state of that repo (e.g. Git `commit id` (hash)) is not preserved upon import, right?](https://discordapp.com/channels/485586884165107732/563406153334128681/618744949277458462) + +On the contrary, DVC relies on Git `commit id` (hash) to determine the state of +the data as well as code. Git `commit id` (hash) is saved in DVC file upon +import, data itself is copied/downloaded into DVC repo cache but would not be +pushed to the remote — DVC does not create duplicates. There is a command to +advance/update it when it’s needed — `dvc update`. Git commit hash saved to +provide reproducibility. Even if the source repo `HEAD` has changed your import +stays the same until you run `dvc update` or redo `dvc import`. + +### Q: I’m trying to understand if DVC is an appropriate solution for storing data under GDPR requirements. [That means that permanent deletion of files with sensitive data needs to be fully supported.](https://discordapp.com/channels/485586884165107732/485596304961962003/621057268145848340) + +Yes, in this sense DVC is not very different from using bare S3, SSH or any +other storage where you can go and just delete data. DVC can give a bit of +overhead to locate a specific file to delete, but otherwise it’s all the same +you will be able to delete any file you want. Read more details in +[this discussion](https://discordapp.com/channels/485586884165107732/485596304961962003/621062105524862987). + +### Q: [Is there anyway to get the remote url for specific DVC-files?](https://discordapp.com/channels/485586884165107732/485596304961962003/621591769766821888) Say, I have a DVC-file `foo.png.dvc` — is there a command that will show the remote url, something like `dvc get-remote-url foo.png.dvc` which will return e.g. the Azure url to download. + +There is no special command for that, but if you are using Python, you could use +our API specifically designed for that: + +```python +from dvc.api import get_url + +url = get_url(path, + repo="https://github.com/user/proj", + rev="mybranch") +``` + +so, you could as well use this from CLI as a wrapper command. + +### Q: [Can DVC be integrated with MS Active Directory (AD) authentication for controlling access?](https://discordapp.com/channels/485586884165107732/563406153334128681/619244714071425035) The GDPR requirements would force me to use such a system to manage access. + +Short answer: no (as of the date of publishing this Heartbeat issue) Good news — +it should be very easy to add, so we would welcome a contribution :) Azure has a +connection argument for AD — quick googling shows this +[library](https://github.com/AzureAD/azure-activedirectory-library-for-python), +which is what probably needed. + +### Q: [How do I uninstall DVC from Mac installed as a package?](https://discordapp.com/channels/485586884165107732/485596304961962003/625124341201502209) + +When installing using `plain.pkg` it is a bit tricky to uninstall, so we usually +recommend using things like brew cask instead if you really need the binary +package. Try to run these commands: + +```dvc +$ sudo rm -rf /usr/local/bin/dvc +$ sudo rm -rf /usr/local/lib/dvc +$ sudo pkgutil --forget com.iterative.dvc +``` + +to uninstall the package. + +### Q: We are using SSH remote to store data, but the problem is that everyone within the project has different username on the remote machine and thus we cannot set it in the config file (that is committed to Git). [Is there a way to add just host and path, without the username?](https://discordapp.com/channels/485586884165107732/563406153334128681/619420070111608848) + +Yes, you should use `--local` or `--global` config options to set user per +project or per use machine without sharing (committing) them to Git: + +```dvc +$ dvc remote modify myremote —local user myuser +``` + +or + +```dvc +$ dvc remote modify myremote —global user myuser +``` + +### Q: [I still get the `SSL ERROR` when I try to perform a dvc push with or without `use_ssl = false`](https://discordapp.com/channels/485586884165107732/485596304961962003/628227197592797191)? + +A simple environment variable like this: + +```dvc +$ export AWS_CA_BUNDLE=/path/to/cert/cert.crt dvc push +``` + +should do the trick for now, we plan to fix the ca_bundle option soon. + +### Q: I have just finished a lengthy `dvc repro` and I’m happy with the result. However, I realized that I didn’t specify a dependency which I needed (and obviously is used in the computation). [Can I somehow fix it?](https://discordapp.com/channels/485586884165107732/563406153334128681/620572187841265675) + +Add the dependency to the stage file without rerunning/reproducing the stage. +This is not needed as this additional dependency hasn’t changed. + +You would need to edit the DVC-file. In the deps section add: + +```yaml +-path: not/included/file/path +``` + +and run `dvc commit file.dvc` to save changes w/o running the pipeline again. +See an example +[here](https://discordapp.com/channels/485586884165107732/563406153334128681/620641530075414570). + +### Q: For some reason [we need to always specify the remote name when doing a `dvc push`](https://discordapp.com/channels/485586884165107732/485596304961962003/629704961868955648) e.g., `dvc push -r upstream` as opposed to `dvc push` (mind no additional arguments). + +You can mark a “default” remote: + +```dvc +$ dvc remote add -d remote /path/to/my/main/remote +``` + +then, `dvc push` (and other commands like `dvc pull`) will know to push to the +default + +### Q: [If I want stage B to run after stage A, but the stage A has no output, can I specify A’s DVC-file as B’s dependency?](https://discordapp.com/channels/485586884165107732/563406153334128681/620715145374466048) + +No, at least at the time of publishing this. You could use a phony output +though. E.g. make the stage A output some dummy file and make B depend on it. +Please, consider creating or upvoting a relevant issue on our Github if you’d +this to be implemented. + +### Q: I’m just getting started with DVC, but I’d like to use it for multiple developers to access the data and share models and code. [I do own the server, but I’m not sure how to use DVC with SSH remote?](https://discordapp.com/channels/485586884165107732/563406153334128681/598867829785362452) + +Please, refer to +[this answer](https://discuss.dvc.org/t/how-do-i-use-dvc-with-ssh-remote/279/2) +on the DVC forum and check the documentation for the +[`dvc remote add`](https://dvc.org/doc/command-reference/remote/add) and +[`dvc remote modify`](https://dvc.org/doc/command-reference/remote/modify) +commands to see more options and details. + +
+ +If you have any questions, concerns or ideas, let us know in the comments below +or connect with DVC team [here](https://dvc.org/support). Our +[DMs on Twitter](https://twitter.com/DVCorg) are always open, too. diff --git a/content/blog/2020-01-17-january-20-dvc-heartbeat.md b/content/blog/2020-01-17-january-20-dvc-heartbeat.md new file mode 100644 index 00000000000..88459d325ab --- /dev/null +++ b/content/blog/2020-01-17-january-20-dvc-heartbeat.md @@ -0,0 +1,145 @@ +--- +title: January '20 DVC❤️Heartbeat +date: 2020-01-17 +description: | + Reaching 100 contributors, PyData LA, and more news from the DVC community. +descriptionLong: | + Every month we share news, findings, interesting reads, community takeaways, + and everything else along the way. Some of those are related to our brainchild + [DVC](https://dvc.org) and its journey. The others are a collection of + exciting stories and ideas centered around ML best practices and workflow. +picture: ../../static/uploads/images/2020-01-17/DVC_chalk_donuts.png +pictureComment: We spread the joys of version control and donuts at PyData LA. +author: ../authors/elle_obrien.md +commentsUrl: https://discuss.dvc.org/t/january-20-dvc-heartbeat/314 +tags: + - Heartbeat + - PyData +--- + +Welcome to the New Year! Time for a recap of the last few weeks of activity in +the DVC community. + +## News + +We were honored to be named a [Project of the Year](https://ods.ai/awards/2019/) +by Open Data Science, Russia's largest community of data scientists and machine +learning practitioners. Check out our ⭐️incredibly shiny trophy⭐️! + +https://twitter.com/DVCorg/status/1209544709930016768 + +DVC hit **100 individual contributors** on Github! To celebrate our +100th contributor, [Vera Sativa](https://github.com/verasativa/), we +sent her \$500 to use on any educational opportunity and her own DeeVee (that's +our rainbow owl). We also awarded educational mini-grants to two of DVC's +biggest contributors, [Vít Novotný](https://twitter.com/tweetiko), and +[David Příhoda](https://twitter.com/david_prihoda). + +![](/uploads/images/2020-01-17/odd_with_deevee.png)_Vera (center, flashing a +peace sign) thanked us with this lovely picture of DeeVee and her team, +[Odd Industries](https://odd.co/en/). They are making some extremely neat tools +for construction teams using computer vision._ + +**We were at PyData LA!** Our fearless leader +[Dmitry gave a talk](https://www.youtube.com/watch?v=7Wsd6V0k4Oc) and we set up +a busy booth to meet with the Pythonistas of Los Angeles. It was a cold and +blustery day, but visitors kept showing up to our semi-outdoor booth. We're sure +they came for the open source version control and not the donuts. + +![](/uploads/images/2020-01-17/py_data1.jpeg) +![](/uploads/images/2020-01-17/py_data2.jpeg) _The DVC team and PyData +volunteers who heroically staffed our booth in the rain._ + +Our engineer and technical writer Jorge reported: + +> We were super happy to meet all kinds of data professionals and enthusiasts in +> several fields who are learning and adopting DVC with their teams – including +> several working with privacy-sensitive medical records, very cool! + +
+ +## From the community + +Here are some rumblings from the machine learning (ML) and data science +community that got us talking. + +**A machine learning software wishlist.** Computer scientist and writer +[Chip Huyen](https://twitter.com/chipro) tweeted about her ML software wishlist +and kicked off a big community discussion. + +https://twitter.com/chipro/status/1202815757593108480 + +Her tweet resonated with a lot of practitioners, who were eager to discuss the +solutions they'd tried. Among the many thoughtful replies and recommendations, +we were thrilled to see DVC mentioned. + +https://twitter.com/kristijan_ivanc/status/1202879739716870144 + +If you haven't already, definitely check out Chip's +[thread](https://twitter.com/chipro/status/1202815757593108480), and follow her +on Twitter for more excllent, accessible content about ML engineering. We're +thinking hard about these ideas and hope the discussion continues on- and +offline. + +**A gentle intro to DVC for data scientists.** Scientist +[Elle O'Brien](https://twitter.com/andronovhopf) published a code walkthrough +about using DVC to make an image classification project more reproducible. +Specifically, the blog is a case study about version control when a dataset +grows over time. If you're looking for a DVC tutorial geared for data +scientists, this might be up your alley. + + + +**Ideas for data scientists to level up their code** Machine learning engineer +Andrew Greatorex posted a blog called “Down with technical debt! Clean Python +for data scientists.” Andrew highlights something we can easily relate to: the +“science” part of data science, which encourages experimentation and +flexibility, sometimes means less emphasis on readable, shareable code. Andrew +writes: + +> "I’m hoping to shed light on some of the ways that more fledgling data +> scientists can write cleaner Python code and better structure small scale +> projects, with the important side effect of reducing the amount of technical +> debt you inadvertently burden on yourself and your team.” + +In this blog, DVC gets a shout-out as Andrew’s preferred data versioning tool, +used in conjunction with Git for versioning Python code. Thanks! + + + +**An introduction to MLOps** Engineer +[Sharif Elfouly](https://twitter.com/elfouly_sharif) wrote an approachable guide +to thinking about MLOps, the growing field around making ML projects run +efficiently from experimentation to production. He summarises why managing ML +projects can be fundamentally different than traditional software development: + +> “The main difference between traditional software and ML is that you don’t +> only have the code. You also have data, models, and experiments. Writing +> traditional software is relatively straightforward but in ML you need to try +> out a lot of different things to find the best and fastest model for your +> use-case. You have a lot of different model types to choose from and every +> single one of them has its specific hyperparameters. Even if you work alone +> this can get out of hand pretty quickly.” + +Sharif gives some recommendations for tools that work especially well for ML, +and he writes that DVC is the “perfect combination for versioning your code and +data.” Thanks, Sharif! We think you’re perfect, too. + + + +That's a wrap for January. We'll see you next month with more updates! diff --git a/content/blog/2020-01-20-january-20-community-gems.md b/content/blog/2020-01-20-january-20-community-gems.md new file mode 100644 index 00000000000..1b61d145875 --- /dev/null +++ b/content/blog/2020-01-20-january-20-community-gems.md @@ -0,0 +1,150 @@ +--- +title: January '20 Community Gems +date: 2020-01-20 +description: | + Great discussions and technical Q&A's from our users. +descriptionLong: | + Every month we share news, findings, interesting reads, community takeaways, + and everything else along the way. Some of those are related to our brainchild + [DVC](https://dvc.org) and its journey. The others are a collection of + exciting stories and ideas centered around ML best practices and workflow. +picture: ../../static/uploads/images/2020-01-20/Community_Gems.png +pictureComment: +author: ../authors/elle_obrien.md +commentsUrl: https://discuss.dvc.org/t/january-20-community-gems/315 +tags: + - Discord + - Gems +--- + +## Discord gems + +There's a lot of action in our Discord channel these days. Ruslan, DVC's core +maintainer, said it best with a gif. + +https://twitter.com/rkuprieiev/status/1144008869414342658?ref_src=twsrc%5Etfw + +It's a lot to keep up with, so here are some highlights. We think these are +useful, good-to-know, and interesting conversations between DVC developers and +users. + +### Q: [What pros does DVC have compared to Git LFS?](https://discordapp.com/channels/485586884165107732/563406153334128681/657590900754612284) + +For an in-depth answer, check out this +[Stack Overflow discussion](https://stackoverflow.com/questions/58541260/difference-between-git-lfs-and-dvc). +But in brief, with DVC you don't need a special server, and you can use nearly +any kind of storage (S3, Google Cloud Storage, Azure Blobs, your own server, +etc.) without a fuss. There are also no limits on the size of the data that you +can store, unlike with GitHub. With Git LFS, there are some general LFS server +limits, too. DVC has additional features for sharing your data (e.g., +`dvc import`) and has pipeline support, so it does much more than LFS. Plus, we +have flexible and quick checkouts, as we utilize different link types (reflinks, +symlinks, and hardlinks). We think there are lots of advantages; of course, the +usefulness will depend on your particular needs. + +### Q: [How do I use DVC with SSH remote storage?](https://discordapp.com/channels/485586884165107732/563406153334128681/656016145119182849) I usually connect with a .pem key file. How do I do the same with DVC? + +DVC is built to work with the SSH protocol to access remote storage (we provide +some +[examples in our official documentation](https://dvc.org/doc/user-guide/external-dependencies#ssh)). +When SSH requires a key file, try this: + +```dvc +$ dvc remote modify myremote keyfile +``` + +### Q: [If you train a TensorFlow model that creates multiple checkpoint files, how do you establish them as dependencies in the DVC pipeline?](https://discordapp.com/channels/485586884165107732/563406153334128681/651098762466426891) + +You can specify a directory as a dependency/output in your DVC pipeline, and +store checkpointed models in that directory. It might look like this: + +```dvc +$ dvc run \ + -f train.dvc \ + -d data \ + -d train.py \ + -o models python code/train.py +``` + +where `models` is a directory created for checkpoint files. If you would like to +preserve your models in the data directory, though, then you would need to +specify them one by one. You can do this with bash: + +```dvc +$ dvc run $(for file in data/*.gz; do echo -n -d $file; done) +``` + +Be careful, though: if you declare checkpoint files to be an output of the DVC +pipeline, you won’t be able to re-run the pipeline using those checkpoint files +to initialize weights for model training. This would introduce circularity, as +your output would become your input. + +Also keep in mind that whenever you re-run a pipeline with `dvc repro`, outputs +are deleted and then regenerated. If you don't wish to automatically delete +outputs, there is a `--persist` flag (see discussion +[here](https://github.com/iterative/dvc/issues/1214) and +[here](https://github.com/iterative/dvc/issues/1884)), although we don't +currently provide technical support for it. + +Finally, remember that setting something as a dependency (`-d`) doesn't mean it +is automatically tracked by DVC. So remember to `dvc add` data files in the +beginning! + +### Q: [Is it possible to use the same cache directory for multiple DVC repos that are used in parallel?](https://discordapp.com/channels/485586884165107732/485596304961962003/655012135973158942) Or do I need external software to prevent potential race conditions? + +This is absolutely possible, and you don't need any external software to safely +use multiple DVC repos in parallel. With DVC, cache operations are atomic. The +only exception is cleaning the cache with `dvc gc`, which you should only run +when no one else is working on a shared project that is referenced in your cache +(and also, be sure to use the `--projects` flag +[as described in our docs](https://dvc.org/doc/command-reference/gc)). For more +about using multiple DVC repos in parallel, check out some discussions +[here](https://discuss.dvc.org/t/setup-dvc-to-work-with-shared-data-on-nas-server/180) +and [here](https://dvc.org/doc/use-cases/shared-development-server). + +### Q: [What are some strategies for reproducibility if parts of our model training pipeline are run on our organizations's HPC?](https://discordapp.com/channels/485586884165107732/485596304961962003/652380507832844328) + +Using DVC for version control is entirely compatible with using remote computing +resources, like high performance computing (HPC), in your model training +pipeline. We think a great example of using DVC with parallel computing is +provided by [Peter Fogh](http://www.peterfogh.dk/) Take a +[look at his repo](https://github.com/PeterFogh/dvc_dask_use_case) for a +detailed use case. Please keep us posted about how HPC works in your pipeline, +as we'll be eager to pass on any insights to the community. + +### Q: Say I have a Git repository with multiple projets inside (one classification, one object detection, etc.). [Is it possible to tell DVC to just pull data for one particular project?](https://discordapp.com/channels/485586884165107732/563406153334128681/646760832616890408) + +Absolutely, DVC supports pulling data from different DVC-files. An example would +be having two project subdirectories in your Git repo, `classification` and +`detection`. You could use `dvc pull -R classification` to only pull files in +that project to your workspace. + +If you prefer to be even more granular, you can `dvc add` files individually. +Then you can use `dvc pull .dvc` to retrieve the outputs specified +only by that file. + +### Q: [Is it possible to set an S3 remote without the use of AWS credentials with DVC?](https://discordapp.com/channels/485586884165107732/563406153334128681/623234659098296348) I want to publicly host a dataset so that everybody who clones my code repo can just run `dvc pull` to fetch the dataset. + +Yes, and we love the idea of publicly hosting a dataset. There are a few ways to +do it with DVC. We use one method in our own DVC project repository on Github. +If you run `git clone https://github.com/iterative/dvc` and then `dvc pull`, +you’ll see that DVC is downloading data from an HTTP repository, which is +actually just an S3 repository that we've granted public HTTP read-access to. + +So you would need to configure two remotes in your config file, each pointing to +the same S3 bucket through different protocols. Like this: + +```dvc +$ dvc remote add -d --local myremote s3://bucket/path +$ dvc remote add -d mypublicemote http://s3-external-1.amazonaws.com/bucket/path +``` + +Here's why this works: the `-d` flag sets the default remote, and the `--local` +flag creates a set of configuration preferences that will override the global +settings when DVC commands are run locally and won't be shared through Git (you +can read more about this +[in our docs](https://dvc.org/doc/command-reference/remote/add#remote-add)). + +This means that even though you and users from the public are accessing the +stored dataset by different protocols (S3 and HTTPS), you'll all run the same +command: `dvc pull`. diff --git a/content/blog/2020-02-04-gsoc-ideas-2020.md b/content/blog/2020-02-04-gsoc-ideas-2020.md new file mode 100644 index 00000000000..92b87279a36 --- /dev/null +++ b/content/blog/2020-02-04-gsoc-ideas-2020.md @@ -0,0 +1,129 @@ +--- +title: Join DVC for Google Summer of Code 2020 +date: 2020-02-04 +description: | + A call for student applications for Google Summer of Code 2020. +descriptionLong: | + DVC is looking for students to take part in + [Google Summer of Code 2020](https://summerofcode.withgoogle.com/). +picture: ../../static/uploads/images/2020-02-04/Summer_of_Code_small.png +pictureComment: +author: ../authors/elle_obrien.md +commentsUrl: https://discuss.dvc.org/t/join-dvc-for-google-summer-of-code/317 +tags: + - Google Summer of Code + - Students + - Mentoring +--- + +Announcement, announcement! After a successful experience with +[Google Season of Docs](https://developers.google.com/season-of-docs) in 2019, +we're putting out a call for students to apply to work with DVC as part of +[Google Summer of Code](https://summerofcode.withgoogle.com/). If you want to +make a dent in open source software development with mentorship from our team, +read on. + +## Prerequisites to apply + +Besides the general requirements to apply to Google Summer of Code, there are a +few skills we look for in applicants. + +1. **Python experience.** All of our core development is done in Python, so we + prefer candidates that are experienced in Python. However, we will consider + applicants who are very strong in another language and familiar with Python + basics. +2. **Git experience.** Git is also a key part of DVC development, as DVC is + built around Git; that said, for certain projects (rated as “Beginner”) a + surface-level knowledge of Git will be sufficient. +3. **People skills.** Beyond technical fundamentals, we put a high value on + communication skills: the ability to report and document your experiments and + findings, to work kindly with teammates, and explain your goals and work + clearly. + +If you like our mission but aren't sure if you're sufficiently prepared, please +be in touch anyway. We'd love to hear from you. + +## Project ideas + +Below are several project ideas that are an immediate priority for the core DVC +team. Of course,we welcome students to create their own proposals, even if they +differ from our ideas. Projets will be primarily mentored by co-founders +[Dmitry Petrov](https://github.com/dmpetrov) and +[Ivan Shcheklein](https://github.com/shcheklein). + +1. **Migrate to the latest v3 API to improve Google Drive support.** Our + organization is a co-maintainer of the PyDrive library in collaboration with + a team at Google. The PyDrive library is now several years old and still + relies on the v2 protocol. We would like to migrate to v3, which we expect + will boost performance for many DVC use cases (e.g. the ability to filter + fields being retrieved from our API, etc). For this project, we’re looking + for a student to work with us to prepare the next major version of the + PyDrive library, as well as making important changes to the core DVC code to + support it. Because PyDrive is broadly used outside of DVC, this project is a + chance to work on a library of widespread interest to the Python community. +

_Skills required:_ Python, Git, experience with APIs
+ _Difficulty rating:_ Beginner-Medium
+ +2. **Introducing parallelism to DVC.** One of DVC’s features is the ability to + create pipelines, linking data repositories with code to process data, train + models, and evaluate model metrics. Once a DVC pipeline is created, the + pipeline can be shared and re-run in a systematic and entirely reproducible + way. Currently, DVC executes pipelines sequentially, even though some steps + may be run in parallel (such as data preprocessing). We would like to support + parallelization for pipeline steps specified by the user. Furthermore, we’ll + need to support building flags into DVC commands that specify the level of + parallelization (CPU, GPU or memory).

_Skills required:_ + Python, Git. Some experience with parallelization and/or scientific computing + would be helpful but not required.
_Difficulty rating:_ Advanced +
+ +3. **Developing use cases for data registries and ML model zoos.** A new DVC + functionality that we’re particularly excited about is `summon`, a method + that can turn remotely-hosted machine learning artifacts such as datasets, + trained models, and more into objects in the user’s local environment (such + as a Jupyter notebook). This is a foundation for creating data catalogs of + data-frames and machine learning model zoos on top of Git repositories and + cloud storages (like GCS or S3). We need to identify and implement model zoos + (think PyTorch Hub, the Caffe Model Zoo, or the TensorFlow DeepLab Model Zoo) + and data registries for types that are not supported by DVC yet. Currently, + we’ve tested `summon` with PyTorch image segmentation models and Pandas + dataframes. We’re looking for students to explore other possible use cases. +

_Skills required:_ Python, Git, and some machine learning or + data science experience
_Difficulty rating:_ Beginner-Medium
+ +4. **Continuous delivery for JetBrains TeamCity.** Continuous integration and + continuous delivery (CI/CD) for ML projects is an area where we see + [DVC make a big impact](https://martinfowler.com/articles/cd4ml.html)- + specifically, by delivering datasets and ML models into CI/CD pipelines. + While there are many cases when DVC is used inside GitHub Actions and GitLab + CI, you will be transferring this experience to another type of CI/CD system, + [JetBrains TeamCity](https://www.jetbrains.com/teamcity/). We're working to + integrate DVC's model and dataset versioning into TeamCity's CI/CD toolkit. + This project would be ideal for a student looking to explore the growing + field of MLOps, an offshoot of DevOps with the specifics of ML projects at + the center.

_Skills required:_ Python, Git, bash scripting. It + would be nice, but not necessary, to have some experience with CI/CD tools + and developer workflow automation.
_Difficulty rating:_ + Medium-Advanced
+ +5. **DVC performance testing framework.** Performance is a core value of DVC. We + will be creating a performance monitoring and testing framework where new + scenarios (e.g., unit testing)can be populated. The framework should reflect + all performance improvements and degradations for each of the DVC releases. + It would be especially compelling if testing could be integrated with our + GitHub workflow (CI/CD). This is a great opportunity for a student to learn + about DVC and versioning in-depth and contribute to its stability.
+
_Skills required:_ Python, Git, bash scripting.
_Difficulty + rating:_ Medium-Advanced
+ +## If you'd like to apply + +Please refer to the +[Google Summer of Code](https://summerofcode.withgoogle.com/) application guides +for specifics of the program. Students looking to know more about DVC, and our +worldwide community of contributors, will learn most by visiting our +[Discord channel](https://dvc.org/chat), +[GitHub repository](https://github.com/iterative/dvc), and +[Forum](https://discuss.dvc.org/). We are available to discuss project proposals +from interested students and can be reached by [email](mailto:support@dvc.org) +or on our Discord channel. diff --git a/content/blog/2020-02-10-february-20-dvc-heartbeat.md b/content/blog/2020-02-10-february-20-dvc-heartbeat.md new file mode 100644 index 00000000000..900165890da --- /dev/null +++ b/content/blog/2020-02-10-february-20-dvc-heartbeat.md @@ -0,0 +1,149 @@ +--- +title: February '20 DVC❤️Heartbeat +date: 2020-02-10 +description: | + DVC talks around the world, + new team members, and full-stack machine learning. +descriptionLong: | + Every month we share news, findings, interesting reads, community takeaways, + and everything else along the way. + + Look here for updates about [DVC](https://dvc.org), our journey as a startup, + projects by our users and big ideas about best practices in ML and data + science. +picture: ../../static/uploads/images/2020-02-10/heartbeat_black.png +pictureComment: + Just in time for Valentine's day, here's a seasonally-relevant DVC pipeline. +author: ../authors/elle_obrien.md +commentsUrl: https://discuss.dvc.org/t/dvc-heartbeat-feburary-20/318 +tags: + - Heartbeat + - Continuous Integration +--- + +Welcome to the February Heartbeat! This month's featured image is a DVC pipeline +[created by one of our users](https://medium.com/nlp-trend-and-review-en/use-dvc-to-version-control-ml-dl-models-bef61dbfe477), +which _we_ think resembles a valentine. Here are some more highlights from our +team and our community: + +## News + +**Our team is growing!** In early January, DVC gained two new folks: engineer +[Saugat Pachhai](https://github.com/skshetry) and data scientist +[Elle O'Brien](https://twitter.com/andronovhopf). Saugat, based in Nepal, will +be contributing to core DVC. Elle (that's me!), currently in San Francisco, will +be leading data science projects and outreach with DVC. + +We're **gearing up for a spring full of talks** about DVC projects, including +new up-and-coming features for data cataloging and continuous integration. Here +are just a few events that have been added to our schedule: + + + + + + + +-Elle O'Brien was recently accepted to give a keynote at +[Women in Data Science](https://www.widsconference.org/) San Diego on May 9. The +talk is called "Packaging data and machine learning models for sharing." + +-Elle will also be speaking at [Div Ops](https://divops.org/), a new online +conference about (you guessed it) DevOps, on March 27. + +Look out for more conference announcements soon- in our **brand new community +page!** We've [just launched a new hub](https://dvc.org/community) for sharing +events, goings-ons, and ways to contribute to DVC. + +## From the community + +Our users continue to put awesome things on the internet. Like this AI blogger +who isn't afraid to wear his heart on his sleeve. + + + +Musa Atlihan writes: + +> From my experience, whether it is a real-world data science project or it is a +> data science competition, there are two major key components for success. +> Those components are API simplicity and reproducible pipelines. Since data +> science means experimenting a lot in a limited time frame, first, we need +> machine learning tools with simplicity and second, we need +> reliable/reproducible machine learning pipelines. Thanks to tools like Keras, +> LightGBM, and fastai we already have simple yet powerful tools for rapid model +> development. And thanks to DVC, we are building large projects with +> reproducible pipelines very easily. + +It's cool how Musa puts DVC in context with libraries for model building. In a +way, the libraries that have made it easier than ever to iterate through +different model architectures have increased the need for reproducibility in +proportion. + +Meanwhile in Germany, superusers Marcel Mikl and Bert Besser wrote +[another](https://blog.codecentric.de/en/2019/03/walkthrough-dvc/) seriously +comprehensive article about DVC for Codecentric. Marcel and Bert walk readers +through the steps to **build a custom machine learning training pipeline with +remote computing resources** like GCP and AWS. It's an excellent guide to +configuring model training with attention to _automation_ and _collaboration_. +We give them 🦉🦉🦉🦉🦉 out of 5. + + + +Here are a few more stories on our radar: + +- **AI Singapore shares their method for AI development and deployment.** This + .. + [blog about how Agile informs their processes](https://makerspace.aisingapore.org/2020/01/agile-ai-engineering-in-aisg/) + for continuous integration and delivery includes data versioning. + +- **Toucan AI dispenses advice for ML engineers.** This .. + [blog for practitioners](https://toucanai.com/blog/post/building-production-ml/) + discusses questions like, "When to work on ML vs. the processes that surround + ML". It covers how DVC is used for model versioning in the exploration stage + of ML. + +- **DVC at the University.** A recent .. + [pre-print from natural language processing researchers at Université Laval](https://arxiv.org/pdf/1912.01706.pdf) + explains how DVC facilitated dataset access for collaborators. + + > "In our case, the original dataset takes up to 6 Gigabytes. The previous way + > of retrieving the dataset over the network with a standard 20 Mbits/sec + > internet connexion took up to an hour to complete (including uncompressing + > the data). Using DVC reduced the retrieval time of the dataset to 3 minutes + > over the network with the same internet connexion." + + Thanks for sharing- this is a lovely result. Oh, and last... + +- **DVC is a job requirement**! We celebrated a small milestone when we stumbled + .. across a listing for a data engineer to support R&D at + [Elvie](https://www.elvie.com/en-us/), a maker of tech for women's health + (pretty neat mission). The decorations on the job posting are ours 😎 + +![](/uploads/images/2020-02-10/elvie.png)_A +[job advertisement](https://www.jobstoday.co.uk/job/40530810/data-engineer/?TrackID=8) +featuring DVC._ diff --git a/content/blog/2020-02-17-a-public-reddit-dataset.md b/content/blog/2020-02-17-a-public-reddit-dataset.md new file mode 100644 index 00000000000..7a7590167e0 --- /dev/null +++ b/content/blog/2020-02-17-a-public-reddit-dataset.md @@ -0,0 +1,326 @@ +--- +title: + AITA for making this? A public dataset of Reddit posts about moral dilemmas +date: 2020-02-17 +description: | + Releasing an open natural language dataset based on r/AmItheAsshole. +descriptionLong: | + Delve into an open natural language dataset of posts about moral dilemmas from + [r/AmItheAsshole](https://www.reddit.com/r/AmItheAsshole/). Use this dataset + for whatever you want- here's how to get it and start playing. +picture: ../../static/uploads/images/2020-02-17/post_header_gmoji.png +author: ../authors/elle_obrien.md +commentsUrl: https://discuss.dvc.org/t/aita-for-making-this-a-public-dataset-of-reddit-posts-about-moral-dilemmas/323 +tags: + - Project + - Data + - Reddit +--- + +In data science, we frequently deal with classification problems like, _is this +[Yelp reviewer unhappy](https://www.ics.uci.edu/~vpsaini/) with their brunch? Is +[this email](https://archive.ics.uci.edu/ml/datasets/spambase) begging me to +claim my long-lost inheritance spam? Does this +[movie critic](http://ai.stanford.edu/~amaas/data/sentiment/) have a positive +opinion of Cats?_ + +Perhaps we should also consider the fundamental introspective matter of, _am I +maybe being a bit of an asshole?_ + +I want to share a dataset of collected moral dilemmas shared on Reddit, as well +as the judgments handed down by a jury of Redditors. The wellspring of this data +is the [r/AmItheAsshole](https://www.reddit.com/r/AmItheAsshole/) subreddit, one +of the natural wonders of the digital world. In this article, I'll show you +what's in the dataset, how to get it, and some things you can do to move the +frontiers of Asshole research forward. + +## What makes an Asshole? + +r/AmItheAsshole is a semi-structured online forum that’s the internet’s closest +approximation of a judicial system. In this corner of the web, citizens post +situations from their lives and Redditors vote to decide if the writer has acted +as The Asshole or not. For example: + +![](/uploads/images/2020-02-17/aita_sample.png) + +Without bringing any code into the picture, it’s intuitive to think of each new +post as a classification task for the subreddit. Formally, we could think of the +subreddit as executing a function _f_ such that + +![](/uploads/images/2020-02-17/aita_formula.png '=500') + +Of course, finding f won’t be trivial. To be frank, I’m not positive how well we +could hope to forecast the rulings of the subreddit. A lot of posts are not easy +for me to decide- like, + +![](/uploads/images/2020-02-17/aita_llama.png) + +There are also many times I find myself disagreeing with the subreddit’s +verdict. All this is to say, I don’t think it’s obvious how well a given human +would do on the task of predicting whether Redditors find someone an Asshole. +Nor is it clear how well we could ever hope for a machine to do approximating +their judgment. + +It seems fun to try, though. It helps that the data is plentiful: because the +subreddit is popular and well-moderated, there’s an especially strong volume of +high-quality content (re: on-topic and appropriately formatted) being posted +daily. + +## Building the dataset + +I pulled content from r/AmITheAsshole dating from the first post in 2012 to +January 1, 2020 using the [pushshift.io](https://pushshift.io/) API to get post +ids and +[scores](https://www.reddit.com/wiki/faq#wiki_how_is_a_submission.27s_score_determined.3F), +followed by Reddit’s API ([praw](https://praw.readthedocs.io/en/latest/)) to get +post content and meta-data. Using a +[similar standard as OpenAI](https://openai.com/blog/better-language-models/) +for trawling Reddit, I collected text from posts with scores of 3 or more only +for quality control. This cut the number of posts from ~355K to ~111K. Each data +point contains an official id code, timestamp, post title, post text, verdict, +score, and comment count; usernames are not included. The scraping and cleaning +code is available +[in the project GitHub repo](https://github.com/iterative/aita_dataset). For +simplicity on the first iteration of this problem, I didn’t scrape post +comments, which can number in the thousands for popular posts. But, should +sufficient interest arise, I’d consider adding them to the dataset in some form. + +To focus on the task of classifying posts, I did some light cleaning: I removed +posts in which the body of the text was redacted (surprisingly common) or blank, +and attempted to remove edits where the author had clearly given away the +verdict (e.g., an edit that says, “Update: You’re right, I was the asshole.”). +There were also verdicts that only occurred once (“cheap asshole”, “Crouching +Liar; hidden asshole”, “the pizza is the asshole”), so I restricted the dataset +to posts with standard verdicts. This left ~63K points. Below is a sample of the +resulting dataframe: + +![](/uploads/images/2020-02-17/df_sample.png)_Click to enlarge._ + +The dataset is a snapshot of the subreddit in its current state, but the +subreddit is certain to change over time as new content gets added. In the +interest of having the most comprehensive dataset about being an asshole ever +collected, _I’m planning to update this dataset monthly with new posts._ + +## How to get the dataset + +Since this dataset will be updated regularly, we’re using git and DVC to +package, version, and release it. The data itself is stored in an S3 bucket, and +you can use DVC to import the data to your workspace. If you haven't already +you'll need to [install DVC](https://dvc.org/doc/install); one of the simplest +ways is `pip install dvc`. + +Say you have a directory on your local machine where you plan to build some +analysis scripts. Simply run + +```dvc +$ dvc get https://github.com/iterative/aita_dataset \ + aita_clean.csv +``` + +This will download a .csv dataset into your local directory, corresponding to +the cleaned version. If you wanted the raw dataset, you would substitute +`aita_raw.csv` for `aita_clean.csv`. + +Because the dataset is >100 MB, I’ve created a git branch (called “lightweight”) +with 10,000 randomly sampled (cleaned) data points for quick-and-dirty +experimentation that won’t occupy all your laptop’s memory. To download only +this smaller dataset, run + +```dvc +$ dvc get --rev lightweight \ + https://github.com/iterative/aita_dataset \ + aita_clean.csv +``` + +## A quick look at the data + +Let’s take a flyover look at the dataset so far. The code to make the following +visuals and results is +[available on GitHub](https://github.com/andronovhopf/aita_viz_and_classify). +First, here’s a frequency plot for how common different verdicts are on the +subreddit. In addition to “Asshole” and “Not the Asshole”, there are two +additional rulings: “Everybody Sucks” and “No Assholes Here”. + +![](/uploads/images/2020-02-17/freq_plot.svg) + +In general agreement with an +[analysis by Nathan Cunn](http://www.nathancunn.com/2019-04-04-am-i-the-asshole/), +the majority of posts are deemed “Not the Asshole” or “No Assholes Here”. If you +are posting on r/AmITheAsshole, you are probably not the asshole. + +Next, I attempted a very basic classifier, logistic regression using 1-gram +frequencies (i.e., the frequency of word occurences in post titles and bodies) +as features. This is intended to give a baseline for what kind of performance +any future modeling efforts should beat. Because of the strong class imbalance, +I used +[SMOTE to oversample](https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html) +Asshole posts. And, for simplicity, I binarized the category labels: + +| Verdict | Label | +| :--------------: | :---: | +| Asshole | 1 | +| Everyone Sucks | 1 | +| Not the Asshole | 0 | +| No Assholes Here | 0 | + +With 5-fold cross-validation, this classifier performed above-chance but +modestly: accuracy was 62.0% +/- 0.005 (95% confidence interval). Curiously, the +only other classifier attempt I could find online +[reported 61% accuracy on held-out data](https://github.com/amr-amr/am-i-the-asshole) +using the much more powerful BERT architecture. Considering that logistic +regression has zero hidden layers, and our features discard sequential +information entirely, we’re doing quite well! Although I can’t be certain, I’m +curious how much the discrepancy comes down to dataset size: the previous effort +with BERT appears to be trained on ~30K posts. + +Seeing that logistic regression on word counts doesn’t produce total garbage, I +looked at which words were predictive of class using the +[chi-squared test](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.chi2.html). +The top five informative words were mom, wife, mother, edit, and dad (looks like +Assholes go back to edit their posts). Since familial relationships featured +prominently, I +[estimated the log odds ratio](https://www.tidytextmining.com/twitter.html#comparing-word-usage) +of being voted Asshole (versus Not the Asshole) if your post mentions a mom, +dad, girlfriend/wife or boyfriend/husband. Roughly, the log odds ratio +represents the difference in probability of a keyword occurring in Asshole posts +compared to Not-Asshole posts. + +![](/uploads/images/2020-02-17/svg_kw2.svg) + +For reference, the log odd ratios are computed with base 2; a score of 1 means +that Asshole posts are twice as likely to contain the keyword as Not the Asshole +posts. So keep in mind that the effect sizes we’re detecting, although almost +certainly non-zero, are still fairly small. + +There seems to be a slight anti-parent trend, with Redditors being more likely +to absolve authors who mention a mom or dad. Only mentioning a female romantic +partner (wife/girlfriend) was associated with a greater likelihood of being +voted the Asshole. This surprised me. My unsubstantiated guess about the gender +difference in mentioning romantic partners is that women may be particularly +likely to question themselves when they act assertively in a relationship. If +this were the case, we might find an especially high proportion of +uncontroversial “Not the Asshole” posts from heterosexual women asking about +situations with their male partners. + +## How to get more data + +As I said earlier, the plan is to grow the dataset over time. I’ve just run a +new scrape for posts from January 1-31, 2020 and am adding them to the public +dataset now. To check for a new release, you can re-run the `dvc get` command +you used to grab the dataset. + +If you’re serious about taking on a project such as, say, building a classifier +that beats our state of the art, word-count-based, logistic regression model, +I’d like to recommend a better way to integrate the dataset into your workflow: +`dvc import`. `dvc import` is like `dvc get`, but it preserves a link to the +hosted data set. This is desirable if you might iterate through several +experiments in the search for the right architecture, for example, or think +you’ll want to re-train a model . To get the dataset the first time, you’ll run: + +```dvc +$ git init +$ dvc init +$ dvc import https://github.com/iterative/aita_dataset \ + aita_clean.csv +``` + +Then, because the dataset in your workspace is linked to our dataset repository, +you can update it by simply running: + +```dvc +$ dvc update aita_clean.csv +``` + +An additional benefit of codifying the link between your copy of the dataset and +ours is that you can track the form of the dataset you used at different points +in your project development. You can jump back and forth through the project +history then, not only to previous versions of code but also to versions of +(specifically, links to) data. For example, you could roll back the state of the +project to before you updated the dataset and re-run your classifier: + +```dvc +$ git log --oneline +58e28a5 retrain logistic reg +6a44161 update aita dataset +0de4fc3 try logistic regression classifier +a266f15 get aita dataset +55031b0 first commit + +$ git checkout 0de4fc3 +$ dvc checkout +$ python train_classifier.py +``` + +Oh, and one more note: you can always use `dvc get` and `dvc import` to grab an +older version of the dataset using the tags associated with each release. The +current release is v.20.1 and the original release is v.20.0- the numeric codes +correspond to the year and month. + +```dvc +$ dvc get --rev v.20.0 \ + https://github.com/iterative/aita_dataset aita_clean.csv +``` + +## What’s next + +I hope that sharing this evolving dataset invites some curiosity, because a lot +of questions come to mind: + +1. Can you beat our classifier that predicts how the subreddit will rule? +2. Is verdict even the most interesting outcome to predict? For example, + developer Scott Ratigan + [created a tool to estimate weighted scores](https://github.com/scotteratigan/amitheahole) + for each post based on the comments (e.g., 75% Asshole, 25% Not the Asshole). + What metrics might invite deeper questions? +3. Can you identify sentences or phrases that are most informative about the + verdict Redditors reach? +4. Do voting patterns systematically differ by topic of discussion? +5. How reliable are verdicts? When a very similar situation is posted multiple + times, do Redditors usually vote the same way? +6. Is the subreddit’s posting and voting behavior changing over time? +7. Can you formulate any testable hypotheses based on + [this survey of the subreddit’s demographics](https://www.reddit.com/r/AmItheAsshole/comments/dcae07/2019_subscriber_survey_data_dump/?) +8. How often do non-Redditors agree with the subreddit? Under what circumstances + might they tend to disagree? + +I expect that leaning into the particulars of the dataset- thinking about how +the format influences the content, and how a subreddit might select for +participants that don’t fully represent the population at large- will lead to +more interesting questions than, say, aiming to forecast something about +morality in general. To put it another way, the data’s not unbiased- so maybe +try to learn something about those biases. + +If you make something with this dataset, please share- perhaps we can form an +international Asshole research collective, or at least keep each other appraised +of findings. And of course, reach out if you encounter any difficulties or +probable errors (you can file issues +[on the GitHub repo](https://github.com/iterative/aita_dataset))! + +Lastly, please stay tuned for more releases- there are hundreds of new posts +every day. The biggest asshole may still be out there. + +
+ +### More resources + +You may want to check out a few more efforts to get at r/AmItheAsshole from a +data-scientific perspective, including +[topic modeling](https://medium.com/@tom.gonda/what-does-reddit-argue-about-28432b11ea26), +[visualizing voting patterns](http://www.nathancunn.com/2019-04-04-am-i-the-asshole/) +and +[growth of the subreddit](https://twitter.com/felipehoffa/status/1223278090958209025), +and +[classification](https://www.informatik.hu-berlin.de/de/forschung/gebiete/wbi/teaching/studienDiplomArbeiten/finished/2019/expose_fletcher.pdf) +with [deep learning](https://github.com/amr-amr/am-i-the-asshole). With a +dataset this rich, there’s much more to be investigated, including continuing to +refine these existing methods. And there’s almost certainly room to push the +state of the art in asshole detection! + +If you're interested in learning more about using Reddit data, check out +[pushshift.io](https://pushshift.io/), a database that contains basically all of +Reddit's content (so why make this dataset? I wanted to remove some of the +barriers to analyzing text from r/AmItheAsshole by providing an +already-processed and cleaned version of the data that can be downloaded with a +line of code; pushshift takes some work). You might use pushshift's API and/or +praw to augment this dataset in some way- perhaps to compare activity in this +subreddit with another, or broader patterns on Reddit. diff --git a/content/blog/2020-02-19-february-20-community-gems.md b/content/blog/2020-02-19-february-20-community-gems.md new file mode 100644 index 00000000000..abbc241846d --- /dev/null +++ b/content/blog/2020-02-19-february-20-community-gems.md @@ -0,0 +1,153 @@ +--- +title: February '20 Community Gems +date: 2020-02-19 +description: | + Great discussions and technical Q&A's from our users. +descriptionLong: | + Look here every month for great discussions and technical Q&A's from our users + and core development team. +picture: ../../static/uploads/images/2020-02-19/feb20_gems_header_gr.png +author: ../authors/elle_obrien.md +commentsUrl: https://discuss.dvc.org/t/feb-20-community-gems/330 +tags: + - Discord + - Google Drive + - Azure + - Gems + - Homebrew +--- + +## Discord gems + +Welcome to the Februrary roundup of useful, intriguing, and good-to-know +discussions going on with DVC users and developers. Let's dive right in with +some questions from our Discord channel. + +### Q: [If I have multiple outputs from a DVC pipeline and only want to checkout one, what command would I run?](https://discordapp.com/channels/485586884165107732/563406153334128681/670233820326264843) + +By defult, `dvc checkout` is written for a +[Git-like experience](https://dvc.org/doc/command-reference/checkout), meaning +that it will sync your local workspace with all the model files, dependencies, +and outputs specified by a project's `.dvc` files. If you only want to access +one artifact from the project, you can do this with +`dvc checkout `. This will deliver the specified file to your +workspace. + +If you're interested in sharing specific artifacts (like data files or model +binaries) with other users, you might also consider `dvc get` and `dvc import`. +These functions are ideal for downloading a single file (or a few files) to the +local workspace, instead of the whole project. + +### Q: [I have a complicated use case.](https://discordapp.com/channels/485586884165107732/563406153334128681/668773484549242890) We're trying to set up a system where users act as data scientists. They'd select data, which would be cleaned/transformed in the backend, and experiment with model hyperparameters until they're happy with the model result. Then they can "save" the model, including artifacts like the input data used, metrics, and binary model file, placing the experiment under version control. Later they can "load" the model again and select new input data from our database, change parameters, and "update it". There might be hundreds of separate models. Can DVC do this? + +Most of this functionality is supported by DVC already. We recommend +`dvc import` as a method for giving users access to data in a repostiory (and +also check out our +[tutorial on data registries](https://dvc.org/doc/use-cases/data-registries)). +For pre-processing data, +[DVC pipelines](https://dvc.org/doc/get-started/pipeline) can automate a +procedure for transforming and cleaning inputs (i.e., you can use bash scripts +to `dvc run` the pipeline whenever a user selects a dataset). Saving the +workspace after experimentation, including model files, metrics, and outputs, is +a core function of DVC (see `dvc add` and `dvc push` functions). We also have a +[Python API](https://dvc.org/doc/use-cases/data-registries#programatic-reusability-of-dvc-data) +so users can load artifacts like datasets and model files into their local +Python session. When they're done experimenting, they can `dvc add` and +`dvc push` their progress. Users can later "pull" a saved workspace and all +associated files using `dvc checkout` + +As for how to organize hundreds of separate experiments, we're still evolving +our strategy and best-practice recommendations. It's conceivable that each +experiment could be carried out and saved on a separate branch of a project +repository. Our thoughts about structuring version control around architecture +search and hyperparameter tuning could fill up a whole blog (and probably will +in the not-so-distant future); check out one of our +[recent conversation threads](https://github.com/iterative/dvc/issues/2799) if +you'd like to see where we're currently at. And please let us know how your use +case goes—at this stage, we'd love to hear what works for you. + +### Q: [What's the difference](https://discordapp.com/channels/485586884165107732/563406153334128681/666708671333400599) between `config` and `config.local` files? Is it safe to do git commit without including my config file? + +There are indeed two kinds of config files you might come across in your project +directory's `.dvc` folder and `.gitignore` file. The key difference is that +`config` is intended to be committed to Git, while `config.local` is not. You'd +use `config.local` to store sensitive information (like personal credentials for +SSH or another kind of authenticated storage) or settings specific to your local +environment—things you wouldn't want to push to a GitHub repo. DVC only modifies +`config.local` when you explicitly use the `--local` flag in the `dvc config` or +`dvc remote *` commands, so outside of these cases you shouldn't have to worry +about it. + +As for using `git commit` without the `config` file, it is safe. _But_ you +should check if there are any settings in `config.local` that you actually want +to save to `config`. This would be rare, since as we mentioned, you'd only have +settings in `config.local` if you expressly called for them with the `--local` +flag. + +### Q: I have an Azure storage account container, and the only link I can see in my Azure portal for the container is an `http://` link. But the tutorial on DVC shows Azure storage accessed with the `azure://` protocol. [Which is right?](https://discordapp.com/channels/485586884165107732/563406153334128681/675087897661276169) + +What you're describing is exactly as it should be. `azure://` is an internal URL +protocol that tells DVC which API to use to connect to your remote storage, not +the exact address of your Blob. You can use the format +`azure:///`. For more details, you can refer to +our documentation about +[supported storage types](https://dvc.org/doc/command-reference/remote/add#supported-storage-types). + +### Q: [I'm using DVC to version my data with Google Drive storage.](https://discordapp.com/channels/485586884165107732/563406153334128681/667198775361536019) If I want a developer to be able to download the data, can I give them my `gdrive_client_id` and `gdrive_client_secret`, or maybe give them permission to access my Google Drive folder? + +For Google Drive, `gdrive_client_id` and `gdrive_client_secret` aren't used to +access a specific user's Google Drive disk; they're predominantly used by +Google's API to +[track usage and set appropriate rate limits](https://rclone.org/drive/#making-your-own-client-id). +So the risk in sharing them is not that your personal files will be vulnerable, +but that your API usage limits could be negatively affected if others are using +it with your credentials. Whether this risk is acceptable is up to you. It's not +unusual for teams and organizations to share a set of credentials, so a +reasonable level of security may mean ensuring that the `config` file for your +project (which typically contains Google Drive credentials) is only visible to +team members. + +Please check out our +[docs about Google Drive](https://dvc.org/doc/user-guide/setup-google-drive-remote), +too, for more about how DVC uses the Google Drive API. + +### Q: I just tried to upgrade DVC via `homebrew` and got a "SHA256 mismatch" error. [What's going on](https://discordapp.com/channels/485586884165107732/485596304961962003/672930535261339669)? + +What most likely happened is that you first installed DVC via +`brew install iterative/homebrew-dvc/dvc`, which is no longer supported—because +DVC is now a core Homebrew formula! Please uninstall and reinstall using +`brew install dvc` for uninterrupted upgrades in the future. + +### Q: [I still can't convince myself to version-control the data rather than meta-data.](https://www.reddit.com/r/datascience/comments/aqkg59/does_anyone_use_data_version_control_dvc_thoughts/eq62lkt?utm_source=share&utm_medium=web2x) Can anyone give me a strong argument against version controlling data file paths in config files instead of using DVC? + +_This question is from a [Reddit discussion.](https://bit.ly/38HOEcj)_ + +Versioning the meta-data associated with your dataset is certainly a workable +strategy. You can use prefixes and suffixes to distinguish models trained on +different versions of data, and keep your data files in one `.gitignored` +directory. That may be enough for some projects. In our experience, though, +we've found this comes with a host of complications that don't scale well: + +1. You'll have to write custom code to support this configuration, specifying + filepaths to your dataset with hardcoded links. +2. For files that are outputs of your analysis pipeline, you'll need to agree on + conventions for suffixes/prefixes for naming to specify which version of the + dataset was used. +3. Depending on the meta-data you use to version data files, you may not detect + changes made by users. Even if you can tell a change has occurred, you may + not be able to track _who_ did it _when_. + +We designed DVC to optimize data management from the user's perspective: users +can change the dataset version without changing their code, so organizations +don't have to adhere to explicit filenaming conventions and hardcoded links that +are prone to human error. Furthermore, versioning data similar to how Git +versions code provides a largely immutable record of every change that has +occurred. We think this is important as teams and projects grow in complexity. +And from a systems-level perspective, DVC does more than track data: it +dedpulicates files behind the scenes, provides simple interfaces for sharing +datasets (and models!) with collaborators and users, and connects specific model +files with the dataset versions they were trained on. + +To summarize, DVC is not the only way to version your data. But we think it's +one way to reduce the overhead of managing data infrastructure when your project +involves experimentation or collaboration. diff --git a/content/blog/2020-03-11-march-20-dvc-heartbeat.md b/content/blog/2020-03-11-march-20-dvc-heartbeat.md new file mode 100644 index 00000000000..91013775a8f --- /dev/null +++ b/content/blog/2020-03-11-march-20-dvc-heartbeat.md @@ -0,0 +1,139 @@ +--- +title: March '20 DVC❤️Heartbeat +date: 2020-03-11 +description: | + DVC discussions around the web, + our growing team, and recommended reading from the open source community. +descriptionLong: | + Every month we share news, findings, interesting reads, community takeaways, + and everything else along the way. + + Look here for updates about [DVC](https://dvc.org), our journey as a startup, + projects by our users and big ideas about best practices in ML and data + science. +picture: ../../static/uploads/images/2020-03-11/March_20_HB_header.png +pictureComment: +author: ../authors/elle_obrien.md +commentsUrl: https://discuss.dvc.org/t/march-20-heartbeat/335 +tags: + - Heartbeat + - CI/CD + - Book + - Monorepo + - New feature +--- + +Welcome to the March Heartbeat! Here are some highlights from our team and +community this past month: + +## News + +**DVC is STILL growing!** In February, Senior Software Engineer +[Guro Bokum](https://www.linkedin.com/in/jiojiajiu/) joined DVC. He's previously +contributed to the core DVC code base and brings several years of full-stack +engineering expertise to the team. Welcome, Guro! + +![](/uploads/images/2020-03-11/hi_guro.png 'Img=500x667')_Welcome, Guro!_ + +**New feature alert.** We've received many requests for +[monorepo](https://en.wikipedia.org/wiki/Monorepo) support in DVC. As of DVC +[release 0.87.0](https://github.com/iterative/dvc/releases), users can version +data science projects within a monorepo! The new `dvc init --subdir` +functionality is designed to allow multiple DVC repositories within a single Git +repository. Don't forget to upgrade and +[check out the latest docs](https://dvc.org/doc/command-reference/init). + +## From the community + +First, there's an intriguing +[discussion evolving in the DVC repo](https://github.com/iterative/dvc/issues/3393) +about how machine learning hyperparameters (such as learning rate, number of +layers in a deep neural network, etc.) can be tracked. Right now, +hyperparameters are tracked as source code (i.e., with Git). Could we use some +kind of abstraction to separate hyperparameters from source code in a +DVC-managed project? Read on and feel free to jump into this discussion, largely +helmed by software developer and DVC contributor +[Helge Munk Jacobsen](http://elgehelge.github.io/). + +Another discussion we appreciated happened on Twitter: + + + +Thanks, [@cyberomin](https://twitter.com/cyberomin)! + +Elsewhere on the internet, DVC made the cut in a much-shared blog, +[Five Interesting Data Engineering Projects](https://medium.com/@squarecog/five-interesting-data-engineering-projects-48ffb9c9c501) +by [Dmitry Ryaboy](https://twitter.com/squarecog) (VP of Engineering at biotech +startup Zymergen, and formerly Twitter). Dmitry wrote: + +> To be honest, I’m a bit of a skeptic on “git for data” and various automated +> data / workflow versioning schemes: various approaches I’ve seen in the past +> were either too partial to be useful, or required too drastic a change in how +> data scientists worked to get a realistic chance at adoption. So I ignored, or +> even explicitly avoided, checking DVC out as the buzz grew. I’ve finally +> checked it out and… it looks like maybe this has legs? Metrics tied to +> branches / versions are a great feature. Tying the idea of git-like braches to +> training multiple models makes the value prop clear. The implementation, using +> Git for code and datafile index storage, while leveraging scalable data stores +> for data, and trying to reduce overall storage cost by being clever about +> reuse, looks sane. A lot of what they have to say in +> https://dvc.org/doc/understanding-dvc rings true. + +Check out the full blog here: + + + +One of the areas that DVC is growing into is continuous integration and +continuous deployment (CI/CD), a part of the nascent field of MLOps. Naturally, +we were thrilled to discover that CI/CD with DVC is taught in a new Packt book, +["Learn Python by Building Data Science Applications"](https://www.packtpub.com/programming/learn-python-by-building-data-science-applications) +by David Katz and Philipp Kats. + +In the authors words, the goal of this book is to teach data scientists and +engineers "not only how to implement Python in data science projects, but also +how to maintain and design them to meet high programming standards." Needless to +say, we are considering starting a book club. Grab a copy here: + + + +Last year in Mexico, DVC contributor [Ramón Valles](https://github.com/mroutis) +gave a talk about reproducible machine learning workflows at Data Day +Monterrey—and +[a video of his presentation](https://www.youtube.com/watch?v=tAxG-n20Di4) is +now online! In this Spanish-language talk, Ramón gives a thorough look at DVC, +particularly building pipelines for reproducible ML. + + + +Finally, DVC data scientist Elle (that's me!) released a new public dataset of +posts from the Reddit forum +[r/AmItheAsshole](https://reddit.com/r/amitheasshole), and reported some +preliminary analyses. We're inviting anyone and everyone to play with the data, +make some hypotheses and share their findings. Check it out here: + + + +That's all for now—thanks for reading, and be in touch on our +[GitHub](https://github.com/iterative/dvc), +[Twitter](https://twitter.com/dvcorg), and +[Discord channel](https://dvc.org/chat). diff --git a/gatsby-config.js b/gatsby-config.js index 9dec5408ad2..5a19916087c 100644 --- a/gatsby-config.js +++ b/gatsby-config.js @@ -7,6 +7,7 @@ require('./config/prismjs/usage') const apiMiddleware = require('./middleware/api') const redirectsMiddleware = require('./middleware/redirects') +const { BLOG } = require('./src/consts') const title = 'Data Version Control · DVC' const description = @@ -28,19 +29,37 @@ const plugins = [ allExtensions: true } }, + 'gatsby-plugin-postcss', + 'gatsby-plugin-styled-components', + 'gatsby-plugin-react-helmet', + 'gatsby-plugin-sitemap', + 'gatsby-plugin-twitter', { resolve: 'gatsby-source-filesystem', options: { - name: 'blog', - path: path.join(__dirname, 'content', 'docs') + name: 'content', + path: path.join(__dirname, 'content') } }, + { + options: { + name: 'images', + path: path.join(__dirname, 'static', 'uploads') + }, + resolve: 'gatsby-source-filesystem' + }, { resolve: 'gatsby-transformer-remark', options: { plugins: [ + 'gatsby-remark-embedder', 'gatsby-remark-dvc-linker', - 'gatsby-remark-prismjs', + { + options: { + noInlineHighlight: true + }, + resolve: 'gatsby-remark-prismjs' + }, 'gatsby-remark-copy-linked-files', { resolve: 'gatsby-remark-smartypants', @@ -48,6 +67,13 @@ const plugins = [ quotes: false } }, + { + resolve: 'gatsby-remark-embed-gist', + options: { + includeDefaultCss: true + } + }, + 'gatsby-remark-relative-images', { resolve: 'gatsby-remark-external-links' }, @@ -57,10 +83,28 @@ const plugins = [ enableCustomId: true, isIconAfterHeader: true } - } + }, + { + resolve: 'gatsby-remark-images', + options: { + maxWidth: BLOG.imageMaxWidth, + withWebp: true + } + }, + 'gatsby-remark-responsive-iframe', + 'resize-image-plugin', + 'external-link-plugin' ] } }, + { + resolve: 'gatsby-plugin-svgr', + options: { + ref: true + } + }, + 'gatsby-transformer-sharp', + 'gatsby-plugin-sharp', 'gatsby-plugin-catch-links', { resolve: 'gatsby-plugin-manifest', @@ -76,9 +120,64 @@ const plugins = [ /* eslint-enable @typescript-eslint/camelcase */ } }, - 'gatsby-plugin-react-helmet', - 'gatsby-plugin-styled-components', - 'gatsby-plugin-sitemap', + { + options: { + feeds: [ + { + description, + output: '/rss.xml', + query: ` + { + allMarkdownRemark( + sort: { fields: [frontmatter___date], order: DESC } + filter: { fileAbsolutePath: { regex: "/content/blog/" } } + ) { + edges { + node { + html + fields { + slug + } + frontmatter { + title + date + description + } + } + } + } + } + `, + serialize: ({ query: { site, allMarkdownRemark } }) => { + return allMarkdownRemark.edges.map(edge => { + return Object.assign({}, edge.node.frontmatter, { + /* eslint-disable-next-line @typescript-eslint/camelcase */ + custom_elements: [{ 'content:encoded': edge.node.html }], + date: edge.node.frontmatter.date, + description: edge.node.description, + guid: site.siteMetadata.siteUrl + edge.node.fields.slug, + url: site.siteMetadata.siteUrl + edge.node.fields.slug + }) + }) + }, + title + } + ], + query: ` + { + site { + siteMetadata { + title + description + siteUrl + site_url: siteUrl + } + } + } + ` + }, + resolve: `gatsby-plugin-feed` + }, { resolve: 'gatsby-plugin-sentry', options: { diff --git a/gatsby-node.js b/gatsby-node.js index d80f1638853..65d49362d31 100644 --- a/gatsby-node.js +++ b/gatsby-node.js @@ -1,10 +1,19 @@ /* eslint-env node */ +const fs = require('fs') const path = require('path') const GithubSlugger = require('github-slugger') +const { createFilePath } = require('gatsby-source-filesystem') +const tagToSlug = require('./src/utils/tagToSlug') +const pagesGenerator = require('./src/components/Paginator/pagesGenerator') +const { siteMetadata } = require('./gatsby-config') const { getItemBySource } = require('./src/utils/sidebar') +const remark = require('remark') +const remarkHTML = require('remark-html') + +const markdownToHtml = remark().use(remarkHTML).processSync const slugger = new GithubSlugger() // Generate hedings data from markdown @@ -40,15 +49,41 @@ const parseHeadings = text => { return matches } -exports.onCreateNode = ({ node, actions }) => { +exports.onCreateNode = ({ node, actions, getNode }) => { const { createNodeField } = actions if (node.internal.type === 'MarkdownRemark') { - const docsPath = path.join(__dirname, 'content') + const contentPath = path.join(__dirname, 'content') + const source = node.fileAbsolutePath.replace(contentPath, '') + let value + + if (source.startsWith('/blog')) { + value = createFilePath({ + getNode, + node, + trailingSlash: false + }).replace(/^\/blog\/[0-9\-]*/, '/blog/') + + // Convert fields in frontmatter from markdown to html + const { + frontmatter: { descriptionLong, pictureComment } + } = node - const source = node.fileAbsolutePath.replace(docsPath, '') + if (descriptionLong) { + node.frontmatter.descriptionLong = markdownToHtml( + descriptionLong + ).contents + } - const { path: value } = getItemBySource(source) + if (pictureComment) { + node.frontmatter.pictureComment = markdownToHtml( + pictureComment + ).contents + } + // end Convert fields + } else { + value = getItemBySource(source).path + } createNodeField({ name: 'slug', @@ -59,11 +94,8 @@ exports.onCreateNode = ({ node, actions }) => { } exports.createPages = async ({ graphql, actions }) => { - const { createPage } = actions - - const docPage = path.resolve('./src/templates/doc.js') - - const result = await graphql( + // DOCS + const docsResponse = await graphql( ` { docs: allMarkdownRemark( @@ -83,43 +115,146 @@ exports.createPages = async ({ graphql, actions }) => { ` ) - if (result.errors) { - throw result.errors + if (docsResponse.errors) { + throw docsResponse.errors } - const docs = result.data.docs.edges + const docComponent = path.resolve('./src/templates/doc.js') - docs.forEach(doc => { + docsResponse.data.docs.edges.forEach(doc => { const headings = parseHeadings(doc.node.rawMarkdownBody) if (doc.node.fields.slug) { - createPage({ - component: docPage, + actions.createPage({ + component: docComponent, path: doc.node.fields.slug, context: { + isDocs: true, slug: doc.node.fields.slug, headings } }) } }) + + // Blog + const blogResponse = await graphql( + ` + { + allMarkdownRemark( + sort: { fields: [frontmatter___date], order: DESC } + filter: { fileAbsolutePath: { regex: "/content/blog/" } } + limit: 9999 + ) { + edges { + node { + fields { + slug + } + frontmatter { + title + } + } + } + } + home: allMarkdownRemark( + sort: { fields: [frontmatter___date], order: DESC } + filter: { fileAbsolutePath: { regex: "/content/blog/" } } + limit: 9999 + ) { + pageInfo { + itemCount + } + } + tags: allMarkdownRemark(limit: 9999) { + group(field: frontmatter___tags) { + fieldValue + pageInfo { + itemCount + } + } + } + } + ` + ) + + if (blogResponse.errors) { + throw blogResponse.errors + } + + // Create home blog pages (with pagination) + const blogHomeTemplate = path.resolve('./src/templates/blog-home.tsx') + + for (const page of pagesGenerator({ + basePath: '/blog', + hasHeroItem: true, + itemCount: blogResponse.data.home.pageInfo.itemCount + })) { + actions.createPage({ + component: blogHomeTemplate, + path: page.path, + context: { + isBlog: true, + ...page.context + } + }) + } + + // Create blog posts pages + const blogPostTemplate = path.resolve('./src/templates/blog-post.tsx') + const posts = blogResponse.data.allMarkdownRemark.edges + + posts.forEach((post, index) => { + const previous = index === posts.length - 1 ? null : posts[index + 1].node + const next = index === 0 ? null : posts[index - 1].node + + actions.createPage({ + component: blogPostTemplate, + context: { + isBlog: true, + currentPage: index + 1, + next, + previous, + slug: post.node.fields.slug + }, + path: post.node.fields.slug + }) + }) + + // Create tags pages (with pagination) + const blogTagsTemplate = path.resolve('./src/templates/blog-tags.tsx') + + blogResponse.data.tags.group.forEach( + ({ fieldValue: tag, pageInfo: { itemCount } }) => { + const basePath = `/tags/${tagToSlug(tag)}` + + for (const page of pagesGenerator({ basePath, itemCount })) { + actions.createPage({ + component: blogTagsTemplate, + path: page.path, + context: { tag, ...page.context } + }) + } + } + ) } -const notFoundRegexp = /^\/404/ +const is404Regexp = /^\/404/ const trailingSlashRegexp = /\/$/ exports.onCreatePage = ({ page, actions }) => { - let newPage = page - - if (notFoundRegexp.test(newPage.path)) { - newPage = { ...newPage, context: { ...newPage.context, is404: true } } + // Set necessary flags for pageContext + const newPage = { + ...page, + context: { + ...page.context, + is404: is404Regexp.test(page.path) + } } + // Remove trailing slash if (page.path !== '/' && trailingSlashRegexp.test(newPage.path)) { - newPage = { - ...newPage, - path: newPage.path.replace(trailingSlashRegexp, '') - } + newPage.path = newPage.path.replace(trailingSlashRegexp, '') } if (newPage !== page) { @@ -127,3 +262,100 @@ exports.onCreatePage = ({ page, actions }) => { actions.createPage(newPage) } } + +// Create json to use on https://dvc.org/community +exports.onPreBuild = async function({ graphql }) { + const result = await graphql(` + { + allMarkdownRemark( + sort: { fields: [frontmatter___date], order: DESC } + filter: { fileAbsolutePath: { regex: "/content/blog/" } } + limit: 3 + ) { + edges { + node { + fields { + slug + } + frontmatter { + title + date + commentsUrl + picture { + childImageSharp { + resize( + width: 160 + height: 160 + fit: COVER + cropFocus: CENTER + ) { + src + } + } + } + } + } + } + } + } + `) + + if (result.errors) { + throw new Error(result.errors) + } + + const posts = result.data.allMarkdownRemark.edges.map( + ({ + node: { + fields: { slug }, + frontmatter: { title, date, commentsUrl, picture } + } + }) => { + const url = `${siteMetadata.siteUrl}/${slug}` + let pictureUrl = null + + if (picture) { + const { + childImageSharp: { + resize: { src } + } + } = picture + + pictureUrl = `${siteMetadata.siteUrl}${src}` + } + + return { + commentsUrl, + date, + pictureUrl, + title, + url + } + } + ) + + const dir = path.join(__dirname, '/public/api') + const filepath = path.join(dir, 'posts.json') + + // Write json file to the public dir, + // it will be used community page later + if (!fs.existsSync(dir)) { + fs.mkdirSync(dir) + } + fs.writeFileSync(filepath, JSON.stringify({ posts })) +} + +// Ignore warnings about CSS inclusion order, because we use CSS modules. +// https://spectrum.chat/gatsby-js/general/having-issue-related-to-chunk-commons-mini-css-extract-plugin~0ee9c456-a37e-472a-a1a0-cc36f8ae6033?m=MTU3MjYyNDQ5OTAyNQ== +exports.onCreateWebpackConfig = ({ stage, actions, getConfig }) => { + if (stage === 'build-javascript') { + const config = getConfig() + const miniCssExtractPlugin = config.plugins.find( + plugin => plugin.constructor.name === 'MiniCssExtractPlugin' + ) + if (miniCssExtractPlugin) { + miniCssExtractPlugin.options.ignoreOrder = true + } + actions.replaceWebpackConfig(config) + } +} diff --git a/gatsby-ssr.js b/gatsby-ssr.js index 8b0fea8d968..78f5804cc5a 100644 --- a/gatsby-ssr.js +++ b/gatsby-ssr.js @@ -1,5 +1,7 @@ /* eslint-env node */ +const React = require('react') + const PageWrapper = require('./src/components/PageWrapper').default exports.wrapPageElement = PageWrapper diff --git a/package.json b/package.json index ce836d5b77b..983c9c76a5a 100644 --- a/package.json +++ b/package.json @@ -29,9 +29,12 @@ "homepage": "https://github.com/iterative/dvc.org#readme", "dependencies": { "@octokit/graphql": "^4.3.1", + "@reach/portal": "^0.9.0", "@reach/router": "^1.3.1", + "@reach/tooltip": "^0.9.1", "@sentry/browser": "^5.12.1", "@types/styled-components": "^5.0.1", + "classnames": "^2.2.6", "color": "^3.1.2", "compression": "^1.7.4", "date-fns": "^2.8.1", @@ -39,9 +42,11 @@ "dom-scroll-into-view": "^2.0.1", "express": "^4.17.1", "gatsby": "^2.20.2", + "gatsby-image": "^2.3.0", "gatsby-link": "^2.3.0", "gatsby-plugin-typescript": "^2.2.5", "gatsby-plugin-webpack-bundle-analyzer": "^1.0.5", + "gatsby-plugin-feed": "^2.4.0", "github-markdown-css": "^3.0.1", "isomorphic-fetch": "^2.2.1", "lodash.fill": "^3.4.0", @@ -66,7 +71,7 @@ "react-popover": "^0.5.10", "react-scroll": "^1.7.13", "react-slick": "^0.25.2", - "react-use": "^13.24.0", + "react-use": "^13.27.0", "rehype-react": "^4.0.1", "request": "^2.88.0", "serve-handler": "^6.1.2", @@ -77,9 +82,13 @@ }, "devDependencies": { "@babel/core": "^7.7.7", + "@svgr/webpack": "^5.2.0", + "@types/classnames": "^2.2.10", + "@types/react-helmet": "^5.0.15", "@types/vfile-message": "^1.0.1", "@typescript-eslint/eslint-plugin": "^2.24.0", "@typescript-eslint/parser": "^2.24.0", + "autoprefixer": "^9.7.4", "babel-eslint": "^10.0.3", "babel-jest": "^24.9.0", "babel-plugin-styled-components": "^1.10.7", @@ -94,22 +103,45 @@ "gatsby-plugin-catch-links": "^2.1.26", "gatsby-plugin-google-analytics": "^2.1.36", "gatsby-plugin-manifest": "^2.2.42", + "gatsby-plugin-postcss": "^2.2.0", "gatsby-plugin-react-helmet": "^3.1.22", "gatsby-plugin-sentry": "^1.0.1", + "gatsby-plugin-sharp": "^2.5.2", "gatsby-plugin-sitemap": "^2.2.27", "gatsby-plugin-styled-components": "^3.1.19", + "gatsby-plugin-svgr": "^2.0.2", + "gatsby-plugin-twitter": "^2.2.0", + "gatsby-plugin-typescript": "^2.2.5", + "gatsby-plugin-webpack-bundle-analyzer": "^1.0.5", "gatsby-remark-autolink-headers": "^2.1.24", "gatsby-remark-copy-linked-files": "^2.1.37", + "gatsby-remark-embed-gist": "^1.1.9", + "gatsby-remark-embedder": "^1.14.0", "gatsby-remark-external-links": "^0.0.4", + "gatsby-remark-images": "^3.2.0", "gatsby-remark-prismjs": "^3.3.31", + "gatsby-remark-relative-images": "^0.3.0", + "gatsby-remark-responsive-iframe": "^2.3.0", "gatsby-remark-smartypants": "^2.1.21", "gatsby-source-filesystem": "^2.1.48", "gatsby-transformer-remark": "^2.6.59", + "gatsby-transformer-sharp": "^2.4.0", + "hast-util-select": "^4.0.0", "husky": "^4.0.10", "jest": "^24.9.0", "lint-staged": "^10.0.0", + "postcss-custom-media": "^7.0.8", + "postcss-custom-properties": "^9.1.1", + "postcss-mixins": "^6.2.3", + "postcss-nested": "^4.2.1", "prettier": "^1.19.1", "pretty-quick": "^2.0.1", + "rehype-parse": "^6.0.2", + "rehype-stringify": "^6.0.1", + "remark": "^11.0.2", + "remark-html": "^10.0.0", + "stylelint": "^13.2.1", + "stylelint-config-standard": "^20.0.0", "typescript": "^3.8.3" }, "resolutions": { diff --git a/plugins/external-link-plugin/index.js b/plugins/external-link-plugin/index.js new file mode 100644 index 00000000000..730810eaba0 --- /dev/null +++ b/plugins/external-link-plugin/index.js @@ -0,0 +1,64 @@ +const visit = require('unist-util-visit') +const { selectAll } = require('hast-util-select') +const escape = require('escape-html') + +const { convertHastToHtml, convertHtmlToHast } = require('../utils/convertHast') + +const requiredExternalLinkAttrs = ['href', 'title', 'description', 'link'] + +function isCorrectExternalLinkAttr(attrsKeyTagArray) { + return requiredExternalLinkAttrs.every(attr => + attrsKeyTagArray.includes(attr) + ) +} + +function renderTag(attrs) { + return ` +
+ +
+

${escape(attrs.title)}

+
${escape(attrs.description)}
+ +
+ ${ + attrs.image + ? `
+ ${escape(attrs.title)} +
` + : `` + } +
+
+ ` +} + +module.exports = ({ markdownAST }) => { + visit(markdownAST, 'html', node => { + const hast = convertHtmlToHast(node.value) + const externalLinkNodeList = selectAll('external-link', hast) + + if (!externalLinkNodeList.length) { + return + } + + externalLinkNodeList.forEach(externalLinkNode => { + const { properties } = externalLinkNode + if (isCorrectExternalLinkAttr(Object.keys(properties))) { + const externalLinkHtml = renderTag(properties) + const externalLinkHast = convertHtmlToHast(externalLinkHtml) + + externalLinkNode.type = externalLinkHast.type + externalLinkNode.tagName = externalLinkHast.tagName + externalLinkNode.properties = externalLinkHast.properties + externalLinkNode.children = externalLinkHast.children + } else { + throw new Error( + `No correct tag or not all nested tags in ${node.value}` + ) + } + }) + + node.value = convertHastToHtml(hast) + }) +} diff --git a/plugins/external-link-plugin/package.json b/plugins/external-link-plugin/package.json new file mode 100644 index 00000000000..39300d0e85e --- /dev/null +++ b/plugins/external-link-plugin/package.json @@ -0,0 +1,7 @@ +{ + "name": "external-link-plugin", + "version": "1.0.0", + "main": "index.js", + "author": "Ilja Bykovskij", + "license": "Apache-2.0" +} diff --git a/plugins/gatsby-remark-dvc-linker/index.js b/plugins/gatsby-remark-dvc-linker/index.js index 9a6f13a937e..8f2a58f3628 100644 --- a/plugins/gatsby-remark-dvc-linker/index.js +++ b/plugins/gatsby-remark-dvc-linker/index.js @@ -10,7 +10,7 @@ const COMMAND_ROOT = '/doc/command-reference/' module.exports = ({ markdownAST }) => { visit(markdownAST, 'inlineCode', function(node, index, parent) { if (parent.type !== 'link' && DVC_REGEXP.test(node.value)) { - let parts = node.value.split(/\s+/) + const parts = node.value.split(/\s+/) let url const hasThirdSegment = parts[2] && COMMAND_REGEXP.test(parts[2]) diff --git a/plugins/resize-image-plugin/constants.js b/plugins/resize-image-plugin/constants.js new file mode 100644 index 00000000000..3eed531de3a --- /dev/null +++ b/plugins/resize-image-plugin/constants.js @@ -0,0 +1,5 @@ +module.exports = { + imageWrapClassPrefix: 'image-wrap-', + imageWrapStopClass: 'image-wrap-stop', + stopWrapTag: 'stop-wrap' +} diff --git a/plugins/resize-image-plugin/index.js b/plugins/resize-image-plugin/index.js new file mode 100644 index 00000000000..747b005e493 --- /dev/null +++ b/plugins/resize-image-plugin/index.js @@ -0,0 +1,125 @@ +/* + Support for resize image inline on markdown + Syntax "=WIDTH", ":wrap-left" and ":wrap-right" + + Examples + ![](/relative-path-image "=500") + ![](/relative-path-image "=500 Some Title") + ![](/relative-path-image "Some Title =500") + ![](/relative-path-image "Some Title :wrap-left =500") + ![](/relative-path-image ":wrap-left =500 Some Title") +*/ + +const visit = require('unist-util-visit') +const { selectAll, select } = require('hast-util-select') +const { + imageClass, + imageWrapperClass +} = require('gatsby-remark-images/constants') + +const { BLOG } = require('../../src/consts') + +const { + imageWrapClassPrefix, + imageWrapStopClass, + stopWrapTag +} = require('./constants') + +const { convertHtmlToHast, convertHastToHtml } = require('../utils/convertHast') + +const extractInstructions = titleString => { + const regexResize = /=\d{2,4}/g + const regexWrap = /:wrap-(left|right)/ + + const title = titleString + .replace(regexResize, '') + .replace(regexWrap, '') + .trim() + const resize = titleString.match(regexResize) + const wrap = titleString.match(regexWrap) + + return { + resize: resize ? Number(resize[0].replace('=', '')) : null, + title, + wrap: wrap ? wrap[1] : null + } +} + +module.exports = ({ markdownAST }) => { + visit(markdownAST, 'html', node => { + const regexMaxWidth = /max-width: \d{1,5}px/g + const hast = convertHtmlToHast(node.value) + const wrapperImageList = selectAll(`.${imageWrapperClass}`, hast) + + if (!wrapperImageList.length) { + return + } + + /* + Image related HTML produced by Gatsby looks like: + + + + + + + + ... + ... + */ + wrapperImageList.forEach(wrapperImage => { + const source = select(`picture > source:first-child`, wrapperImage) + const image = select(`.${imageClass}`, wrapperImage) + const { resize, title, wrap } = extractInstructions( + image.properties.title + ) + + if (resize || wrap) { + // by default Gatsby populates title value with alt, + // restoring it here if needed + image.properties.title = title ? title : image.properties.alt + } + + const originalSize = source.properties.srcSet[ + source.properties.srcSet.length - 1 + ] + .split(' ')[1] + .replace('w', '') + + const maxWidth = wrapperImage.properties.style + .match(regexMaxWidth)[0] + .replace(/\D/g, '') + + if (wrap) { + const { className, style } = wrapperImage.properties + wrapperImage.properties.className = `${className || + ''} ${imageWrapClassPrefix}${wrap}` + + // Prevent us from using an !important in the CSS + wrapperImage.properties.style = style.replace( + /margin-(left|right):\s+auto/g, + '' + ) + } + + if (resize || BLOG.imageMaxWidth * 2 > originalSize) { + wrapperImage.properties.style = wrapperImage.properties.style.replace( + regexMaxWidth, + `max-width: ${ + resize ? Math.min(resize, maxWidth) : originalSize / 2 + }px` + ) + } + }) + + const stopWrapTagList = selectAll(stopWrapTag, hast) + stopWrapTagList.forEach(stopWrap => { + stopWrap.tagName = 'div' + stopWrap.properties.className = imageWrapStopClass + }) + + node.value = convertHastToHtml(hast) + }) +} + +module.exports.extractInstructions = extractInstructions diff --git a/plugins/resize-image-plugin/index.test.js b/plugins/resize-image-plugin/index.test.js new file mode 100644 index 00000000000..57a1c886c7b --- /dev/null +++ b/plugins/resize-image-plugin/index.test.js @@ -0,0 +1,40 @@ +const { extractInstructions } = require('.') + +describe('extractInstructions', () => { + it('extracts the title if no instructions are found', () => { + expect(extractInstructions('I am a title')).toEqual({ + resize: null, + title: 'I am a title', + wrap: null + }) + }) + + it('extracts a resize instruction when it finds =NNN', () => { + expect(extractInstructions('=42 title')).toEqual({ + resize: 42, + title: 'title', + wrap: null + }) + }) + + it('extracts a wrap instruction when it finds :wrap-left or :wrap-right', () => { + expect(extractInstructions(':wrap-left title')).toEqual({ + resize: null, + title: 'title', + wrap: 'left' + }) + expect(extractInstructions(':wrap-right title')).toEqual({ + resize: null, + title: 'title', + wrap: 'right' + }) + }) + + it('extracts both wrap instructions and resize instructions', () => { + expect(extractInstructions('=200 :wrap-right title')).toEqual({ + resize: 200, + title: 'title', + wrap: 'right' + }) + }) +}) diff --git a/plugins/resize-image-plugin/package.json b/plugins/resize-image-plugin/package.json new file mode 100644 index 00000000000..69ca34a37ff --- /dev/null +++ b/plugins/resize-image-plugin/package.json @@ -0,0 +1,7 @@ +{ + "name": "resize-image-plugin", + "version": "1.0.0", + "main": "index.js", + "author": "Franco", + "license": "Apache-2.0" +} diff --git a/plugins/utils/convertHast.js b/plugins/utils/convertHast.js new file mode 100644 index 00000000000..395d41f707e --- /dev/null +++ b/plugins/utils/convertHast.js @@ -0,0 +1,21 @@ +const unified = require('unified') +const parse = require('rehype-parse') +const stringify = require('rehype-stringify') + +/** HAST - Hypertext Abstract Syntax Tree */ +function convertHtmlToHast(htmlString) { + return unified() + .use(parse, { fragment: true }) + .parse(htmlString) +} + +function convertHastToHtml(htmlAst) { + return unified() + .use(stringify) + .stringify(htmlAst) +} + +module.exports = { + convertHastToHtml, + convertHtmlToHast +} diff --git a/postcss.config.js b/postcss.config.js new file mode 100644 index 00000000000..d7bd01f18d2 --- /dev/null +++ b/postcss.config.js @@ -0,0 +1,22 @@ +const nested = require('postcss-nested') +const autoprefixer = require('autoprefixer') +const customMedia = require('postcss-custom-media') +const customProperties = require('postcss-custom-properties') +const mixins = require('postcss-mixins') + +const mediaConfig = require('./config/postcss/media') +const mixinsConfig = require('./config/postcss/mixins') + +module.exports = function postcssConfig() { + return { + plugins: [ + mixins(mixinsConfig), + customMedia({ importFrom: mediaConfig }), + customProperties({ + importFrom: ['src/components/Layout/base.css'] + }), + nested, + autoprefixer + ] + } +} diff --git a/src/api/index.ts b/src/api/index.ts new file mode 100644 index 00000000000..7de29b89dd0 --- /dev/null +++ b/src/api/index.ts @@ -0,0 +1,2 @@ +export const getCommentsCount = (url: string) => + fetch(`/api/comments?url=${url}`) diff --git a/src/components/404/index.js b/src/components/404/index.js deleted file mode 100644 index c2869a7ce53..00000000000 --- a/src/components/404/index.js +++ /dev/null @@ -1,20 +0,0 @@ -import React from 'react' -import Subscribe from '../Subscribe' - -import { Wrapper, Title, Content } from './styles' - -function Page404() { - return ( - <> - - Not Found - - You just hit a route that doesn't exist... the sadness. - - - - - ) -} - -export default Page404 diff --git a/src/components/404/styles.js b/src/components/404/styles.js deleted file mode 100644 index 97dc2bcc750..00000000000 --- a/src/components/404/styles.js +++ /dev/null @@ -1,38 +0,0 @@ -import styled from 'styled-components' -import { media } from '../../styles' - -export const Wrapper = styled.div` - display: flex; - flex-direction: column; - justify-content: center; - align-items: center; - margin: 100px auto 150px; -` - -export const Title = styled.h1` - ${media.desktop` - font-weight: 500; - font-size: 30px; - line-height: 40px; - `} - - font-family: BrandonGrotesqueMed; - font-size: 40px; - line-height: 60px; - margin-top: 0.67em; - margin-bottom: 0.67em; -` - -export const Content = styled.div` - ${media.desktop` - padding: 0 15px; - text-align: center; - font-size: 20px; - line-height: 30px; - `} - - padding: 0; - text-align: left; - font-size: 24px; - line-height: 34px; -` diff --git a/src/components/BlogFeed/Item/index.tsx b/src/components/BlogFeed/Item/index.tsx new file mode 100644 index 00000000000..acffe4e7cba --- /dev/null +++ b/src/components/BlogFeed/Item/index.tsx @@ -0,0 +1,161 @@ +import React, { useEffect, useRef } from 'react' +import { useRafState, useWindowSize } from 'react-use' +import { graphql } from 'gatsby' +import Link from '../../Link' +import Image, { FixedObject, FluidObject } from 'gatsby-image' +import cn from 'classnames' + +import BlogFeedMeta from '../../BlogFeedMeta' + +import styles from './styles.module.css' + +import { ReactComponent as Placeholder } from './placeholder.svg' + +export interface IBlogFeedPostData { + id: string + timeToRead: string + fields: { + slug: string + } + frontmatter: { + title: string + date: string + description: string + descriptionLong: string + picture?: { + childImageSharp: { + big: FluidObject + small: FluidObject + } + } + author: { + childMarkdownRemark: { + frontmatter: { + name: string + avatar: { + childImageSharp: { + fixed: FixedObject + } + } + } + } + } + } +} + +interface IBlogFeedItemProps { + big?: boolean + feedPost: IBlogFeedPostData +} + +const BlogFeedItem: React.SFC = ({ + big, + feedPost: { fields, frontmatter, timeToRead } +}) => { + const { title, description, date, picture, author } = frontmatter + const { avatar, name } = author.childMarkdownRemark.frontmatter + const bodyRef = useRef(null) + const { width } = useWindowSize() + const [isOverflown, setIsOverflown] = useRafState(true) + + useEffect(() => { + if (bodyRef.current) { + const { scrollHeight, clientHeight } = bodyRef.current + + setIsOverflown(scrollHeight <= clientHeight) + } + }, [width]) + + const image = picture + ? big + ? picture.childImageSharp.big + : picture.childImageSharp.small + : undefined + + return ( +
+ + {picture ? ( + + ) : ( + + )} + +
+ + {title} + +
{description}
+
+
+ +
+
+ ) +} + +export const query = graphql` + fragment FeedPost on MarkdownRemark { + id + timeToRead + fields { + slug + } + frontmatter { + date(formatString: "MMM DD, YYYY") + title + description + descriptionLong + picture { + childImageSharp { + big: fluid( + maxWidth: 650 + maxHeight: 450 + cropFocus: CENTER + quality: 90 + ) { + ...GatsbyImageSharpFluid_withWebp + } + small: fluid( + maxWidth: 300 + maxHeight: 250 + cropFocus: CENTER + quality: 90 + ) { + ...GatsbyImageSharpFluid_withWebp + } + } + } + author { + childMarkdownRemark { + frontmatter { + name + avatar { + childImageSharp { + fixed(width: 40, height: 40, quality: 50, cropFocus: CENTER) { + ...GatsbyImageSharpFixed_withWebp + } + } + } + } + } + } + } + } +` + +export default BlogFeedItem diff --git a/src/components/BlogFeed/Item/placeholder.svg b/src/components/BlogFeed/Item/placeholder.svg new file mode 100644 index 00000000000..7b6992815b1 --- /dev/null +++ b/src/components/BlogFeed/Item/placeholder.svg @@ -0,0 +1,8 @@ + + + + + + + + diff --git a/src/components/BlogFeed/Item/styles.module.css b/src/components/BlogFeed/Item/styles.module.css new file mode 100644 index 00000000000..5419843e35a --- /dev/null +++ b/src/components/BlogFeed/Item/styles.module.css @@ -0,0 +1,187 @@ +.wrapper { + position: relative; + width: 300px; + height: 500px; + margin: 0 50px 50px 0; + background-color: var(--color-light-blue); + + @media (--md-scr) { + &.big { + width: 650px; + height: auto; + } + } + + @media (--sm-scr) { + width: auto; + height: auto; + margin: 0; + + &.big { + width: auto; + } + + & + & { + margin-top: 10px; + } + } + + @media (--lg-scr) { + &.big { + width: 1005px; + height: 450px; + } + } +} + +.pictureLink { + @mixin active; + + display: block; + + &:hover, + &:focus { + outline: none; + opacity: 0.7; + } +} + +.picture { + display: block; + width: 300px; + height: 250px; + background-color: #dee8ed; + + @media (--md-scr) { + .big & { + width: 650px; + height: 450px; + } + } + + @media (--sm-scr) { + width: auto; + height: auto; + + .big & { + width: auto; + height: auto; + } + + .placeholder & { + display: none; + } + } + + @media (--lg-scr) { + .big & { + float: left; + width: 650px; + height: 450px; + margin-right: 50px; + } + } +} + +.body { + position: relative; + overflow: hidden; + padding: 10px 20px; + max-height: 166px; + + &.overflown::after { + content: ''; + position: absolute; + right: 0; + bottom: 0; + left: 0; + height: 30px; + background: linear-gradient( + 0deg, + rgba(238, 244, 248, 1) 0%, + rgba(238, 244, 248, 0) 100% + ); + } + + @media (--md-scr) { + .big & { + padding-top: 30px; + } + } + + @media (--sm-scr) { + .placeholder & { + padding-top: 30px; + } + } + + @media (--lg-scr) { + .big & { + max-height: 314px; + padding: 30px; + } + } +} + +.title { + @mixin h3-desktop; + @mixin active; + @mixin focus; + @mixin hover; + + display: block; + text-decoration: none; + color: var(--color-black); + + @media (--sm-scr) { + @mixin h3-mobile; + } + + @media (--lg-scr) { + .big & { + @mixin h2-desktop; + } + } + + &:hover { + opacity: 0.7; + } +} + +.description { + @mixin text-secondary; + + margin-top: 10px; + color: var(--color-gray); + + @media (--lg-scr) { + .big & { + margin-top: 40px; + } + } +} + +.meta { + position: absolute; + right: 20px; + bottom: 15px; + left: 20px; + + @media (--md-scr) { + .big & { + position: static; + padding: 0 30px 15px; + } + } + + @media (--sm-scr) { + position: static; + padding: 0 30px 15px; + } + + @media (--lg-scr) { + .big & { + left: 700px; + } + } +} diff --git a/src/components/BlogFeed/index.tsx b/src/components/BlogFeed/index.tsx new file mode 100644 index 00000000000..7d01d08e7a7 --- /dev/null +++ b/src/components/BlogFeed/index.tsx @@ -0,0 +1,67 @@ +import { graphql } from 'gatsby' + +import React from 'react' + +import cn from 'classnames' + +import Paginator, { IPageInfo } from '../Paginator' +import BlogFeedItem, { IBlogFeedPostData } from './Item' + +import styles from './styles.module.css' + +export interface IBlogFeedPostList { + edges: Array<{ + node: IBlogFeedPostData + }> +} + +interface IBlogFeedProps { + feedPostList: IBlogFeedPostList + bigFirst?: boolean + header: React.ReactNode + leadParagraph?: React.ReactNode + pageInfo: IPageInfo +} + +const BlogFeed: React.SFC = ({ + feedPostList: { edges }, + pageInfo, + bigFirst = true, + header, + leadParagraph +}) => { + return ( +
+
+

{header}

+ {leadParagraph &&
{leadParagraph}
} +
+
+ {edges.map(({ node }, index) => ( + + ))} +
+ +
+ ) +} + +export const query = graphql` + fragment FeedPostList on MarkdownRemarkConnection { + edges { + node { + ...FeedPost + } + } + } +` + +export default BlogFeed diff --git a/src/components/BlogFeed/styles.module.css b/src/components/BlogFeed/styles.module.css new file mode 100644 index 00000000000..aef9be33022 --- /dev/null +++ b/src/components/BlogFeed/styles.module.css @@ -0,0 +1,63 @@ +.wrapper { + overflow: hidden; + max-width: 1005px; + margin: 0 auto; + + @media (--md-scr) { + max-width: 650px; + } + + @media (--sm-scr) { + padding: 0 15px; + } +} + +.meta { + margin: 20px 0 40px; + + @media (--sm-scr) { + max-width: auto; + margin: 30px; + } +} + +.metaSlim { + max-width: 650px; +} + +.header { + @media (--md-scr) { + @mixin h1-mobile; + } + + @media (--lg-scr) { + @mixin h1-desktop; + } + + margin: 0; +} + +.lead { + @mixin text-secondary; + + margin-top: 10px; + color: var(--color-gray); +} + +.posts { + display: flex; + flex-wrap: wrap; + + @media (--md-scr) { + width: 700px; + } + + @media (--sm-scr) { + display: block; + width: auto; + } + + @media (--lg-scr) { + width: 1050px; + } +} diff --git a/src/components/BlogFeedMeta/index.tsx b/src/components/BlogFeedMeta/index.tsx new file mode 100644 index 00000000000..6faae1de2e5 --- /dev/null +++ b/src/components/BlogFeedMeta/index.tsx @@ -0,0 +1,54 @@ +import Image, { FixedObject } from 'gatsby-image' +import React from 'react' + +import { pluralizeComments } from '../../utils/i18n' + +import styles from './styles.module.css' + +interface IBlogFeedMetaProps { + avatar: { + childImageSharp: { + fixed: FixedObject + } + } + commentsUrl?: string + commentsCount?: number + date: string + name: string + timeToRead: string +} + +const BlogFeedMeta: React.SFC = ({ + avatar, + commentsUrl, + commentsCount, + date, + name, + timeToRead +}) => { + return ( +
+ ) +} + +export default BlogFeedMeta diff --git a/src/components/BlogFeedMeta/styles.module.css b/src/components/BlogFeedMeta/styles.module.css new file mode 100644 index 00000000000..e78afebeff3 --- /dev/null +++ b/src/components/BlogFeedMeta/styles.module.css @@ -0,0 +1,43 @@ +.wrapper { + position: relative; + display: flex; + flex-wrap: wrap; + align-items: center; + min-height: 44px; + padding-left: 40px; +} + +.avatar { + position: absolute !important; + top: 50%; + left: 0; + margin-top: -20px; + border-radius: 20px; +} + +.list { + overflow: hidden; + margin: 0 0 0 10px; + padding: 0; +} + +.item { + @mixin text-secondary; + + position: relative; + display: inline-block; + margin-right: 14px; + white-space: nowrap; + line-height: 20px; + color: var(--color-gray); + + &::before { + content: '• '; + position: absolute; + right: 100%; + } +} + +.link { + @mixin link; +} diff --git a/src/components/BlogHome/index.tsx b/src/components/BlogHome/index.tsx new file mode 100644 index 00000000000..3d71b7087a2 --- /dev/null +++ b/src/components/BlogHome/index.tsx @@ -0,0 +1,35 @@ +import React from 'react' + +import { IPageInfo } from '../Paginator' +import PageContent from '../PageContent' +import BlogFeed, { IBlogFeedPostList } from '../BlogFeed' +import Subscribe from '../Subscribe' + +interface IBlogHomeProps { + posts: IBlogFeedPostList + pageInfo: IPageInfo +} + +const BlogHome: React.SFC = ({ posts, pageInfo }) => { + return ( + <> + + + We write about machine learning workflow. From data versioning and + processing to model productionization. We share our news, + findings, interesting reads, community takeaways. + + } + /> + + + + ) +} + +export default BlogHome diff --git a/src/components/BlogLayout/index.tsx b/src/components/BlogLayout/index.tsx new file mode 100644 index 00000000000..cee5eb4e68a --- /dev/null +++ b/src/components/BlogLayout/index.tsx @@ -0,0 +1,31 @@ +import React from 'react' +import { Helmet } from 'react-helmet' +import { LayoutComponent } from '../Layout' +import MainLayout from '../MainLayout' + +const keywords = + 'git, data, version control, machine learning models management, datasets' +const description = + 'Data Version Control Blog. We write about machine learning workflow. From data versioning and processing to model productionization. We share our news, findings, interesting reads, community takeaways.' + +const BlogLayout: LayoutComponent = ({ children, ...restProps }) => ( + + +