From 26e203242fd4d8c64310ed5dc1c60891f36eff30 Mon Sep 17 00:00:00 2001 From: Casper da Costa-Luis Date: Wed, 6 May 2020 15:02:16 +0100 Subject: [PATCH 1/3] gdrive: download: stream & add progress (#3722) * wip * gdrive: add progress Part of #2865 See https://github.com/iterative/dvc/issues/2865#issuecomment-622984785 * gdrive: move towards next pydrive2 release - depends on https://github.com/iterative/PyDrive2/pull/30 * update to latest pydrive>=1.4.11 * avoid unneeded API call * progress: gdrive: ensure proper bar_format --- dvc/remote/gdrive.py | 30 ++++++++++-------------------- setup.py | 2 +- 2 files changed, 11 insertions(+), 21 deletions(-) diff --git a/dvc/remote/gdrive.py b/dvc/remote/gdrive.py index 4e004b53d4..6ce88eb057 100644 --- a/dvc/remote/gdrive.py +++ b/dvc/remote/gdrive.py @@ -44,17 +44,6 @@ def __init__(self, cred_location): super().__init__(message) -def _extract(exc, field): - from pydrive2.files import ApiRequestError - - assert isinstance(exc, ApiRequestError) - - # https://cloud.google.com/storage/docs/json_api/v1/status-codes#errorformat - return ( - exc.error["errors"][0].get(field, "") if "errors" in exc.error else "" - ) - - def _gdrive_retry(func): def should_retry(exc): from pydrive2.files import ApiRequestError @@ -68,7 +57,7 @@ def should_retry(exc): result = True if error_code == 403: - result = _extract(exc, "reason") in [ + result = exc.GetField("reason") in [ "userRateLimitExceeded", "rateLimitExceeded", ] @@ -396,14 +385,15 @@ def _gdrive_download_file( param = {"id": item_id} # it does not create a file on the remote gdrive_file = self._drive.CreateFile(param) - bar_format = ( - "Downloading {desc:{ncols_desc}.{ncols_desc}}... " - + Tqdm.format_sizeof(int(gdrive_file["fileSize"]), "B", 1024) - ) + with Tqdm( - bar_format=bar_format, desc=progress_desc, disable=no_progress_bar - ): - gdrive_file.GetContentFile(to_file) + desc=progress_desc, + disable=no_progress_bar, + bytes=True, + # explicit `bar_format` as `total` will be set by `update_to` + bar_format=Tqdm.BAR_FMT_DEFAULT, + ) as pbar: + gdrive_file.GetContentFile(to_file, callback=pbar.update_to) @_gdrive_retry def _gdrive_delete_file(self, item_id): @@ -420,7 +410,7 @@ def _gdrive_delete_file(self, item_id): if ( http_error_code == 403 and self._list_params["corpora"] == "drive" - and _extract(exc, "location") == "file.permissions" + and exc.GetField("location") == "file.permissions" ): raise DvcException( "Insufficient permissions to {}. You should have {} " diff --git a/setup.py b/setup.py index e27b30843e..3ede96a2aa 100644 --- a/setup.py +++ b/setup.py @@ -87,7 +87,7 @@ def run(self): # Extra dependencies for remote integrations gs = ["google-cloud-storage==1.19.0"] -gdrive = ["pydrive2>=1.4.10"] +gdrive = ["pydrive2>=1.4.11"] s3 = ["boto3>=1.9.201"] azure = ["azure-storage-blob==2.1.0"] oss = ["oss2==2.6.1"] From aa99cc99344c3cb05990fb7a9d0a72bf8c707b36 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pawe=C5=82=20Redzy=C5=84ski?= Date: Wed, 6 May 2020 17:33:10 +0200 Subject: [PATCH 2/3] plot: check field exists (#3742) --- dvc/repo/plot/template.py | 20 ++++++++++++++++++++ tests/func/test_plot.py | 13 +++++++++++++ 2 files changed, 33 insertions(+) diff --git a/dvc/repo/plot/template.py b/dvc/repo/plot/template.py index 2850a6220f..f36d90a9a0 100644 --- a/dvc/repo/plot/template.py +++ b/dvc/repo/plot/template.py @@ -24,6 +24,13 @@ def __init__(self, template_path): ) +class NoFieldInDataError(DvcException): + def __init__(self, field_name): + super().__init__( + "Field '{}' does not exist in provided data.".format(field_name) + ) + + class Template: INDENT = 4 SEPARATORS = (",", ": ") @@ -91,6 +98,11 @@ def fill( with open(template_path, "r") as fobj: result_content = fobj.read() + if x_field: + Template._check_field_exists(data, x_field) + if y_field: + Template._check_field_exists(data, y_field) + result_content = Template._replace_data_anchors( result_content, data, priority_datafile ) @@ -101,6 +113,14 @@ def fill( return result_content + @staticmethod + def _check_field_exists(data, field): + for file, data_points in data.items(): + if not any( + field in data_point.keys() for data_point in data_points + ): + raise NoFieldInDataError(field) + @staticmethod def _replace_metadata_anchors( result_content, title, x_field, x_title, y_field, y_title diff --git a/tests/func/test_plot.py b/tests/func/test_plot.py index 683192a331..a2cba47efb 100644 --- a/tests/func/test_plot.py +++ b/tests/func/test_plot.py @@ -18,6 +18,7 @@ from dvc.repo.plot.template import ( TemplateNotFoundError, NoDataForTemplateError, + NoFieldInDataError, ) from dvc.repo.plot import NoDataOrTemplateProvided @@ -502,3 +503,15 @@ def test_plot_yaml(tmp_dir, scm, dvc): {"val": 2, PlotData.INDEX_FIELD: 0, "rev": "workspace"}, {"val": 3, PlotData.INDEX_FIELD: 1, "rev": "workspace"}, ] + + +def test_raise_on_wrong_field(tmp_dir, scm, dvc): + metric = [{"val": 2}, {"val": 3}] + _write_json(tmp_dir, metric, "metric.json") + _run_with_metric(tmp_dir, "metric.json", "first run") + + with pytest.raises(NoFieldInDataError): + dvc.plot("metric.json", x_field="no_val") + + with pytest.raises(NoFieldInDataError): + dvc.plot("metric.json", y_field="no_val") From e3831066ded4f84f52bfe6e70a509e08d40c5695 Mon Sep 17 00:00:00 2001 From: Saugat Pachhai Date: Wed, 6 May 2020 15:33:42 +0000 Subject: [PATCH 3/3] repro: use dvc.yaml by default (#3747) Note: This does not change default for `pipeline` as it cannot show multiple DAGs at the same time. If we are able to show multiple DAGs, we can just default to `dvc.yaml`. --- dvc/command/base.py | 8 ++++---- dvc/command/pipeline.py | 4 +++- tests/func/test_repro.py | 10 +++++----- tests/unit/command/test_repro.py | 5 +++-- 4 files changed, 15 insertions(+), 12 deletions(-) diff --git a/dvc/command/base.py b/dvc/command/base.py index 21838a9227..6831335596 100644 --- a/dvc/command/base.py +++ b/dvc/command/base.py @@ -41,12 +41,12 @@ def __init__(self, args): @property def default_targets(self): - """Default targets for `dvc repro` and `dvc pipeline`.""" - from dvc.dvcfile import DVC_FILE + """Default targets for `dvc repro`.""" + from dvc.dvcfile import PIPELINE_FILE - msg = "assuming default target '{}'.".format(DVC_FILE) + msg = "assuming default target '{}'.".format(PIPELINE_FILE) logger.warning(msg) - return [DVC_FILE] + return [PIPELINE_FILE] # Abstract methods that have to be implemented by any inheritance class def run(self): diff --git a/dvc/command/pipeline.py b/dvc/command/pipeline.py index d788031660..77f525c311 100644 --- a/dvc/command/pipeline.py +++ b/dvc/command/pipeline.py @@ -119,8 +119,10 @@ def _write_dot(self, target, commands, outs): logger.info(dot_file.getvalue()) def run(self): + from dvc.dvcfile import DVC_FILE + if not self.args.targets: - self.args.targets = self.default_targets + self.args.targets = [DVC_FILE] for target in self.args.targets: try: diff --git a/tests/func/test_repro.py b/tests/func/test_repro.py index c0ba14c412..d2e5eb2552 100644 --- a/tests/func/test_repro.py +++ b/tests/func/test_repro.py @@ -28,7 +28,7 @@ from dvc.remote.local import LocalRemote from dvc.repo import Repo as DvcRepo from dvc.stage import Stage -from dvc.dvcfile import Dvcfile +from dvc.dvcfile import Dvcfile, DVC_FILE from dvc.stage.exceptions import StageFileDoesNotExistError from dvc.system import System from dvc.utils import file_md5 @@ -407,11 +407,11 @@ def test(self): self.assertEqual(ret, 0) ret = main( - ["run", "--no-exec", "--single-stage", "-f", "Dvcfile"] + deps + ["run", "--no-exec", "--single-stage", "-f", DVC_FILE] + deps ) self.assertEqual(ret, 0) - ret = main(["repro", "--dry"]) + ret = main(["repro", "--dry", DVC_FILE]) self.assertEqual(ret, 0) @@ -820,7 +820,7 @@ def test(self): os.unlink(bar) - ret = main(["repro", "-c", dname]) + ret = main(["repro", "-c", dname, DVC_FILE]) self.assertEqual(ret, 0) self.assertTrue(os.path.isfile(foo)) self.assertTrue(os.path.isfile(bar)) @@ -1458,7 +1458,7 @@ def repro_dir(tmp_dir, dvc, run_copy): assert dir_file_copy.read_text() == "dir file content" stages["dir_file_copy"] = stage - last_stage = tmp_dir / "dir" / "Dvcfile" + last_stage = tmp_dir / "dir" / DVC_FILE stage = dvc.run( fname=fspath(last_stage), deps=[fspath(origin_copy_2), fspath(dir_file_copy)], diff --git a/tests/unit/command/test_repro.py b/tests/unit/command/test_repro.py index dcdfdbe743..1e20a04cec 100644 --- a/tests/unit/command/test_repro.py +++ b/tests/unit/command/test_repro.py @@ -1,5 +1,6 @@ from dvc.cli import parse_args from dvc.command.repro import CmdRepro +from dvc.dvcfile import PIPELINE_FILE default_arguments = { "all_pipelines": False, @@ -19,7 +20,7 @@ def test_default_arguments(dvc, mocker): cmd = CmdRepro(parse_args(["repro"])) mocker.patch.object(cmd.repo, "reproduce") cmd.run() - cmd.repo.reproduce.assert_called_with("Dvcfile", **default_arguments) + cmd.repo.reproduce.assert_called_with(PIPELINE_FILE, **default_arguments) def test_downstream(dvc, mocker): @@ -28,4 +29,4 @@ def test_downstream(dvc, mocker): cmd.run() arguments = default_arguments.copy() arguments.update({"downstream": True}) - cmd.repo.reproduce.assert_called_with("Dvcfile", **arguments) + cmd.repo.reproduce.assert_called_with(PIPELINE_FILE, **arguments)