Skip to content

Commit

Permalink
experiments: exp res bug fixes (#5258)
Browse files Browse the repository at this point in the history
* dvc repro: fail on checkpoint stages

* exp res: check for moved HEAD/baseline before resuming

* exp run/res: always consider last checkpoint as applied for workspace runs
  • Loading branch information
pmrowla authored Jan 13, 2021
1 parent 1ab1abf commit 228b3b1
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 16 deletions.
50 changes: 38 additions & 12 deletions dvc/repo/experiments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -417,22 +417,44 @@ def _resume_checkpoint(
)

last_applied = self.scm.get_ref(EXEC_APPLY)
try:
if last_applied:
self.check_baseline(last_applied)
self.check_baseline(resume_rev)
except BaselineMismatchError:
# If HEAD has moved since the the last applied checkpoint,
# the applied checkpoint is no longer valid
self.scm.remove_ref(EXEC_APPLY)
last_applied = None
checkpoint_resume = None
if resume_rev != last_applied:
if last_applied is None:
msg = "Current workspace does not contain an experiment. "
if checkpoint_resume == self.LAST_CHECKPOINT:
display_rev: Optional[str] = resume_rev[:7]
else:
display_rev = checkpoint_resume

if display_rev:
if last_applied is None:
msg = (
f"Checkpoint '{display_rev}' cannot be resumed until "
"it is applied to your workspace."
)
else:
msg = (
f"Checkpoint '{display_rev}' does not match the "
"most recently applied experiment in your workspace "
f"('{last_applied[:7]}')."
)
msg = (
f"Checkpoint '{checkpoint_resume[:7]}' does not match the "
"most recently applied experiment in your workspace "
f"('{last_applied[:7]}')."
f"{msg}\n"
"To resume this experiment run:\n\n"
f"\tdvc exp apply {display_rev}\n\n"
"And then retry this 'dvc exp res' command."
)
else:
msg = "No existing checkpoint to resume in your workspace."

raise DvcException(
f"{msg}\n"
"To resume this experiment run:\n\n"
f"\tdvc exp apply {checkpoint_resume[:7]}\n\n"
"And then retry this 'dvc exp res' command."
)
raise DvcException(msg)

baseline_rev = self._get_baseline(branch)
logger.debug(
Expand All @@ -449,7 +471,7 @@ def _resume_checkpoint(
**kwargs,
)

def _get_last_checkpoint(self):
def _get_last_checkpoint(self) -> str:
rev = self.scm.get_ref(EXEC_CHECKPOINT)
if rev:
return rev
Expand Down Expand Up @@ -674,6 +696,7 @@ def _workspace_repro(self) -> Mapping[str, str]:
elif self.scm.get_ref(EXEC_BRANCH):
self.scm.remove_ref(EXEC_BRANCH)
try:
orig_checkpoint = self.scm.get_ref(EXEC_CHECKPOINT)
exec_result = BaseExecutor.reproduce(
None,
rev,
Expand Down Expand Up @@ -701,6 +724,9 @@ def _workspace_repro(self) -> Mapping[str, str]:
self.scm.remove_ref(EXEC_BASELINE)
if entry.branch:
self.scm.remove_ref(EXEC_BRANCH)
checkpoint = self.scm.get_ref(EXEC_CHECKPOINT)
if checkpoint and checkpoint != orig_checkpoint:
self.scm.set_ref(EXEC_APPLY, checkpoint)

def check_baseline(self, exp_rev):
baseline_sha = self.repo.scm.get_rev()
Expand Down
8 changes: 4 additions & 4 deletions dvc/repo/reproduce.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import typing
from functools import partial

from dvc.exceptions import ReproductionError
from dvc.exceptions import DvcException, ReproductionError
from dvc.repo.scm_context import scm_context

from . import locked
Expand All @@ -24,9 +24,9 @@ def _run_callback(repro_callback):
if checkpoint_func:
kwargs["checkpoint_func"] = partial(_run_callback, checkpoint_func)
else:
logger.warning(
"Checkpoint stages are not fully supported in 'dvc repro'. "
"Checkpoint stages should be reproduced with 'dvc exp run' "
raise DvcException(
"Checkpoint stages are not supported in 'dvc repro'. "
"Checkpoint stages must be reproduced with 'dvc exp run' "
"or 'dvc exp resume'."
)

Expand Down

0 comments on commit 228b3b1

Please sign in to comment.