Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

♻️ Store source code in the database, not on S3 #64

Merged
merged 18 commits into from
Sep 3, 2024
23 changes: 18 additions & 5 deletions lamin_cli/_get.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from typing import Tuple
from lamin_utils import logger
import lamindb_setup as ln_setup
from pathlib import Path


def decompose_url(url: str) -> Tuple[str, str, str]:
Expand All @@ -25,19 +26,31 @@ def get(url: str):
ln_setup.settings.auto_connect = False

import lamindb as ln
from lamindb._finish import script_to_notebook

ln_setup.settings.auto_connect = auto_connect
ln.connect(instance_slug)
ln.settings.verbosity = "success"

if entity == "transform":
transform = ln.Transform.get(uid)
filepath_cache = transform._source_code_artifact.cache()
target_filename = transform.key
if not target_filename.endswith(transform._source_code_artifact.suffix):
target_filename += transform._source_code_artifact.suffix
filepath_cache.rename(target_filename)
logger.success(f"cached source code of transform {uid} as {target_filename}")
if transform._source_code_artifact_id is not None:
# backward compat
filepath_cache = transform._source_code_artifact.cache()
if not target_filename.endswith(transform._source_code_artifact.suffix):
target_filename += transform._source_code_artifact.suffix
filepath_cache.rename(target_filename)
elif transform.source_code is not None:
if transform.key.endswith(".ipynb"):
script_to_notebook(transform, target_filename)
else:
Path(target_filename).write_text(transform.source_code)
else:
raise ValueError("No source code available for this transform.")
logger.success(
f"downloaded source code of transform {uid} as {target_filename}"
)
elif entity == "artifact":
artifact = ln.Artifact.get(uid)
cache_path = artifact.cache()
Expand Down
11 changes: 4 additions & 7 deletions lamin_cli/_save.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,10 @@ def save_from_filepath_cli(
" in Transform registry. Did you run ln.context.track()?"
)
return "not-tracked-in-transform-registry"
# refactor this, save_context_core should not depend on transform_family
transform_family = transform.versions
else:
# the corresponding transform family in the transform table
transform_family = ln.Transform.filter(uid__startswith=stem_uid).all()
# the specific version
transform = transform_family.get(version=transform_version)
transform = ln.Transform.get(
uid__startswith=stem_uid, version=transform_version
)
# latest run of this transform by user
run = ln.Run.filter(transform=transform).order_by("-started_at").first()
if run.created_by.id != ln_setup.settings.user.id:
Expand All @@ -106,10 +103,10 @@ def save_from_filepath_cli(
)
if response != "y":
return "aborted-save-notebook-created-by-different-user"
print("saving", run, transform)
return save_context_core(
run=run,
transform=transform,
filepath=filepath,
transform_family=transform_family,
from_cli=True,
)
48 changes: 42 additions & 6 deletions tests/test_save_notebooks.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pytest
from nbproject.dev import read_notebook, write_notebook
from nbclient.exceptions import CellExecutionError
import json
import lamindb as ln

notebook_dir = "./sub/lamin-cli/tests/notebooks/"
Expand Down Expand Up @@ -62,6 +63,8 @@ def test_save_consecutive():
env = os.environ
env["LAMIN_TESTING"] = "true"

assert not Path("./with-title-and-initialized-consecutive.ipynb").exists()

transform = ln.Transform.filter(uid="hlsFXswrJjtt0000").one_or_none()
assert transform is None

Expand Down Expand Up @@ -92,16 +95,44 @@ def test_save_consecutive():
capture_output=True,
env=env,
)
print(result.stdout.decode())
print(result.stderr.decode())
assert result.returncode == 0

# now, we have the associated artifacts
transform = ln.Transform.filter(uid="hlsFXswrJjtt0000").one_or_none()
assert transform is not None
assert transform.latest_run.report.path.exists()
assert transform.latest_run.report.path == transform.latest_run.report.path
assert transform._source_code_artifact.hash == "EQrdZpS-fPaz5MKk_g02AA"
assert (
transform.source_code
== """# %% [markdown]
# # Transform.name

# %%
import lamindb as ln

# %%
ln.context.uid = "hlsFXswrJjtt0000"
ln.context.track()

# %%
print("my consecutive cell")
"""
)
assert transform.hash == "T1oAJS3rgPXkPoqzsJcWuQ"
# below is the test that we can use if store the run repot as `.ipynb`
# and not as html as we do right now
assert transform.latest_run.report.suffix == ".html"
# with open(transform.latest_run.report.path, "r") as f:
# json_notebook = json.load(f)
# # test that title is stripped from notebook
# assert json_notebook["cells"][0] == {
# "cell_type": "markdown",
# "metadata": {},
# "source": [],
# }
# testing for the hash of the report makes no sense because it contains timestamps
assert transform.latest_run.environment.path.exists()
assert transform._source_code_artifact.path.exists()
assert transform._source_code_artifact is None

# now, assume the user modifies the notebook
nb = read_notebook(notebook_path)
Expand Down Expand Up @@ -130,9 +161,9 @@ def test_save_consecutive():
transform = ln.Transform.get("hlsFXswrJjtt0000")
assert transform.latest_run.report.path.exists()
assert transform.latest_run.report.path == transform.latest_run.report.path
assert transform._source_code_artifact.hash == "DMVEHVQqmY3ektOg2KtKKA"
assert transform.hash == "8F6i1zHA3Rv55JBHH05ZmQ"
assert transform.latest_run.environment.path.exists()
assert transform._source_code_artifact.path.exists()
assert transform._source_code_artifact is None

# get the the source code via command line
result = subprocess.run(
Expand All @@ -142,6 +173,11 @@ def test_save_consecutive():
capture_output=True,
)
# print(result.stderr.decode())
assert Path("./with-title-and-initialized-consecutive.ipynb").exists()
with open("./with-title-and-initialized-consecutive.ipynb", "r") as f:
json_notebook = json.load(f)
print(json_notebook["cells"][0])
assert json_notebook["cells"][0]["source"] == ["# My test notebook (consecutive)"]
assert result.returncode == 0

# now, assume the user renames the notebook
Expand Down
12 changes: 6 additions & 6 deletions tests/test_save_scripts.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ def test_run_save_cache():
assert "created Run" in result.stdout.decode()

transform = ln.Transform.get("m5uCHTTpJnjQ")
assert transform._source_code_artifact.hash == "Cwk0OPOyUH5nzTiU2ISlDQ"
assert transform.hash == "Cwk0OPOyUH5nzTiU2ISlDQ"
assert transform.latest_run.environment.path.exists()
assert transform._source_code_artifact.path.exists()
assert transform._source_code_artifact is None

# you can rerun the same script
result = subprocess.run(
Expand Down Expand Up @@ -73,7 +73,7 @@ def test_run_save_cache():
content = filepath.read_text() + "\n # edited"
filepath.write_text(content)

# re-run the script without commiting
# re-run the script without committing
result = subprocess.run(
f"python {filepath}",
shell=True,
Expand Down Expand Up @@ -101,10 +101,10 @@ def test_run_save_cache():
capture_output=True,
env=env,
)
# print(result.stdout.decode())
# print(result.stderr.decode())
print(result.stdout.decode())
print(result.stderr.decode())
assert result.returncode == 1
assert "Source code changed, bump version by setting" in result.stderr.decode()
assert "Source code changed, bump revision by setting" in result.stderr.decode()

# try to get the the source code via command line
result = subprocess.run(
Expand Down