Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support hash-only build paths #777

Merged
40 changes: 39 additions & 1 deletion conda-store-server/conda_store_server/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
import hashlib
import typing

import platformdirs
Expand Down Expand Up @@ -32,6 +33,8 @@ class BuildKey:

_version2_hash_size = 8

_version3_experimental_hash_size = 32

def _version1_fmt(build: "Build") -> str: # noqa: F821
datetime_format = "%Y%m%d-%H%M%S-%f"
hash = build.specification.sha256
Expand All @@ -48,10 +51,34 @@ def _version2_fmt(build: "Build") -> str: # noqa: F821
name = build.specification.name
return f"{hash}-{timestamp}-{id}-{name}"

# Warning: this is an experimental version and can be changed at any time
def _version3_experimental_fmt(build: "Build") -> str: # noqa: F821
# Caches the hash value for faster lookup later
if build.hash is not None:
return build.hash

# Adds namespace here to separate builds performed by different users
# since all builds are stored in the same directory in v3, see
# Build.build_path in orm.py. Additionally, this also hashes the
# timestamp and build id just to make collisions very unlikely
nkaretnikov marked this conversation as resolved.
Show resolved Hide resolved
namespace_name = build.environment.namespace.name
specification_hash = build.specification.sha256
nkaretnikov marked this conversation as resolved.
Show resolved Hide resolved
tzinfo = datetime.timezone.utc
timestamp = int(build.scheduled_on.replace(tzinfo=tzinfo).timestamp())
build_id = build.id
hash_input = (
namespace_name + specification_hash + str(timestamp) + str(build_id)
)
hash = hashlib.sha256(hash_input.encode("utf-8")).hexdigest()
hash = hash[: BuildKey._version3_experimental_hash_size]
build.hash = hash
return hash

# version -> fmt function
_fmt = {
1: _version1_fmt,
2: _version2_fmt,
3: _version3_experimental_fmt,
}

@classmethod
Expand Down Expand Up @@ -88,14 +115,25 @@ def get_build_key(cls, build: "Build") -> str: # noqa: F821
return cls._fmt.get(build.build_key_version)(build)

@classmethod
def parse_build_key(cls, build_key: str) -> int:
def parse_build_key(
cls, conda_store: "CondaStore", build_key: str # noqa: F821
) -> int:
"""Returns build id from build key"""
# This import is here to avoid cyclic imports
from conda_store_server import orm

parts = build_key.split("-")
# Note: cannot rely on the number of dashes to differentiate between
# versions because name can contain dashes. Instead, this relies on the
# hash size to infer the format. The name is the last field, so indexing
# to find the id is okay.
if build_key[cls._version2_hash_size] == "-": # v2
return int(parts[2]) # build_id
elif "-" not in build_key: # v3
with conda_store.get_db() as db:
build = db.query(orm.Build).filter(orm.Build.hash == build_key).first()
if build is not None:
return build.id
return None
else: # v1
return int(parts[4]) # build_id
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""add build hash

Revision ID: e17b4cc6e086
Revises: 03c839888c82
Create Date: 2024-03-26 04:39:24.275214

"""

import sqlalchemy as sa

from alembic import op


# revision identifiers, used by Alembic.
revision = "e17b4cc6e086"
down_revision = "03c839888c82"
branch_labels = None
depends_on = None


def upgrade():
op.add_column("build", sa.Column("hash", sa.Unicode(length=32), nullable=True))


def downgrade():
op.drop_column("build", "hash")
2 changes: 1 addition & 1 deletion conda-store-server/conda_store_server/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ class CondaStore(LoggingConfigurable):

build_key_version = Integer(
BuildKey.set_current_version(2),
help="Build key version to use: 1 (long, legacy), 2 (short, default)",
help="Build key version to use: 1 (long, legacy), 2 (shorter hash, default), 3 (hash-only, experimental)",
config=True,
)

Expand Down
6 changes: 4 additions & 2 deletions conda-store-server/conda_store_server/build.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,8 @@ def build_conda_environment(db: Session, conda_store, build):
conda_prefix.parent.mkdir(parents=True, exist_ok=True)

environment_prefix = build.environment_path(conda_store)
environment_prefix.parent.mkdir(parents=True, exist_ok=True)
if environment_prefix is not None:
environment_prefix.parent.mkdir(parents=True, exist_ok=True)

with utils.timer(conda_store.log, f"building conda_prefix={conda_prefix}"):
context = action.action_solve_lockfile(
Expand Down Expand Up @@ -247,7 +248,8 @@ def build_conda_environment(db: Session, conda_store, build):
),
)

utils.symlink(conda_prefix, environment_prefix)
if environment_prefix is not None:
utils.symlink(conda_prefix, environment_prefix)

action.action_set_conda_prefix_permissions(
conda_prefix=conda_prefix,
Expand Down
28 changes: 25 additions & 3 deletions conda-store-server/conda_store_server/orm.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,9 @@ class Build(Base):
ended_on = Column(DateTime, default=None)
deleted_on = Column(DateTime, default=None)

# Only used by build_key_version 3, not necessary for earlier versions
hash = Column(Unicode(32), default=None)

@staticmethod
def _get_build_key_version():
# Uses local import to make sure BuildKey is initialized
Expand All @@ -258,8 +261,15 @@ def build_path(self, conda_store):
build the environment

"""
# Uses local import to make sure BuildKey is initialized
from conda_store_server import BuildKey

if BuildKey.current_version() < 3:
namespace = self.environment.namespace.name
else:
namespace = ""

store_directory = os.path.abspath(conda_store.store_directory)
namespace = self.environment.namespace.name
res = (
pathlib.Path(
conda_store.build_directory.format(
Expand All @@ -285,6 +295,18 @@ def environment_path(self, conda_store):
path

"""
# Uses local import to make sure BuildKey is initialized
from conda_store_server import BuildKey

# This is not used with v3 because the whole point of v3 is to avoid any
# dependence on user-provided variable-size data on the filesystem and
# by default the environment path contains the namespace and the
# environment name, which can be arbitrary large. By setting this to
# None, we're making it clear that this shouldn't be used by other
# functions, such as when creating symlinks
if BuildKey.current_version() >= 3:
return None

store_directory = os.path.abspath(conda_store.store_directory)
namespace = self.environment.namespace.name
name = self.specification.name
Expand Down Expand Up @@ -317,11 +339,11 @@ def build_key(self):
return BuildKey.get_build_key(self)

@staticmethod
def parse_build_key(key):
def parse_build_key(conda_store: "CondaStore", key: str): # noqa: F821
# Uses local import to make sure BuildKey is initialized
from conda_store_server import BuildKey

return BuildKey.parse_build_key(key)
return BuildKey.parse_build_key(conda_store, key)

@property
def log_key(self):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def get_docker_image_manifest(conda_store, image, tag, timeout=10 * 60):
else:
build_key = tag

build_id = orm.Build.parse_build_key(build_key)
build_id = orm.Build.parse_build_key(conda_store, build_key)
if build_id is None:
return docker_error_message(schema.DockerRegistryError.MANIFEST_UNKNOWN)

Expand Down
9 changes: 8 additions & 1 deletion conda-store-server/conda_store_server/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,5 +244,12 @@ def get_url(self, key):

def delete(self, db, build_id, key):
filename = os.path.join(self.storage_path, key)
os.remove(filename)
try:
os.remove(filename)
except FileNotFoundError:
# The DB can contain multiple entries pointing to the same key, like
# a log file. This skips files that were previously processed and
# deleted. See LocalStorage.fset and Storage.fset, which are used
# for saving build artifacts
pass
super().delete(db, build_id, key)
3 changes: 2 additions & 1 deletion conda-store-server/conda_store_server/worker/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,8 @@ def task_update_environment_build(self, environment_id):
conda_prefix = environment.current_build.build_path(conda_store)
environment_prefix = environment.current_build.environment_path(conda_store)

utils.symlink(conda_prefix, environment_prefix)
if environment_prefix is not None:
utils.symlink(conda_prefix, environment_prefix)

if conda_store.post_update_environment_build_hook:
conda_store.post_update_environment_build_hook(conda_store, environment)
Expand Down
13 changes: 8 additions & 5 deletions conda-store-server/tests/test_actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,8 @@ def test_add_lockfile_packages(
[
(False, 0), # invalid
(False, 1), # long (legacy)
(False, 2), # short (default)
(False, 2), # shorter hash (default)
(False, 3), # hash-only (experimental)
(True, 1), # build_key_version doesn't matter because there's no lockfile
],
)
Expand All @@ -350,14 +351,14 @@ def test_api_get_build_lockfile(
TraitError,
match=(
r"c.CondaStore.build_key_version: invalid build key version: 0, "
r"expected: \(1, 2\)"
r"expected: \(1, 2, 3\)"
),
):
conda_store.build_key_version = build_key_version
return # invalid, nothing more to test
conda_store.build_key_version = build_key_version
assert BuildKey.current_version() == build_key_version
assert BuildKey.versions() == (1, 2)
assert BuildKey.versions() == (1, 2, 3)

# initializes data needed to get the lockfile
specification = simple_specification_with_pip
Expand Down Expand Up @@ -436,14 +437,16 @@ def lockfile_url(build_key):
)
elif build_key_version == 2:
build_key = "c7afdeff-1699156450-12345678-this-is-a-long-environment-name"
elif build_key_version == 3:
build_key = "c1f206a26263e1166e5b43548f69aa0c"
else:
raise ValueError(f"unexpected build_key_version: {build_key_version}")
assert type(res) is RedirectResponse
assert key == res.headers["location"]
assert build.build_key == build_key
assert BuildKey.get_build_key(build) == build_key
assert build.parse_build_key(build_key) == 12345678
assert BuildKey.parse_build_key(build_key) == 12345678
assert build.parse_build_key(conda_store, build_key) == 12345678
assert BuildKey.parse_build_key(conda_store, build_key) == 12345678
assert lockfile_url(build_key) == build.conda_lock_key
assert lockfile_url(build_key) == res.headers["location"]
assert res.status_code == 307
Expand Down
7 changes: 7 additions & 0 deletions docusaurus-docs/community/policies/backwards-compatibility.md
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,13 @@ objects as public if there is an explicit need to do so. Keeping code private by
default limits the public API that the conda-store project developers are
committing to supporting.

### Build keys

conda-store ships with several build key versions. The build key determines the
location of environment builds and build artifacts. Build key versions marked as
experimental can be changed at any time, see `BuildKey` and the FAQ for more
information.

#### Deprecating Python APIs

Under exceptional circumstances such as a serious security vulnerability which
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ store_directory, namespace, name. The default will put all
environments in the same namespace within the same directory.

`CondaStore.build_key_version` is the [build key version](#build-key-versions)
to use: 1 (long, legacy), 2 (short, default).
to use: 1 (long, legacy), 2 (shorter hash, default), 3 (hash-only, experimental).

`CondaStore.validate_specification` callable function taking
`conda_store` and `specification` as input arguments to apply for
Expand Down
76 changes: 66 additions & 10 deletions docusaurus-docs/conda-store/references/faq.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,22 +94,78 @@ It consists of:
3. the id of a build
4. the environment name.

The version 2 format is now the default. Environments created using the version
1 format will continue to be accessible in the UI, but new builds will use the
version 2 format. No changes are needed for existing deployments of conda-store.
However, version 2 build paths don't solve the problem completely because they
include user-provided data, like the environment name, and that data can be
arbitrarily large.

There is no real reason to use the version 1 format anymore, but it can be
explicitly set via the config:
To solve this problem, version 3 was introduced, which will always have the same
size. It looks like this:

```bash
64a943764b70e8fe181643404894f7ae
```

:::warning
Version 3 is experimental and can be changed at any time.
:::

It's a truncated SHA-256 hex digest, which is calculated based on:

- namespace name
- specification hash (also SHA-256)
- build timestamp
- build id.

See `BuildKey._version3_experimental_fmt` for details.

:::note
When version 3 is used, `Build.build_path` will not include the namespace name,
because it's not fixed size, so all builds will be placed right into
`CondaStore.store_directory`.

Additionally, `CondaStore.environment_directory` will be completely ignored, so
no symlinks connecting an environment name to its corresponding build will be
created, because the environment directory format also includes variable-size
data (the namespace and environment names).

For context, these symlinks are created because that's how conda is usually
used: each environment name points to a particular directory on the filesystem,
and symlinks connect this directory to the current build.

In the following example, which uses the version 2 format, there are two
environments in the `default` namespace: `test` and `test2`. The former points
to build 3 and the latter points to build 2:

```python
c.CondaStore.build_key_version = 1
```
$ ls -l ~/.conda-store/state/default/envs
test -> /home/user/.conda-store/state/default/b3109fbf-1710602415-3-test
test2 -> /home/user/.conda-store/state/default/2aad045f-1710602357-2-test2
```

The lack of symlinks doesn't prevent server artifacts from being generated,
which are available for download via the UI (lockfiles, archives, etc.), because
those rely on storage or use the database.

The version 2 format can also be explicitly set if needed (this is the same as
the default):
But it does impact conda integration or tools that rely on it, like when
conda-store is used with JupyterLab as part of a Nebari deployment. Without
environment symlinks, there'll be no way to tell conda where to look for
nkaretnikov marked this conversation as resolved.
Show resolved Hide resolved
environments, which is done by setting `envs_dirs` in `.condarc`, so `conda env
list` will return nothing and no environments will show up in JupyterLab.
:::
nkaretnikov marked this conversation as resolved.
Show resolved Hide resolved

The version 2 format is the default because it supports environment symlinks and
doesn't usually run into path length limitations. If you do experience problems
with the latter and don't need the former, then consider using the version 3
format.

No matter what format you choose, environments that were previously created
using other version formats will be accessible in the conda-store web UI.

There is no real reason to use the version 1 format anymore, but any version can
be explicitly set via the config, for example:

```python
c.CondaStore.build_key_version = 2
c.CondaStore.build_key_version = 1
```

## Long paths on Windows
Expand Down
Loading