diff --git a/conda-store-server/conda_store_server/__init__.py b/conda-store-server/conda_store_server/__init__.py index b18e2e8e9..60dfdb27d 100644 --- a/conda-store-server/conda_store_server/__init__.py +++ b/conda-store-server/conda_store_server/__init__.py @@ -1,4 +1,5 @@ import datetime +import hashlib import typing import platformdirs @@ -32,6 +33,8 @@ class BuildKey: _version2_hash_size = 8 + _version3_experimental_hash_size = 32 + def _version1_fmt(build: "Build") -> str: # noqa: F821 datetime_format = "%Y%m%d-%H%M%S-%f" hash = build.specification.sha256 @@ -48,10 +51,34 @@ def _version2_fmt(build: "Build") -> str: # noqa: F821 name = build.specification.name return f"{hash}-{timestamp}-{id}-{name}" + # Warning: this is an experimental version and can be changed at any time + def _version3_experimental_fmt(build: "Build") -> str: # noqa: F821 + # Caches the hash value for faster lookup later + if build.hash is not None: + return build.hash + + # Adds namespace here to separate builds performed by different users + # since all builds are stored in the same directory in v3, see + # Build.build_path in orm.py. Additionally, this also hashes the + # timestamp and build id just to make collisions very unlikely + namespace_name = build.environment.namespace.name + specification_hash = build.specification.sha256 + tzinfo = datetime.timezone.utc + timestamp = int(build.scheduled_on.replace(tzinfo=tzinfo).timestamp()) + build_id = build.id + hash_input = ( + namespace_name + specification_hash + str(timestamp) + str(build_id) + ) + hash = hashlib.sha256(hash_input.encode("utf-8")).hexdigest() + hash = hash[: BuildKey._version3_experimental_hash_size] + build.hash = hash + return hash + # version -> fmt function _fmt = { 1: _version1_fmt, 2: _version2_fmt, + 3: _version3_experimental_fmt, } @classmethod @@ -88,8 +115,13 @@ def get_build_key(cls, build: "Build") -> str: # noqa: F821 return cls._fmt.get(build.build_key_version)(build) @classmethod - def parse_build_key(cls, build_key: str) -> int: + def parse_build_key( + cls, conda_store: "CondaStore", build_key: str # noqa: F821 + ) -> int: """Returns build id from build key""" + # This import is here to avoid cyclic imports + from conda_store_server import orm + parts = build_key.split("-") # Note: cannot rely on the number of dashes to differentiate between # versions because name can contain dashes. Instead, this relies on the @@ -97,5 +129,11 @@ def parse_build_key(cls, build_key: str) -> int: # to find the id is okay. if build_key[cls._version2_hash_size] == "-": # v2 return int(parts[2]) # build_id + elif "-" not in build_key: # v3 + with conda_store.get_db() as db: + build = db.query(orm.Build).filter(orm.Build.hash == build_key).first() + if build is not None: + return build.id + return None else: # v1 return int(parts[4]) # build_id diff --git a/conda-store-server/conda_store_server/alembic/versions/e17b4cc6e086_add_build_hash.py b/conda-store-server/conda_store_server/alembic/versions/e17b4cc6e086_add_build_hash.py new file mode 100644 index 000000000..f7223aec7 --- /dev/null +++ b/conda-store-server/conda_store_server/alembic/versions/e17b4cc6e086_add_build_hash.py @@ -0,0 +1,26 @@ +"""add build hash + +Revision ID: e17b4cc6e086 +Revises: 03c839888c82 +Create Date: 2024-03-26 04:39:24.275214 + +""" + +import sqlalchemy as sa + +from alembic import op + + +# revision identifiers, used by Alembic. +revision = "e17b4cc6e086" +down_revision = "03c839888c82" +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column("build", sa.Column("hash", sa.Unicode(length=32), nullable=True)) + + +def downgrade(): + op.drop_column("build", "hash") diff --git a/conda-store-server/conda_store_server/app.py b/conda-store-server/conda_store_server/app.py index bac136d9f..27976ed55 100644 --- a/conda-store-server/conda_store_server/app.py +++ b/conda-store-server/conda_store_server/app.py @@ -112,7 +112,7 @@ class CondaStore(LoggingConfigurable): build_key_version = Integer( BuildKey.set_current_version(2), - help="Build key version to use: 1 (long, legacy), 2 (short, default)", + help="Build key version to use: 1 (long, legacy), 2 (shorter hash, default), 3 (hash-only, experimental)", config=True, ) diff --git a/conda-store-server/conda_store_server/build.py b/conda-store-server/conda_store_server/build.py index 233a39b33..baa7dace0 100644 --- a/conda-store-server/conda_store_server/build.py +++ b/conda-store-server/conda_store_server/build.py @@ -196,7 +196,8 @@ def build_conda_environment(db: Session, conda_store, build): conda_prefix.parent.mkdir(parents=True, exist_ok=True) environment_prefix = build.environment_path(conda_store) - environment_prefix.parent.mkdir(parents=True, exist_ok=True) + if environment_prefix is not None: + environment_prefix.parent.mkdir(parents=True, exist_ok=True) with utils.timer(conda_store.log, f"building conda_prefix={conda_prefix}"): context = action.action_solve_lockfile( @@ -247,7 +248,8 @@ def build_conda_environment(db: Session, conda_store, build): ), ) - utils.symlink(conda_prefix, environment_prefix) + if environment_prefix is not None: + utils.symlink(conda_prefix, environment_prefix) action.action_set_conda_prefix_permissions( conda_prefix=conda_prefix, diff --git a/conda-store-server/conda_store_server/orm.py b/conda-store-server/conda_store_server/orm.py index ba815bbc3..42533bc29 100644 --- a/conda-store-server/conda_store_server/orm.py +++ b/conda-store-server/conda_store_server/orm.py @@ -233,6 +233,9 @@ class Build(Base): ended_on = Column(DateTime, default=None) deleted_on = Column(DateTime, default=None) + # Only used by build_key_version 3, not necessary for earlier versions + hash = Column(Unicode(32), default=None) + @staticmethod def _get_build_key_version(): # Uses local import to make sure BuildKey is initialized @@ -258,8 +261,15 @@ def build_path(self, conda_store): build the environment """ + # Uses local import to make sure BuildKey is initialized + from conda_store_server import BuildKey + + if BuildKey.current_version() < 3: + namespace = self.environment.namespace.name + else: + namespace = "" + store_directory = os.path.abspath(conda_store.store_directory) - namespace = self.environment.namespace.name res = ( pathlib.Path( conda_store.build_directory.format( @@ -285,6 +295,18 @@ def environment_path(self, conda_store): path """ + # Uses local import to make sure BuildKey is initialized + from conda_store_server import BuildKey + + # This is not used with v3 because the whole point of v3 is to avoid any + # dependence on user-provided variable-size data on the filesystem and + # by default the environment path contains the namespace and the + # environment name, which can be arbitrary large. By setting this to + # None, we're making it clear that this shouldn't be used by other + # functions, such as when creating symlinks + if BuildKey.current_version() >= 3: + return None + store_directory = os.path.abspath(conda_store.store_directory) namespace = self.environment.namespace.name name = self.specification.name @@ -317,11 +339,11 @@ def build_key(self): return BuildKey.get_build_key(self) @staticmethod - def parse_build_key(key): + def parse_build_key(conda_store: "CondaStore", key: str): # noqa: F821 # Uses local import to make sure BuildKey is initialized from conda_store_server import BuildKey - return BuildKey.parse_build_key(key) + return BuildKey.parse_build_key(conda_store, key) @property def log_key(self): diff --git a/conda-store-server/conda_store_server/server/views/registry.py b/conda-store-server/conda_store_server/server/views/registry.py index b6dd323d5..1325eb02d 100644 --- a/conda-store-server/conda_store_server/server/views/registry.py +++ b/conda-store-server/conda_store_server/server/views/registry.py @@ -103,7 +103,7 @@ def get_docker_image_manifest(conda_store, image, tag, timeout=10 * 60): else: build_key = tag - build_id = orm.Build.parse_build_key(build_key) + build_id = orm.Build.parse_build_key(conda_store, build_key) if build_id is None: return docker_error_message(schema.DockerRegistryError.MANIFEST_UNKNOWN) diff --git a/conda-store-server/conda_store_server/storage.py b/conda-store-server/conda_store_server/storage.py index 4efb4765f..efddf7bd7 100644 --- a/conda-store-server/conda_store_server/storage.py +++ b/conda-store-server/conda_store_server/storage.py @@ -244,5 +244,12 @@ def get_url(self, key): def delete(self, db, build_id, key): filename = os.path.join(self.storage_path, key) - os.remove(filename) + try: + os.remove(filename) + except FileNotFoundError: + # The DB can contain multiple entries pointing to the same key, like + # a log file. This skips files that were previously processed and + # deleted. See LocalStorage.fset and Storage.fset, which are used + # for saving build artifacts + pass super().delete(db, build_id, key) diff --git a/conda-store-server/conda_store_server/worker/tasks.py b/conda-store-server/conda_store_server/worker/tasks.py index 9c601b1c3..80cd6db62 100644 --- a/conda-store-server/conda_store_server/worker/tasks.py +++ b/conda-store-server/conda_store_server/worker/tasks.py @@ -262,7 +262,8 @@ def task_update_environment_build(self, environment_id): conda_prefix = environment.current_build.build_path(conda_store) environment_prefix = environment.current_build.environment_path(conda_store) - utils.symlink(conda_prefix, environment_prefix) + if environment_prefix is not None: + utils.symlink(conda_prefix, environment_prefix) if conda_store.post_update_environment_build_hook: conda_store.post_update_environment_build_hook(conda_store, environment) diff --git a/conda-store-server/tests/test_actions.py b/conda-store-server/tests/test_actions.py index de9419548..33d39bf6d 100644 --- a/conda-store-server/tests/test_actions.py +++ b/conda-store-server/tests/test_actions.py @@ -331,7 +331,8 @@ def test_add_lockfile_packages( [ (False, 0), # invalid (False, 1), # long (legacy) - (False, 2), # short (default) + (False, 2), # shorter hash (default) + (False, 3), # hash-only (experimental) (True, 1), # build_key_version doesn't matter because there's no lockfile ], ) @@ -350,14 +351,14 @@ def test_api_get_build_lockfile( TraitError, match=( r"c.CondaStore.build_key_version: invalid build key version: 0, " - r"expected: \(1, 2\)" + r"expected: \(1, 2, 3\)" ), ): conda_store.build_key_version = build_key_version return # invalid, nothing more to test conda_store.build_key_version = build_key_version assert BuildKey.current_version() == build_key_version - assert BuildKey.versions() == (1, 2) + assert BuildKey.versions() == (1, 2, 3) # initializes data needed to get the lockfile specification = simple_specification_with_pip @@ -436,14 +437,16 @@ def lockfile_url(build_key): ) elif build_key_version == 2: build_key = "c7afdeff-1699156450-12345678-this-is-a-long-environment-name" + elif build_key_version == 3: + build_key = "c1f206a26263e1166e5b43548f69aa0c" else: raise ValueError(f"unexpected build_key_version: {build_key_version}") assert type(res) is RedirectResponse assert key == res.headers["location"] assert build.build_key == build_key assert BuildKey.get_build_key(build) == build_key - assert build.parse_build_key(build_key) == 12345678 - assert BuildKey.parse_build_key(build_key) == 12345678 + assert build.parse_build_key(conda_store, build_key) == 12345678 + assert BuildKey.parse_build_key(conda_store, build_key) == 12345678 assert lockfile_url(build_key) == build.conda_lock_key assert lockfile_url(build_key) == res.headers["location"] assert res.status_code == 307 diff --git a/docusaurus-docs/community/policies/backwards-compatibility.md b/docusaurus-docs/community/policies/backwards-compatibility.md index 068b12f3f..f243f31b7 100644 --- a/docusaurus-docs/community/policies/backwards-compatibility.md +++ b/docusaurus-docs/community/policies/backwards-compatibility.md @@ -222,6 +222,13 @@ objects as public if there is an explicit need to do so. Keeping code private by default limits the public API that the conda-store project developers are committing to supporting. +### Build keys + +conda-store ships with several build key versions. The build key determines the +location of environment builds and build artifacts. Build key versions marked as +experimental can be changed at any time, see `BuildKey` and the FAQ for more +information. + #### Deprecating Python APIs Under exceptional circumstances such as a serious security vulnerability which diff --git a/docusaurus-docs/conda-store/references/configuration-options.md b/docusaurus-docs/conda-store/references/configuration-options.md index bd23525b3..fc9900fa5 100644 --- a/docusaurus-docs/conda-store/references/configuration-options.md +++ b/docusaurus-docs/conda-store/references/configuration-options.md @@ -73,7 +73,7 @@ store_directory, namespace, name. The default will put all environments in the same namespace within the same directory. `CondaStore.build_key_version` is the [build key version](#build-key-versions) -to use: 1 (long, legacy), 2 (short, default). +to use: 1 (long, legacy), 2 (shorter hash, default), 3 (hash-only, experimental). `CondaStore.validate_specification` callable function taking `conda_store` and `specification` as input arguments to apply for diff --git a/docusaurus-docs/conda-store/references/faq.md b/docusaurus-docs/conda-store/references/faq.md index 356d13548..a08b86adf 100644 --- a/docusaurus-docs/conda-store/references/faq.md +++ b/docusaurus-docs/conda-store/references/faq.md @@ -94,22 +94,78 @@ It consists of: 3. the id of a build 4. the environment name. -The version 2 format is now the default. Environments created using the version -1 format will continue to be accessible in the UI, but new builds will use the -version 2 format. No changes are needed for existing deployments of conda-store. +However, version 2 build paths don't solve the problem completely because they +include user-provided data, like the environment name, and that data can be +arbitrarily large. -There is no real reason to use the version 1 format anymore, but it can be -explicitly set via the config: +To solve this problem, version 3 was introduced, which will always have the same +size. It looks like this: + +```bash +64a943764b70e8fe181643404894f7ae +``` + +:::warning +Version 3 is experimental and can be changed at any time. +::: + +It's a truncated SHA-256 hex digest, which is calculated based on: + +- namespace name +- specification hash (also SHA-256) +- build timestamp +- build id. + +See `BuildKey._version3_experimental_fmt` for details. + +:::note +When version 3 is used, `Build.build_path` will not include the namespace name, +because it's not fixed size, so all builds will be placed right into +`CondaStore.store_directory`. + +Additionally, `CondaStore.environment_directory` will be completely ignored, so +no symlinks connecting an environment name to its corresponding build will be +created, because the environment directory format also includes variable-size +data (the namespace and environment names). + +For context, these symlinks are created because that's how conda is usually +used: each environment name points to a particular directory on the filesystem, +and symlinks connect this directory to the current build. + +In the following example, which uses the version 2 format, there are two +environments in the `default` namespace: `test` and `test2`. The former points +to build 3 and the latter points to build 2: -```python -c.CondaStore.build_key_version = 1 ``` +$ ls -l ~/.conda-store/state/default/envs +test -> /home/user/.conda-store/state/default/b3109fbf-1710602415-3-test +test2 -> /home/user/.conda-store/state/default/2aad045f-1710602357-2-test2 +``` + +The lack of symlinks doesn't prevent server artifacts from being generated, +which are available for download via the UI (lockfiles, archives, etc.), because +those rely on storage or use the database. -The version 2 format can also be explicitly set if needed (this is the same as -the default): +But it does impact conda integration or tools that rely on it, like when +conda-store is used with JupyterLab as part of a Nebari deployment. Without +environment symlinks, there'll be no way to tell conda where to look for +environments, which is done by setting `envs_dirs` in `.condarc`, so `conda env +list` will return nothing and no environments will show up in JupyterLab. +::: + +The version 2 format is the default because it supports environment symlinks and +doesn't usually run into path length limitations. If you do experience problems +with the latter and don't need the former, then consider using the version 3 +format. + +No matter what format you choose, environments that were previously created +using other version formats will be accessible in the conda-store web UI. + +There is no real reason to use the version 1 format anymore, but any version can +be explicitly set via the config, for example: ```python -c.CondaStore.build_key_version = 2 +c.CondaStore.build_key_version = 1 ``` ## Long paths on Windows