Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Redesign base path file structure #373

Merged
merged 10 commits into from
Aug 23, 2023
6 changes: 4 additions & 2 deletions src/fondant/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,8 @@ def _generate_spec(
command.extend(
[
"--output_manifest_path",
f"{path}/{component_name}/manifest.json",
f"{path}/{metadata.pipeline_name}/{metadata.run_id}/"
f"{component_name}/manifest.json",
],
)

Expand All @@ -160,7 +161,8 @@ def _generate_spec(
command.extend(
[
"--input_manifest_path",
f"{path}/{dependency}/manifest.json",
f"{path}/{metadata.pipeline_name}/{metadata.run_id}/"
f"{dependency}/manifest.json",
],
)

Expand Down
4 changes: 2 additions & 2 deletions src/fondant/executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -212,9 +212,9 @@ def upload_manifest(self, manifest: Manifest, save_path: t.Union[str, Path]):

if is_kubeflow_output:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not the biggest fan of having kubeflow specific code in here. Opened #376 to remove this.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree, though I don't see a clear-cut solution for this just yet

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will vertex have the same issue ?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

probably, ideally we have some control over the format and save path of the artifact.

# Save to the expected base path directory
safe_component_name = self.spec.name.replace(" ", "_").lower()
save_path_base_path = (
f"{manifest.base_path}/{safe_component_name}/manifest.json"
f"{manifest.base_path}/{manifest.pipeline_name}/{manifest.run_id}/"
f"{manifest.component_id}/manifest.json"
)
Path(save_path_base_path).parent.mkdir(parents=True, exist_ok=True)
manifest.to_file(save_path_base_path)
Expand Down
8 changes: 4 additions & 4 deletions src/fondant/manifest.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def create(

specification = {
"metadata": metadata.to_dict(),
"index": {"location": f"/index/{run_id}/{component_id}"},
"index": {"location": f"/{pipeline_name}/{run_id}/{component_id}/index"},
"subsets": {},
}
return cls(specification)
Expand Down Expand Up @@ -226,7 +226,7 @@ def add_subset(
raise ValueError(msg)

self._specification["subsets"][name] = {
"location": f"/{name}/{self.run_id}/{self.component_id}",
"location": f"/{self.pipeline_name}/{self.run_id}/{self.component_id}/{name}",
"fields": {name: type_.to_json() for name, type_ in fields},
}

Expand Down Expand Up @@ -254,7 +254,7 @@ def evolve( # noqa : PLR0912 (too many branches)
# Update index location as this is currently always rewritten
evolved_manifest.index._specification[
"location"
] = f"/index/{self.run_id}/{component_id}"
] = f"/{self.pipeline_name}/{self.run_id}/{component_id}/index"

# If additionalSubsets is False in consumes,
# Remove all subsets from the manifest that are not listed
Expand Down Expand Up @@ -305,7 +305,7 @@ def evolve( # noqa : PLR0912 (too many branches)
# Update subset location as this is currently always rewritten
evolved_manifest.subsets[subset_name]._specification[
"location"
] = f"/{subset_name}/{self.run_id}/{component_id}"
] = f"{self.pipeline_name}/{self.run_id}/{component_id}/{subset_name}"

# Subset is not yet in manifest, add it
else:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ services:
- '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000",
"component_id": "first_component"}'
- --output_manifest_path
- /foo/bar/first_component/manifest.json
- /foo/bar/test_pipeline/test_pipeline-20230101000000/first_component/manifest.json
- --storage_args
- a dummy string arg
- --input_partition_rows
Expand All @@ -30,7 +30,7 @@ services:
- '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000",
"component_id": "second_component"}'
- --output_manifest_path
- /foo/bar/second_component/manifest.json
- /foo/bar/test_pipeline/test_pipeline-20230101000000/second_component/manifest.json
- --storage_args
- a dummy string arg
- --input_partition_rows
Expand All @@ -42,7 +42,7 @@ services:
"array", "items": {"type": "float32"}}}}}, "args": {"storage_args": {"description":
"Storage arguments", "type": "str"}}}'
- --input_manifest_path
- /foo/bar/first_component/manifest.json
- /foo/bar/test_pipeline/test_pipeline-20230101000000/first_component/manifest.json
depends_on:
first_component:
condition: service_completed_successfully
Expand All @@ -56,7 +56,7 @@ services:
- '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000",
"component_id": "third_component"}'
- --output_manifest_path
- /foo/bar/third_component/manifest.json
- /foo/bar/test_pipeline/test_pipeline-20230101000000/third_component/manifest.json
- --storage_args
- a dummy string arg
- --input_partition_rows
Expand All @@ -70,7 +70,7 @@ services:
false}, "args": {"storage_args": {"description": "Storage arguments", "type":
"str"}}}'
- --input_manifest_path
- /foo/bar/second_component/manifest.json
- /foo/bar/test_pipeline/test_pipeline-20230101000000/second_component/manifest.json
depends_on:
second_component:
condition: service_completed_successfully
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ services:
- '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000",
"component_id": "first_component"}'
- --output_manifest_path
- /foo/bar/first_component/manifest.json
- /foo/bar/test_pipeline/test_pipeline-20230101000000/first_component/manifest.json
- --storage_args
- a dummy string arg
- --input_partition_rows
Expand All @@ -27,7 +27,7 @@ services:
- '{"base_path": "/foo/bar", "pipeline_name": "test_pipeline", "run_id": "test_pipeline-20230101000000",
"component_id": "image_cropping"}'
- --output_manifest_path
- /foo/bar/image_cropping/manifest.json
- /foo/bar/test_pipeline/test_pipeline-20230101000000/image_cropping/manifest.json
- --cropping_threshold
- '0'
- --padding
Expand All @@ -46,7 +46,7 @@ services:
for the image cropping. The padding is added to all borders of the image.",
"type": "int", "default": 10}}}'
- --input_manifest_path
- /foo/bar/first_component/manifest.json
- /foo/bar/test_pipeline/test_pipeline-20230101000000/first_component/manifest.json
depends_on:
first_component:
condition: service_completed_successfully
Expand Down
88 changes: 44 additions & 44 deletions tests/example_specs/evolution_examples/1/output_manifest.json
Original file line number Diff line number Diff line change
@@ -1,46 +1,46 @@
{
"metadata": {
"pipeline_name": "test_pipeline",
"base_path": "gs://bucket",
"run_id": "12345",
"component_id": "example_component"
},
"index": {
"location": "/index/12345/example_component"
},
"subsets": {
"images": {
"location": "/images/12345/example_component",
"fields": {
"width": {
"type": "int32"
},
"height": {
"type": "int32"
},
"data": {
"type": "binary"
}
"metadata":{
"pipeline_name":"test_pipeline",
"base_path":"gs://bucket",
"run_id":"12345",
"component_id":"example_component"
},
"index":{
"location":"/test_pipeline/12345/example_component/index"
},
"subsets":{
"images":{
"location":"/test_pipeline/12345/example_component/images",
"fields":{
"width":{
"type":"int32"
},
"height":{
"type":"int32"
},
"data":{
"type":"binary"
}
}
},
"captions":{
"location":"/test_pipeline/12345/example_component/captions",
"fields":{
"data":{
"type":"binary"
}
}
},
"embeddings":{
"location":"/test_pipeline/12345/example_component/embeddings",
"fields":{
"data":{
"type":"array",
"items":{
"type":"float32"
}
}
}
}
},
"captions": {
"location": "/captions/12345/example_component",
"fields": {
"data": {
"type": "binary"
}
}
},
"embeddings": {
"location": "/embeddings/12345/example_component",
"fields": {
"data": {
"type": "array",
"items": {
"type": "float32"
}
}
}
}
}
}
}
}
72 changes: 36 additions & 36 deletions tests/example_specs/evolution_examples/2/output_manifest.json
Original file line number Diff line number Diff line change
@@ -1,38 +1,38 @@
{
"metadata": {
"pipeline_name": "test_pipeline",
"base_path": "gs://bucket",
"run_id": "12345",
"component_id": "example_component"
},
"index": {
"location": "/index/12345/example_component"
},
"subsets": {
"images": {
"location": "/images/12345/example_component",
"fields": {
"width": {
"type": "int32"
},
"height": {
"type": "int32"
},
"data": {
"type": "binary"
}
"metadata":{
"pipeline_name":"test_pipeline",
"base_path":"gs://bucket",
"run_id":"12345",
"component_id":"example_component"
},
"index":{
"location":"/test_pipeline/12345/example_component/index"
},
"subsets":{
"images":{
"location":"/test_pipeline/12345/example_component/images",
"fields":{
"width":{
"type":"int32"
},
"height":{
"type":"int32"
},
"data":{
"type":"binary"
}
}
},
"embeddings":{
"location":"/test_pipeline/12345/example_component/embeddings",
"fields":{
"data":{
"type":"array",
"items":{
"type":"float32"
}
}
}
}
},
"embeddings": {
"location": "/embeddings/12345/example_component",
"fields": {
"data": {
"type": "array",
"items": {
"type": "float32"
}
}
}
}
}
}
}
}
60 changes: 30 additions & 30 deletions tests/example_specs/evolution_examples/3/output_manifest.json
Original file line number Diff line number Diff line change
@@ -1,32 +1,32 @@
{
"metadata": {
"pipeline_name": "test_pipeline",
"base_path": "gs://bucket",
"run_id": "12345",
"component_id": "example_component"
},
"index": {
"location": "/index/12345/example_component"
},
"subsets": {
"images": {
"location": "/images/12345/example_component",
"fields": {
"data": {
"type": "binary"
}
"metadata":{
"pipeline_name":"test_pipeline",
"base_path":"gs://bucket",
"run_id":"12345",
"component_id":"example_component"
},
"index":{
"location":"/test_pipeline/12345/example_component/index"
},
"subsets":{
"images":{
"location":"/test_pipeline/12345/example_component/images",
"fields":{
"data":{
"type":"binary"
}
}
},
"embeddings":{
"location":"/test_pipeline/12345/example_component/embeddings",
"fields":{
"data":{
"type":"array",
"items":{
"type":"float32"
}
}
}
}
},
"embeddings": {
"location": "/embeddings/12345/example_component",
"fields": {
"data": {
"type": "array",
"items": {
"type": "float32"
}
}
}
}
}
}
}
}
Loading