From eeed0fd4d84f8e29b29b114e0a1320c0bdeff7d2 Mon Sep 17 00:00:00 2001 From: niant Date: Thu, 11 Mar 2021 16:44:59 +0100 Subject: [PATCH] Move proto from datacatalog Signed-off-by: niant --- generate_mocks.sh | 2 +- protos/flyteidl/datacatalog/datacatalog.proto | 271 ++++++++++++------ protos/flyteidl/datacatalog/title.rst | 6 - 3 files changed, 184 insertions(+), 95 deletions(-) delete mode 100644 protos/flyteidl/datacatalog/title.rst diff --git a/generate_mocks.sh b/generate_mocks.sh index e3d40dc19..8d2c9b898 100755 --- a/generate_mocks.sh +++ b/generate_mocks.sh @@ -3,4 +3,4 @@ set -e set -x mockery -dir=gen/pb-go/flyteidl/service/ -name=AdminServiceClient -output=clients/go/admin/mocks -mockery -dir=gen/pb-go/flyteidl/datacatalog/ -name=ArtifactsClient -output=clients/go/datacatalog/mocks +mockery -dir=gen/pb-go/flyteidl/datacatalog/ -name=DataCatalogClient -output=clients/go/datacatalog/mocks diff --git a/protos/flyteidl/datacatalog/datacatalog.proto b/protos/flyteidl/datacatalog/datacatalog.proto index 7e8fb9810..45cc0a4e2 100644 --- a/protos/flyteidl/datacatalog/datacatalog.proto +++ b/protos/flyteidl/datacatalog/datacatalog.proto @@ -1,128 +1,223 @@ syntax = "proto3"; -package pb.lyft.datacatalog; +package datacatalog; -option go_package = "datacatalog"; -option py_generic_services = true; +import "flyteidl/core/literals.proto"; +import "google/protobuf/timestamp.proto"; -message Parameter { +service DataCatalog { + rpc CreateDataset (CreateDatasetRequest) returns (CreateDatasetResponse); + rpc GetDataset (GetDatasetRequest) returns (GetDatasetResponse); + rpc CreateArtifact (CreateArtifactRequest) returns (CreateArtifactResponse); + rpc GetArtifact (GetArtifactRequest) returns (GetArtifactResponse); + rpc AddTag (AddTagRequest) returns (AddTagResponse); + rpc ListArtifacts (ListArtifactsRequest) returns (ListArtifactsResponse); + rpc ListDatasets (ListDatasetsRequest) returns (ListDatasetsResponse); +} - string name = 1; - string value = 2; +message CreateDatasetRequest { + Dataset dataset = 1; } -// Before jumping to message definition, lets go over the expected flow- -// An Artifact represents an unit-of-work identified by (task, version, inputs). This is -// encoded as unique hash for faster queries(called provenance). An artifact is persisted with some other -// attributes (revision, createdAt, reference_id, outputs). -// Only Discovery service knows about the hashing algorithm; one can use the closure (task, version, inputs) -// to query an artifact if it doesnt have the provenance value. -// -// Before starting the work on a task, programming-model first checks if the task has been done. -// Request: GET (task, version, inputs) -// Response: (Exists, Artifact) or (NotFound, nil) -// if not found, Task executor goes ahead with the execution and at the end of execution creates a new entry in -// the discovery service -// Request: CREATE (task, version, inputs) + (revision, reference_id, outputs) -// Response: (Exists, Artifact) or (Created, Artifact) -// -// One can also Query all the artifacts by querying any subset of properties. -// Message Artifact represents the complete information of an artifact- field that unique define the artifact + -// properties. -// Message ArtifactInternal is our storage model where we create an additional derived column for faster queries. -// Message ArtifactId only represents field that uniquely define the artifact. -message Artifact { - string provenance = 1; - string name = 2; - string version = 3; - int64 revision = 4; // strictly increasing value set by users. users can do range query on this attribute. - int64 created_at = 5; // the time when discovery service received the request. - string reference_id = 6; // this could be a workflow runid or something that ties it data elsewhere - repeated Parameter inputs = 7; - repeated Parameter outputs = 8; +message CreateDatasetResponse { + } -message ArtifactId { - string name = 1; - string version = 2; - repeated Parameter inputs = 3; +message GetDatasetRequest { + DatasetID dataset = 1; +} + +message GetDatasetResponse { + Dataset dataset = 1; } -message GetRequest { - oneof id { - string provenance = 1; - ArtifactId artifact_id = 2; +message GetArtifactRequest { + DatasetID dataset = 1; + + oneof query_handle { + string artifact_id = 2; + string tag_name = 3; } } -message GetResponse { +message GetArtifactResponse { Artifact artifact = 1; } -enum QueryOperator { - EQUAL = 0; - GREATER_THAN = 1; - LESSER_THAN = 2; +message CreateArtifactRequest { + Artifact artifact = 1; } -message IntFilter { - int64 value = 1; - QueryOperator operator = 2; +message CreateArtifactResponse { + } -message IntRangeFilter { - int64 min = 1; - int64 max = 2; +message AddTagRequest { + Tag tag = 1; } -message IntQueryKey { - oneof filter { - IntFilter val = 1; - IntRangeFilter range = 2; - } +message AddTagResponse { + +} + +// List the artifacts that belong to the Dataset +message ListArtifactsRequest { + DatasetID dataset = 1; + // Apply the filter expression to this query + FilterExpression filter = 2; + // Pagination options to get a page of artifacts + PaginationOptions pagination = 3; } -// QueryRequest allows queries on a range of values for revision column and point queries on created_at -// and reference_id -message QueryRequest { +// Response to list artifacts +message ListArtifactsResponse { + // The list of artifacts + repeated Artifact artifacts = 1; + // Token to use to request the next page, pass this into the next requests PaginationOptions + string next_token = 2; +} + +// List the datasets for the given query +message ListDatasetsRequest { + // Apply the filter expression to this query + FilterExpression filter = 1; + // Pagination options to get a page of datasets + PaginationOptions pagination = 2; +} + +// List the datasets response with token for next pagination +message ListDatasetsResponse { + // The list of datasets + repeated Dataset datasets = 1; + // Token to use to request the next page, pass this into the next requests PaginationOptions + string next_token = 2; +} + +message Dataset { + DatasetID id = 1; + Metadata metadata = 2; + repeated string partitionKeys = 3; +} + +message Partition { + string key = 1; + string value = 2; +} + +message DatasetID { + string project = 1; // The name of the project + string name = 2; // The name of the dataset + string domain = 3; // The domain (eg. environment) + string version = 4; // Version of the data schema + string UUID = 5; // UUID for the dataset (if set the above fields are optional) +} + +message Artifact { + string id = 1; + DatasetID dataset = 2; + repeated ArtifactData data = 3; + Metadata metadata = 4; + repeated Partition partitions = 5; + repeated Tag tags = 6; + google.protobuf.Timestamp created_at = 7; // creation timestamp of artifact, autogenerated by service +} + +message ArtifactData { string name = 1; - string version = 2; - IntQueryKey revision = 3; - int64 created_at = 4; - string reference_id = 5; + flyteidl.core.Literal value = 2; } -message QueryResponse { - repeated Artifact artifact = 1; +message Tag { + string name = 1; + string artifact_id = 2; + DatasetID dataset = 3; } -message CreateRequest { - ArtifactId ref = 1; - string reference_id = 2; - int64 revision = 3; - repeated Parameter outputs = 4; +message Metadata { + map key_map = 1; // key map is a dictionary of key/val strings that represent metadata } -message CreateResponse { - Artifact artifact = 1; - enum Status { - ALREADY_EXISTS = 0; - CREATED = 1; +// Filter expression that is composed of a combination of single filters +message FilterExpression { + repeated SinglePropertyFilter filters = 1; +} + +// A single property to filter on. +message SinglePropertyFilter { + oneof property_filter { + TagPropertyFilter tag_filter = 1; + PartitionPropertyFilter partition_filter = 2; + ArtifactPropertyFilter artifact_filter = 3; + DatasetPropertyFilter dataset_filter = 4; + } + + // as use-cases come up we can add more operators, ex: gte, like, not eq etc. + enum ComparisonOperator { + EQUALS = 0; + } + + ComparisonOperator operator = 10; // field 10 in case we add more entities to query + // Next field number: 11 +} + +// Artifact properties we can filter by +message ArtifactPropertyFilter { + // oneof because we can add more properties in the future + oneof property { + string artifact_id = 1; + } +} + +// Tag properties we can filter by +message TagPropertyFilter { + oneof property { + string tag_name = 1; } - Status status = 2; } -message GenerateProvenanceRequest { - ArtifactId id = 1; +// Partition properties we can filter by +message PartitionPropertyFilter { + oneof property { + KeyValuePair key_val = 1; + } } -message GenerateProvenanceResponse { - string provenance = 1; +message KeyValuePair { + string key = 1; + string value = 2; } -service Artifacts { - rpc Get (GetRequest) returns (GetResponse) {} - rpc Query (QueryRequest) returns (QueryResponse) {} - rpc Create (CreateRequest) returns (CreateResponse) {} - rpc GenerateProvenance (GenerateProvenanceRequest) returns (GenerateProvenanceResponse) {} +// Dataset properties we can filter by +message DatasetPropertyFilter { + oneof property { + string project = 1; + string name = 2; + string domain = 3; + string version = 4; + } +} + +// Pagination options for making list requests +message PaginationOptions { + + // the max number of results to return + uint32 limit = 1; + + // the token to pass to fetch the next page + string token = 2; + + // the property that we want to sort the results by + SortKey sortKey = 3; + + // the sort order of the results + SortOrder sortOrder = 4; + + enum SortOrder { + DESCENDING = 0; + ASCENDING = 1; + } + + enum SortKey { + CREATION_TIME = 0; + } } diff --git a/protos/flyteidl/datacatalog/title.rst b/protos/flyteidl/datacatalog/title.rst deleted file mode 100644 index 02f9174a3..000000000 --- a/protos/flyteidl/datacatalog/title.rst +++ /dev/null @@ -1,6 +0,0 @@ -Flyte Legacy Catalog Service -============================ - -These protos provide interface definition for the datacatalog -service. This proto is under rapid development and not currently recommended for use. -