Skip to content

Commit

Permalink
feat: added new Dataplex APIs and new features for existing APIs (e.g…
Browse files Browse the repository at this point in the history
…. DataScans)

docs: updated comments for multiple Dataplex APIs

PiperOrigin-RevId: 528906555
  • Loading branch information
Google APIs authored and copybara-github committed May 2, 2023
1 parent 3e316f1 commit b7429bc
Show file tree
Hide file tree
Showing 9 changed files with 244 additions and 151 deletions.
5 changes: 5 additions & 0 deletions google/cloud/dataplex/v1/analyze.proto
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,7 @@ message Environment {
}
}

// Configuration for sessions created for this environment.
message SessionSpec {
// Optional. The idle time configuration of the session. The session will be
// auto-terminated at the end of this period.
Expand All @@ -104,12 +105,14 @@ message Environment {
bool enable_fast_startup = 2 [(google.api.field_behavior) = OPTIONAL];
}

// Status of sessions created for this environment.
message SessionStatus {
// Output only. Queries over sessions to mark whether the environment is
// currently active or not
bool active = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
}

// URI Endpoints to access sessions associated with the Environment.
message Endpoints {
// Output only. URI to serve notebook APIs
string notebooks = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
Expand Down Expand Up @@ -244,6 +247,7 @@ message Content {
string data_text = 9 [(google.api.field_behavior) = REQUIRED];
}

// Types of content
oneof content {
// Sql Script related configurations.
SqlScript sql_script = 100;
Expand Down Expand Up @@ -276,5 +280,6 @@ message Session {
google.protobuf.Timestamp create_time = 3
[(google.api.field_behavior) = OUTPUT_ONLY];

// Output only. State of Session
State state = 4 [(google.api.field_behavior) = OUTPUT_ONLY];
}
156 changes: 67 additions & 89 deletions google/cloud/dataplex/v1/data_profile.proto
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ syntax = "proto3";

package google.cloud.dataplex.v1;

import "google/api/field_behavior.proto";
import "google/cloud/dataplex/v1/processing.proto";

option go_package = "cloud.google.com/go/dataplex/apiv1/dataplexpb;dataplexpb";
Expand All @@ -26,182 +27,159 @@ option java_package = "com.google.cloud.dataplex.v1";
// DataProfileScan related setting.
message DataProfileSpec {}

// DataProfileResult defines the output of DataProfileScan.
// Each field of the table will have field type specific profile result.
// DataProfileResult defines the output of DataProfileScan. Each field of the
// table will have field type specific profile result.
message DataProfileResult {
// Profile information describing the structure and layout of the data
// and contains the profile info.
// Contains name, type, mode and field type specific profile information.
message Profile {
// Represents a column field within a table schema.
// A field within a table.
message Field {
// ProfileInfo defines the profile information for each schema field type.
// The profile information for each field type.
message ProfileInfo {
// StringFieldInfo defines output info for any string type field.
// The profile information for a string type field.
message StringFieldInfo {
// The minimum length of the string field in the sampled data.
// Optional if zero non-null rows.
// Minimum length of non-null values in the scanned data.
int64 min_length = 1;

// The maximum length of a string field in the sampled data.
// Optional if zero non-null rows.
// Maximum length of non-null values in the scanned data.
int64 max_length = 2;

// The average length of a string field in the sampled data.
// Optional if zero non-null rows.
// Average length of non-null values in the scanned data.
double average_length = 3;
}

// IntegerFieldInfo defines output for any integer type field.
// The profile information for an integer type field.
message IntegerFieldInfo {
// The average of non-null values of integer field in the sampled
// data. Return NaN, if the field has a NaN. Optional if zero non-null
// rows.
// Average of non-null values in the scanned data. NaN, if the field
// has a NaN.
double average = 1;

// The standard deviation of non-null of integer field in the sampled
// data. Return NaN, if the field has a NaN. Optional if zero non-null
// rows.
// Standard deviation of non-null values in the scanned data. NaN, if
// the field has a NaN.
double standard_deviation = 3;

// The minimum value of an integer field in the sampled data.
// Return NaN, if the field has a NaN. Optional if zero non-null
// rows.
// Minimum of non-null values in the scanned data. NaN, if the field
// has a NaN.
int64 min = 4;

// A quartile divide the number of data points into four parts, or
// A quartile divides the number of data points into four parts, or
// quarters, of more-or-less equal size. Three main quartiles used
// are: The first quartile (Q1) splits off the lowest 25% of data from
// the highest 75%. It is also known as the lower or 25th empirical
// quartile, as 25% of the data is below this point. The second
// quartile (Q2) is the median of a data set. So, 50% of the data lies
// below this point. The third quartile (Q3) splits off the highest
// 25% of data from the lowest 75%. It is known as the upper or 75th
// empirical quartile, as 75% of the data lies below this point. So,
// here the quartiles is provided as an ordered list of quartile
// values, occurring in order Q1, median, Q3.
// empirical quartile, as 75% of the data lies below this point.
// Here, the quartiles is provided as an ordered list of quartile
// values for the scanned data, occurring in order Q1, median, Q3.
repeated int64 quartiles = 6;

// The maximum value of an integer field in the sampled data.
// Return NaN, if the field has a NaN. Optional if zero non-null
// rows.
// Maximum of non-null values in the scanned data. NaN, if the field
// has a NaN.
int64 max = 5;
}

// DoubleFieldInfo defines output for any double type field.
// The profile information for a double type field.
message DoubleFieldInfo {
// The average of non-null values of double field in the sampled data.
// Return NaN, if the field has a NaN. Optional if zero non-null rows.
// Average of non-null values in the scanned data. NaN, if the field
// has a NaN.
double average = 1;

// The standard deviation of non-null of double field in the sampled
// data. Return NaN, if the field has a NaN. Optional if zero non-null
// rows.
// Standard deviation of non-null values in the scanned data. NaN, if
// the field has a NaN.
double standard_deviation = 3;

// The minimum value of a double field in the sampled data.
// Return NaN, if the field has a NaN. Optional if zero non-null
// rows.
// Minimum of non-null values in the scanned data. NaN, if the field
// has a NaN.
double min = 4;

// A quartile divide the numebr of data points into four parts, or
// A quartile divides the number of data points into four parts, or
// quarters, of more-or-less equal size. Three main quartiles used
// are: The first quartile (Q1) splits off the lowest 25% of data from
// the highest 75%. It is also known as the lower or 25th empirical
// quartile, as 25% of the data is below this point. The second
// quartile (Q2) is the median of a data set. So, 50% of the data lies
// below this point. The third quartile (Q3) splits off the highest
// 25% of data from the lowest 75%. It is known as the upper or 75th
// empirical quartile, as 75% of the data lies below this point. So,
// here the quartiles is provided as an ordered list of quartile
// values, occurring in order Q1, median, Q3.
// empirical quartile, as 75% of the data lies below this point.
// Here, the quartiles is provided as an ordered list of quartile
// values for the scanned data, occurring in order Q1, median, Q3.
repeated double quartiles = 6;

// The maximum value of a double field in the sampled data.
// Return NaN, if the field has a NaN. Optional if zero non-null
// rows.
// Maximum of non-null values in the scanned data. NaN, if the field
// has a NaN.
double max = 5;
}

// The TopNValue defines the structure of output of top N values of a
// field.
// Top N non-null values in the scanned data.
message TopNValue {
// The value is the string value of the actual value from the field.
// String value of a top N non-null value.
string value = 1;

// The frequency count of the corresponding value in the field.
// Count of the corresponding value in the scanned data.
int64 count = 2;
}

// The ratio of null rows against the rows in the sampled data.
// Ratio of rows with null value against total scanned rows.
double null_ratio = 2;

// The ratio of rows that are distinct against the rows in the sampled
// data.
// Ratio of rows with distinct values against total scanned rows.
// Not available for complex non-groupable field type RECORD and fields
// with REPEATABLE mode.
double distinct_ratio = 3;

// The array of top N values of the field in the sampled data.
// Currently N is set as 10 or equal to distinct values in the field,
// whichever is smaller. This will be optional for complex non-groupable
// data-types such as JSON, ARRAY, JSON, STRUCT.
// The list of top N non-null values and number of times they occur in
// the scanned data. N is 10 or equal to the number of distinct values
// in the field, whichever is smaller. Not available for complex
// non-groupable field type RECORD and fields with REPEATABLE mode.
repeated TopNValue top_n_values = 4;

// The corresponding profile for specific field type.
// Each field will have only one field type specific profile output.
// Structural and profile information for specific field type. Not
// available, if mode is REPEATABLE.
oneof field_info {
// The corresponding string field profile.
// String type field information.
StringFieldInfo string_profile = 101;

// The corresponding integer field profile.
// Integer type field information.
IntegerFieldInfo integer_profile = 102;

// The corresponding double field profile.
// Double type field information.
DoubleFieldInfo double_profile = 103;
}
}

// The name of the field.
string name = 1;

// The field data type. Possible values include:
//
// * STRING
// * BYTE
// * INT64
// * INT32
// * INT16
// * DOUBLE
// * FLOAT
// * DECIMAL
// * BOOLEAN
// * BINARY
// * TIMESTAMP
// * DATE
// * TIME
// * NULL
// * RECORD
// The data type retrieved from the schema of the data source. For
// instance, for a BigQuery native table, it is the [BigQuery Table
// Schema](https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#tablefieldschema).
// For a Dataplex Entity, it is the [Entity
// Schema](https://cloud.google.com/dataplex/docs/reference/rpc/google.cloud.dataplex.v1#type_3).
string type = 2;

// The mode of the field. Its value will be:
// REQUIRED, if it is a required field.
// NULLABLE, if it is an optional field.
// REPEATED, if it is a repeated field.
// The mode of the field. Possible values include:
//
// * REQUIRED, if it is a required field.
// * NULLABLE, if it is an optional field.
// * REPEATED, if it is a repeated field.
string mode = 3;

// The profile information for the corresponding field.
// Profile information for the corresponding field.
ProfileInfo profile = 4;
}

// The sequence of fields describing data in table entities.
// List of fields with structural and profile information for each field.
repeated Field fields = 2;
}

// The count of all rows in the sampled data.
// Return 0, if zero rows.
// The count of rows scanned.
int64 row_count = 3;

// This represents the profile information per field.
// The profile information per field.
Profile profile = 4;

// The data scanned for this profile.
// The data scanned for this result.
ScannedData scanned_data = 5;
}
Loading

0 comments on commit b7429bc

Please sign in to comment.