From 88ea0ca6bcd240c1d687926d5a2a0770199f3db3 Mon Sep 17 00:00:00 2001 From: Bohdan Yurov Date: Mon, 10 Jun 2019 19:23:40 +0300 Subject: [PATCH] Fixes #105: DM/bigquery: refactoring https://github.com/GoogleCloudPlatform/cloud-foundation-toolkit/issues/105 - Added version, links to docs - Switched to using type provider - Added support for cross-project resource creation - Added missing fields to datasets: "friendlyName", "defaultPartitionExpirationMs", "labels", "access" - Added missing fields to tables: "description", "labels", "clustering", "requirePartitionFilter", "externalDataConfiguration", "encryptionConfiguration" - Fixed resource names - Added uniqueItems: true and additionalProperties: false --- dm/templates/bigquery/bigquery_dataset.py | 22 +- .../bigquery/bigquery_dataset.py.schema | 82 +++-- dm/templates/bigquery/bigquery_table.py | 32 +- .../bigquery/bigquery_table.py.schema | 344 +++++++++++++++++- 4 files changed, 431 insertions(+), 49 deletions(-) diff --git a/dm/templates/bigquery/bigquery_dataset.py b/dm/templates/bigquery/bigquery_dataset.py index 11d3849e1f2b..e2725c7d18e0 100644 --- a/dm/templates/bigquery/bigquery_dataset.py +++ b/dm/templates/bigquery/bigquery_dataset.py @@ -20,15 +20,18 @@ def generate_config(context): # You can modify the roles you wish to whitelist. whitelisted_roles = ['READER', 'WRITER', 'OWNER'] - name = context.properties['name'] + properties = context.properties + name = properties.get('name', context.env['name']) + project_id = properties.get('project', context.env['project']) properties = { 'datasetReference': { 'datasetId': name, - 'projectId': context.env['project'] + 'projectId': project_id }, - 'location': context.properties['location'] + 'location': context.properties['location'], + 'projectId': project_id, } optional_properties = ['description', 'defaultTableExpirationMs'] @@ -68,8 +71,9 @@ def generate_config(context): resources = [ { - 'type': 'bigquery.v2.dataset', - 'name': name, + # https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets + 'type': 'gcp-types/bigquery-v2:datasets', + 'name': context.env['name'], 'properties': properties } ] @@ -77,7 +81,7 @@ def generate_config(context): outputs = [ { 'name': 'selfLink', - 'value': '$(ref.{}.selfLink)'.format(name) + 'value': '$(ref.{}.selfLink)'.format(context.env['name']) }, { 'name': 'datasetId', @@ -85,15 +89,15 @@ def generate_config(context): }, { 'name': 'etag', - 'value': '$(ref.{}.etag)'.format(name) + 'value': '$(ref.{}.etag)'.format(context.env['name']) }, { 'name': 'creationTime', - 'value': '$(ref.{}.creationTime)'.format(name) + 'value': '$(ref.{}.creationTime)'.format(context.env['name']) }, { 'name': 'lastModifiedTime', - 'value': '$(ref.{}.lastModifiedTime)'.format(name) + 'value': '$(ref.{}.lastModifiedTime)'.format(context.env['name']) } ] diff --git a/dm/templates/bigquery/bigquery_dataset.py.schema b/dm/templates/bigquery/bigquery_dataset.py.schema index 1e07c2fcd37a..d53678ffdd55 100644 --- a/dm/templates/bigquery/bigquery_dataset.py.schema +++ b/dm/templates/bigquery/bigquery_dataset.py.schema @@ -15,11 +15,17 @@ info: title: BigQuery Dataset author: Sourced Group Inc. + version: 1.0.0 description: | Creates a BigQuery dataset. + For information on this resource: https://cloud.google.com/bigquery/docs/. + APIs endpoints used by this template: + - gcp-types/bigquery-v2:datasets => + https://cloud.google.com/bigquery/docs/reference/rest/v2/datasets + imports: - path: bigquery_dataset.py @@ -31,7 +37,21 @@ required: properties: name: type: string - description: The resource name. + description: | + The table dataset name. Resource name would be used if omitted. + project: + type: string + description: | + The project ID of the project containing the dataset. The + Google apps domain is prefixed if applicable. + friendlyName: + type: string + description: | + A descriptive name for the dataset. + description: + type: string + description: | + A user-friendly description of the dataset. location: type: string description: | @@ -45,6 +65,7 @@ properties: - US access: type: array + uniqueItems: true description: | An array of objects that define dataset access for one or more entities. You can set this property when inserting or updating @@ -56,33 +77,34 @@ properties: access.specialGroup: projectOwners; access.role: OWNER access.userByEmail: [dataset creator email]; access.role: OWNER items: - role: - type: string - description: | - The role (rights) granted to the user specified by the other - member of the access object. The following string values are - supported: READER, WRITER, OWNER. See details at - https://cloud.google.com/bigquery/docs/access-control. - enum: - - READER - - WRITER - - OWNER - oneOf: - - domain: + type: object + additionalProperties: false + required: + - role + properties: + role: + type: string + description: | + An IAM role ID that should be granted to the user, group, or domain specified in this access entry. + The following legacy mappings will be applied: OWNER <=> roles/bigquery.dataOwner + WRITER <=> roles/bigquery.dataEditor READER <=> roles/bigquery.dataViewer This field will accept any of + the above formats, but will return only the legacy format. For example, if you set this field to + "roles/bigquery.dataOwner", it will be returned back as "OWNER". @mutable bigquery.datasets.update + domain: type: string description: | The domain to grant access to. All users signed in with the specified domain are granted the corresponding access. Example: "example.com". - - userByEmail: + userByEmail: type: string description: | The email address of a user to grant access to. For example: fred@example.com. - - groupByEmail: + groupByEmail: type: string description: The email address of a Google Group to grant access to. - - specialGroup: + specialGroup: type: string description: | The special group to grant access to. Possible values include: @@ -90,8 +112,9 @@ properties: projectReaders: readers of the enclosing project projectWriters: writers of the enclosing project allAuthenticatedUsers: all authenticated BigQuery users - - view: + view: type: object + additionalProperties: false description: | A view from a different dataset to grant access to. Queries executed against that view have the Read access to tables in that @@ -112,9 +135,6 @@ properties: The table ID. The ID must contain only letters (a-z, A-Z), numbers (0-9), or underscores (_). The maximum length is 1,024 characters. - description: - type: string - description: A user-friendly description of the dataset. setDefaultOwner: type: boolean default: False @@ -136,6 +156,26 @@ properties: expirationTime while creating the table, that value takes precedence over the default expiration time indicated by this property. minimum: 3600000 + defaultPartitionExpirationMs: + type: string + format: int64 + description: | + The default partition expiration for all partitioned tables in the dataset, in milliseconds. + Once this property is set, all newly-created partitioned tables in the dataset will have an expirationMs + property in the timePartitioning settings set to this value, and changing the value will only affect new tables, + not existing ones. The storage in a partition will have an expiration time of its partition time plus this value. + Setting this property overrides the use of defaultTableExpirationMs for partitioned tables: only one of + defaultTableExpirationMs and defaultPartitionExpirationMs will be used for any new partitioned table. + If you provide an explicit timePartitioning.expirationMs when creating or updating a partitioned table, + that value takes precedence over the default partition expiration time indicated by this property. + labels: + type: object + description: | + Map labels associated with this dataset. + Example: + name: wrench + mass: 1.3kg + count: 3 outputs: properties: diff --git a/dm/templates/bigquery/bigquery_table.py b/dm/templates/bigquery/bigquery_table.py index ea527d3366fd..d4ec420e3021 100644 --- a/dm/templates/bigquery/bigquery_table.py +++ b/dm/templates/bigquery/bigquery_table.py @@ -18,16 +18,19 @@ def generate_config(context): """ Entry point for the deployment resources. """ - name = context.properties['name'] + properties = context.properties + name = properties.get('name', context.env['name']) + project_id = properties.get('project', context.env['project']) properties = { 'tableReference': { 'tableId': name, 'datasetId': context.properties['datasetId'], - 'projectId': context.env['project'] + 'projectId': project_id }, - 'datasetId': context.properties['datasetId'] + 'datasetId': context.properties['datasetId'], + 'projectId': project_id, } optional_properties = [ @@ -48,8 +51,9 @@ def generate_config(context): resources = [ { - 'type': 'bigquery.v2.table', - 'name': name, + # https://cloud.google.com/bigquery/docs/reference/rest/v2/tables + 'type': 'gcp-types/bigquery-v2:tables', + 'name': context.env['name'], 'properties': properties, 'metadata': { 'dependsOn': [context.properties['datasetId']] @@ -60,39 +64,39 @@ def generate_config(context): outputs = [ { 'name': 'selfLink', - 'value': '$(ref.{}.selfLink)'.format(name) + 'value': '$(ref.{}.selfLink)'.format(context.env['name']) }, { 'name': 'etag', - 'value': '$(ref.{}.etag)'.format(name) + 'value': '$(ref.{}.etag)'.format(context.env['name']) }, { 'name': 'creationTime', - 'value': '$(ref.{}.creationTime)'.format(name) + 'value': '$(ref.{}.creationTime)'.format(context.env['name']) }, { 'name': 'lastModifiedTime', - 'value': '$(ref.{}.lastModifiedTime)'.format(name) + 'value': '$(ref.{}.lastModifiedTime)'.format(context.env['name']) }, { 'name': 'location', - 'value': '$(ref.{}.location)'.format(name) + 'value': '$(ref.{}.location)'.format(context.env['name']) }, { 'name': 'numBytes', - 'value': '$(ref.{}.numBytes)'.format(name) + 'value': '$(ref.{}.numBytes)'.format(context.env['name']) }, { 'name': 'numLongTermBytes', - 'value': '$(ref.{}.numLongTermBytes)'.format(name) + 'value': '$(ref.{}.numLongTermBytes)'.format(context.env['name']) }, { 'name': 'numRows', - 'value': '$(ref.{}.numRows)'.format(name) + 'value': '$(ref.{}.numRows)'.format(context.env['name']) }, { 'name': 'type', - 'value': '$(ref.{}.type)'.format(name) + 'value': '$(ref.{}.type)'.format(context.env['name']) } ] diff --git a/dm/templates/bigquery/bigquery_table.py.schema b/dm/templates/bigquery/bigquery_table.py.schema index ad0cbb8865ce..e36cf31a9216 100644 --- a/dm/templates/bigquery/bigquery_table.py.schema +++ b/dm/templates/bigquery/bigquery_table.py.schema @@ -15,11 +15,17 @@ info: title: BigQuery Table author: Sourced Group Inc. + version: 1.0.0 description: | Creates a BigQuery table. - For more information on this resource: + + For information on this resource: https://cloud.google.com/bigquery/docs/. + APIs endpoints used by this template: + - gcp-types/bigquery-v2:tables => + https://cloud.google.com/bigquery/docs/reference/rest/v2/tables + imports: - path: bigquery_table.py @@ -31,14 +37,25 @@ required: properties: name: type: string - description: The resource name. + description: | + The table name name. Resource name would be used if omitted. + project: + type: string + description: | + The project ID of the project containing the table. The + Google apps domain is prefixed if applicable. datasetId: type: string description: | The ID of the dataset the table belongs to. friendlyName: type: string - description: A descriptive name for the table. + description: | + A descriptive name for the table. + description: + type: string + description: | + A user-friendly description of the dataset. expirationTime: type: string description: | @@ -47,9 +64,314 @@ properties: deleted, and their storage is reclaimed. The defaultTableExpirationMs property of the encapsulating dataset can be used to set a default expirationTime on newly created tables. For example, 1535739430. + encryptionConfiguration: + type: object + additionalProperties: false + description: | + Custom encryption configuration (e.g., Cloud KMS keys). + properties: + kmsKeyName: + type: string + description: | + Describes the Cloud KMS encryption key that will be used to protect destination BigQuery table. + The BigQuery Service Account associated with your project requires access to this encryption key. + externalDataConfiguration: + type: object + additionalProperties: false + description: | + Describes the data format, location, and other properties of a table stored outside of BigQuery. + By defining these properties, the data source can then be queried as if it were a standard BigQuery table. + required: + - sourceUris + - sourceFormat + properties: + sourceUris: + type: array + minItems: 1 + uniqueItems: true + description: | + The fully-qualified URIs that point to your data in Google Cloud. For Google Cloud Storage URIs: + Each URI can contain one '*' wildcard character and it must come after the 'bucket' name. + Size limits related to load jobs apply to external data sources. For Google Cloud Bigtable URIs: + Exactly one URI can be specified and it has be a fully specified and valid HTTPS URL for a + Google Cloud Bigtable table. For Google Cloud Datastore backups, exactly one URI can be specified. + Also, the '*' wildcard character is not allowed. + items: + type: object + schema: + type: object + description: | + The schema for the data. Schema is required for CSV and JSON formats. Schema is disallowed for + Google Cloud Bigtable, Cloud Datastore backups, and Avro formats. + sourceFormat: + type: string + description: | + The data format. For CSV files, specify "CSV". For Google sheets, specify "GOOGLE_SHEETS". + For newline-delimited JSON, specify "NEWLINE_DELIMITED_JSON". For Avro files, specify "AVRO". + For Google Cloud Datastore backups, specify "DATASTORE_BACKUP". + [Beta] For Google Cloud Bigtable, specify "BIGTABLE". + enum: + - CSV + - GOOGLE_SHEETS + - NEWLINE_DELIMITED_JSON + - AVRO + - DATASTORE_BACKUP + - BIGTABLE + maxBadRecords: + type: number + description: | + The maximum number of bad records that BigQuery can ignore when reading data. If the number of + bad records exceeds this value, an invalid error is returned in the job result. + The default value is 0, which requires that all records are valid. This setting is ignored + for Google Cloud Bigtable, Google Cloud Datastore backups and Avro formats. + autodetect: + type: boolean + description: | + Indicates if BigQuery should allow extra values that are not represented in the table schema. + If true, the extra values are ignored. If false, records with extra columns are treated as bad records, + and if there are too many bad records, an invalid error is returned in the job result. + The default value is false. The sourceFormat property determines what BigQuery treats as an extra value: + CSV: Trailing columns JSON: Named values that don't match any column names + Google Cloud Bigtable: This setting is ignored + Google Cloud Datastore backups: This setting is ignored + Avro: This setting is ignored. + compression: + type: string + description: | + The compression type of the data source. Possible values include GZIP and NONE. The default value is NONE. + This setting is ignored for Google Cloud Bigtable, Google Cloud Datastore backups and Avro formats. + An empty string is an invalid value. + enum: + - NONE + - GZIP + csvOptions: + type: object + additionalProperties: false + description: | + Additional properties to set if sourceFormat is set to CSV. + properties: + fieldDelimiter: + type: string + description: | + The separator for fields in a CSV file. BigQuery converts the string to ISO-8859-1 encoding, + and then uses the first byte of the encoded string to split the data in its raw, binary state. + BigQuery also supports the escape sequence "\t" to specify a tab separator. + The default value is a comma (','). + skipLeadingRows: + type: number + description: | + The number of rows at the top of a CSV file that BigQuery will skip when reading the data. + The default value is 0. This property is useful if you have header rows in the file that should be skipped. + quote: + type: string + description: | + The value that is used to quote data sections in a CSV file. BigQuery converts the string to + ISO-8859-1 encoding, and then uses the first byte of the encoded string to split the data in its raw, + binary state. The default value is a double-quote ('"'). If your data does not contain quoted sections, + set the property value to an empty string. If your data contains quoted newline characters, + you must also set the allowQuotedNewlines property to true. @default '"' + allowQuotedNewlines: + type: boolean + description: | + Indicates if BigQuery should allow quoted data sections that contain newline characters in a CSV file. + The default value is false. + allowJaggedRows: + type: boolean + description: | + Indicates if BigQuery should accept rows that are missing trailing optional columns. + If true, BigQuery treats missing trailing columns as null values. + If false, records with missing trailing columns are treated as bad records, and if there are + too many bad records, an invalid error is returned in the job result. The default value is false. + encoding: + type: string + description: | + The character encoding of the data. The supported values are UTF-8 or ISO-8859-1. + The default value is UTF-8. BigQuery decodes the data after the raw, binary data has + been split using the values of the quote and fieldDelimiter properties. + enum: + - UTF-8 + - ISO-8859-1 + bigtableOptions: + type: object + additionalProperties: false + description: | + Additional options if sourceFormat is set to BIGTABLE. + properties: + columnFamilies: + type: array + uniqueItems: true + description: | + tabledata.list of column families to expose in the table schema along with their types. + This list restricts the column families that can be referenced in queries and specifies their value types. + You can use this list to do type conversions - see the 'type' field for more details. + If you leave this list empty, all column families are present in the table schema and their values + are read as BYTES. During a query only the column families referenced in that query are read from Bigtable. + items: + type: object + additionalProperties: false + properties: + familyId: + type: string + description: | + Identifier of the column family. + type: + type: string + description: | + The type to convert the value in cells of this column family. The values are expected to be + encoded using HBase Bytes.toBytes function when using the BINARY encoding value. + Following BigQuery types are allowed (case-sensitive) - BYTES STRING INTEGER FLOAT BOOLEAN + Default type is BYTES. This can be overridden for a specific column by listing that + column in 'columns' and specifying a type for it. + enum: + - BYTES + - STRING + - INTEGER + - FLOAT + - BOOLEAN + encoding: + type: string + description: | + The encoding of the values when the type is not STRING. Acceptable encoding values are: + - TEXT - indicates values are alphanumeric text strings. + - BINARY - indicates values are encoded using HBase Bytes.toBytes family of functions. + This can be overridden for a specific column by listing that column in + 'columns' and specifying an encoding for it. + enum: + - TEXT + - BINARY + columns: + type: array + uniqueItems: true + description: | + Lists of columns that should be exposed as individual fields as opposed to a list of + (column name, value) pairs. All columns whose qualifier matches a qualifier in this list + can be accessed as .. Other columns can be accessed as a list through .Column field. + items: + type: object + additionalProperties: false + required: + - qualifierEncoded + properties: + qualifierEncoded: + type: string + description: | + Qualifier of the column. Columns in the parent column family that has this exact qualifier + are exposed as . field. If the qualifier is valid UTF-8 string, it can be specified in + the qualifierString field. Otherwise, a base-64 encoded value must be set to qualifierEncoded. + The column field name is the same as the column qualifier. However, if the qualifier is not a + valid BigQuery field identifier i.e. does not match [a-zA-Z][a-zA-Z0-9_]*, a valid identifier + must be provided as fieldName. + qualifierString: + type: string + fieldName: + type: string + description: | + If the qualifier is not a valid BigQuery field identifier i.e. does not match + [a-zA-Z][a-zA-Z0-9_]*, a valid identifier must be provided as the column field name + and is used as field name in queries. + type: + type: string + description: | + The type to convert the value in cells of this column. The values are expected to be + encoded using HBase Bytes.toBytes function when using the BINARY encoding value. + Following BigQuery types are allowed (case-sensitive) - BYTES STRING INTEGER FLOAT BOOLEAN + Default type is BYTES. 'type' can also be set at the column family level. + However, the setting at this level takes precedence if 'type' is set at both levels. + enum: + - BYTES + - STRING + - INTEGER + - FLOAT + - BOOLEAN + encoding: + type: string + description: | + The encoding of the values when the type is not STRING. Acceptable encoding values are: + - TEXT - indicates values are alphanumeric text strings. + - BINARY - indicates values are encoded using HBase Bytes.toBytes family of functions. + 'encoding' can also be set at the column family level. However, the setting at this level + takes precedence if 'encoding' is set at both levels. + enum: + - TEXT + - BINARY + onlyReadLatest: + type: boolean + description: | + If this is set, only the latest version of value in this column are exposed. + 'onlyReadLatest' can also be set at the column family level. However, the setting at + this level takes precedence if 'onlyReadLatest' is set at both levels. + ignoreUnspecifiedColumnFamilies: + type: boolean + description: | + If field is true, then the column families that are not specified in columnFamilies list are not + exposed in the table schema. Otherwise, they are read with BYTES type values. The default value is false. + readRowkeyAsString: + type: boolean + description: | + If field is true, then the rowkey column families will be read and converted to string. + Otherwise they are read with BYTES type values and users need to manually cast them with CAST if necessary. + The default value is false. + googleSheetsOptions: + type: object + additionalProperties: false + description: | + Additional options if sourceFormat is set to GOOGLE_SHEETS. + properties: + skipLeadingRows: + type: number + description: | + The number of rows at the top of a sheet that BigQuery will skip when reading the data. + The default value is 0. This property is useful if you have header rows that should be skipped. + When autodetect is on, behavior is the following: * skipLeadingRows unspecified - Autodetect tries to + detect headers in the first row. If they are not detected, the row is read as data. Otherwise data + is read starting from the second row. * skipLeadingRows is 0 - Instructs autodetect that there are + no headers and data should be read starting from the first row. * skipLeadingRows = N > 0 - Autodetect + skips N-1 rows and tries to detect headers in row N. If headers are not detected, row N is just skipped. + Otherwise row N is used to extract column names for the detected schema. + range: + type: string + description: | + [Beta] Range of a sheet to query from. Only used when non-empty. + hivePartitioningMode: + type: string + description: | + [Experimental] When set, what mode of hive partitioning to use when reading data. + Two modes are supported: + - AUTO: automatically infer partition key name(s) and type(s). + - STRINGS: automatically infer partition key name(s). All types are strings. + Not all storage formats support hive partitioning -- requesting hive partitioning + on an unsupported format will lead to an error. + enum: + - AUTO + - STRINGS + clustering: + type: object + additionalProperties: false + description: | + Clustering specification for the table. Must be specified with time-based partitioning, data in the table + will be first partitioned and subsequently clustered. + required: + - fields + properties: + fields: + type: array + minItems: 1 + uniqueItems: true + description: | + One or more fields on which data should be clustered. Only top-level, non-repeated, simple-type fields + are supported. The order of the fields will determine how clusters will be generated, so it is important. + items: + type: string + requirePartitionFilter: + type: boolean + description: | + [Beta] If set to true, queries over this table require a partition filter that can be used for + partition elimination to be specified. timePartitioning: type: object - description: The time-based partitioning specification for this table. + additionalProperties: false + description: | + The time-based partitioning specification for this table. properties: expirationMs: type: string @@ -69,7 +391,7 @@ properties: requirePartitionFilter: type: boolean description: | - If True, queries over the table require a partition filter + [Beta] If True, queries over the table require a partition filter (that can be used for partition elimination) to be specified. type: type: string @@ -78,6 +400,7 @@ properties: per day. view: type: object + additionalProperties: false description: The view definintion. properties: query: @@ -94,6 +417,7 @@ properties: value. userDefinedFunctionResources: type: array + uniqueItems: true description: | User-defined function resources used in the query. items: @@ -111,12 +435,14 @@ properties: (gs://bucket/path). schema: type: array + uniqueItems: true description: | The schema for the data. Required for the CSV and JSON formats. Disallowed for the Google Cloud Bigtable, Cloud Datastore backups, and Avro formats. items: type: object + additionalProperties: false description: Defines the table fields. required: - name @@ -167,6 +493,14 @@ properties: type: string description: | The field description. The maximum length is 1,024 characters. + labels: + type: object + description: | + Map labels associated with this table. + Example: + name: wrench + mass: 1.3kg + count: 3 outputs: properties: