From 61eda8f91edf4264b6e211b5254e04eae1c3c8db Mon Sep 17 00:00:00 2001 From: Fai Fung Date: Mon, 29 Jun 2020 16:57:03 +0200 Subject: [PATCH 1/4] support custom external_data_configuration.schema for CSV and NDJSON formats --- .../resources/resource_bigquery_table.go | 48 ++++++++++-- .../tests/resource_bigquery_table_test.go | 74 +++++++++++++++++++ .../docs/r/bigquery_table.html.markdown | 20 +++-- 3 files changed, 132 insertions(+), 10 deletions(-) diff --git a/third_party/terraform/resources/resource_bigquery_table.go b/third_party/terraform/resources/resource_bigquery_table.go index 01e92e4e85c9..c587a0c620c0 100644 --- a/third_party/terraform/resources/resource_bigquery_table.go +++ b/third_party/terraform/resources/resource_bigquery_table.go @@ -109,6 +109,21 @@ func resourceBigQueryTable() *schema.Resource { Default: "NONE", Description: `The compression type of the data source. Valid values are "NONE" or "GZIP".`, }, + // Schema: Optional] The schema for the data. + // Schema is required for CSV and JSON formats if autodetect is not on. + // Schema is disallowed for Google Cloud Bigtable, Cloud Datastore backups, Avro, ORC and Parquet formats. + "schema": { + Type: schema.TypeString, + Optional: true, + Computed: true, + ForceNew: true, + ValidateFunc: validation.ValidateJsonString, + StateFunc: func(v interface{}) string { + json, _ := structure.NormalizeJsonString(v) + return json + }, + Description: `A JSON schema for the external table. Schema is required for CSV and JSON formats and is disallowed for Google Cloud Bigtable, Cloud Datastore backups, and Avro formats when using external tables.`, + }, // CsvOptions: [Optional] Additional properties to set if // sourceFormat is set to CSV. "csv_options": { @@ -275,9 +290,6 @@ func resourceBigQueryTable() *schema.Resource { }, // Schema: [Optional] Describes the schema of this table. - // Schema is required for external tables in CSV and JSON formats - // and disallowed for Google Cloud Bigtable, Cloud Datastore backups, - // and Avro formats. "schema": { Type: schema.TypeString, Optional: true, @@ -287,7 +299,7 @@ func resourceBigQueryTable() *schema.Resource { json, _ := structure.NormalizeJsonString(v) return json }, - Description: `A JSON schema for the table. Schema is required for CSV and JSON formats and is disallowed for Google Cloud Bigtable, Cloud Datastore backups, and Avro formats when using external tables.`, + Description: `A JSON schema for the table.`, }, // View: [Optional] If specified, configures this table as a view. @@ -636,7 +648,6 @@ func resourceBigQueryTableCreate(d *schema.ResourceData, meta interface{}) error } log.Printf("[INFO] BigQuery table %s has been created", res.Id) - d.SetId(fmt.Sprintf("projects/%s/datasets/%s/tables/%s", res.TableReference.ProjectId, res.TableReference.DatasetId, res.TableReference.TableId)) return resourceBigQueryTableRead(d, meta) @@ -683,6 +694,26 @@ func resourceBigQueryTableRead(d *schema.ResourceData, meta interface{}) error { return err } + edc := map[string]interface{}(nil) + + if v, ok := d.GetOk("external_data_configuration"); ok { + // The API response doesn't return the `external_data_configuration.schema` + // used when creating the table and it cannot be queried. + // After creation, a computed schema is stored in the toplevel `schema`, + // which combines `external_data_configuration.schema` + // with any hive partioning fields found in the `source_uri_prefix`. + // So just assume the configured schema has been applied after successful + // creation, by copying the configured value back into the resource schema. + // This avoids that reading back this field will be identified as a change. + // The `ForceNew=true` on `external_data_configuration.schema` will ensure + // the users' expectation that changing the configured input schema will + // recreate the resource. + edc = v.([]interface{})[0].(map[string]interface{}) + if edc["schema"] != nil { + externalDataConfiguration[0]["schema"] = edc["schema"] + } + } + d.Set("external_data_configuration", externalDataConfiguration) } @@ -804,6 +835,13 @@ func expandExternalDataConfiguration(cfg interface{}) (*bigquery.ExternalDataCon if v, ok := raw["max_bad_records"]; ok { edc.MaxBadRecords = int64(v.(int)) } + if v, ok := raw["schema"]; ok { + schema, err := expandSchema(v) + if err != nil { + return nil, err + } + edc.Schema = schema + } if v, ok := raw["source_format"]; ok { edc.SourceFormat = v.(string) } diff --git a/third_party/terraform/tests/resource_bigquery_table_test.go b/third_party/terraform/tests/resource_bigquery_table_test.go index 5a6228d3f3bb..236762c174f1 100644 --- a/third_party/terraform/tests/resource_bigquery_table_test.go +++ b/third_party/terraform/tests/resource_bigquery_table_test.go @@ -119,6 +119,31 @@ func TestAccBigQueryTable_HivePartitioning(t *testing.T) { }) } +func TestAccBigQueryTable_HivePartitioningCustomSchema(t *testing.T) { + t.Parallel() + bucketName := testBucketName(t) + resourceName := "google_bigquery_table.test" + datasetID := fmt.Sprintf("tf_test_%s", randString(t, 10)) + tableID := fmt.Sprintf("tf_test_%s", randString(t, 10)) + + vcrTest(t, resource.TestCase{ + PreCheck: func() { testAccPreCheck(t) }, + Providers: testAccProviders, + CheckDestroy: testAccCheckBigQueryTableDestroyProducer(t), + Steps: []resource.TestStep{ + { + Config: testAccBigQueryTableHivePartitioningCustomSchema(bucketName, datasetID, tableID), + }, + { + ResourceName: resourceName, + ImportState: true, + ImportStateVerify: true, + ImportStateVerifyIgnore: []string{"external_data_configuration.0.schema"}, + }, + }, + }) +} + func TestAccBigQueryTable_RangePartitioning(t *testing.T) { t.Parallel() resourceName := "google_bigquery_table.test" @@ -503,6 +528,55 @@ resource "google_bigquery_table" "test" { `, bucketName, datasetID, tableID) } +func testAccBigQueryTableHivePartitioningCustomSchema(bucketName, datasetID, tableID string) string { + return fmt.Sprintf(` +resource "google_storage_bucket" "test" { + name = "%s" + force_destroy = true +} + +resource "google_storage_bucket_object" "test" { + name = "key1=20200330/data.json" + content = "{\"name\":\"test\", \"last_modification\":\"2020-04-01\"}" + bucket = google_storage_bucket.test.name +} + +resource "google_bigquery_dataset" "test" { + dataset_id = "%s" +} + +resource "google_bigquery_table" "test" { + table_id = "%s" + dataset_id = google_bigquery_dataset.test.dataset_id + + external_data_configuration { + source_format = "NEWLINE_DELIMITED_JSON" + autodetect = false + source_uris= ["gs://${google_storage_bucket.test.name}/*"] + + hive_partitioning_options { + mode = "CUSTOM" + source_uri_prefix = "gs://${google_storage_bucket.test.name}/{key1:STRING}" + } + + schema = <**NOTE**: Because this field expects a JSON string, any changes to the string will create a diff, even if the JSON itself hasn't changed. If the API returns a different value for the same schema, e.g. it @@ -167,6 +163,20 @@ The `external_data_configuration` block supports: * `max_bad_records` (Optional) - The maximum number of bad records that BigQuery can ignore when reading data. +* `schema` - (Optional) A JSON schema for the external table. Schema is required + for CSV and JSON formats if autodetect is not on. Schema is disallowed + for Google Cloud Bigtable, Cloud Datastore backups, Avro, ORC and Parquet formats. + A JSON schema for the table. Schema is required + for CSV and JSON formats and is disallowed for Google Cloud + Bigtable, Cloud Datastore backups, and Avro formats when using + external tables. + ~>**NOTE**: Because this field expects a JSON string, any changes to the + string will create a diff, even if the JSON itself hasn't changed. + Any diff on this schema will force the table to be recreated. + This schema is only applied when creating a table from an external + datasource, after creation the computed schema will be stored in + `google_bigquery_dataset.schema` + * `source_format` (Required) - The data format. Supported values are: "CSV", "GOOGLE_SHEETS", "NEWLINE_DELIMITED_JSON", "AVRO", "PARQUET", and "DATSTORE_BACKUP". To use "GOOGLE_SHEETS" From baa387de7845c4065b76e86e8d54f1548e374f3e Mon Sep 17 00:00:00 2001 From: Fai Fung Date: Tue, 30 Jun 2020 09:20:03 +0200 Subject: [PATCH 2/4] fix linting error --- third_party/terraform/resources/resource_bigquery_table.go | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/third_party/terraform/resources/resource_bigquery_table.go b/third_party/terraform/resources/resource_bigquery_table.go index c587a0c620c0..379db3718564 100644 --- a/third_party/terraform/resources/resource_bigquery_table.go +++ b/third_party/terraform/resources/resource_bigquery_table.go @@ -694,8 +694,6 @@ func resourceBigQueryTableRead(d *schema.ResourceData, meta interface{}) error { return err } - edc := map[string]interface{}(nil) - if v, ok := d.GetOk("external_data_configuration"); ok { // The API response doesn't return the `external_data_configuration.schema` // used when creating the table and it cannot be queried. @@ -708,7 +706,7 @@ func resourceBigQueryTableRead(d *schema.ResourceData, meta interface{}) error { // The `ForceNew=true` on `external_data_configuration.schema` will ensure // the users' expectation that changing the configured input schema will // recreate the resource. - edc = v.([]interface{})[0].(map[string]interface{}) + edc := v.([]interface{})[0].(map[string]interface{}) if edc["schema"] != nil { externalDataConfiguration[0]["schema"] = edc["schema"] } From 54f372413f2421052da3eccc4e4dffdb43927bc4 Mon Sep 17 00:00:00 2001 From: Fai Fung Date: Tue, 30 Jun 2020 09:20:30 +0200 Subject: [PATCH 3/4] fix mixed indentation --- .../tests/resource_bigquery_table_test.go | 38 +++++++++---------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/third_party/terraform/tests/resource_bigquery_table_test.go b/third_party/terraform/tests/resource_bigquery_table_test.go index 236762c174f1..28777be94af0 100644 --- a/third_party/terraform/tests/resource_bigquery_table_test.go +++ b/third_party/terraform/tests/resource_bigquery_table_test.go @@ -505,7 +505,7 @@ resource "google_storage_bucket_object" "test" { } resource "google_bigquery_dataset" "test" { - dataset_id = "%s" + dataset_id = "%s" } resource "google_bigquery_table" "test" { @@ -513,16 +513,16 @@ resource "google_bigquery_table" "test" { dataset_id = google_bigquery_dataset.test.dataset_id external_data_configuration { - source_format = "CSV" - autodetect = true - source_uris= ["gs://${google_storage_bucket.test.name}/*"] + source_format = "CSV" + autodetect = true + source_uris= ["gs://${google_storage_bucket.test.name}/*"] - hive_partitioning_options { - mode = "AUTO" - source_uri_prefix = "gs://${google_storage_bucket.test.name}/" - } + hive_partitioning_options { + mode = "AUTO" + source_uri_prefix = "gs://${google_storage_bucket.test.name}/" + } - } + } depends_on = ["google_storage_bucket_object.test"] } `, bucketName, datasetID, tableID) @@ -537,12 +537,12 @@ resource "google_storage_bucket" "test" { resource "google_storage_bucket_object" "test" { name = "key1=20200330/data.json" - content = "{\"name\":\"test\", \"last_modification\":\"2020-04-01\"}" + content = "{\"name\":\"test\", \"last_modification\":\"2020-04-01\"}" bucket = google_storage_bucket.test.name } resource "google_bigquery_dataset" "test" { - dataset_id = "%s" + dataset_id = "%s" } resource "google_bigquery_table" "test" { @@ -550,16 +550,16 @@ resource "google_bigquery_table" "test" { dataset_id = google_bigquery_dataset.test.dataset_id external_data_configuration { - source_format = "NEWLINE_DELIMITED_JSON" - autodetect = false - source_uris= ["gs://${google_storage_bucket.test.name}/*"] + source_format = "NEWLINE_DELIMITED_JSON" + autodetect = false + source_uris= ["gs://${google_storage_bucket.test.name}/*"] - hive_partitioning_options { - mode = "CUSTOM" - source_uri_prefix = "gs://${google_storage_bucket.test.name}/{key1:STRING}" - } + hive_partitioning_options { + mode = "CUSTOM" + source_uri_prefix = "gs://${google_storage_bucket.test.name}/{key1:STRING}" + } - schema = < Date: Thu, 2 Jul 2020 09:20:49 +0200 Subject: [PATCH 4/4] improve on documentation --- .../website/docs/r/bigquery_table.html.markdown | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/third_party/terraform/website/docs/r/bigquery_table.html.markdown b/third_party/terraform/website/docs/r/bigquery_table.html.markdown index 4a86cf3d4d7f..5a98205d2c94 100644 --- a/third_party/terraform/website/docs/r/bigquery_table.html.markdown +++ b/third_party/terraform/website/docs/r/bigquery_table.html.markdown @@ -166,16 +166,14 @@ The `external_data_configuration` block supports: * `schema` - (Optional) A JSON schema for the external table. Schema is required for CSV and JSON formats if autodetect is not on. Schema is disallowed for Google Cloud Bigtable, Cloud Datastore backups, Avro, ORC and Parquet formats. - A JSON schema for the table. Schema is required - for CSV and JSON formats and is disallowed for Google Cloud - Bigtable, Cloud Datastore backups, and Avro formats when using - external tables. ~>**NOTE**: Because this field expects a JSON string, any changes to the string will create a diff, even if the JSON itself hasn't changed. - Any diff on this schema will force the table to be recreated. - This schema is only applied when creating a table from an external + Furthermore drift for this field cannot not be detected because BigQuery + only uses this schema to compute the effective schema for the table, therefore + any changes on the configured value will force the table to be recreated. + This schema is effectively only applied when creating a table from an external datasource, after creation the computed schema will be stored in - `google_bigquery_dataset.schema` + `google_bigquery_table.schema` * `source_format` (Required) - The data format. Supported values are: "CSV", "GOOGLE_SHEETS", "NEWLINE_DELIMITED_JSON", "AVRO", "PARQUET",