From 7a9c95d583612b7241712456b2ef1f7d75f08209 Mon Sep 17 00:00:00 2001 From: Steffany Brown <30247553+steffnay@users.noreply.github.com> Date: Wed, 27 Oct 2021 14:09:08 -0700 Subject: [PATCH] docs(samples): add create external table with hive partitioning (#1033) * docs(samples): add create table hive partitioning sample * refactor --- .../create_table_external_hive_partitioned.py | 73 +++++++++++++++++++ ...te_table_external_hive_partitioned_test.py | 31 ++++++++ 2 files changed, 104 insertions(+) create mode 100644 samples/snippets/create_table_external_hive_partitioned.py create mode 100644 samples/snippets/create_table_external_hive_partitioned_test.py diff --git a/samples/snippets/create_table_external_hive_partitioned.py b/samples/snippets/create_table_external_hive_partitioned.py new file mode 100644 index 000000000..2ff8a2220 --- /dev/null +++ b/samples/snippets/create_table_external_hive_partitioned.py @@ -0,0 +1,73 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +def create_table_external_hive_partitioned(table_id: str): + original_table_id = table_id + # [START bigquery_create_table_external_hivepartitioned] + # Demonstrates creating an external table with hive partitioning. + + # TODO(developer): Set table_id to the ID of the table to create. + table_id = "your-project.your_dataset.your_table_name" + + # TODO(developer): Set source uri. + # Example file: + # gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/dt=2020-11-15/file1.parquet + uri = "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/*" + + # TODO(developer): Set source uri prefix. + source_uri_prefix = ( + "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/" + ) + + # [END bigquery_create_table_external_hivepartitioned] + table_id = original_table_id + # [START bigquery_create_table_external_hivepartitioned] + from google.cloud import bigquery + + # Construct a BigQuery client object. + client = bigquery.Client() + + # Configure the external data source. + external_config = bigquery.ExternalConfig("PARQUET") + external_config.source_uris = [uri] + external_config.autodetect = True + + # Configure partitioning options. + hive_partitioning_opts = bigquery.external_config.HivePartitioningOptions() + + # The layout of the files in here is compatible with the layout requirements for hive partitioning, + # so we can add an optional Hive partitioning configuration to leverage the object paths for deriving + # partitioning column information. + + # For more information on how partitions are extracted, see: + # https://cloud.google.com/bigquery/docs/hive-partitioned-queries-gcs + + # We have a "/dt=YYYY-MM-DD/" path component in our example files as documented above. + # Autolayout will expose this as a column named "dt" of type DATE. + hive_partitioning_opts.mode = "AUTO" + hive_partitioning_opts.require_partition_filter = True + hive_partitioning_opts.source_uri_prefix = source_uri_prefix + + external_config.hive_partitioning = hive_partitioning_opts + + table = bigquery.Table(table_id) + table.external_data_configuration = external_config + + table = client.create_table(table) # Make an API request. + print( + "Created table {}.{}.{}".format(table.project, table.dataset_id, table.table_id) + ) + # [END bigquery_create_table_external_hivepartitioned] + return table diff --git a/samples/snippets/create_table_external_hive_partitioned_test.py b/samples/snippets/create_table_external_hive_partitioned_test.py new file mode 100644 index 000000000..c3cdddb55 --- /dev/null +++ b/samples/snippets/create_table_external_hive_partitioned_test.py @@ -0,0 +1,31 @@ +# Copyright 2021 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import create_table_external_hive_partitioned + + +def test_create_table_external_hive_partitioned(capsys, random_table_id): + table = create_table_external_hive_partitioned.create_table_external_hive_partitioned( + random_table_id + ) + + out, _ = capsys.readouterr() + hive_partioning = table.external_data_configuration.hive_partitioning + assert "Created table {}".format(random_table_id) in out + assert ( + hive_partioning.source_uri_prefix + == "gs://cloud-samples-data/bigquery/hive-partitioning-samples/autolayout/" + ) + assert hive_partioning.require_partition_filter is True + assert hive_partioning.mode == "AUTO"