airbytehq · andriikorotkov · Mar 23, 2022 · Mar 15, 2022 · Mar 17, 2022 · Mar 17, 2022
@@ -17,5 +17,5 @@ ENV ENABLE_SENTRY true
 
 COPY --from=build /airbyte /airbyte
 
-LABEL io.airbyte.version=0.6.11
+LABEL io.airbyte.version=0.6.12
 LABEL io.airbyte.name=airbyte/destination-bigquery
@@ -8,12 +8,13 @@
     "$schema": "http://json-schema.org/draft-07/schema#",
     "title": "BigQuery Destination Spec",
     "type": "object",
-    "required": ["project_id", "dataset_id"],
+    "required": ["project_id", "dataset_id", "big_query_client_buffer_size_mb", "dataset_location",
+      "transformation_priority", "loading_method"],
     "additionalProperties": true,
     "properties": {
       "big_query_client_buffer_size_mb": {
-        "title": "Google BigQuery client chunk size",
-        "description": "Google BigQuery client's chunk(buffer) size (MIN=1, MAX = 15) for each table. The size that will be written by a single RPC. Written data will be buffered and only flushed upon reaching this size or closing the channel. The default 15MiB value is used if not set explicitly. It's recommended to decrease value for big data sets migration for less HEAP memory consumption and avoiding crashes. For more details refer to https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.client.Client.html",
+        "title": "Google BigQuery Client Chunk Size",
+        "description": "Google BigQuery client's chunk (buffer) size (MIN=1, MAX = 15) for each table. The size that will be written by a single RPC. Written data will be buffered and only flushed upon reaching this size or closing the channel. The default 15MB value is used if not set explicitly. Read more <a href=\"https://googleapis.dev/python/bigquery/latest/generated/google.cloud.bigquery.client.Client.html\">here</a>.",
         "type": "integer",
         "minimum": 1,
         "maximum": 15,
@@ -22,17 +23,17 @@
       },
       "project_id": {
         "type": "string",
-        "description": "The GCP project ID for the project containing the target BigQuery dataset.",
+        "description": "The GCP project ID for the project containing the target BigQuery dataset. Read more <a href=\"https://cloud.google.com/iam/docs/creating-managing-service-accounts#creating\">here</a>.",
         "title": "Project ID"
       },
       "dataset_id": {
         "type": "string",
-        "description": "Default BigQuery Dataset ID tables are replicated to if the source does not specify a namespace.",
+        "description": "The default BigQuery Dataset ID that tables are replicated to if the source does not specify a namespace. Read more <a href=\"https://cloud.google.com/bigquery/docs/datasets#create-dataset\">here</a>.",
         "title": "Default Dataset ID"
       },
       "dataset_location": {
         "type": "string",
-        "description": "The location of the dataset. Warning: Changes made after creation will not be applied.",
+        "description": "The location of the dataset. Warning: Changes made after creation will not be applied. Read more <a href=\"https://cloud.google.com/bigquery/docs/locations\">here</a>.",
         "title": "Dataset Location",
         "default": "US",
         "enum": [
@@ -71,26 +72,25 @@
       },
       "credentials_json": {
         "type": "string",
-        "description": "The contents of the JSON service account key. Check out the <a href=\"https://docs.airbyte.io/integrations/destinations/bigquery\">docs</a> if you need help generating this key. Default credentials will be used if this field is left empty.",
-        "title": "Credentials JSON",
+        "description": "The contents of the JSON service account key. Check out the <a href=\"https://docs.airbyte.com/integrations/destinations/bigquery#service-account-key\">docs</a> if you need help generating this key. Default credentials will be used if this field is left empty.",
+        "title": "Credentials JSON (Optional)",
         "airbyte_secret": true
       },
       "transformation_priority": {
         "type": "string",
-        "description": "Interactive run type means that the query is executed as soon as possible, and these queries count towards concurrent rate limit and daily limit. Batch queries are queued and started as soon as idle resources are available in the BigQuery shared resource pool, which usually occurs within a few minutes. Batch queries don’t count towards your concurrent rate limit.",
+        "description": "Interactive run type means that the query is executed as soon as possible, and these queries count towards concurrent rate limit and daily limit. Read more about interactive run type <a href=\"https://cloud.google.com/bigquery/docs/running-queries#queries\">here</a>. Batch queries are queued and started as soon as idle resources are available in the BigQuery shared resource pool, which usually occurs within a few minutes. Batch queries don’t count towards your concurrent rate limit. Read more about batch queries <a href=\"https://cloud.google.com/bigquery/docs/running-queries#batch\">here</a>.",
         "title": "Transformation Query Run Type",
         "default": "interactive",
         "enum": ["interactive", "batch"]
       },
       "loading_method": {
         "type": "object",
         "title": "Loading Method",
-        "description": "Loading method used to send select the way data will be uploaded to BigQuery.",
+        "description": "Loading method used to send select the way data will be uploaded to BigQuery. <br><b>Standard Inserts</b> - Direct uploading using streams. <br><b>GCS Staging</b> - Writes large batches of records to a file, uploads the file to GCS, then uses <b>COPY INTO table</b> to upload the file. Recommended for large production workloads for better speed and scalability. Read more about GCS Staging <a href=\"https://docs.airbyte.com/integrations/destinations/bigquery#gcs-staging\">here</a>.",
         "oneOf": [
           {
             "title": "Standard Inserts",
             "additionalProperties": false,
-            "description": "Direct uploading using streams.",
             "required": ["method"],
             "properties": {
               "method": {
@@ -102,11 +102,12 @@
           {
             "title": "GCS Staging",
             "additionalProperties": false,
-            "description": "Writes large batches of records to a file, uploads the file to GCS, then uses <pre>COPY INTO table</pre> to upload the file. Recommended for large production workloads for better speed and scalability.",
             "required": [
               "method",
               "gcs_bucket_name",
               "gcs_bucket_path",
+              "part_size_mb",
+              "keep_files_in_gcs-bucket",
               "credential"
             ],
             "properties": {
@@ -117,16 +118,17 @@
               "gcs_bucket_name": {
                 "title": "GCS Bucket Name",
                 "type": "string",
-                "description": "The name of the GCS bucket.",
+                "description": "The name of the GCS bucket. Read more <a href=\"https://cloud.google.com/storage/docs/naming-buckets\">here</a>.",
                 "examples": ["airbyte_sync"]
               },
               "gcs_bucket_path": {
+                "title": "GCS Bucket Path",
                 "description": "Directory under the GCS bucket where data will be written.",
                 "type": "string",
                 "examples": ["data_sync/test"]
               },
               "part_size_mb": {
-                "title": "Block Size (MB) for GCS multipart upload",
+                "title": "Block Size (MB) for GCS Multipart Upload",
                 "description": "This is the size of a \"Part\" being buffered in memory. It limits the memory usage when writing. Larger values will allow to upload a bigger files and improve the speed, but consumes more memory. Allowed values: min=5MB, max=525MB Default: 5MB.",
                 "type": "integer",
                 "default": 5,
@@ -137,7 +139,7 @@
               "keep_files_in_gcs-bucket": {
                 "type": "string",
                 "description": "This upload method is supposed to temporary store records in GCS bucket. What do you want to do with data in GCS bucket when migration has finished?",
-                "title": "GCS tmp files afterward processing",
+                "title": "GCS Tmp Files Afterward Processing",
                 "default": "Delete all tmp files from GCS",
                 "enum": [
                   "Delete all tmp files from GCS",
@@ -146,6 +148,7 @@
               },
               "credential": {
                 "title": "Credential",
+                "description": "An HMAC key is a type of credential and can be associated with a service account or a user account in Cloud Storage. Read more <a href=\"https://cloud.google.com/storage/docs/authentication/hmackeys\">here</a>.",
                 "type": "object",
                 "oneOf": [
                   {

diff --git a/docs/integrations/destinations/bigquery.md b/docs/integrations/destinations/bigquery.md
@@ -6,7 +6,24 @@ description: >-
 
 # BigQuery
 
-## Features
+## Overview
+
+The Airbyte BigQuery destination allows you to sync data to BigQuery.
+
+### Sync overview
+
+#### Output schema
+
+Each stream will be output into its own table in BigQuery. Each table will contain 3 columns:
+
+* `_airbyte_ab_id`: a uuid assigned by Airbyte to each event that is processed. The column type in BigQuery is `String`.
+* `_airbyte_emitted_at`: a timestamp representing when the event was pulled from the data source. The column type in BigQuery is `Timestamp`.
+* `_airbyte_data`: a json blob representing with the event data. The column type in BigQuery is `String`.
+
+The output tables from the BigQuery destination are partitioned and clustered by the Time-unit column `_airbyte_emitted_at` at a daily granularity. Partitions boundaries are based on UTC time.
+This is useful to limit the number of partitions scanned when querying these partitioned tables, by using a predicate filter (a WHERE clause). Filters on the partitioning column will be used to prune the partitions and reduce the query cost. (The parameter "Require partition filter" is not enabled by Airbyte, but you may toggle this by updating the produced tables if you wish so)
+
+#### Features
 
 | Feature | Supported?\(Yes/No\) | Notes |
 | :--- | :--- | :--- |
@@ -21,31 +38,16 @@ There are two flavors of connectors for this destination:
 1. Bigquery: This is producing the standard Airbyte outputs using a `_airbyte_raw_*` tables storing the JSON blob data first. Afterward, these are transformed and normalized into separate tables, potentially "exploding" nested streams into their own tables if [basic normalization](../../understanding-airbyte/basic-normalization.md) is configured.
 2. `Bigquery (Denormalized)`: Instead of splitting the final data into multiple tables, this destination leverages BigQuery capabilities with [Structured and Repeated fields](https://cloud.google.com/bigquery/docs/nested-repeated) to produce a single "big" table per stream. This does not write the `_airbyte_raw_*` tables in the destination and normalization from this connector is not supported at this time.
 
-## Troubleshooting
-
-Check out common troubleshooting issues for the BigQuery destination connector on our Discourse [here](https://discuss.airbyte.io/tags/c/connector/11/destination-bigquery).
-
-## Output Schema for BigQuery
-
-Each stream will be output into its own table in BigQuery. Each table will contain 3 columns:
-
-* `_airbyte_ab_id`: a uuid assigned by Airbyte to each event that is processed. The column type in BigQuery is `String`.
-* `_airbyte_emitted_at`: a timestamp representing when the event was pulled from the data source. The column type in BigQuery is `Timestamp`.
-* `_airbyte_data`: a json blob representing with the event data. The column type in BigQuery is `String`.
-
-The output tables from the BigQuery destination are partitioned and clustered by the Time-unit column `_airbyte_emitted_at` at a daily granularity. Partitions boundaries are based on UTC time.
-This is useful to limit the number of partitions scanned when querying these partitioned tables, by using a predicate filter (a WHERE clause). Filters on the partitioning column will be used to prune the partitions and reduce the query cost. (The parameter "Require partition filter" is not enabled by Airbyte, but you may toggle this by updating the produced tables if you wish so)
-
 ## Getting Started \(Airbyte Open-Source / Airbyte Cloud\)
 
 #### Requirements
 
 To use the BigQuery destination, you'll need:
 
-* A Google Cloud Project with BigQuery enabled
-* A BigQuery Dataset into which Airbyte can sync your data
-* A Google Cloud Service Account with the "BigQuery User" and "BigQuery Data Editor" roles in your GCP project
-* A Service Account Key to authenticate into your Service Account
+* [A Google Cloud Project with BigQuery enabled](https://docs.airbyte.com/integrations/destinations/bigquery#google-cloud-project)
+* [A BigQuery Dataset into which Airbyte can sync your data](https://docs.airbyte.com/integrations/destinations/bigquery#bigquery-dataset-for-airbyte-syncs)
+* [A Google Cloud Service Account with the "BigQuery User" and "BigQuery Data Editor" roles in your GCP project](https://docs.airbyte.com/integrations/destinations/bigquery#service-account)
+* [A Service Account Key to authenticate into your Service Account](https://docs.airbyte.com/integrations/destinations/bigquery#service-account-key)
 
 For GCS Staging upload mode:
 
@@ -72,7 +74,57 @@ Note that queries written in BigQuery can only reference Datasets in the same ph
 
 #### Service account
 
-In order for Airbyte to sync data into BigQuery, it needs credentials for a [Service Account](https://cloud.google.com/iam/docs/service-accounts) with the "BigQuery User" and "BigQuery Data Editor" roles, which grants permissions to run BigQuery jobs, write to BigQuery Datasets, and read table metadata. We highly recommend that this Service Account is exclusive to Airbyte for ease of permissioning and auditing. However, you can use a pre-existing Service Account if you already have one with the correct permissions.
+In order for Airbyte to sync data into BigQuery, it needs credentials for a [Service Account](https://cloud.google.com/iam/docs/service-accounts) with the "BigQuery User"(roles/bigquery.user) and "BigQuery Data Editor"(roles/bigquery.dataEditor) roles, which grants permissions to run BigQuery jobs, write to BigQuery Datasets, and read table metadata. More read about BigQuery roles permissions ypu can read [here](https://cloud.google.com/bigquery/docs/access-control). 
+We highly recommend that this Service Account is exclusive to Airbyte for ease of permissioning and auditing. However, you can use a pre-existing Service Account if you already have one with the correct permissions. "BigQuery User"(roles/bigquery.user) role permissions: 
+``` 
+bigquery.bireservations.get
+bigquery.capacityCommitments.get
+bigquery.capacityCommitments.list
+bigquery.config.get
+bigquery.datasets.create
+bigquery.datasets.get
+bigquery.datasets.getIamPolicy
+bigquery.jobs.create
+bigquery.jobs.list
+bigquery.models.list
+bigquery.readsessions.*
+bigquery.reservationAssignments.list
+bigquery.reservationAssignments.search
+bigquery.reservations.get
+bigquery.reservations.list
+bigquery.routines.list
+bigquery.savedqueries.get
+bigquery.savedqueries.list
+bigquery.tables.list
+bigquery.transfers.get
+resourcemanager.projects.get
+resourcemanager.projects.list
+```
+
+"BigQuery Data Editor"(roles/bigquery.dataEditor) role permissions:
+```
+bigquery.config.get
+bigquery.datasets.create
+bigquery.datasets.get
+bigquery.datasets.getIamPolicy
+bigquery.datasets.updateTag
+bigquery.models.*
+bigquery.routines.*
+bigquery.tables.create
+bigquery.tables.createSnapshot
+bigquery.tables.delete
+bigquery.tables.export
+bigquery.tables.get
+bigquery.tables.getData
+bigquery.tables.getIamPolicy
+bigquery.tables.list
+bigquery.tables.restoreSnapshot
+bigquery.tables.update
+bigquery.tables.updateData
+bigquery.tables.updateTag
+resourcemanager.projects.get
+resourcemanager.projects.list
+```
 
 The easiest way to create a Service Account is to follow GCP's guide for [Creating a Service Account](https://cloud.google.com/iam/docs/creating-managing-service-accounts). Once you've created the Service Account, make sure to keep its ID handy as you will need to reference it when granting roles. Service Account IDs typically take the form `<account-name>@<project-name>.iam.gserviceaccount.com`
 
@@ -100,7 +152,23 @@ Additional options can also be customized:
 
 Once you've configured BigQuery as a destination, delete the Service Account Key from your computer.
 
-## Uploading Options
+## Notes about BigQuery Naming Conventions
+
+From [BigQuery Datasets Naming](https://cloud.google.com/bigquery/docs/datasets#dataset-naming):
+
+When you create a dataset in BigQuery, the dataset name must be unique for each project. The dataset name can contain the following:
+
+* Up to 1,024 characters.
+* Letters \(uppercase or lowercase\), numbers, and underscores.
+
+  Note: In the Cloud Console, datasets that begin with an underscore are hidden from the navigation pane. You can query tables and views in these datasets even though these datasets aren't visible.
+
+* Dataset names are case-sensitive: mydataset and MyDataset can coexist in the same project.
+* Dataset names cannot contain spaces or special characters such as -, &, @, or %.
+
+Therefore, Airbyte BigQuery destination will convert any invalid characters into '\_' characters when writing data.
+
+## Loading Method
 
 There are 2 available options to upload data to BigQuery `Standard` and `GCS Staging`.
 
@@ -131,28 +199,13 @@ This is the recommended configuration for uploading data to BigQuery. It works b
 ### `Standard` uploads
 This uploads data directly from your source to BigQuery. While this is faster to setup initially, **we strongly recommend that you do not use this option for anything other than a quick demo**. It is more than 10x slower than the GCS uploading option and will fail for many datasets. Please be aware you may see some failures for big datasets and slow sources, e.g. if reading from source takes more than 10-12 hours. This is caused by the Google BigQuery SDK client limitations. For more details please check [https://github.com/airbytehq/airbyte/issues/3549](https://github.com/airbytehq/airbyte/issues/3549)
 
-## Naming Conventions
-
-From [BigQuery Datasets Naming](https://cloud.google.com/bigquery/docs/datasets#dataset-naming):
-
-When you create a dataset in BigQuery, the dataset name must be unique for each project. The dataset name can contain the following:
-
-* Up to 1,024 characters.
-* Letters \(uppercase or lowercase\), numbers, and underscores.
-
-  Note: In the Cloud Console, datasets that begin with an underscore are hidden from the navigation pane. You can query tables and views in these datasets even though these datasets aren't visible.
-
-* Dataset names are case-sensitive: mydataset and MyDataset can coexist in the same project.
-* Dataset names cannot contain spaces or special characters such as -, &, @, or %.
-
-Therefore, Airbyte BigQuery destination will convert any invalid characters into '\_' characters when writing data.
-
 ## CHANGELOG
 
 ### bigquery
 
 | Version | Date | Pull Request | Subject |
 |:--------| :--- | :--- | :--- |
+| 0.6.12 | 2022-03-18 | [11238](https://github.com/airbytehq/airbyte/pull/11238) | Updated spec and documentation |
 | 0.6.11 | 2022-03-03 | [10755](https://github.com/airbytehq/airbyte/pull/10755) | Make sure to kill children threads and stop JVM |
 | 0.6.8 | 2022-02-14 | [10256](https://github.com/airbytehq/airbyte/pull/10256) | Add `-XX:+ExitOnOutOfMemoryError` JVM option |
 | 0.6.6   | 2022-02-01 | [\#9959](https://github.com/airbytehq/airbyte/pull/9959) | Fix null pointer exception from buffered stream consumer. |