From 0a938c318c4b8c56a57dd9dc703304428b41a09e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Poniedzia=C5=82ek?= Date: Mon, 13 May 2024 10:49:51 +0200 Subject: [PATCH] Mooore BQ docs changes --- .../bigquery-loader/_diagram.md | 3 +- .../bigquery-loader/index.md | 2 - .../bigquery-loader-1.x/_diagram.md | 2 +- .../bigquery-loader-1.x/index.md | 40 ++++++++--------- .../2-0-0-upgrade-guide/index.md | 45 +------------------ .../storing-querying/loading-process/index.md | 1 + .../schemas-in-warehouse/index.md | 4 +- src/componentVersions.js | 1 + 8 files changed, 28 insertions(+), 70 deletions(-) diff --git a/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/_diagram.md b/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/_diagram.md index 35c1e99a29..a21e963fa2 100644 --- a/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/_diagram.md +++ b/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/_diagram.md @@ -2,8 +2,7 @@ import Mermaid from '@theme/Mermaid'; import Link from '@docusaurus/Link'; ``` - -

The BigQuery Streaming Loader on {props.cloud} is a fully streaming application that continually pulls events from {props.stream} and writes to BigQuery using the BigQuery Storage API.

+

The BigQuery Streaming Loader on {props.cloud} is a fully streaming application that continually pulls events from {props.stream} and writes to BigQuery using the BigQuery Storage API.

diff --git a/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/previous-versions/bigquery-loader-1.x/_diagram.md b/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/previous-versions/bigquery-loader-1.x/_diagram.md index 8d39607656..d5c9778a0c 100644 --- a/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/previous-versions/bigquery-loader-1.x/_diagram.md +++ b/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/previous-versions/bigquery-loader-1.x/_diagram.md @@ -1,4 +1,4 @@ -At the high level, BigQuery loader reads enriched Snowplow events in real time and loads them in BigQuery using the Storage Write API. +At the high level, BigQuery loader reads enriched Snowplow events in real time and loads them in BigQuery using the [legacy streaming API](https://cloud.google.com/bigquery/docs/streaming-data-into-bigquery). ```mermaid flowchart LR diff --git a/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/previous-versions/bigquery-loader-1.x/index.md b/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/previous-versions/bigquery-loader-1.x/index.md index dd4aae832d..88eed982ba 100644 --- a/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/previous-versions/bigquery-loader-1.x/index.md +++ b/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/previous-versions/bigquery-loader-1.x/index.md @@ -6,7 +6,7 @@ sidebar_position: 0 ```mdx-code-block import {versions} from '@site/src/componentVersions'; import CodeBlock from '@theme/CodeBlock'; -import Diagram from '@site/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/_diagram.md'; +import Diagram from '@site/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/previous-versions/bigquery-loader-1.x/_diagram.md'; ``` Under the umbrella of Snowplow BigQuery Loader, we have a family of applications that can be used to load enriched Snowplow data into BigQuery. @@ -147,7 +147,7 @@ The loader takes command line arguments `--config` with a path to the configurat { `docker run \\ -v /path/to/configs:/configs \\ - snowplow/snowplow-bigquery-streamloader:1.7.1 \\ + snowplow/snowplow-bigquery-streamloader:${versions.bqLoader1x} \\ --config=/configs/bigquery.hocon \\ --resolver=/configs/resolver.json `} @@ -157,7 +157,7 @@ Or you can pass the whole config as a base64-encoded string using the `--config` { `docker run \\ -v /path/to/resolver.json:/resolver.json \\ - snowplow/snowplow-bigquery-streamloader:1.7.1 \\ + snowplow/snowplow-bigquery-streamloader:${versions.bqLoader1x} \\ --config=ewogICJwcm9qZWN0SWQiOiAiY29tLWFjbWUiCgogICJsb2FkZXIiOiB7CiAgICAiaW5wdXQiOiB7CiAgICAgICJzdWJzY3JpcHRpb24iOiAiZW5yaWNoZWQtc3ViIgogICAgfQoKICAgICJvdXRwdXQiOiB7CiAgICAgICJnb29kIjogewogICAgICAgICJkYXRhc2V0SWQiOiAic25vd3Bsb3ciCiAgICAgICAgInRhYmxlSWQiOiAiZXZlbnRzIgogICAgICB9CgogICAgICAiYmFkIjogewogICAgICAgICJ0b3BpYyI6ICJiYWQtdG9waWMiCiAgICAgIH0KCiAgICAgICJ0eXBlcyI6IHsKICAgICAgICAidG9waWMiOiAidHlwZXMtdG9waWMiCiAgICAgIH0KCiAgICAgICJmYWlsZWRJbnNlcnRzIjogewogICAgICAgICJ0b3BpYyI6ICJmYWlsZWQtaW5zZXJ0cy10b3BpYyIKICAgICAgfQogICAgfQogIH0KCiAgIm11dGF0b3IiOiB7CiAgICAiaW5wdXQiOiB7CiAgICAgICJzdWJzY3JpcHRpb24iOiAidHlwZXMtc3ViIgogICAgfQoKICAgICJvdXRwdXQiOiB7CiAgICAgICJnb29kIjogJHtsb2FkZXIub3V0cHV0Lmdvb2R9ICMgd2lsbCBiZSBhdXRvbWF0aWNhbGx5IGluZmVycmVkCiAgICB9CiAgfQoKICAicmVwZWF0ZXIiOiB7CiAgICAiaW5wdXQiOiB7CiAgICAgICJzdWJzY3JpcHRpb24iOiAiZmFpbGVkLWluc2VydHMtc3ViIgogICAgfQoKICAgICJvdXRwdXQiOiB7CiAgICAgICJnb29kIjogJHtsb2FkZXIub3V0cHV0Lmdvb2R9ICMgd2lsbCBiZSBhdXRvbWF0aWNhbGx5IGluZmVycmVkCgogICAgICAiZGVhZExldHRlcnMiOiB7CiAgICAgICAgImJ1Y2tldCI6ICJnczovL2RlYWQtbGV0dGVyLWJ1Y2tldCIKICAgICAgfQogICAgfQogIH0KCiAgIm1vbml0b3JpbmciOiB7fSAjIGRpc2FibGVkCn0= \\ --resolver=/resolver.json `} @@ -169,7 +169,7 @@ For example, to override the `repeater.input.subscription` setting using system { `docker run \\ -v /path/to/configs:/configs \\ - snowplow/snowplow-bigquery-streamloader:1.7.1 \\ + snowplow/snowplow-bigquery-streamloader:${versions.bqLoader1x} \\ --config=/configs/bigquery.hocon \\ --resolver=/configs/resolver.json \\ -Drepeater.input.subscription="failed-inserts-sub" @@ -180,7 +180,7 @@ Or to use environment variables for every setting: { `docker run \\ -v /path/to/resolver.json:/resolver.json \\ - snowplow/snowplow-bigquery-repeater:1.7.1 \\ + snowplow/snowplow-bigquery-repeater:${versions.bqLoader1x} \\ --resolver=/resolver.json \\ -Dconfig.override_with_env_vars=true `} @@ -197,7 +197,7 @@ StreamLoader accepts `--config` and `--resolver` arguments, as well as any JVM s { `docker run \\ -v /path/to/configs:/configs \\ - snowplow/snowplow-bigquery-streamloader:1.7.1 \\ + snowplow/snowplow-bigquery-streamloader:${versions.bqLoader1x} \\ --config=/configs/bigquery.hocon \\ --resolver=/configs/resolver.json \\ -Dconfig.override_with_env_vars=true @@ -212,7 +212,7 @@ The Dataflow Loader accepts the same two arguments as StreamLoader and [any oth { `docker run \\ -v /path/to/configs:/configs \\ - snowplow/snowplow-bigquery-loader:1.7.1 \\ + snowplow/snowplow-bigquery-loader:${versions.bqLoader1x} \\ --config=/configs/bigquery.hocon \\ --resolver=/configs/resolver.json \\ --labels={"key1":"val1","key2":"val2"} # optional Dataflow args @@ -233,7 +233,7 @@ Mutator has three subcommands: `listen`, `create` and `add-column`. { `docker run \\ -v /path/to/configs:/configs \\ - snowplow/snowplow-bigquery-mutator:1.7.1 \\ + snowplow/snowplow-bigquery-mutator:${versions.bqLoader1x} \\ listen \\ --config=/configs/bigquery.hocon \\ --resolver=/configs/resolver.json \\ @@ -247,7 +247,7 @@ Mutator has three subcommands: `listen`, `create` and `add-column`. { `docker run \\ -v /path/to/configs:/configs \\ - snowplow/snowplow-bigquery-mutator:1.7.1 \\ + snowplow/snowplow-bigquery-mutator:${versions.bqLoader1x} \\ add-column \\ --config=/configs/bigquery.hocon \\ --resolver=/configs/resolver.json \\ @@ -264,7 +264,7 @@ The specified schema must be present in one of the Iglu registries in the resolv { `docker run \\ -v /path/to/configs:/configs \\ - snowplow/snowplow-bigquery-mutator:1.7.1 \\ + snowplow/snowplow-bigquery-mutator:${versions.bqLoader1x} \\ create \\ --config=/configs/bigquery.hocon \\ --resolver=/configs/resolver.json \\ @@ -281,7 +281,7 @@ We recommend constantly running Repeater on a small / cheap node or Docker conta { `docker run \\ -v /path/to/configs:/configs \\ - snowplow/snowplow-bigquery-repeater:1.7.1 \\ + snowplow/snowplow-bigquery-repeater:${versions.bqLoader1x} \\ --config=/configs/bigquery.hocon \\ --resolver=/configs/resolver.json \\ --bufferSize=20 \\ # size of the batch to send to the dead-letter bucket @@ -297,19 +297,19 @@ We recommend constantly running Repeater on a small / cheap node or Docker conta All applications are available as Docker images on Docker Hub, based on Ubuntu Focal and OpenJDK 11: { -`$ docker pull snowplow/snowplow-bigquery-streamloader:1.7.1 -$ docker pull snowplow/snowplow-bigquery-loader:1.7.1 -$ docker pull snowplow/snowplow-bigquery-mutator:1.7.1 -$ docker pull snowplow/snowplow-bigquery-repeater:1.7.1 +`$ docker pull snowplow/snowplow-bigquery-streamloader:${versions.bqLoader1x} +$ docker pull snowplow/snowplow-bigquery-loader:${versions.bqLoader1x} +$ docker pull snowplow/snowplow-bigquery-mutator:${versions.bqLoader1x} +$ docker pull snowplow/snowplow-bigquery-repeater:${versions.bqLoader1x} `} -

We also provide an alternative lightweight set of images based on Google's "distroless" base image, which may provide some security advantages for carrying fewer dependencies. These images are distinguished with the {`1.7.1-distroless`} tag:

+

We also provide an alternative lightweight set of images based on Google's "distroless" base image, which may provide some security advantages for carrying fewer dependencies. These images are distinguished with the {`${versions.bqLoader1x}-distroless`} tag:

{ -`$ docker pull snowplow/snowplow-bigquery-streamloader:1.7.1-distroless -$ docker pull snowplow/snowplow-bigquery-loader:1.7.1-distroless -$ docker pull snowplow/snowplow-bigquery-mutator:1.7.1-distroless -$ docker pull snowplow/snowplow-bigquery-repeater:1.7.1-distroless +`$ docker pull snowplow/snowplow-bigquery-streamloader:${versions.bqLoader1x}-distroless +$ docker pull snowplow/snowplow-bigquery-loader:${versions.bqLoader1x}-distroless +$ docker pull snowplow/snowplow-bigquery-mutator:${versions.bqLoader1x}-distroless +$ docker pull snowplow/snowplow-bigquery-repeater:${versions.bqLoader1x}-distroless `} Mutator, Repeater and Streamloader are also available as fatjar files attached to [releases](https://github.com/snowplow-incubator/snowplow-bigquery-loader/releases) in the project's Github repository. diff --git a/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/upgrade-guides/2-0-0-upgrade-guide/index.md b/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/upgrade-guides/2-0-0-upgrade-guide/index.md index 3896f4a7b0..af795489bd 100644 --- a/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/upgrade-guides/2-0-0-upgrade-guide/index.md +++ b/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/upgrade-guides/2-0-0-upgrade-guide/index.md @@ -53,53 +53,12 @@ There are two main types of schema changes: **Non-breaking**: The schema version can be changed in a minor way (`1-2-3` → `1-3-0` or `1-2-3` → `1-2-4`). Data is stored in the same database column. -### Without recovery columns - Loader tries to format the incoming data according to the latest version of the schema it saw (for a given major version, e.g. `1-*-*`). For example, if a batch contains events with schema versions `1-0-0`, `1-0-1` and `1-0-2`, the loader derives the output schema based on version `1-0-2`. Then the loader instructs BigQuery to adjust the database column and load the data. -This logic relies on two assumptions: - -1. **Old events compatible with new schemas.** Events with older schema versions, e.g. `1-0-0` and `1-0-1`, have to be valid against the newer ones, e.g. `1-0-2`. Those that are valid will result in failed events. - -2. **Old columns compatible with new schemas.** The corresponding BigQuery columns have to be migrated correctly from one version to another. Changes, such as altering the type of a field from `integer` to `string`, would fail. Loading would break with SQL errors and the whole batch would be stuck and hard to recover. - -These assumptions are not always clear to the users, making the process error-prone. - -### With recovery columns - -First, we support schema evolution that’s not strictly backwards compatible (although we still recommend against it since it can confuse downstream consumers of the data). This is done by _merging_ multiple schemas so that both old and new events can coexist. For example, suppose we have these two schemas: - -```json -{ - // 1-0-0 - "properties": { - "a": {"type": "integer"} - } -} -``` - -```json -{ - // 1-0-1 - "properties": { - "b": {"type": "integer"} - } -} -``` - -These would be merged into the following: -```json -{ - // merged - "properties": { - "a": {"type": "integer"}, - "b": {"type": "integer"} - } -} -``` +### Recovering from invalid schema evolution +Let's consider these two schemas as an example of breaking schema evolution (changing the type of a field from `integer` to `string`) using the same major version (`1-0-0` and `1-0-1`): -Second, the loader does not fail when it can’t modify the database column to store both old and new events. (As a reminder, an example would be changing the type of a field from `integer` to `string`.) Instead, it creates a _temporary_ column for the new data as an exception. The users can then run SQL statements to resolve this situation as they see fit. For instance, consider these two schemas: ```json { // 1-0-0 diff --git a/docs/storing-querying/loading-process/index.md b/docs/storing-querying/loading-process/index.md index d38e8e1a71..219d894f9e 100644 --- a/docs/storing-querying/loading-process/index.md +++ b/docs/storing-querying/loading-process/index.md @@ -27,6 +27,7 @@ We load data into Redshift using the [RDB Loader](/docs/pipeline-components-and-
+We load data into BigQuery using the [BigQuery Loader](/docs/pipeline-components-and-applications/loaders-storage-targets/bigquery-loader/index.md). diff --git a/docs/storing-querying/schemas-in-warehouse/index.md b/docs/storing-querying/schemas-in-warehouse/index.md index eafcba01da..7e14b3a6a5 100644 --- a/docs/storing-querying/schemas-in-warehouse/index.md +++ b/docs/storing-querying/schemas-in-warehouse/index.md @@ -381,9 +381,9 @@ If you are [modeling your data with dbt](/docs/modeling-your-data/modeling-your- :::info Breaking changes -While our recommendation is to use major schema versions to indicate breaking changes (e.g. changing a type of a field from a `string` to a `number`), this is not particularly relevant for BigQuery. Indeed, each schema version gets its own column, so there is no difference between major and minor versions. That said, we believe sticking to our recommendation is a good idea: +While our recommendation is to use major schema versions to indicate breaking changes (e.g. changing a type of a field from a `string` to a `number`), this is not particularly relevant for BigQuery Loader version 1.x. Indeed, each schema version gets its own column, so there is no difference between major and minor versions. That said, we believe sticking to our recommendation is a good idea: * Breaking changes might affect downstream consumers of the data, even if they don’t affect BigQuery -* In the future, you might decide to migrate to a different data warehouse where our rules are stricter (e.g. Databricks) +* Version 2 of the loader has stricter behavior that matches our loaders for other warehouses and lakes ::: diff --git a/src/componentVersions.js b/src/componentVersions.js index c9108d00d5..0cdf4fb219 100644 --- a/src/componentVersions.js +++ b/src/componentVersions.js @@ -30,6 +30,7 @@ export const versions = { // Loaders bqLoader: '2.0.0', + bqLoader1x: '1.7.1', esLoader: '2.1.2', gcsLoader: '0.5.5', postgresLoader: '0.3.3',