-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconfig.pubsub.reference.hocon
219 lines (169 loc) · 7.26 KB
/
config.pubsub.reference.hocon
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
{
# -- Full license text available in LICENSE.md
"license" {
"accept": true
}
"input": {
# -- pubsub subscription for the source of enriched events
"subscription": "projects/myproject/subscriptions/snowplow-enriched"
# -- Controls how many threads are used internally by the pubsub client library for fetching events.
# -- The number of threads is equal to this factor multiplied by the number of availble cpu cores
"parallelPullFactor": 0.5
# -- How many bytes can be buffered by the loader app before blocking the pubsub client library
# -- from fetching more events.
# -- This is a balance between memory usage vs how efficiently the app can operate. The default value works well.
"bufferMaxBytes": 1000000
# -- For how long the pubsub client library will continue to re-extend the ack deadline of an unprocessed event.
"maxAckExtensionPeriod": "1 hour"
# -- Sets min/max boundaries on the value by which an ack deadline is extended.
# -- The actual value used is guided by runtime statistics collected by the pubsub client library.
"minDurationPerAckExtension": "60 seconds"
"maxDurationPerAckExtension": "600 seconds"
# -- The maximum number of streaming pulls we allow on a single GRPC transport channel before opening another channel.
# -- This advanced setting is only relevant on extremely large VMs, or with a high value of `parallelPullCount`.
"maxPullsPerTransportChannel": 16
}
"output": {
"good": {
# -- uri of the snowflake account
"url": "https://orgname.accountname.snowflakecomputing.com"
# -- snowflake user who has necessary privileges
"user": "snowplow"
# -- snowflake private key, used to connect to the account
"privateKey": ${SNOWFLAKE_PRIVATE_KEY}
# -- optional, passphrase for the private key
"privateKeyPassphrase": ${?SNOWFLAKE_PRIVATE_KEY_PASSPHRASE}
# -- optional, snowflake role which the snowflake user should assume
"role": "snowplow_loader"
# -- name of the snowflake database containing the events table
"database": "snowplow"
# -- name of the snowflake schema containing the events table
"schema": "atomic"
# -- name to use for the events table.
"table": "events"
# -- Prefix to use for the snowflake channels.
# -- The full name will be suffixed with a number, e.g. `snowplow-1`
# -- The prefix must be unique per loader VM
"channel": "snowplow"
# -- Timeouts used for JDBC operations
"jdbcLoginTimeout": "60 seconds"
"jdbcNetworkTimeout": "60 seconds"
"jdbcQueryTimeout": "60 seconds"
}
"bad": {
# -- output pubsub topic for emitting failed events that could not be processed
"topic": "projects/myproject/topics/snowplow-bad"
# -- Failed sends events to pubsub in batches not exceeding this size.
"batchSize": 100
# -- Failed events to pubsub in batches not exceeding this size number of bytes
"requestByteThreshold": 1000000
}
}
"batching": {
# - Events are emitted to Snowflake when the batch reaches this size in bytes
"maxBytes": 16000000
# - Events are emitted to Snowflake for a maximum of this duration, even if the `maxBytes` size has not been reached
"maxDelay": "1 second"
# - Controls how many batches can we send simultaneously over the network to Snowflake.
# -- E.g. If there are 4 available processors, and uploadParallelismFactor = 2.5, then we send up to 10 batches in parallel
# -- Adjusting this value can cause the app to use more or less of the available CPU.
"uploadParallelismFactor": 2.5
}
# -- Controls how the app splits the workload into concurrent batches which can be run in parallel.
# -- E.g. If there are 4 available processors, and cpuParallelismFactor = 0.75, then we process 3 batches concurrently.
# -- Adjusting this value can cause the app to use more or less of the available CPU.
"cpuParallelismFactor": 0.75
# Retry configuration for Snowflake operation failures
"retries": {
# -- Configures exponential backoff on errors related to how Snowflake is set up for this loader.
# -- Examples include authentication errors and permissions errors.
# -- This class of errors are reported periodically to the monitoring webhook.
"setupErrors": {
"delay": "30 seconds"
}
# -- Configures exponential backoff errors that are likely to be transient.
# -- Examples include server errors and network errors
"transientErrors": {
"delay": "1 second"
"attempts": 5
}
}
# -- Schemas that won't be loaded to Snowflake. Optional, default value []
"skipSchemas": [
"iglu:com.acme/skipped1/jsonschema/1-0-0",
"iglu:com.acme/skipped2/jsonschema/1-0-*",
"iglu:com.acme/skipped3/jsonschema/1-*-*",
"iglu:com.acme/skipped4/jsonschema/*-*-*"
]
"monitoring": {
"metrics": {
# -- Send runtime metrics to a statsd server
"statsd": {
"hostname": "127.0.0.1"
"port": 8125
# -- Map of key/value pairs to be send along with the metric
"tags": {
"myTag": "xyz"
}
# -- How often to report metrics
"period": "1 minute"
# -- Prefix used for the metric name when sending to statsd
"prefix": "snowplow.snowflake.loader"
}
}
# -- Report unexpected runtime exceptions to Sentry
"sentry": {
"dsn": "https://[email protected]/1"
# -- Map of key/value pairs to be included as tags
"tags": {
"myTag": "xyz"
}
}
# -- Report alerts and heartbeats to the webhook
"webhook": {
# An actual HTTP endpoint
"endpoint": "https://webhook.acme.com",
# Set of arbitrary key-value pairs attached to the payload
"tags": {
"pipeline": "production"
}
# How often to send the heartbeat event
"heartbeat": "60.minutes"
}
}
# -- Configuration of internal http client used for alerts and telemetry
"http": {
"client": {
"maxConnectionsPerServer": 4
}
}
# -- Optional, configure telemetry
# -- All the fields are optional
"telemetry": {
# -- Set to true to disable telemetry
"disable": false
# -- Interval for the heartbeat event
"interval": 15 minutes
# -- HTTP method used to send the heartbeat event
"method": POST
# -- URI of the collector receiving the heartbeat event
"collectorUri": collector-g.snowplowanalytics.com
# -- Port of the collector receiving the heartbeat event
"collectorPort": 443
# -- Whether to use https or not
"secure": true
# -- Identifier intended to tie events together across modules,
# -- infrastructure and apps when used consistently
"userProvidedId": my_pipeline
# -- ID automatically generated upon running a modules deployment script
# -- Intended to identify each independent module, and the infrastructure it controls
"autoGeneratedId": hfy67e5ydhtrd
# -- Unique identifier for the VM instance
# -- Unique for each instance of the app running within a module
"instanceId": 665bhft5u6udjf
# -- Name of the terraform module that deployed the app
"moduleName": snowflake-loader-vmss
# -- Version of the terraform module that deployed the app
"moduleVersion": 1.0.0
}
}