-
Notifications
You must be signed in to change notification settings - Fork 3
/
config.aws.reference.hocon
295 lines (243 loc) · 10.4 KB
/
config.aws.reference.hocon
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
{
# -- Accept the terms of the Snowplow Limited Use License Agreement
# -- See https://docs.snowplow.io/limited-use-license-1.0/
"license": {
"accept": ${?ACCEPT_LIMITED_USE_LICENSE}
}
"input": {
# -- kinesis stream for the source of enriched events
"streamName": "snowplow-enriched-events"
# -- name to use for the KCL dynamodb table
"appName": "snowplow-lake-loader"
# -- From where the loader should start consuming if this is the first time it is run.
# -- On subsequent runs, it will always resume from where it last checkpointed.
"initialPosition": {
# -- Options are `TRIM_HORIZON` for the oldest available events, `LATEST` for latest events,
# -- or `AT_TIMESTAMP` to start consuming from events written at a particular time.
"type": "TRIM_HORIZON"
# -- Only required if `initialPosition.type` is AT_TIMESTAMP
"timestamp": "2023-01-01T00:00:00Z"
}
# -- How the underlying Kinesis client should fetch events from the stream
"retrievalMode": {
# -- Options are "Polling" for the client to poll Kinesis for more events when needed
# -- or "FanOut" to enabled Kinesis's Enhanced Fan Out feature using HTTP/2
"type": "Polling"
# -- Only used if retrieval mode is type Polling. How many events the client may fetch in a single poll.
"maxRecords": 1000
}
# -- Name of this KCL worker used in the dynamodb lease table
"workerIdentifier": ${HOSTNAME}
# -- Duration of shard leases. KCL workers must periodically refresh leases in the dynamodb table before this duration expires.
"leaseDuration": "10 seconds"
}
"output": {
## -- DELTA OUTPUT FORMAT -- ##
"good": {
# -- URI of the bucket where the data lake will be written (required)
# -- For a S3 bucket, the uri should start with `s3a://`
"location": "s3a://my-bucket/events
# -- Any valid Delta table property
# -- This can be blank in most setups because the loader already sets sensible defaults.
"deltaTableProperties": {
"delta.dataSkippingStatsColumns": "load_tstamp,collector_tstamp,derived_tstamp,dvce_created_tstamp"
"delta.checkpointInterval": "50"
}
}
## -- HUDI OUTPUT FORMAT -- ##
# "good": {
#
# # -- Tell the loader to use Hudi output format
# "type": "Hudi"
#
# # -- URI of the bucket where the data lake will be written (required)
# # -- For a S3 bucket, the uri should start with `s3a://`
# "location": "s3a://my-bucket/events
#
# # -- Any valid hudi configuration key/value.
# # -- This can be blank in most setups because the loader already sets sensible defaults.
# "hudiWriteOptions": {
# "hoodie.metadata.index.column.stats.column.list": "load_tstamp,collector_tstamp,derived_tstamp,dvce_created_tstamp"
# }
#
# # -- Any valid hudi table property
# "hudiTableProperties": {
# "hoodie.keygen.timebased.output.dateformat": "yyyy-MM-dd"
# }
# }
## -- ICEBERG OUTPUT FORMAT -- ##
# "good": {
#
# # -- Tell the loader to use Iceberg
# "type": "Iceberg"
#
# # -- URI of the bucket where the data lake will be written (required)
# # -- For a S3 bucket, the uri should start with `s3a://`
# "location": "s3a://my-bucket/events
#
# # -- Name of the database in the catalog (required)
# "database": "snowplow"
#
# # -- Name of the table in the catalog (required)
# "table": "events"
#
# # -- Details of the Iceberg catalog
# "catalog": {
#
# # -- The catalog implementation.
# # -- Options are `Glue` for the AWS Glue catalog, or Hadoop.
# # -- Option, default Hadoop.
# "type": "Glue"
#
# # -- Any other valid catalog config option from the Iceberg documentation
# "options": {
# # -- For example, to use a Glue catalog in a different account:
# "glue.id": "123456789"
# }
# }
#
# # -- Any valid Iceberg table property
# # -- This can be blank in most setups because the loader already sets sensible defaults.
# "icebergTableProperties": {
# "write.metadata.metrics.column.event_id": "count"
# }
# }
"bad": {
# -- output kinesis stream for emitting failed events that could not be processed
"streamName": "bad"
# -- how to retry sending failed events if we exceed the kinesis write throughput limits
"throttledBackoffPolicy": {
"minBackoff": "100 milliseconds"
"maxBackoff": "1 second"
}
# -- the maximum allowed to records we are allowed to send to Kinesis in 1 PutRecords request
"recordLimit": 500
# -- the maximum allowed to bytes we are allowed to send to Kinesis in 1 PutRecords request
"byteLimit": 5242880
}
}
# -- Controls how many events are buffered in memory before saving the batch to local disk.
# -- The default value works well for most reasonably sized VMs.
"inMemBatchBytes": 25600000
# -- Controls how the app splits the workload into concurrent batches which can be run in parallel.
# -- E.g. If there are 4 available processors, and cpuParallelismFraction = 0.75, then we process 3 batches concurrently.
# -- Adjusting this value can cause the app to use more or less of the available CPU.
"cpuParallelismFraction": 0.75
# -- Controls how often we write/commit pending events to the data lake.
"windowing": "5 minutes"
# -- Controls how eagerly the loader starts processing the next timed window even when the previous timed window is still
# -- finalizing (committing into the lake). By default, we start processing a timed windows if the previous 1 window is
# -- still finalizing, but we do not start processing a timed window if any more older windows are still finalizing.
# -- The default value is known to work well for most workloads.
"numEagerWindows": 1
# -- Settings relating to the local Spark context use internally by this loader.
"spark": {
# -- How many times a Spark task should be retried in case of failure.
"taskRetries": 3
# -- Any valid spark configuration key/value.
# -- This can be blank in most setups because the loader already sets sensible defaults.
"conf": {
# -- E.g. to enable the spark ui for debugging:
"spark.ui.enabled": true
# -- E.g. to change credentials provider
"fs.s3a.aws.credentials.provider": "com.amazonaws.auth.InstanceProfileCredentialsProvider"
}
# -- Controls how many spark tasks run in parallel during writing the events to cloud storage.
# -- E.g. If there are 8 available processors, and cpuParallelismFraction = 0.5, then we have 4 spark tasks for writing.
# -- The default value is known to work well. Changing this setting might affect memory usage, file sizes, and/or latency.
"writerParallelismFraction": 0.5
}
# Retry configuration for lake operation failures
"retries": {
# -- Configures exponential backoff on errors related to how lake is set up for this loader.
# -- Examples include authentication errors and permissions errors.
# -- This class of errors are reported periodically to the monitoring webhook.
"setupErrors": {
"delay": "30 seconds"
}
# -- Configures exponential backoff errors that are likely to be transient.
# -- Examples include server errors and network errors
"transientErrors": {
"delay": "1 second"
"attempts": 5
}
}
# -- Schemas that won't be loaded to the lake. Optional, default value []
"skipSchemas": [
"iglu:com.acme/skipped1/jsonschema/1-0-0"
"iglu:com.acme/skipped2/jsonschema/1-0-*"
"iglu:com.acme/skipped3/jsonschema/1-*-*"
"iglu:com.acme/skipped4/jsonschema/*-*-*"
]
# -- Whether the loader should crash and exit if it fails to resolve an Iglu Schema.
# -- We recommend `true` because Snowplow enriched events have already passed validation, so a missing schema normally
# -- indicates an error that needs addressing.
# -- Change to `false` so events go the failed events stream instead of crashing the loader.
"exitOnMissingIgluSchema": true
# -- Whether the output parquet files should declare nested fields as non-nullable according to the Iglu schema.
# -- When true (default), nested fields are nullable only if they are not required fields according to the Iglu schema.
# -- When false, all nested fields are defined as nullable in the output table's schemas
# -- Set this to false if you use a query engine that dislikes non-nullable nested fields of a nullable struct.
"respectIgluNullability": true
# -- Configuration of internal http client used for iglu resolver, alerts and telemetry
"http": {
"client": {
"maxConnectionsPerServer": 4
}
}
"monitoring": {
"metrics": {
# -- Send runtime metrics to a statsd server
# -- `hostname` is the only required field in order to turn on this feature.
"statsd": {
# -- Hostname or IP of a statsd server.
"hostname": "127.0.0.1"
# -- Port of the statsd server.
"port": 8125
# -- Map of key/value pairs to be send along with the statsd metric.
"tags": {
"myTag": "xyz"
}
# -- How often to report metrics to statsd.
"period": "1 minute"
# -- Prefix used for the metric name when sending to statsd.
"prefix": "snowplow.lakeloader"
}
}
# -- Report unexpected runtime exceptions to Sentry
"sentry": {
"dsn": "https://[email protected]/1"
# -- Map of key/value pairs to be included as tags
"tags": {
"myTag": "xyz"
}
}
# -- Report alerts and heartbeats to the webhook
"webhook": {
# An actual HTTP endpoint
"endpoint": "https://webhook.acme.com",
# Set of arbitrary key-value pairs attached to the payload
"tags": {
"pipeline": "production"
}
# How often to send the heartbeat event
"heartbeat": "60.minutes"
}
# -- Open a HTTP server that returns OK only if the app is healthy
"healthProbe": {
"port": 8000
# -- Health probe becomes unhealthy if any received event is still not fully processed before
# -- this cutoff time
"unhealthyLatency": "15 minutes"
}
}
# -- Optional, configure telemetry
# -- All the fields are optional
"telemetry": {
# -- Set to true to disable telemetry
"disable": false
# -- Identifier intended to tie events together across modules,
# -- infrastructure and apps when used consistently
"userProvidedId": "my_company"
}
}