config/config.aws.reference.hocon

{

  # -- Accept the terms of the Snowplow Limited Use License Agreement
  # -- See https://docs.snowplow.io/limited-use-license-1.0/
  "license": {
    "accept": ${?ACCEPT_LIMITED_USE_LICENSE}
  }

  "input": {
    # -- kinesis stream for the source of enriched events
    "streamName": "snowplow-enriched-events"

    # -- name to use for the KCL dynamodb table
    "appName": "snowplow-lake-loader"

    # -- From where the loader should start consuming if this is the first time it is run.
    # -- On subsequent runs, it will always resume from where it last checkpointed.
    "initialPosition": {
      # -- Options are `TRIM_HORIZON` for the oldest available events, `LATEST` for latest events,
      # -- or `AT_TIMESTAMP` to start consuming from events written at a particular time.
      "type": "TRIM_HORIZON"

      # -- Only required if `initialPosition.type` is AT_TIMESTAMP
      "timestamp": "2023-01-01T00:00:00Z"
    }

    # -- How the underlying Kinesis client should fetch events from the stream
    "retrievalMode": {
      # -- Options are "Polling" for the client to poll Kinesis for more events when needed
      # -- or "FanOut" to enabled Kinesis's Enhanced Fan Out feature using HTTP/2
      "type": "Polling"

      # -- Only used if retrieval mode is type Polling. How many events the client may fetch in a single poll.
      "maxRecords": 1000
    }

    # -- Name of this KCL worker used in the dynamodb lease table
    "workerIdentifier": ${HOSTNAME}

    # -- Duration of shard leases. KCL workers must periodically refresh leases in the dynamodb table before this duration expires.
    "leaseDuration": "10 seconds"
  }

  "output": {

    ## -- DELTA OUTPUT FORMAT -- ##
    "good": {
      # -- URI of the bucket where the data lake will be written (required)
      # -- For a S3 bucket, the uri should start with `s3a://`
      "location": "s3a://my-bucket/events

      # -- Any valid Delta table property
      # -- This can be blank in most setups because the loader already sets sensible defaults.
      "deltaTableProperties": {
        "delta.dataSkippingStatsColumns": "load_tstamp,collector_tstamp,derived_tstamp,dvce_created_tstamp"
        "delta.checkpointInterval": "50"
      }

    }

    ## -- HUDI OUTPUT FORMAT -- ##
#   "good": {
#
#     # -- Tell the loader to use Hudi output format
#     "type": "Hudi"
#
#     # -- URI of the bucket where the data lake will be written (required)
#     # -- For a S3 bucket, the uri should start with `s3a://`
#     "location": "s3a://my-bucket/events
#
#     # -- Any valid hudi configuration key/value.
#     # -- This can be blank in most setups because the loader already sets sensible defaults.
#     "hudiWriteOptions": {
#       "hoodie.metadata.index.column.stats.column.list": "load_tstamp,collector_tstamp,derived_tstamp,dvce_created_tstamp"
#     }
#
#     # -- Any valid hudi table property
#     "hudiTableProperties": {
#       "hoodie.keygen.timebased.output.dateformat": "yyyy-MM-dd"
#     }
#   }

    ## -- ICEBERG OUTPUT FORMAT -- ##
#   "good": {
#
#     # -- Tell the loader to use Iceberg
#     "type": "Iceberg"
#
#     # -- URI of the bucket where the data lake will be written (required)
#     # -- For a S3 bucket, the uri should start with `s3a://`
#     "location": "s3a://my-bucket/events
#
#     # -- Name of the database in the catalog (required)
#     "database": "snowplow"
#
#     # -- Name of the table in the catalog (required)
#     "table": "events"
#
#     # -- Details of the Iceberg catalog
#     "catalog": {
#
#       # -- The catalog implementation.
#       # -- Options are `Glue` for the AWS Glue catalog, or Hadoop.
#       # -- Option, default Hadoop.
#       "type": "Glue"
#
#       # -- Any other valid catalog config option from the Iceberg documentation
#       "options": {
#          # -- For example, to use a Glue catalog in a different account:
#          "glue.id": "123456789"
#       }
#     }
#
#     # -- Any valid Iceberg table property
#     # -- This can be blank in most setups because the loader already sets sensible defaults.
#     "icebergTableProperties": {
#       "write.metadata.metrics.column.event_id": "count"
#     }
#   }

    "bad": {
      # -- output kinesis stream for emitting failed events that could not be processed
      "streamName": "bad"

      # -- how to retry sending failed events if we exceed the kinesis write throughput limits
      "throttledBackoffPolicy": {
        "minBackoff": "100 milliseconds"
        "maxBackoff": "1 second"
      }

      # -- the maximum allowed to records we are allowed to send to Kinesis in 1 PutRecords request
      "recordLimit": 500

      # -- the maximum allowed to bytes we are allowed to send to Kinesis in 1 PutRecords request
      "byteLimit": 5242880
    }

  }

  # -- Controls how many events are buffered in memory before saving the batch to local disk.
  # -- The default value works well for most reasonably sized VMs.
  "inMemBatchBytes": 25600000

  # -- Controls how the app splits the workload into concurrent batches which can be run in parallel.
  # -- E.g. If there are 4 available processors, and cpuParallelismFraction = 0.75, then we process 3 batches concurrently.
  # -- Adjusting this value can cause the app to use more or less of the available CPU.
  "cpuParallelismFraction": 0.75

  # -- Controls how often we write/commit pending events to the data lake.
  "windowing": "5 minutes"

  # -- Controls how eagerly the loader starts processing the next timed window even when the previous timed window is still
  # -- finalizing (committing into the lake). By default, we start processing a timed windows if the previous 1 window is
  # -- still finalizing, but we do not start processing a timed window if any more older windows are still finalizing.
  # -- The default value is known to work well for most workloads.
  "numEagerWindows": 1

  # -- Settings relating to the local Spark context use internally by this loader.
  "spark": {

    # -- How many times a Spark task should be retried in case of failure.
    "taskRetries": 3

    # -- Any valid spark configuration key/value.
    # -- This can be blank in most setups because the loader already sets sensible defaults.
    "conf": {
      # -- E.g. to enable the spark ui for debugging:
      "spark.ui.enabled": true

      # -- E.g. to change credentials provider
      "fs.s3a.aws.credentials.provider": "com.amazonaws.auth.InstanceProfileCredentialsProvider"
    }

    # -- Controls how many spark tasks run in parallel during writing the events to cloud storage.
    # -- E.g. If there are 8 available processors, and cpuParallelismFraction = 0.5, then we have 4 spark tasks for writing.
    # -- The default value is known to work well. Changing this setting might affect memory usage, file sizes, and/or latency.
    "writerParallelismFraction": 0.5
  }

  # Retry configuration for lake operation failures
  "retries": {

    # -- Configures exponential backoff on errors related to how lake is set up for this loader.
    # -- Examples include authentication errors and permissions errors.
    # -- This class of errors are reported periodically to the monitoring webhook.
    "setupErrors": {
      "delay": "30 seconds"
    }

    # -- Configures exponential backoff errors that are likely to be transient.
    # -- Examples include server errors and network errors
    "transientErrors": {
      "delay": "1 second"
      "attempts": 5
    }
  }

  # -- Schemas that won't be loaded to the lake.  Optional, default value []
  "skipSchemas": [
    "iglu:com.acme/skipped1/jsonschema/1-0-0"
    "iglu:com.acme/skipped2/jsonschema/1-0-*"
    "iglu:com.acme/skipped3/jsonschema/1-*-*"
    "iglu:com.acme/skipped4/jsonschema/*-*-*"
  ]

  # -- Whether the loader should crash and exit if it fails to resolve an Iglu Schema.
  # -- We recommend `true` because Snowplow enriched events have already passed validation, so a missing schema normally
  # -- indicates an error that needs addressing.
  # -- Change to `false` so events go the failed events stream instead of crashing the loader.
  "exitOnMissingIgluSchema": true

  # -- Whether the output parquet files should declare nested fields as non-nullable according to the Iglu schema.
  # -- When true (default), nested fields are nullable only if they are not required fields according to the Iglu schema.
  # -- When false, all nested fields are defined as nullable in the output table's schemas
  # -- Set this to false if you use a query engine that dislikes non-nullable nested fields of a nullable struct.
  "respectIgluNullability": true

  # -- Configuration of internal http client used for iglu resolver, alerts and telemetry
  "http": {
    "client": {
      "maxConnectionsPerServer": 4
    }
  }

  "monitoring": {
    "metrics": {

      # -- Send runtime metrics to a statsd server
      # -- `hostname` is the only required field in order to turn on this feature.
      "statsd": {

        # -- Hostname or IP of a statsd server.
        "hostname": "127.0.0.1"

        # -- Port of the statsd server.
        "port": 8125

        # -- Map of key/value pairs to be send along with the statsd metric.
        "tags": {
          "myTag": "xyz"
        }

        # -- How often to report metrics to statsd.
        "period": "1 minute"

        # -- Prefix used for the metric name when sending to statsd.
        "prefix": "snowplow.lakeloader"
      }
    }

    # -- Report unexpected runtime exceptions to Sentry
    "sentry": {
      "dsn": "https://public@sentry.example.com/1"

      # -- Map of key/value pairs to be included as tags
      "tags": {
        "myTag": "xyz"
      }
    }

    # -- Report alerts and heartbeats to the webhook 
    "webhook": {
      # An actual HTTP endpoint
      "endpoint": "https://webhook.acme.com",
      # Set of arbitrary key-value pairs attached to the payload
      "tags": {
        "pipeline": "production"
      }
      # How often to send the heartbeat event
      "heartbeat": "60.minutes"
    }

    # -- Open a HTTP server that returns OK only if the app is healthy
    "healthProbe": {
      "port": 8000

      # -- Health probe becomes unhealthy if any received event is still not fully processed before
      # -- this cutoff time
      "unhealthyLatency": "15 minutes"
    }
  }

  # -- Optional, configure telemetry
  # -- All the fields are optional
  "telemetry": {

    # -- Set to true to disable telemetry
    "disable": false

    # -- Identifier intended to tie events together across modules,
    # -- infrastructure and apps when used consistently
    "userProvidedId": "my_company"

  }
}