config/config.pubsub.reference.hocon

{
  # -- Full license text available in LICENSE.md
  "license" {
    "accept": true
  }

  "input": {
    # -- pubsub subscription for the source of enriched events
    "subscription": "projects/myproject/subscriptions/snowplow-enriched"

    # -- Controls how many threads are used internally by the pubsub client library for fetching events.
    # -- The number of threads is equal to this factor multiplied by the number of availble cpu cores
    "parallelPullFactor": 0.5

    # -- How many bytes can be buffered by the loader app before blocking the pubsub client library
    # -- from fetching more events.
    # -- This is a balance between memory usage vs how efficiently the app can operate.  The default value works well.
    "bufferMaxBytes": 1000000

    # -- For how long the pubsub client library will continue to re-extend the ack deadline of an unprocessed event.
    "maxAckExtensionPeriod": "1 hour"

    # -- Sets min/max boundaries on the value by which an ack deadline is extended.
    # -- The actual value used is guided by runtime statistics collected by the pubsub client library.
    "minDurationPerAckExtension": "60 seconds"
    "maxDurationPerAckExtension": "600 seconds"

    # -- The maximum number of streaming pulls we allow on a single GRPC transport channel before opening another channel.
    # -- This advanced setting is only relevant on extremely large VMs, or with a high value of `parallelPullCount`.
    "maxPullsPerTransportChannel": 16
  }

  "output": {

    "good": {
      # -- uri of the snowflake account
      "url": "https://orgname.accountname.snowflakecomputing.com"

      # -- snowflake user who has necessary privileges
      "user": "snowplow"

      # -- snowflake private key, used to connect to the account
      "privateKey": ${SNOWFLAKE_PRIVATE_KEY}

      # -- optional, passphrase for the private key
      "privateKeyPassphrase": ${?SNOWFLAKE_PRIVATE_KEY_PASSPHRASE}

      # -- optional, snowflake role which the snowflake user should assume
      "role": "snowplow_loader"

      # -- name of the snowflake database containing the events table
      "database": "snowplow"

      # -- name of the snowflake schema containing the events table
      "schema": "atomic"

      # -- name to use for the events table.
      "table": "events"

      # -- Prefix to use for the snowflake channels.
      # -- The full name will be suffixed with a number, e.g. `snowplow-1`
      # -- The prefix must be unique per loader VM
      "channel": "snowplow"

      # -- Timeouts used for JDBC operations
      "jdbcLoginTimeout": "60 seconds"
      "jdbcNetworkTimeout": "60 seconds"
      "jdbcQueryTimeout": "60 seconds"
    }

    "bad": {
      # -- output pubsub topic for emitting failed events that could not be processed
      "topic": "projects/myproject/topics/snowplow-bad"

      # -- Failed sends events to pubsub in batches not exceeding this size.
      "batchSize": 100
      # -- Failed events to pubsub in batches not exceeding this size number of bytes
      "requestByteThreshold": 1000000
    }

  }

  "batching": {

    # - Events are emitted to Snowflake when the batch reaches this size in bytes
    "maxBytes": 16000000

    # - Events are emitted to Snowflake for a maximum of this duration, even if the `maxBytes` size has not been reached
    "maxDelay": "1 second"

    # - Controls how many batches can we send simultaneously over the network to Snowflake.
    # -- E.g. If there are 4 available processors, and uploadParallelismFactor = 2.5, then we send up to 10 batches in parallel
    # -- Adjusting this value can cause the app to use more or less of the available CPU.
    "uploadParallelismFactor":  2.5
  }

  # -- Controls how the app splits the workload into concurrent batches which can be run in parallel.
  # -- E.g. If there are 4 available processors, and cpuParallelismFactor = 0.75, then we process 3 batches concurrently.
  # -- Adjusting this value can cause the app to use more or less of the available CPU.
  "cpuParallelismFactor": 0.75

  # Retry configuration for Snowflake operation failures
  "retries": {

    # -- Configures exponential backoff on errors related to how Snowflake is set up for this loader.
    # -- Examples include authentication errors and permissions errors.
    # -- This class of errors are reported periodically to the monitoring webhook.
    "setupErrors": {
      "delay": "30 seconds"
    }

    # -- Configures exponential backoff  errors that are likely to be transient.
    # -- Examples include server errors and network errors
    "transientErrors": {
      "delay": "1 second"
      "attempts": 5
    }
  }

  # -- Schemas that won't be loaded to Snowflake. Optional, default value []
  "skipSchemas": [
    "iglu:com.acme/skipped1/jsonschema/1-0-0",
    "iglu:com.acme/skipped2/jsonschema/1-0-*",
    "iglu:com.acme/skipped3/jsonschema/1-*-*",
    "iglu:com.acme/skipped4/jsonschema/*-*-*"
  ]

  "monitoring": {
    "metrics": {

      # -- Send runtime metrics to a statsd server
      "statsd": {
        "hostname": "127.0.0.1"
        "port": 8125

        # -- Map of key/value pairs to be send along with the metric
        "tags": {
          "myTag": "xyz"
        }

        # -- How often to report metrics
        "period": "1 minute"

        # -- Prefix used for the metric name when sending to statsd
        "prefix": "snowplow.snowflake.loader"
      }
    }

    # -- Report unexpected runtime exceptions to Sentry
    "sentry": {
      "dsn": "https://public@sentry.example.com/1"

      # -- Map of key/value pairs to be included as tags
      "tags": {
        "myTag": "xyz"
      }
    }

    # -- Report alerts and heartbeats to the webhook
    "webhook": {
      # An actual HTTP endpoint
      "endpoint": "https://webhook.acme.com",
      # Set of arbitrary key-value pairs attached to the payload
      "tags": {
        "pipeline": "production"
      }
      # How often to send the heartbeat event
      "heartbeat": "60.minutes"
    }
  }

  # -- Configuration of internal http client used for alerts and telemetry
  "http": {
    "client": {
      "maxConnectionsPerServer": 4
    }
  }

  # -- Optional, configure telemetry
  # -- All the fields are optional
  "telemetry": {

    # -- Set to true to disable telemetry
    "disable": false

    # -- Interval for the heartbeat event
    "interval": 15 minutes

    # -- HTTP method used to send the heartbeat event
    "method": POST

    # -- URI of the collector receiving the heartbeat event
    "collectorUri": collector-g.snowplowanalytics.com

    # -- Port of the collector receiving the heartbeat event
    "collectorPort": 443

    # -- Whether to use https or not
    "secure": true

    # -- Identifier intended to tie events together across modules,
    # -- infrastructure and apps when used consistently
    "userProvidedId": my_pipeline

    # -- ID automatically generated upon running a modules deployment script
    # -- Intended to identify each independent module, and the infrastructure it controls
    "autoGeneratedId": hfy67e5ydhtrd

    # -- Unique identifier for the VM instance
    # -- Unique for each instance of the app running within a module
    "instanceId": 665bhft5u6udjf

    # -- Name of the terraform module that deployed the app
    "moduleName": snowflake-loader-vmss

    # -- Version of the terraform module that deployed the app
    "moduleVersion": 1.0.0
  }
}