Add option to exit on missing Iglu schemas (#79)

Before this PR, the loader would generate a failed event if it failed to fetch a required schema from Iglu. However, all events have already passed validation in Enrich, so it is completely unexpected to have an Iglu failure. An Iglu error _probably_ means some type of configuration error or service outage. After this PR, the loader will crash and exit on an Iglu error, instead of creating a failed event. This is probably the preferred behaviour, while the pipeline operator addresses the underlying infrastructure problem. If an Iglu schema is genuinely now unavailable, then the pipeline operator can override the default behaviour by setting `exitOnMissingIgluSchema: false` in the configuration file or by listing the missing schema in `skipschemas`.
snowplow-incubator · Sep 5, 2024 · e186cf5 · e186cf5
1 parent 89b0647
commit e186cf5
Show file tree

Hide file tree

Showing 13 changed files with 303 additions and 148 deletions.
diff --git a/config/config.aws.reference.hocon b/config/config.aws.reference.hocon
@@ -207,6 +207,12 @@
     "iglu:com.acme/skipped4/jsonschema/*-*-*"
   ]
 
+  # -- Whether the loader should crash and exit if it fails to resolve an Iglu Schema.
+  # -- We recommend `true` because Snowplow enriched events have already passed validation, so a missing schema normally
+  # -- indicates an error that needs addressing.
+  # -- Change to `false` so events go the failed events stream instead of crashing the loader.
+  "exitOnMissingIgluSchema": true
+
   # -- Whether the output parquet files should declare nested fields as non-nullable according to the Iglu schema.
   # -- When true (default), nested fields are nullable only if they are not required fields according to the Iglu schema.
   # -- When false, all nested fields are defined as nullable in the output table's schemas

diff --git a/config/config.azure.reference.hocon b/config/config.azure.reference.hocon
@@ -178,6 +178,12 @@
     "iglu:com.acme/skipped4/jsonschema/*-*-*"
   ]
 
+  # -- Whether the loader should crash and exit if it fails to resolve an Iglu Schema.
+  # -- We recommend `true` because Snowplow enriched events have already passed validation, so a missing schema normally
+  # -- indicates an error that needs addressing.
+  # -- Change to `false` so events go the failed events stream instead of crashing the loader.
+  "exitOnMissingIgluSchema": true
+
   # -- Whether the output parquet files should declare nested fields as non-nullable according to the Iglu schema.
   # -- When true (default), nested fields are nullable only if they are not required fields according to the Iglu schema.
   # -- When false, all nested fields are defined as nullable in the output table's schemas

diff --git a/config/config.gcp.reference.hocon b/config/config.gcp.reference.hocon
@@ -186,6 +186,12 @@
     "iglu:com.acme/skipped4/jsonschema/*-*-*"
   ]
 
+  # -- Whether the loader should crash and exit if it fails to resolve an Iglu Schema.
+  # -- We recommend `true` because Snowplow enriched events have already passed validation, so a missing schema normally
+  # -- indicates an error that needs addressing.
+  # -- Change to `false` so events go the failed events stream instead of crashing the loader.
+  "exitOnMissingIgluSchema": true
+
   # -- Whether the output parquet files should declare nested fields as non-nullable according to the Iglu schema.
   # -- When true (default), nested fields are nullable only if they are not required fields according to the Iglu schema.
   # -- When false, all nested fields are defined as nullable in the output table's schemas

diff --git a/modules/core/src/main/resources/reference.conf b/modules/core/src/main/resources/reference.conf
@@ -139,6 +139,7 @@
 
   "skipSchemas": []
   "respectIgluNullability": true
+  "exitOnMissingIgluSchema": true
 
   "monitoring": {
     "metrics": {

diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/Config.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/Config.scala
@@ -39,6 +39,7 @@ case class Config[+Source, +Sink](
   license: AcceptedLicense,
   skipSchemas: List[SchemaCriterion],
   respectIgluNullability: Boolean,
+  exitOnMissingIgluSchema: Boolean,
   retries: Config.Retries
 )
 

diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/Environment.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/Environment.scala
@@ -53,7 +53,8 @@ case class Environment[F[_]](
   windowing: EventProcessingConfig.TimedWindows,
   badRowMaxSize: Int,
   schemasToSkip: List[SchemaCriterion],
-  respectIgluNullability: Boolean
+  respectIgluNullability: Boolean,
+  exitOnMissingIgluSchema: Boolean
 )
 
 object Environment {
@@ -82,20 +83,21 @@ object Environment {
       metrics <- Resource.eval(Metrics.build(config.main.monitoring.metrics))
       cpuParallelism = chooseCpuParallelism(config.main)
     } yield Environment(
-      appInfo                = appInfo,
-      source                 = sourceAndAck,
-      badSink                = badSink,
-      resolver               = resolver,
-      httpClient             = httpClient,
-      lakeWriter             = lakeWriterWrapped,
-      metrics                = metrics,
-      appHealth              = appHealth,
-      cpuParallelism         = cpuParallelism,
-      inMemBatchBytes        = config.main.inMemBatchBytes,
-      windowing              = windowing,
-      badRowMaxSize          = config.main.output.bad.maxRecordSize,
-      schemasToSkip          = config.main.skipSchemas,
-      respectIgluNullability = config.main.respectIgluNullability
+      appInfo                 = appInfo,
+      source                  = sourceAndAck,
+      badSink                 = badSink,
+      resolver                = resolver,
+      httpClient              = httpClient,
+      lakeWriter              = lakeWriterWrapped,
+      metrics                 = metrics,
+      appHealth               = appHealth,
+      cpuParallelism          = cpuParallelism,
+      inMemBatchBytes         = config.main.inMemBatchBytes,
+      windowing               = windowing,
+      badRowMaxSize           = config.main.output.bad.maxRecordSize,
+      schemasToSkip           = config.main.skipSchemas,
+      respectIgluNullability  = config.main.respectIgluNullability,
+      exitOnMissingIgluSchema = config.main.exitOnMissingIgluSchema
     )
 
   private def enableSentry[F[_]: Sync](appInfo: AppInfo, config: Option[Config.Sentry]): Resource[F, Unit] =

diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/RuntimeService.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/RuntimeService.scala
@@ -15,9 +15,11 @@ sealed trait RuntimeService
 object RuntimeService {
   case object SparkWriter extends RuntimeService
   case object BadSink extends RuntimeService
+  case object Iglu extends RuntimeService
 
   implicit val show: Show[RuntimeService] = Show.show {
     case SparkWriter => "Spark writer"
     case BadSink     => "Failed events sink"
+    case Iglu        => "Iglu repositories"
   }
 }
diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/Processing.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/Processing.scala
@@ -16,6 +16,7 @@ import cats.{Applicative, Foldable, Functor}
 import cats.effect.{Async, Deferred, Sync}
 import cats.effect.kernel.{Ref, Unique}
 import fs2.{Chunk, Pipe, Stream}
+import io.circe.syntax._
 import org.typelevel.log4cats.Logger
 import org.typelevel.log4cats.slf4j.Slf4jLogger
 import org.apache.spark.sql.Row
@@ -134,6 +135,7 @@ object Processing {
       for {
         _ <- Logger[F].debug(s"Processing batch of size ${events.size}")
         nonAtomicFields <- NonAtomicFields.resolveTypes[F](env.resolver, entities, env.schemasToSkip)
+        _ <- possiblyExitOnMissingIgluSchema(env, nonAtomicFields)
         _ <- rememberColumnNames(ref, nonAtomicFields.fields)
         (bad, rows) <- transformToSpark[F](badProcessor, events, nonAtomicFields)
         _ <- sendFailedEvents(env, badProcessor, bad)
@@ -277,4 +279,14 @@ object Processing {
 
       Stream.eval(commit) >> Stream.emits(state.tokens.reverse)
     }
+
+  private def possiblyExitOnMissingIgluSchema[F[_]: Sync](env: Environment[F], nonAtomicFields: NonAtomicFields.Result): F[Unit] =
+    if (env.exitOnMissingIgluSchema && nonAtomicFields.igluFailures.nonEmpty) {
+      val base =
+        "Exiting because failed to resolve Iglu schemas.  Either check the configuration of the Iglu repos, or set the `skipSchemas` config option, or set `exitOnMissingIgluSchema` to false.\n"
+      val msg = nonAtomicFields.igluFailures.map(_.failure.asJson.noSpaces).mkString(base, "\n", "")
+      Logger[F].error(base) *> env.appHealth.beUnhealthyForRuntimeService(RuntimeService.Iglu) *> Sync[F].raiseError(
+        new RuntimeException(msg)
+      )
+    } else Applicative[F].unit
 }
diff --git a/modules/core/src/test/scala/com.snowplowanalytics.snowplow.lakes/MockEnvironment.scala b/modules/core/src/test/scala/com.snowplowanalytics.snowplow.lakes/MockEnvironment.scala
@@ -76,20 +76,21 @@ object MockEnvironment {
       source = testSourceAndAck(windows, state)
     } yield {
       val env = Environment(
-        appInfo                = TestSparkEnvironment.appInfo,
-        source                 = source,
-        badSink                = testSink(state),
-        resolver               = Resolver[IO](Nil, None),
-        httpClient             = testHttpClient,
-        lakeWriter             = testLakeWriter(state),
-        metrics                = testMetrics(state),
-        appHealth              = testAppHealth(state),
-        inMemBatchBytes        = 1000000L,
-        cpuParallelism         = 1,
-        windowing              = EventProcessingConfig.TimedWindows(1.minute, 1.0, 1),
-        badRowMaxSize          = 1000000,
-        schemasToSkip          = List.empty,
-        respectIgluNullability = true
+        appInfo                 = TestSparkEnvironment.appInfo,
+        source                  = source,
+        badSink                 = testSink(state),
+        resolver                = Resolver[IO](Nil, None),
+        httpClient              = testHttpClient,
+        lakeWriter              = testLakeWriter(state),
+        metrics                 = testMetrics(state),
+        appHealth               = testAppHealth(state),
+        inMemBatchBytes         = 1000000L,
+        cpuParallelism          = 1,
+        windowing               = EventProcessingConfig.TimedWindows(1.minute, 1.0, 1),
+        badRowMaxSize           = 1000000,
+        schemasToSkip           = List.empty,
+        respectIgluNullability  = true,
+        exitOnMissingIgluSchema = false
       )
       MockEnvironment(state, env)
     }

diff --git a/modules/core/src/test/scala/com.snowplowanalytics.snowplow.lakes/TestSparkEnvironment.scala b/modules/core/src/test/scala/com.snowplowanalytics.snowplow.lakes/TestSparkEnvironment.scala
@@ -36,20 +36,21 @@ object TestSparkEnvironment {
     lakeWriter <- LakeWriter.build[IO](testConfig.spark, testConfig.output.good)
     lakeWriterWrapped = LakeWriter.withHandledErrors(lakeWriter, dummyAppHealth, retriesConfig, PartialFunction.empty)
   } yield Environment(
-    appInfo                = appInfo,
-    source                 = source,
-    badSink                = Sink[IO](_ => IO.unit),
-    resolver               = Resolver[IO](Nil, None),
-    httpClient             = testHttpClient,
-    lakeWriter             = lakeWriterWrapped,
-    metrics                = testMetrics,
-    appHealth              = dummyAppHealth,
-    inMemBatchBytes        = 1000000L,
-    cpuParallelism         = 1,
-    windowing              = EventProcessingConfig.TimedWindows(1.minute, 1.0, 1),
-    badRowMaxSize          = 1000000,
-    schemasToSkip          = List.empty,
-    respectIgluNullability = true
+    appInfo                 = appInfo,
+    source                  = source,
+    badSink                 = Sink[IO](_ => IO.unit),
+    resolver                = Resolver[IO](Nil, None),
+    httpClient              = testHttpClient,
+    lakeWriter              = lakeWriterWrapped,
+    metrics                 = testMetrics,
+    appHealth               = dummyAppHealth,
+    inMemBatchBytes         = 1000000L,
+    cpuParallelism          = 1,
+    windowing               = EventProcessingConfig.TimedWindows(1.minute, 1.0, 1),
+    badRowMaxSize           = 1000000,
+    schemasToSkip           = List.empty,
+    respectIgluNullability  = true,
+    exitOnMissingIgluSchema = false
   )
 
   private val retriesConfig = Config.Retries(

diff --git a/...re/src/test/scala/com.snowplowanalytics.snowplow.lakes/processing/AbstractSparkSpec.scala b/...re/src/test/scala/com.snowplowanalytics.snowplow.lakes/processing/AbstractSparkSpec.scala
@@ -15,21 +15,17 @@ import cats.effect.kernel.Resource
 import cats.implicits._
 import cats.effect.testing.specs2.CatsEffect
 import io.circe.Json
-import fs2.{Chunk, Stream}
 import org.specs2.Specification
 import org.specs2.matcher.MatchResult
 
 import org.apache.spark.sql.{DataFrame, SparkSession}
 
 import scala.concurrent.duration.DurationInt
-import java.nio.charset.StandardCharsets
 import fs2.io.file.{Files, Path}
 
 import com.snowplowanalytics.iglu.core.{SchemaKey, SchemaVer, SelfDescribingData}
-import com.snowplowanalytics.snowplow.analytics.scalasdk.{Event, SnowplowEvent}
-import com.snowplowanalytics.snowplow.analytics.scalasdk.SnowplowEvent.{Contexts, UnstructEvent}
+import com.snowplowanalytics.snowplow.analytics.scalasdk.SnowplowEvent
 import com.snowplowanalytics.snowplow.lakes.{TestConfig, TestSparkEnvironment}
-import com.snowplowanalytics.snowplow.sources.TokenedEvents
 
 /** Base Spec for testing different output formats of this loader */
 abstract class AbstractSparkSpec extends Specification with CatsEffect {
@@ -62,7 +58,7 @@ abstract class AbstractSparkSpec extends Specification with CatsEffect {
 
   def e1 = Files[IO].tempDirectory.use { tmpDir =>
     val resources = for {
-      inputs <- Resource.eval(inputEvents(2, good()))
+      inputs <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good()))
       tokened <- Resource.eval(inputs.traverse(_.tokened))
       env <- TestSparkEnvironment.build(target, tmpDir, List(tokened))
     } yield (inputs, env)
@@ -125,9 +121,9 @@ abstract class AbstractSparkSpec extends Specification with CatsEffect {
     )
 
     val resources = for {
-      inputs1 <- Resource.eval(inputEvents(2, good(ue = ueGood700)))
+      inputs1 <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good(ue = ueGood700)))
       tokened1 <- Resource.eval(inputs1.traverse(_.tokened))
-      inputs2 <- Resource.eval(inputEvents(2, good(ue = ueGood701)))
+      inputs2 <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good(ue = ueGood701)))
       tokened2 <- Resource.eval(inputs2.traverse(_.tokened))
       env <- TestSparkEnvironment.build(target, tmpDir, List(tokened1, tokened2))
     } yield env
@@ -182,9 +178,9 @@ abstract class AbstractSparkSpec extends Specification with CatsEffect {
     )
 
     val resources = for {
-      inputs1 <- Resource.eval(inputEvents(2, good(ue = ueBadEvolution100)))
+      inputs1 <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good(ue = ueBadEvolution100)))
       tokened1 <- Resource.eval(inputs1.traverse(_.tokened))
-      inputs2 <- Resource.eval(inputEvents(2, good(ue = ueBadEvolution101)))
+      inputs2 <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good(ue = ueBadEvolution101)))
       tokened2 <- Resource.eval(inputs2.traverse(_.tokened))
       env <- TestSparkEnvironment.build(target, tmpDir, List(tokened1, tokened2))
     } yield env
@@ -228,7 +224,7 @@ abstract class AbstractSparkSpec extends Specification with CatsEffect {
     )
 
     val resources = for {
-      inputs <- Resource.eval(inputEvents(2, good(ue = adBreakEndEvent)))
+      inputs <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good(ue = adBreakEndEvent)))
       tokened <- Resource.eval(inputs.traverse(_.tokened))
       env <- TestSparkEnvironment.build(target, tmpDir, List(tokened))
     } yield env
@@ -268,7 +264,7 @@ abstract class AbstractSparkSpec extends Specification with CatsEffect {
     )
 
     val resources = for {
-      inputs <- Resource.eval(inputEvents(2, good(ue = ue)))
+      inputs <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good(ue = ue)))
       tokened <- Resource.eval(inputs.traverse(_.tokened))
       env <- TestSparkEnvironment.build(target, tmpDir, List(tokened))
     } yield env
@@ -308,7 +304,7 @@ abstract class AbstractSparkSpec extends Specification with CatsEffect {
     )
 
     val resources = for {
-      inputs <- Resource.eval(inputEvents(2, good(contexts = adBreakEndEvent)))
+      inputs <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good(contexts = adBreakEndEvent)))
       tokened <- Resource.eval(inputs.traverse(_.tokened))
       env <- TestSparkEnvironment.build(target, tmpDir, List(tokened))
     } yield env
@@ -348,7 +344,7 @@ abstract class AbstractSparkSpec extends Specification with CatsEffect {
     )
 
     val resources = for {
-      inputs <- Resource.eval(inputEvents(2, good(contexts = contexts)))
+      inputs <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good(contexts = contexts)))
       tokened <- Resource.eval(inputs.traverse(_.tokened))
       env <- TestSparkEnvironment.build(target, tmpDir, List(tokened))
     } yield env
@@ -381,41 +377,6 @@ abstract class AbstractSparkSpec extends Specification with CatsEffect {
 
 object AbstractSparkSpec {
 
-  case class TestBatch(value: List[Event]) {
-    def tokened: IO[TokenedEvents] = {
-      val serialized = Chunk.from(value).map { e =>
-        StandardCharsets.UTF_8.encode(e.toTsv)
-      }
-      IO.unique.map { ack =>
-        TokenedEvents(serialized, ack, None)
-      }
-    }
-  }
-
-  def inputEvents(count: Long, source: IO[TestBatch]): IO[List[TestBatch]] =
-    Stream
-      .eval(source)
-      .repeat
-      .take(count)
-      .compile
-      .toList
-
-  def good(ue: UnstructEvent = UnstructEvent(None), contexts: Contexts = Contexts(List.empty)): IO[TestBatch] =
-    for {
-      eventId1 <- IO.randomUUID
-      eventId2 <- IO.randomUUID
-      collectorTstamp <- IO.realTimeInstant
-    } yield {
-      val event1 = Event
-        .minimal(eventId1, collectorTstamp, "0.0.0", "0.0.0")
-        .copy(tr_total = Some(1.23))
-        .copy(unstruct_event = ue)
-        .copy(contexts = contexts)
-      val event2 = Event
-        .minimal(eventId2, collectorTstamp, "0.0.0", "0.0.0")
-      TestBatch(List(event1, event2))
-    }
-
   /** A spark session just used for making assertions, not for running the code under test */
   private def sparkForAssertions(config: Map[String, String]): Resource[IO, SparkSession] = {
     val io = IO.blocking {