snowplow-incubator · istreeter · Sep 5, 2024 · Aug 16, 2024 · pondzix · Sep 4, 2024
diff --git a/config/config.aws.reference.hocon b/config/config.aws.reference.hocon
@@ -207,6 +207,12 @@
     "iglu:com.acme/skipped4/jsonschema/*-*-*"
   ]
 
+  # -- Whether the loader should crash and exit if it fails to resolve an Iglu Schema.
+  # -- We recommend `true` because Snowplow enriched events have already passed validation, so a missing schema normally
+  # -- indicates an error that needs addressing.
+  # -- Change to `false` so events go the failed events stream instead of crashing the loader.
+  "exitOnMissingIgluSchema": true
+
   # -- Whether the output parquet files should declare nested fields as non-nullable according to the Iglu schema.
   # -- When true (default), nested fields are nullable only if they are not required fields according to the Iglu schema.
   # -- When false, all nested fields are defined as nullable in the output table's schemas

diff --git a/config/config.azure.reference.hocon b/config/config.azure.reference.hocon
@@ -178,6 +178,12 @@
     "iglu:com.acme/skipped4/jsonschema/*-*-*"
   ]
 
+  # -- Whether the loader should crash and exit if it fails to resolve an Iglu Schema.
+  # -- We recommend `true` because Snowplow enriched events have already passed validation, so a missing schema normally
+  # -- indicates an error that needs addressing.
+  # -- Change to `false` so events go the failed events stream instead of crashing the loader.
+  "exitOnMissingIgluSchema": true
+
   # -- Whether the output parquet files should declare nested fields as non-nullable according to the Iglu schema.
   # -- When true (default), nested fields are nullable only if they are not required fields according to the Iglu schema.
   # -- When false, all nested fields are defined as nullable in the output table's schemas

diff --git a/config/config.gcp.reference.hocon b/config/config.gcp.reference.hocon
@@ -186,6 +186,12 @@
     "iglu:com.acme/skipped4/jsonschema/*-*-*"
   ]
 
+  # -- Whether the loader should crash and exit if it fails to resolve an Iglu Schema.
+  # -- We recommend `true` because Snowplow enriched events have already passed validation, so a missing schema normally
+  # -- indicates an error that needs addressing.
+  # -- Change to `false` so events go the failed events stream instead of crashing the loader.
+  "exitOnMissingIgluSchema": true
+
   # -- Whether the output parquet files should declare nested fields as non-nullable according to the Iglu schema.
   # -- When true (default), nested fields are nullable only if they are not required fields according to the Iglu schema.
   # -- When false, all nested fields are defined as nullable in the output table's schemas

diff --git a/modules/core/src/main/resources/reference.conf b/modules/core/src/main/resources/reference.conf
@@ -139,6 +139,7 @@
 
   "skipSchemas": []
   "respectIgluNullability": true
+  "exitOnMissingIgluSchema": true
 
   "monitoring": {
     "metrics": {

diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/Config.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/Config.scala
@@ -39,6 +39,7 @@ case class Config[+Source, +Sink](
   license: AcceptedLicense,
   skipSchemas: List[SchemaCriterion],
   respectIgluNullability: Boolean,
+  exitOnMissingIgluSchema: Boolean,
   retries: Config.Retries
 )
 

diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/Environment.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/Environment.scala
@@ -53,7 +53,8 @@ case class Environment[F[_]](
   windowing: EventProcessingConfig.TimedWindows,
   badRowMaxSize: Int,
   schemasToSkip: List[SchemaCriterion],
-  respectIgluNullability: Boolean
+  respectIgluNullability: Boolean,
+  exitOnMissingIgluSchema: Boolean
 )
 
 object Environment {
@@ -82,20 +83,21 @@ object Environment {
       metrics <- Resource.eval(Metrics.build(config.main.monitoring.metrics))
       cpuParallelism = chooseCpuParallelism(config.main)
     } yield Environment(
-      appInfo                = appInfo,
-      source                 = sourceAndAck,
-      badSink                = badSink,
-      resolver               = resolver,
-      httpClient             = httpClient,
-      lakeWriter             = lakeWriterWrapped,
-      metrics                = metrics,
-      appHealth              = appHealth,
-      cpuParallelism         = cpuParallelism,
-      inMemBatchBytes        = config.main.inMemBatchBytes,
-      windowing              = windowing,
-      badRowMaxSize          = config.main.output.bad.maxRecordSize,
-      schemasToSkip          = config.main.skipSchemas,
-      respectIgluNullability = config.main.respectIgluNullability
+      appInfo                 = appInfo,
+      source                  = sourceAndAck,
+      badSink                 = badSink,
+      resolver                = resolver,
+      httpClient              = httpClient,
+      lakeWriter              = lakeWriterWrapped,
+      metrics                 = metrics,
+      appHealth               = appHealth,
+      cpuParallelism          = cpuParallelism,
+      inMemBatchBytes         = config.main.inMemBatchBytes,
+      windowing               = windowing,
+      badRowMaxSize           = config.main.output.bad.maxRecordSize,
+      schemasToSkip           = config.main.skipSchemas,
+      respectIgluNullability  = config.main.respectIgluNullability,
+      exitOnMissingIgluSchema = config.main.exitOnMissingIgluSchema
     )
 
   private def enableSentry[F[_]: Sync](appInfo: AppInfo, config: Option[Config.Sentry]): Resource[F, Unit] =

diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/RuntimeService.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/RuntimeService.scala
@@ -15,9 +15,11 @@ sealed trait RuntimeService
 object RuntimeService {
   case object SparkWriter extends RuntimeService
   case object BadSink extends RuntimeService
+  case object Iglu extends RuntimeService
 
   implicit val show: Show[RuntimeService] = Show.show {
     case SparkWriter => "Spark writer"
     case BadSink     => "Failed events sink"
+    case Iglu        => "Iglu repositories"
   }
 }
diff --git a/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/Processing.scala b/modules/core/src/main/scala/com.snowplowanalytics.snowplow.lakes/processing/Processing.scala
@@ -16,6 +16,7 @@ import cats.{Applicative, Foldable, Functor}
 import cats.effect.{Async, Deferred, Sync}
 import cats.effect.kernel.{Ref, Unique}
 import fs2.{Chunk, Pipe, Stream}
+import io.circe.syntax._
 import org.typelevel.log4cats.Logger
 import org.typelevel.log4cats.slf4j.Slf4jLogger
 import org.apache.spark.sql.Row
@@ -134,6 +135,7 @@ object Processing {
       for {
         _ <- Logger[F].debug(s"Processing batch of size ${events.size}")
         nonAtomicFields <- NonAtomicFields.resolveTypes[F](env.resolver, entities, env.schemasToSkip)
+        _ <- possiblyExitOnMissingIgluSchema(env, nonAtomicFields)
         _ <- rememberColumnNames(ref, nonAtomicFields.fields)
         (bad, rows) <- transformToSpark[F](badProcessor, events, nonAtomicFields)
         _ <- sendFailedEvents(env, badProcessor, bad)
@@ -277,4 +279,14 @@ object Processing {
 
       Stream.eval(commit) >> Stream.emits(state.tokens.reverse)
     }
+
+  private def possiblyExitOnMissingIgluSchema[F[_]: Sync](env: Environment[F], nonAtomicFields: NonAtomicFields.Result): F[Unit] =
+    if (env.exitOnMissingIgluSchema && nonAtomicFields.igluFailures.nonEmpty) {
+      val base =
+        "Exiting because failed to resolve Iglu schemas.  Either check the configuration of the Iglu repos, or set the `skipSchemas` config option, or set `exitOnMissingIgluSchema` to false.\n"
+      val msg = nonAtomicFields.igluFailures.map(_.failure.asJson.noSpaces).mkString(base, "\n", "")
+      Logger[F].error(base) *> env.appHealth.beUnhealthyForRuntimeService(RuntimeService.Iglu) *> Sync[F].raiseError(
+        new RuntimeException(msg)
+      )
+    } else Applicative[F].unit
 }
diff --git a/modules/core/src/test/scala/com.snowplowanalytics.snowplow.lakes/MockEnvironment.scala b/modules/core/src/test/scala/com.snowplowanalytics.snowplow.lakes/MockEnvironment.scala
@@ -76,20 +76,21 @@ object MockEnvironment {
       source = testSourceAndAck(windows, state)
     } yield {
       val env = Environment(
-        appInfo                = TestSparkEnvironment.appInfo,
-        source                 = source,
-        badSink                = testSink(state),
-        resolver               = Resolver[IO](Nil, None),
-        httpClient             = testHttpClient,
-        lakeWriter             = testLakeWriter(state),
-        metrics                = testMetrics(state),
-        appHealth              = testAppHealth(state),
-        inMemBatchBytes        = 1000000L,
-        cpuParallelism         = 1,
-        windowing              = EventProcessingConfig.TimedWindows(1.minute, 1.0, 1),
-        badRowMaxSize          = 1000000,
-        schemasToSkip          = List.empty,
-        respectIgluNullability = true
+        appInfo                 = TestSparkEnvironment.appInfo,
+        source                  = source,
+        badSink                 = testSink(state),
+        resolver                = Resolver[IO](Nil, None),
+        httpClient              = testHttpClient,
+        lakeWriter              = testLakeWriter(state),
+        metrics                 = testMetrics(state),
+        appHealth               = testAppHealth(state),
+        inMemBatchBytes         = 1000000L,
+        cpuParallelism          = 1,
+        windowing               = EventProcessingConfig.TimedWindows(1.minute, 1.0, 1),
+        badRowMaxSize           = 1000000,
+        schemasToSkip           = List.empty,
+        respectIgluNullability  = true,
+        exitOnMissingIgluSchema = false
       )
       MockEnvironment(state, env)
     }

diff --git a/modules/core/src/test/scala/com.snowplowanalytics.snowplow.lakes/TestSparkEnvironment.scala b/modules/core/src/test/scala/com.snowplowanalytics.snowplow.lakes/TestSparkEnvironment.scala
@@ -36,20 +36,21 @@ object TestSparkEnvironment {
     lakeWriter <- LakeWriter.build[IO](testConfig.spark, testConfig.output.good)
     lakeWriterWrapped = LakeWriter.withHandledErrors(lakeWriter, dummyAppHealth, retriesConfig, PartialFunction.empty)
   } yield Environment(
-    appInfo                = appInfo,
-    source                 = source,
-    badSink                = Sink[IO](_ => IO.unit),
-    resolver               = Resolver[IO](Nil, None),
-    httpClient             = testHttpClient,
-    lakeWriter             = lakeWriterWrapped,
-    metrics                = testMetrics,
-    appHealth              = dummyAppHealth,
-    inMemBatchBytes        = 1000000L,
-    cpuParallelism         = 1,
-    windowing              = EventProcessingConfig.TimedWindows(1.minute, 1.0, 1),
-    badRowMaxSize          = 1000000,
-    schemasToSkip          = List.empty,
-    respectIgluNullability = true
+    appInfo                 = appInfo,
+    source                  = source,
+    badSink                 = Sink[IO](_ => IO.unit),
+    resolver                = Resolver[IO](Nil, None),
+    httpClient              = testHttpClient,
+    lakeWriter              = lakeWriterWrapped,
+    metrics                 = testMetrics,
+    appHealth               = dummyAppHealth,
+    inMemBatchBytes         = 1000000L,
+    cpuParallelism          = 1,
+    windowing               = EventProcessingConfig.TimedWindows(1.minute, 1.0, 1),
+    badRowMaxSize           = 1000000,
+    schemasToSkip           = List.empty,
+    respectIgluNullability  = true,
+    exitOnMissingIgluSchema = false
   )
 
   private val retriesConfig = Config.Retries(

diff --git a/...re/src/test/scala/com.snowplowanalytics.snowplow.lakes/processing/AbstractSparkSpec.scala b/...re/src/test/scala/com.snowplowanalytics.snowplow.lakes/processing/AbstractSparkSpec.scala
@@ -15,21 +15,17 @@ import cats.effect.kernel.Resource
 import cats.implicits._
 import cats.effect.testing.specs2.CatsEffect
 import io.circe.Json
-import fs2.{Chunk, Stream}
 import org.specs2.Specification
 import org.specs2.matcher.MatchResult
 
 import org.apache.spark.sql.{DataFrame, SparkSession}
 
 import scala.concurrent.duration.DurationInt
-import java.nio.charset.StandardCharsets
 import fs2.io.file.{Files, Path}
 
 import com.snowplowanalytics.iglu.core.{SchemaKey, SchemaVer, SelfDescribingData}
-import com.snowplowanalytics.snowplow.analytics.scalasdk.{Event, SnowplowEvent}
-import com.snowplowanalytics.snowplow.analytics.scalasdk.SnowplowEvent.{Contexts, UnstructEvent}
+import com.snowplowanalytics.snowplow.analytics.scalasdk.SnowplowEvent
 import com.snowplowanalytics.snowplow.lakes.{TestConfig, TestSparkEnvironment}
-import com.snowplowanalytics.snowplow.sources.TokenedEvents
 
 /** Base Spec for testing different output formats of this loader */
 abstract class AbstractSparkSpec extends Specification with CatsEffect {
@@ -62,7 +58,7 @@ abstract class AbstractSparkSpec extends Specification with CatsEffect {
 
   def e1 = Files[IO].tempDirectory.use { tmpDir =>
     val resources = for {
-      inputs <- Resource.eval(inputEvents(2, good()))
+      inputs <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good()))
       tokened <- Resource.eval(inputs.traverse(_.tokened))
       env <- TestSparkEnvironment.build(target, tmpDir, List(tokened))
     } yield (inputs, env)
@@ -125,9 +121,9 @@ abstract class AbstractSparkSpec extends Specification with CatsEffect {
     )
 
     val resources = for {
-      inputs1 <- Resource.eval(inputEvents(2, good(ue = ueGood700)))
+      inputs1 <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good(ue = ueGood700)))
       tokened1 <- Resource.eval(inputs1.traverse(_.tokened))
-      inputs2 <- Resource.eval(inputEvents(2, good(ue = ueGood701)))
+      inputs2 <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good(ue = ueGood701)))
       tokened2 <- Resource.eval(inputs2.traverse(_.tokened))
       env <- TestSparkEnvironment.build(target, tmpDir, List(tokened1, tokened2))
     } yield env
@@ -182,9 +178,9 @@ abstract class AbstractSparkSpec extends Specification with CatsEffect {
     )
 
     val resources = for {
-      inputs1 <- Resource.eval(inputEvents(2, good(ue = ueBadEvolution100)))
+      inputs1 <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good(ue = ueBadEvolution100)))
       tokened1 <- Resource.eval(inputs1.traverse(_.tokened))
-      inputs2 <- Resource.eval(inputEvents(2, good(ue = ueBadEvolution101)))
+      inputs2 <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good(ue = ueBadEvolution101)))
       tokened2 <- Resource.eval(inputs2.traverse(_.tokened))
       env <- TestSparkEnvironment.build(target, tmpDir, List(tokened1, tokened2))
     } yield env
@@ -228,7 +224,7 @@ abstract class AbstractSparkSpec extends Specification with CatsEffect {
     )
 
     val resources = for {
-      inputs <- Resource.eval(inputEvents(2, good(ue = adBreakEndEvent)))
+      inputs <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good(ue = adBreakEndEvent)))
       tokened <- Resource.eval(inputs.traverse(_.tokened))
       env <- TestSparkEnvironment.build(target, tmpDir, List(tokened))
     } yield env
@@ -268,7 +264,7 @@ abstract class AbstractSparkSpec extends Specification with CatsEffect {
     )
 
     val resources = for {
-      inputs <- Resource.eval(inputEvents(2, good(ue = ue)))
+      inputs <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good(ue = ue)))
       tokened <- Resource.eval(inputs.traverse(_.tokened))
       env <- TestSparkEnvironment.build(target, tmpDir, List(tokened))
     } yield env
@@ -308,7 +304,7 @@ abstract class AbstractSparkSpec extends Specification with CatsEffect {
     )
 
     val resources = for {
-      inputs <- Resource.eval(inputEvents(2, good(contexts = adBreakEndEvent)))
+      inputs <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good(contexts = adBreakEndEvent)))
       tokened <- Resource.eval(inputs.traverse(_.tokened))
       env <- TestSparkEnvironment.build(target, tmpDir, List(tokened))
     } yield env
@@ -348,7 +344,7 @@ abstract class AbstractSparkSpec extends Specification with CatsEffect {
     )
 
     val resources = for {
-      inputs <- Resource.eval(inputEvents(2, good(contexts = contexts)))
+      inputs <- Resource.eval(EventUtils.inputEvents(2, EventUtils.good(contexts = contexts)))
       tokened <- Resource.eval(inputs.traverse(_.tokened))
       env <- TestSparkEnvironment.build(target, tmpDir, List(tokened))
     } yield env
@@ -381,41 +377,6 @@ abstract class AbstractSparkSpec extends Specification with CatsEffect {
 
 object AbstractSparkSpec {
 
-  case class TestBatch(value: List[Event]) {
-    def tokened: IO[TokenedEvents] = {
-      val serialized = Chunk.from(value).map { e =>
-        StandardCharsets.UTF_8.encode(e.toTsv)
-      }
-      IO.unique.map { ack =>
-        TokenedEvents(serialized, ack, None)
-      }
-    }
-  }
-
-  def inputEvents(count: Long, source: IO[TestBatch]): IO[List[TestBatch]] =
-    Stream
-      .eval(source)
-      .repeat
-      .take(count)
-      .compile
-      .toList
-
-  def good(ue: UnstructEvent = UnstructEvent(None), contexts: Contexts = Contexts(List.empty)): IO[TestBatch] =
-    for {
-      eventId1 <- IO.randomUUID
-      eventId2 <- IO.randomUUID
-      collectorTstamp <- IO.realTimeInstant
-    } yield {
-      val event1 = Event
-        .minimal(eventId1, collectorTstamp, "0.0.0", "0.0.0")
-        .copy(tr_total = Some(1.23))
-        .copy(unstruct_event = ue)
-        .copy(contexts = contexts)
-      val event2 = Event
-        .minimal(eventId2, collectorTstamp, "0.0.0", "0.0.0")
-      TestBatch(List(event1, event2))
-    }
-
   /** A spark session just used for making assertions, not for running the code under test */
   private def sparkForAssertions(config: Map[String, String]): Resource[IO, SparkSession] = {
     val io = IO.blocking {