From ff178b6bc1a1e243b0405141fc6ac11471f7c952 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Poniedzia=C5=82ek?= Date: Wed, 13 Mar 2024 12:48:43 -0700 Subject: [PATCH] Use `Vector` instead of `List` in transform methods. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, the transform functions in common-loaders return a [List](https://github.com/snowplow-incubator/common-streams/blob/0.2.1/modules/loaders-common/src/main/scala/com/snowplowanalytics/snowplow/loaders/transform/Transform.scala#L83). We construct the list by concatenating two lists: 1 for atomic fields and 1 for the self-describing entity fields ([this line](https://github.com/snowplow-incubator/common-streams/blob/0.2.1/modules/loaders-common/src/main/scala/com/snowplowanalytics/snowplow/loaders/transform/Transform.scala#L86)). The ::: operator is relatively expensive, especially for large lists. And this is a “hot” function which we invoke for every single event. If we switch the implementation to use Vector instead of List then it should be much more efficient to concatenate the two lists. --- .../loaders/transform/Transform.scala | 20 +++++++++---------- .../transform/TransformSpec.scala | 10 +++++----- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/modules/loaders-common/src/main/scala/com/snowplowanalytics/snowplow/loaders/transform/Transform.scala b/modules/loaders-common/src/main/scala/com/snowplowanalytics/snowplow/loaders/transform/Transform.scala index a71a9f3..2415162 100644 --- a/modules/loaders-common/src/main/scala/com/snowplowanalytics/snowplow/loaders/transform/Transform.scala +++ b/modules/loaders-common/src/main/scala/com/snowplowanalytics/snowplow/loaders/transform/Transform.scala @@ -45,11 +45,11 @@ object Transform { caster: Caster[A], event: Event, batchInfo: NonAtomicFields.Result - ): Either[BadRow, List[Caster.NamedValue[A]]] = + ): Either[BadRow, Vector[Caster.NamedValue[A]]] = failForResolverErrors(processor, event, batchInfo.igluFailures) *> - (forAtomic(caster, event), forEntities(caster, event, batchInfo.fields)) + (forAtomic(caster, event), forEntities(caster, event, batchInfo.fields.toVector)) .mapN { case (atomic, nonAtomic) => - atomic ::: nonAtomic + atomic ++ nonAtomic } .toEither .leftMap { nel => @@ -80,12 +80,12 @@ object Transform { jsonCaster: Json.Folder[A], event: Event, entitiesToSkip: List[SchemaCriterion] - ): Either[BadRow, List[Caster.NamedValue[A]]] = + ): Either[BadRow, Vector[Caster.NamedValue[A]]] = forAtomic(caster, event).toEither .map { atomic => - atomic ::: extractEntities(event, entitiesToSkip).iterator.map { case (key, json) => + atomic ++ extractEntities(event, entitiesToSkip).iterator.map { case (key, json) => Caster.NamedValue(key, json.foldWith(jsonCaster)) - }.toList + }.toVector } .leftMap { nel => BadRow.LoaderIgluError(processor, BadRowFailure.LoaderIgluErrors(nel), BadPayload.LoaderPayload(event)) @@ -147,8 +147,8 @@ object Transform { private def forEntities[A]( caster: Caster[A], event: Event, - entities: List[TypedTabledEntity] - ): ValidatedNel[FailureDetails.LoaderIgluError, List[Caster.NamedValue[A]]] = + entities: Vector[TypedTabledEntity] + ): ValidatedNel[FailureDetails.LoaderIgluError, Vector[Caster.NamedValue[A]]] = entities.flatMap { case TypedTabledEntity(entity, field, subVersions, recoveries) => val head = forEntity(caster, entity, field, subVersions, event) val tail = recoveries.map { case (recoveryVersion, recoveryField) => @@ -246,7 +246,7 @@ object Transform { * * TODO: implement this using Shapeless to make it less fragile */ - private def forAtomic[A](caster: Caster[A], event: Event): ValidatedNel[FailureDetails.LoaderIgluError, List[Caster.NamedValue[A]]] = + private def forAtomic[A](caster: Caster[A], event: Event): ValidatedNel[FailureDetails.LoaderIgluError, Vector[Caster.NamedValue[A]]] = ( event.tr_total.traverse(forMoney(caster, _)), event.tr_tax.traverse(forMoney(caster, _)), @@ -257,7 +257,7 @@ object Transform { event.tr_shipping_base.traverse(forMoney(caster, _)), event.ti_price_base.traverse(forMoney(caster, _)) ).mapN { case (trTotal, trTax, trShipping, tiPrice, trTotalBase, trTaxBase, trShippingBase, tiPriceBase) => - List[A]( + Vector[A]( event.app_id.fold[A](caster.nullValue)(caster.stringValue(_)), event.platform.fold[A](caster.nullValue)(caster.stringValue(_)), event.etl_tstamp.fold[A](caster.nullValue)(caster.timestampValue(_)), diff --git a/modules/loaders-common/src/test/scala/com.snowplowanalytics.snowplow.loaders/transform/TransformSpec.scala b/modules/loaders-common/src/test/scala/com.snowplowanalytics.snowplow.loaders/transform/TransformSpec.scala index a603f83..67acedf 100644 --- a/modules/loaders-common/src/test/scala/com.snowplowanalytics.snowplow.loaders/transform/TransformSpec.scala +++ b/modules/loaders-common/src/test/scala/com.snowplowanalytics.snowplow.loaders/transform/TransformSpec.scala @@ -53,7 +53,7 @@ class TransformSpec extends Specification { NamedValue("geo_region", Json.Null) ) - result must beRight { namedValues: List[NamedValue[Json]] => + result must beRight { namedValues: Vector[NamedValue[Json]] => expected .map { e => namedValues must contain(e).exactly(1.times) @@ -85,7 +85,7 @@ class TransformSpec extends Specification { NamedValue("tr_total", Json.fromDoubleOrNull(12.34)) ) - result must beRight { namedValues: List[NamedValue[Json]] => + result must beRight { namedValues: Vector[NamedValue[Json]] => expected .map { e => namedValues must contain(e).exactly(1.times) @@ -140,7 +140,7 @@ class TransformSpec extends Specification { val expected = NamedValue("unstruct_event_com_example_my_schema_7", data) - result must beRight { namedValues: List[NamedValue[Json]] => + result must beRight { namedValues: Vector[NamedValue[Json]] => namedValues must contain(expected).exactly(1.times) } } @@ -196,7 +196,7 @@ class TransformSpec extends Specification { ) ) - result must beRight { namedValues: List[NamedValue[Json]] => + result must beRight { namedValues: Vector[NamedValue[Json]] => expected .map { e => namedValues must contain(e).exactly(1.times) @@ -249,7 +249,7 @@ class TransformSpec extends Specification { """ ) - result must beRight { namedValues: List[NamedValue[Json]] => + result must beRight { namedValues: Vector[NamedValue[Json]] => namedValues must contain(expected).exactly(1.times) } }