Skip to content

Commit

Permalink
Use Vector instead of List in transform methods.
Browse files Browse the repository at this point in the history
Currently, the transform functions in common-loaders return a [List](https://github.com/snowplow-incubator/common-streams/blob/0.2.1/modules/loaders-common/src/main/scala/com/snowplowanalytics/snowplow/loaders/transform/Transform.scala#L83).  We construct the list by concatenating two lists: 1 for atomic fields and 1 for the self-describing entity fields ([this line](https://github.com/snowplow-incubator/common-streams/blob/0.2.1/modules/loaders-common/src/main/scala/com/snowplowanalytics/snowplow/loaders/transform/Transform.scala#L86)).

The ::: operator is relatively expensive, especially for large lists.  And this is a “hot” function which we invoke for every single event.

If we switch the implementation to use Vector instead of List then it should be much more efficient to concatenate the two lists.
  • Loading branch information
pondzix committed Mar 13, 2024
1 parent 94d23b3 commit 7bcc80f
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -45,11 +45,11 @@ object Transform {
caster: Caster[A],
event: Event,
batchInfo: NonAtomicFields.Result
): Either[BadRow, List[Caster.NamedValue[A]]] =
): Either[BadRow, Vector[Caster.NamedValue[A]]] =
failForResolverErrors(processor, event, batchInfo.igluFailures) *>
(forAtomic(caster, event), forEntities(caster, event, batchInfo.fields))
(forAtomic(caster, event), forEntities(caster, event, batchInfo.fields.toVector))
.mapN { case (atomic, nonAtomic) =>
atomic ::: nonAtomic
atomic ++ nonAtomic
}
.toEither
.leftMap { nel =>
Expand Down Expand Up @@ -80,12 +80,12 @@ object Transform {
jsonCaster: Json.Folder[A],
event: Event,
entitiesToSkip: List[SchemaCriterion]
): Either[BadRow, List[Caster.NamedValue[A]]] =
): Either[BadRow, Vector[Caster.NamedValue[A]]] =
forAtomic(caster, event).toEither
.map { atomic =>
atomic ::: extractEntities(event, entitiesToSkip).iterator.map { case (key, json) =>
atomic ++ extractEntities(event, entitiesToSkip).iterator.map { case (key, json) =>
Caster.NamedValue(key, json.foldWith(jsonCaster))
}.toList
}.toVector
}
.leftMap { nel =>
BadRow.LoaderIgluError(processor, BadRowFailure.LoaderIgluErrors(nel), BadPayload.LoaderPayload(event))
Expand Down Expand Up @@ -147,8 +147,8 @@ object Transform {
private def forEntities[A](
caster: Caster[A],
event: Event,
entities: List[TypedTabledEntity]
): ValidatedNel[FailureDetails.LoaderIgluError, List[Caster.NamedValue[A]]] =
entities: Vector[TypedTabledEntity]
): ValidatedNel[FailureDetails.LoaderIgluError, Vector[Caster.NamedValue[A]]] =
entities.flatMap { case TypedTabledEntity(entity, field, subVersions, recoveries) =>
val head = forEntity(caster, entity, field, subVersions, event)
val tail = recoveries.map { case (recoveryVersion, recoveryField) =>
Expand Down Expand Up @@ -246,7 +246,7 @@ object Transform {
*
* TODO: implement this using Shapeless to make it less fragile
*/
private def forAtomic[A](caster: Caster[A], event: Event): ValidatedNel[FailureDetails.LoaderIgluError, List[Caster.NamedValue[A]]] =
private def forAtomic[A](caster: Caster[A], event: Event): ValidatedNel[FailureDetails.LoaderIgluError, Vector[Caster.NamedValue[A]]] =
(
event.tr_total.traverse(forMoney(caster, _)),
event.tr_tax.traverse(forMoney(caster, _)),
Expand All @@ -257,7 +257,7 @@ object Transform {
event.tr_shipping_base.traverse(forMoney(caster, _)),
event.ti_price_base.traverse(forMoney(caster, _))
).mapN { case (trTotal, trTax, trShipping, tiPrice, trTotalBase, trTaxBase, trShippingBase, tiPriceBase) =>
List[A](
Vector[A](
event.app_id.fold[A](caster.nullValue)(caster.stringValue(_)),
event.platform.fold[A](caster.nullValue)(caster.stringValue(_)),
event.etl_tstamp.fold[A](caster.nullValue)(caster.timestampValue(_)),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ class TransformSpec extends Specification {
NamedValue("geo_region", Json.Null)
)

result must beRight { namedValues: List[NamedValue[Json]] =>
result must beRight { namedValues: Vector[NamedValue[Json]] =>
expected
.map { e =>
namedValues must contain(e).exactly(1.times)
Expand Down Expand Up @@ -85,7 +85,7 @@ class TransformSpec extends Specification {
NamedValue("tr_total", Json.fromDoubleOrNull(12.34))
)

result must beRight { namedValues: List[NamedValue[Json]] =>
result must beRight { namedValues: Vector[NamedValue[Json]] =>
expected
.map { e =>
namedValues must contain(e).exactly(1.times)
Expand Down Expand Up @@ -140,7 +140,7 @@ class TransformSpec extends Specification {

val expected = NamedValue("unstruct_event_com_example_my_schema_7", data)

result must beRight { namedValues: List[NamedValue[Json]] =>
result must beRight { namedValues: Vector[NamedValue[Json]] =>
namedValues must contain(expected).exactly(1.times)
}
}
Expand Down Expand Up @@ -196,7 +196,7 @@ class TransformSpec extends Specification {
)
)

result must beRight { namedValues: List[NamedValue[Json]] =>
result must beRight { namedValues: Vector[NamedValue[Json]] =>
expected
.map { e =>
namedValues must contain(e).exactly(1.times)
Expand Down Expand Up @@ -249,7 +249,7 @@ class TransformSpec extends Specification {
"""
)

result must beRight { namedValues: List[NamedValue[Json]] =>
result must beRight { namedValues: Vector[NamedValue[Json]] =>
namedValues must contain(expected).exactly(1.times)
}
}
Expand Down

0 comments on commit 7bcc80f

Please sign in to comment.