Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support sampling for synthetic event group spec #1425

Merged
merged 14 commits into from
Feb 2, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,11 @@ package org.wfanet.measurement.loadtest.dataprovider
import com.google.protobuf.Descriptors.FieldDescriptor
import com.google.protobuf.Message
import java.time.ZoneOffset
import kotlin.random.Random
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.FieldValue
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.SimulatorSyntheticDataSpec
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.SyntheticEventGroupSpec
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.SyntheticEventGroupSpec.FrequencySpec.VidRangeSpec
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.SyntheticPopulationSpec
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.SyntheticPopulationSpec.SubPopulation
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.VidRange
Expand All @@ -44,16 +46,19 @@ object SyntheticDataGeneration {
fun <T : Message> generateEvents(
messageInstance: T,
populationSpec: SyntheticPopulationSpec,
syntheticEventGroupSpec: SyntheticEventGroupSpec
syntheticEventGroupSpec: SyntheticEventGroupSpec,
randomSeed: Long = 0L,
): Sequence<LabeledEvent<T>> {
val subPopulations = populationSpec.subPopulationsList

return sequence {
for (dateSpec: SyntheticEventGroupSpec.DateSpec in syntheticEventGroupSpec.dateSpecsList) {
val dateProgression = dateSpec.dateRange.toProgression()
for (frequencySpec: SyntheticEventGroupSpec.FrequencySpec in dateSpec.frequencySpecsList) {
for (vidRangeSpec: SyntheticEventGroupSpec.FrequencySpec.VidRangeSpec in
frequencySpec.vidRangeSpecsList) {

check(!frequencySpec.hasOverlaps()) { "The VID ranges should be non-overlapping." }

for (vidRangeSpec: VidRangeSpec in frequencySpec.vidRangeSpecsList) {
val subPopulation: SubPopulation =
vidRangeSpec.vidRange.findSubPopulation(subPopulations)
?: throw IllegalArgumentException()
Expand All @@ -77,9 +82,10 @@ object SyntheticDataGeneration {
@Suppress("UNCHECKED_CAST") // Safe per protobuf API.
val message = builder.build() as T

for (vid in vidRangeSpec.vidRange.start until vidRangeSpec.vidRange.endExclusive) {
for (date in dateProgression) {
for (i in 0 until frequencySpec.frequency) {
for (date in dateProgression) {
for (i in 0 until frequencySpec.frequency) {
val sampledVids = sampleVids(vidRangeSpec, date.toEpochDay() + randomSeed)
for (vid in sampledVids) {
yield(LabeledEvent(date.atStartOfDay().toInstant(ZoneOffset.UTC), vid, message))
}
}
Expand All @@ -90,6 +96,31 @@ object SyntheticDataGeneration {
}
}

/**
* Returns the sampled Vids from [vidRangeSpec]. Given the same [vidRangeSpec] and [randomSeed],
* returns the same vids. Returns all of the vids if sample size is 0.
*/
private fun sampleVids(vidRangeSpec: VidRangeSpec, randomSeed: Long): Sequence<Long> {
val vidRangeSequence =
(vidRangeSpec.vidRange.start until vidRangeSpec.vidRange.endExclusive).asSequence()
if (vidRangeSpec.sampleSize == 0) {
return vidRangeSequence
}

// This step ensures given the same seed and the same vidRangeSpec, sample vids are the same.
// This is required because the EDP should respond by querying the same data for the same
// requisition.
val random =
Random(
vidRangeSpec.vidRange.start +
vidRangeSpec.vidRange.endExclusive +
vidRangeSpec.sampleSize.toLong() +
randomSeed
)

return vidRangeSequence.shuffled(random).take(vidRangeSpec.sampleSize)
}

/**
* Returns the [SubPopulation] from a list of [SubPopulation] that contains the [VidRange] in its
* range.
Expand Down Expand Up @@ -154,3 +185,19 @@ object SyntheticDataGeneration {
private fun SyntheticEventGroupSpec.DateSpec.DateRange.toProgression(): LocalDateProgression {
return start.toLocalDate()..endExclusive.toLocalDate().minusDays(1)
}

private fun SyntheticEventGroupSpec.FrequencySpec.hasOverlaps(): Boolean {
return this.vidRangeSpecsList
.toList()
.flatMap { vidRangeSpec: VidRangeSpec ->
listOf(
RangePoint(vidRangeSpec.vidRange.start, true),
RangePoint(vidRangeSpec.vidRange.endExclusive - 1, false)
)
}
.sortedBy { it.x }
.zipWithNext { first, second -> first.isStart && second.isStart }
.any { it }
}

private data class RangePoint(val x: Long, val isStart: Boolean)
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,13 @@ message SyntheticEventGroupSpec {
// A range of VIDs within a single `SubPopulation`.
VidRange vid_range = 1;

// Number of vids sampled uniformly without replacement from vid_range.
// If this is 0, no sampling is done and all the vids in range are taken.
int32 sample_size = 2;

// A map of `non_population_fields` from `SyntheticPopulationSpec` to
// their values.
map<string, FieldValue> non_population_field_values = 2;
map<string, FieldValue> non_population_field_values = 3;
}
// The VID ranges should be non-overlapping sub-ranges of SubPopulations.
repeated VidRangeSpec vid_range_specs = 2;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,244 @@ class SyntheticDataGenerationTest {
assertThat(labeledEvents).containsExactlyElementsIn(expectedTestEvents)
}

@Test
fun `generateEvents returns a sequence of sampled events when sample size specified`() {

val sampleSizeForFreqOne = 2
val firstsampleSizeForFreqTwo = 5
val secondSampleSizeForFreqTwo = 10

val population = syntheticPopulationSpec {
vidRange = vidRange {
start = 0L
endExclusive = 100L
}

populationFields += "person.gender"
populationFields += "person.age_group"

nonPopulationFields += "banner_ad.viewable"
nonPopulationFields += "video_ad.viewed_fraction"

subPopulations +=
SyntheticPopulationSpecKt.subPopulation {
vidSubRange = vidRange {
start = 0L
endExclusive = 50L
}

populationFieldsValues["person.gender"] = fieldValue {
enumValue = Person.Gender.MALE_VALUE
}
populationFieldsValues["person.age_group"] = fieldValue {
enumValue = Person.AgeGroup.YEARS_18_TO_34_VALUE
}
}
subPopulations +=
SyntheticPopulationSpecKt.subPopulation {
vidSubRange = vidRange {
start = 50L
endExclusive = 100L
}

populationFieldsValues["person.gender"] = fieldValue {
enumValue = Person.Gender.FEMALE_VALUE
}
populationFieldsValues["person.age_group"] = fieldValue {
enumValue = Person.AgeGroup.YEARS_18_TO_34_VALUE
}
}
}
val eventGroupSpec = syntheticEventGroupSpec {
description = "event group 1"

dateSpecs +=
SyntheticEventGroupSpecKt.dateSpec {
dateRange =
SyntheticEventGroupSpecKt.DateSpecKt.dateRange {
start = date {
year = 2023
month = 6
day = 27
}
endExclusive = date {
year = 2023
month = 6
day = 28
}
}

frequencySpecs +=
SyntheticEventGroupSpecKt.frequencySpec {
frequency = 2

vidRangeSpecs +=
SyntheticEventGroupSpecKt.FrequencySpecKt.vidRangeSpec {
vidRange = vidRange {
start = 0L
endExclusive = 25L
}

sampleSize = firstsampleSizeForFreqTwo

nonPopulationFieldValues["banner_ad.viewable"] = fieldValue { boolValue = true }
nonPopulationFieldValues["video_ad.viewed_fraction"] = fieldValue {
doubleValue = 0.5
}
}
vidRangeSpecs +=
SyntheticEventGroupSpecKt.FrequencySpecKt.vidRangeSpec {
vidRange = vidRange {
start = 25L
endExclusive = 50L
}

sampleSize = secondSampleSizeForFreqTwo

nonPopulationFieldValues["banner_ad.viewable"] = fieldValue { boolValue = false }
nonPopulationFieldValues["video_ad.viewed_fraction"] = fieldValue {
doubleValue = 0.7
}
}
}
frequencySpecs +=
SyntheticEventGroupSpecKt.frequencySpec {
frequency = 1

vidRangeSpecs +=
SyntheticEventGroupSpecKt.FrequencySpecKt.vidRangeSpec {
vidRange = vidRange {
start = 50L
endExclusive = 75L
}

sampleSize = sampleSizeForFreqOne

nonPopulationFieldValues["banner_ad.viewable"] = fieldValue { boolValue = true }
nonPopulationFieldValues["video_ad.viewed_fraction"] = fieldValue {
doubleValue = 0.8
}
}
}
}
}

val labeledEvents: List<LabeledEvent<TestEvent>> =
SyntheticDataGeneration.generateEvents(
TestEvent.getDefaultInstance(),
population,
eventGroupSpec
)
.toList()
val expectedNumberOfEvents =
sampleSizeForFreqOne + 2 * (firstsampleSizeForFreqTwo + secondSampleSizeForFreqTwo)
assertThat(labeledEvents.size).isEqualTo(expectedNumberOfEvents)
}

fun `generateEvents fails when overlapping vidRanges exist`() {
val population = syntheticPopulationSpec {
vidRange = vidRange {
start = 0L
endExclusive = 100L
}

populationFields += "person.gender"
populationFields += "person.age_group"

nonPopulationFields += "banner_ad.viewable"
nonPopulationFields += "video_ad.viewed_fraction"

subPopulations +=
SyntheticPopulationSpecKt.subPopulation {
vidSubRange = vidRange {
start = 0L
endExclusive = 50L
}

populationFieldsValues["person.gender"] = fieldValue {
enumValue = Person.Gender.MALE_VALUE
}
populationFieldsValues["person.age_group"] = fieldValue {
enumValue = Person.AgeGroup.YEARS_18_TO_34_VALUE
}
}
subPopulations +=
SyntheticPopulationSpecKt.subPopulation {
vidSubRange = vidRange {
start = 50L
endExclusive = 100L
}

populationFieldsValues["person.gender"] = fieldValue {
enumValue = Person.Gender.FEMALE_VALUE
}
populationFieldsValues["person.age_group"] = fieldValue {
enumValue = Person.AgeGroup.YEARS_18_TO_34_VALUE
}
}
}
val eventGroupSpec = syntheticEventGroupSpec {
description = "event group 1"

dateSpecs +=
SyntheticEventGroupSpecKt.dateSpec {
dateRange =
SyntheticEventGroupSpecKt.DateSpecKt.dateRange {
start = date {
year = 2023
month = 6
day = 27
}
endExclusive = date {
year = 2023
month = 6
day = 28
}
}

frequencySpecs +=
SyntheticEventGroupSpecKt.frequencySpec {
frequency = 2

vidRangeSpecs +=
SyntheticEventGroupSpecKt.FrequencySpecKt.vidRangeSpec {
vidRange = vidRange {
start = 0L
endExclusive = 25L
}

nonPopulationFieldValues["banner_ad.viewable"] = fieldValue { boolValue = true }
nonPopulationFieldValues["video_ad.viewed_fraction"] = fieldValue {
doubleValue = 0.5
}
}
vidRangeSpecs +=
SyntheticEventGroupSpecKt.FrequencySpecKt.vidRangeSpec {
vidRange = vidRange {
// 20 is in between 0 and 25, the previous range.
start = 20L
endExclusive = 50L
}

nonPopulationFieldValues["banner_ad.viewable"] = fieldValue { boolValue = false }
nonPopulationFieldValues["video_ad.viewed_fraction"] = fieldValue {
doubleValue = 0.7
}
}
}
}
}

assertFailsWith<IllegalArgumentException> {
SyntheticDataGeneration.generateEvents(
TestEvent.getDefaultInstance(),
population,
eventGroupSpec
)
.toList()
}
}

@Test
fun `generateEvents returns messages with a Duration field`() {
val populationSpec = syntheticPopulationSpec {
Expand Down
Loading