Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support sampling for synthetic event group spec #1425

Merged
merged 14 commits into from
Feb 2, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,14 @@ package org.wfanet.measurement.loadtest.dataprovider
import com.google.protobuf.Descriptors.FieldDescriptor
import com.google.protobuf.Message
import java.time.ZoneOffset
import java.util.Collections
import java.util.Random
import kotlin.math.max
import kotlin.math.min
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.FieldValue
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.SimulatorSyntheticDataSpec
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.SyntheticEventGroupSpec
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.SyntheticEventGroupSpec.FrequencySpec.VidRangeSpec
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.SyntheticPopulationSpec
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.SyntheticPopulationSpec.SubPopulation
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.VidRange
Expand All @@ -46,14 +51,39 @@ object SyntheticDataGeneration {
populationSpec: SyntheticPopulationSpec,
syntheticEventGroupSpec: SyntheticEventGroupSpec,
): Sequence<LabeledEvent<T>> {
var samplingRequired = false
val vidRangeSpecs =
syntheticEventGroupSpec.dateSpecsList
.flatMap { it.frequencySpecsList }
.flatMap { it.vidRangeSpecsList }

for (vidRangeSpec in vidRangeSpecs) {
val vidRangeWidth = vidRangeSpec.vidRange.endExclusive - vidRangeSpec.vidRange.start
check(vidRangeWidth >= vidRangeSpec.sampleSize) {
"all vidRange widths should be larger than sampleSizes"
}
if (vidRangeSpec.sampleSize > 0) {
samplingRequired = true
}
}

if (samplingRequired) {
check(syntheticEventGroupSpec.rngType == SyntheticEventGroupSpec.RngType.JAVA_UTIL_RANDOM) {
"Expecting JAVA_UTIL_RANDOM rng type, got ${syntheticEventGroupSpec.rngType}"
}
}

val subPopulations = populationSpec.subPopulationsList

return sequence {
for (dateSpec: SyntheticEventGroupSpec.DateSpec in syntheticEventGroupSpec.dateSpecsList) {
val dateProgression = dateSpec.dateRange.toProgression()
for (frequencySpec: SyntheticEventGroupSpec.FrequencySpec in dateSpec.frequencySpecsList) {
for (vidRangeSpec: SyntheticEventGroupSpec.FrequencySpec.VidRangeSpec in
frequencySpec.vidRangeSpecsList) {

check(!frequencySpec.hasOverlaps()) { "The VID ranges should be non-overlapping." }

for (vidRangeSpec: VidRangeSpec in frequencySpec.vidRangeSpecsList) {
val random = Random(vidRangeSpec.randomSeed)
val subPopulation: SubPopulation =
vidRangeSpec.vidRange.findSubPopulation(subPopulations)
?: throw IllegalArgumentException()
Expand All @@ -77,9 +107,10 @@ object SyntheticDataGeneration {
@Suppress("UNCHECKED_CAST") // Safe per protobuf API.
val message = builder.build() as T

for (vid in vidRangeSpec.vidRange.start until vidRangeSpec.vidRange.endExclusive) {
for (date in dateProgression) {
for (i in 0 until frequencySpec.frequency) {
for (date in dateProgression) {
for (i in 0 until frequencySpec.frequency) {
val sampledVids = sampleVids(vidRangeSpec, random)
for (vid in sampledVids) {
yield(LabeledEvent(date.atStartOfDay().toInstant(ZoneOffset.UTC), vid, message))
}
}
Expand All @@ -90,6 +121,21 @@ object SyntheticDataGeneration {
}
}

/**
* Returns the sampled Vids from [vidRangeSpec]. Given the same [vidRangeSpec] and [randomSeed],
* returns the same vids. Returns all of the vids if sample size is 0.
*/
private fun sampleVids(vidRangeSpec: VidRangeSpec, random: Random): List<Long> {
val vidRangeList =
(vidRangeSpec.vidRange.start until vidRangeSpec.vidRange.endExclusive).toMutableList()
if (vidRangeSpec.sampleSize == 0) {
return vidRangeSequence
}
Collections.shuffle(vidRangeList, random)

return vidRangeList.take(vidRangeSpec.sampleSize)
}

/**
* Returns the [SubPopulation] from a list of [SubPopulation] that contains the [VidRange] in its
* range.
Expand Down Expand Up @@ -154,3 +200,15 @@ object SyntheticDataGeneration {
private fun SyntheticEventGroupSpec.DateSpec.DateRange.toProgression(): LocalDateProgression {
return start.toLocalDate()..endExclusive.toLocalDate().minusDays(1)
}

// Sort the ranges by their start. If there are any consecutive ranges where
// the previous has a larger end than the latter's start, then there is an overlap.
private fun SyntheticEventGroupSpec.FrequencySpec.hasOverlaps() =
vidRangeSpecsList
.map { it.vidRange }
.sortedBy { it.start }
.zipWithNext()
.any { (first, second) -> first.overlaps(second) }

private fun VidRange.overlaps(other: VidRange) =
max(start, other.start) < min(endExclusive, other.endExclusive)
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,14 @@ message SyntheticEventGroupSpec {
// A map of `non_population_fields` from `SyntheticPopulationSpec` to
// their values.
map<string, FieldValue> non_population_field_values = 2;

// Number of vids sampled uniformly without replacement from vid_range.
// If this is 0, no sampling is done and all the vids in range are taken.
int32 sample_size = 3;

// Random seed to be fed into the random number generator to sample vids.
// Required if this VidRangeSpec specifies a sample_size.
int64 random_seed = 4;
}
// The VID ranges should be non-overlapping sub-ranges of SubPopulations.
repeated VidRangeSpec vid_range_specs = 2;
Expand All @@ -130,4 +138,16 @@ message SyntheticEventGroupSpec {
}
// `DateSpec`s should describe non-overlapping date ranges.
repeated DateSpec date_specs = 2;

// Type of random number generator to sample vids.
enum RngType {
// Default value used if the rng type is omitted.
RNG_TYPE_UNSPECIFIED = 0;
// Signals java.util.Random should be used for sampling.
JAVA_UTIL_RANDOM = 1;
}

// Random Number Generator type for this `SyntheticEventGroupSpec`.
// Required if any VidRangeSpec specifies a sample_size.
RngType rng_type = 4;
}
Loading
Loading