Skip to content

Commit

Permalink
Support sampling for synthetic event group spec (#1425)
Browse files Browse the repository at this point in the history
Support sampling for synthetic event group spec
  • Loading branch information
uakyol authored and ple13 committed Aug 16, 2024
1 parent 05a3d7c commit e68809e
Show file tree
Hide file tree
Showing 3 changed files with 555 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,15 @@ package org.wfanet.measurement.loadtest.dataprovider
import com.google.protobuf.Descriptors.FieldDescriptor
import com.google.protobuf.Message
import java.time.ZoneOffset
import java.util.Random
import kotlin.math.max
import kotlin.math.min
import kotlin.random.Random as KotlinRandom
import kotlin.random.asKotlinRandom
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.FieldValue
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.SimulatorSyntheticDataSpec
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.SyntheticEventGroupSpec
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.SyntheticEventGroupSpec.FrequencySpec.VidRangeSpec
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.SyntheticPopulationSpec
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.SyntheticPopulationSpec.SubPopulation
import org.wfanet.measurement.api.v2alpha.event_group_metadata.testing.VidRange
Expand All @@ -46,14 +52,39 @@ object SyntheticDataGeneration {
populationSpec: SyntheticPopulationSpec,
syntheticEventGroupSpec: SyntheticEventGroupSpec,
): Sequence<LabeledEvent<T>> {
var samplingRequired = false
val vidRangeSpecs =
syntheticEventGroupSpec.dateSpecsList
.flatMap { it.frequencySpecsList }
.flatMap { it.vidRangeSpecsList }

for (vidRangeSpec in vidRangeSpecs) {
val vidRangeWidth = vidRangeSpec.vidRange.endExclusive - vidRangeSpec.vidRange.start
check(vidRangeWidth >= vidRangeSpec.sampleSize) {
"all vidRange widths should be larger than sampleSizes"
}
if (vidRangeSpec.sampleSize > 0) {
samplingRequired = true
}
}

if (samplingRequired) {
check(syntheticEventGroupSpec.rngType == SyntheticEventGroupSpec.RngType.JAVA_UTIL_RANDOM) {
"Expecting JAVA_UTIL_RANDOM rng type, got ${syntheticEventGroupSpec.rngType}"
}
}

val subPopulations = populationSpec.subPopulationsList

return sequence {
for (dateSpec: SyntheticEventGroupSpec.DateSpec in syntheticEventGroupSpec.dateSpecsList) {
val dateProgression = dateSpec.dateRange.toProgression()
for (frequencySpec: SyntheticEventGroupSpec.FrequencySpec in dateSpec.frequencySpecsList) {
for (vidRangeSpec: SyntheticEventGroupSpec.FrequencySpec.VidRangeSpec in
frequencySpec.vidRangeSpecsList) {

check(!frequencySpec.hasOverlaps()) { "The VID ranges should be non-overlapping." }

for (vidRangeSpec: VidRangeSpec in frequencySpec.vidRangeSpecsList) {
val random = Random(vidRangeSpec.randomSeed).asKotlinRandom()
val subPopulation: SubPopulation =
vidRangeSpec.vidRange.findSubPopulation(subPopulations)
?: throw IllegalArgumentException()
Expand All @@ -77,9 +108,10 @@ object SyntheticDataGeneration {
@Suppress("UNCHECKED_CAST") // Safe per protobuf API.
val message = builder.build() as T

for (vid in vidRangeSpec.vidRange.start until vidRangeSpec.vidRange.endExclusive) {
for (date in dateProgression) {
for (i in 0 until frequencySpec.frequency) {
for (date in dateProgression) {
for (i in 0 until frequencySpec.frequency) {
val sampledVids = sampleVids(vidRangeSpec, random)
for (vid in sampledVids) {
yield(LabeledEvent(date.atStartOfDay().toInstant(ZoneOffset.UTC), vid, message))
}
}
Expand All @@ -90,6 +122,19 @@ object SyntheticDataGeneration {
}
}

/**
* Returns the sampled Vids from [vidRangeSpec]. Given the same [vidRangeSpec] and [randomSeed],
* returns the same vids. Returns all of the vids if sample size is 0.
*/
private fun sampleVids(vidRangeSpec: VidRangeSpec, random: KotlinRandom): Sequence<Long> {
val vidRangeSequence =
(vidRangeSpec.vidRange.start until vidRangeSpec.vidRange.endExclusive).asSequence()
if (vidRangeSpec.sampleSize == 0) {
return vidRangeSequence
}
return vidRangeSequence.shuffled(random).take(vidRangeSpec.sampleSize)
}

/**
* Returns the [SubPopulation] from a list of [SubPopulation] that contains the [VidRange] in its
* range.
Expand Down Expand Up @@ -154,3 +199,15 @@ object SyntheticDataGeneration {
private fun SyntheticEventGroupSpec.DateSpec.DateRange.toProgression(): LocalDateProgression {
return start.toLocalDate()..endExclusive.toLocalDate().minusDays(1)
}

// Sort the ranges by their start. If there are any consecutive ranges where
// the previous has a larger end than the latter's start, then there is an overlap.
private fun SyntheticEventGroupSpec.FrequencySpec.hasOverlaps() =
vidRangeSpecsList
.map { it.vidRange }
.sortedBy { it.start }
.zipWithNext()
.any { (first, second) -> first.overlaps(second) }

private fun VidRange.overlaps(other: VidRange) =
max(start, other.start) < min(endExclusive, other.endExclusive)
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,14 @@ message SyntheticEventGroupSpec {
// A map of `non_population_fields` from `SyntheticPopulationSpec` to
// their values.
map<string, FieldValue> non_population_field_values = 2;

// Number of vids sampled uniformly without replacement from vid_range.
// If this is 0, no sampling is done and all the vids in range are taken.
int32 sample_size = 3;

// Random seed to be fed into the random number generator to sample vids.
// Required if this VidRangeSpec specifies a sample_size.
int64 random_seed = 4;
}
// The VID ranges should be non-overlapping sub-ranges of SubPopulations.
repeated VidRangeSpec vid_range_specs = 2;
Expand All @@ -130,4 +138,16 @@ message SyntheticEventGroupSpec {
}
// `DateSpec`s should describe non-overlapping date ranges.
repeated DateSpec date_specs = 2;

// Type of random number generator to sample vids.
enum RngType {
// Default value used if the rng type is omitted.
RNG_TYPE_UNSPECIFIED = 0;
// Signals java.util.Random should be used for sampling.
JAVA_UTIL_RANDOM = 1;
}

// Random Number Generator type for this `SyntheticEventGroupSpec`.
// Required if any VidRangeSpec specifies a sample_size.
RngType rng_type = 4;
}
Loading

0 comments on commit e68809e

Please sign in to comment.