Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

LogsDB data generator - support nested object field #111206

Merged
merged 2 commits into from
Jul 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ public class DataGenerator {
private final FieldDataGenerator topLevelGenerator;

public DataGenerator(DataGeneratorSpecification specification) {
this.topLevelGenerator = new ObjectFieldDataGenerator(specification, 0);
this.topLevelGenerator = new ObjectFieldDataGenerator(specification);
}

public void writeMapping(XContentBuilder mapping) throws IOException {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,59 @@

/**
* Allows configuring behavior of {@link DataGenerator}.
* @param arbitrary provides arbitrary values used during generation
* @param maxFieldCountPerLevel maximum number of fields that an individual object in mapping has.
* Applies to subobjects.
* @param maxObjectDepth maximum depth of nested objects
* @param arbitrary provides arbitrary values used during generation
* @param nestedFieldsLimit how many total nested fields can be present in a produced mapping
*/
public record DataGeneratorSpecification(int maxFieldCountPerLevel, int maxObjectDepth, Arbitrary arbitrary) {
public DataGeneratorSpecification() {
this(50, 3, new RandomBasedArbitrary());
public record DataGeneratorSpecification(Arbitrary arbitrary, int maxFieldCountPerLevel, int maxObjectDepth, int nestedFieldsLimit) {

public static Builder builder() {
return new Builder();
}

public static DataGeneratorSpecification buildDefault() {
return builder().build();
}

public static class Builder {
private Arbitrary arbitrary;
private int maxFieldCountPerLevel;
private int maxObjectDepth;
private int nestedFieldsLimit;

public Builder() {
// Simply sufficiently big numbers to get some permutations
maxFieldCountPerLevel = 50;
maxObjectDepth = 3;
// Default value of index.mapping.nested_fields.limit
nestedFieldsLimit = 50;
arbitrary = new RandomBasedArbitrary();
}

public Builder withArbitrary(Arbitrary arbitrary) {
this.arbitrary = arbitrary;
return this;
}

public Builder withMaxFieldCountPerLevel(int maxFieldCountPerLevel) {
this.maxFieldCountPerLevel = maxFieldCountPerLevel;
return this;
}

public Builder withMaxObjectDepth(int maxObjectDepth) {
this.maxObjectDepth = maxObjectDepth;
return this;
}

public Builder withNestedFieldsLimit(int nestedFieldsLimit) {
this.nestedFieldsLimit = nestedFieldsLimit;
return this;
}

public DataGeneratorSpecification build() {
return new DataGeneratorSpecification(arbitrary, maxFieldCountPerLevel, maxObjectDepth, nestedFieldsLimit);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
public interface Arbitrary {
boolean generateSubObject();

boolean generateNestedObject();

int childFieldCount(int lowerBound, int upperBound);

String fieldName(int lengthLowerBound, int lengthUpperBound);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,12 @@ public boolean generateSubObject() {
return randomDouble() <= 0.1;
}

@Override
public boolean generateNestedObject() {
// Using a static 10% change, this is just a chosen value that can be tweaked.
return randomDouble() <= 0.1;
}

@Override
public int childFieldCount(int lowerBound, int upperBound) {
return randomIntBetween(lowerBound, upperBound);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.logsdb.datageneration.fields;

import org.elasticsearch.logsdb.datageneration.DataGeneratorSpecification;

class Context {
private final DataGeneratorSpecification specification;
private final int objectDepth;
private final int nestedFieldsCount;

Context(DataGeneratorSpecification specification) {
this(specification, 0, 0);
}

private Context(DataGeneratorSpecification specification, int objectDepth, int nestedFieldsCount) {
this.specification = specification;
this.objectDepth = objectDepth;
this.nestedFieldsCount = nestedFieldsCount;
}

public DataGeneratorSpecification specification() {
return specification;
}

public Context subObject() {
return new Context(specification, objectDepth + 1, nestedFieldsCount);
}

public Context nestedObject() {
return new Context(specification, objectDepth + 1, nestedFieldsCount + 1);
}

public boolean shouldAddObjectField() {
return specification.arbitrary().generateSubObject() && objectDepth < specification.maxObjectDepth();
}

public boolean shouldAddNestedField() {
return specification.arbitrary().generateNestedObject()
&& objectDepth < specification.maxObjectDepth()
&& nestedFieldsCount < specification.nestedFieldsLimit();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.logsdb.datageneration.fields;

import org.elasticsearch.core.CheckedConsumer;
import org.elasticsearch.logsdb.datageneration.FieldDataGenerator;
import org.elasticsearch.logsdb.datageneration.FieldType;
import org.elasticsearch.xcontent.XContentBuilder;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

/**
* Generic generator for any type of object field (e.g. "object", "nested").
*/
public class GenericSubObjectFieldDataGenerator {
private final Context context;

private final List<ChildField> childFields;

public GenericSubObjectFieldDataGenerator(Context context) {
this.context = context;

childFields = new ArrayList<>();
generateChildFields();
}

public CheckedConsumer<XContentBuilder, IOException> mappingWriter(
CheckedConsumer<XContentBuilder, IOException> customMappingParameters
) {
return b -> {
b.startObject();
customMappingParameters.accept(b);

b.startObject("properties");
for (var childField : childFields) {
b.field(childField.fieldName);
childField.generator.mappingWriter().accept(b);
}
b.endObject();

b.endObject();
};
}

public CheckedConsumer<XContentBuilder, IOException> fieldValueGenerator() {
return b -> {
b.startObject();

for (var childField : childFields) {
b.field(childField.fieldName);
childField.generator.fieldValueGenerator().accept(b);
}

b.endObject();
};
}

private void generateChildFields() {
var existingFields = new HashSet<String>();
// no child fields is legal
var childFieldsCount = context.specification().arbitrary().childFieldCount(0, context.specification().maxFieldCountPerLevel());

for (int i = 0; i < childFieldsCount; i++) {
var fieldName = generateFieldName(existingFields);

if (context.shouldAddObjectField()) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we give an equal probability of 50% to generating object and nested fields? It currently seems like we prioritize shouldAddObjectField.

Copy link
Contributor Author

@lkts lkts Jul 24, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this is just based on my impression that nested is less used in reality. I'll do 10% each.

childFields.add(new ChildField(fieldName, new ObjectFieldDataGenerator(context.subObject())));
} else if (context.shouldAddNestedField()) {
childFields.add(new ChildField(fieldName, new NestedFieldDataGenerator(context.nestedObject())));
} else {
var fieldType = context.specification().arbitrary().fieldType();
addLeafField(fieldType, fieldName);
}
}
}

private void addLeafField(FieldType type, String fieldName) {
var generator = switch (type) {
case LONG -> new LongFieldDataGenerator(context.specification().arbitrary());
case KEYWORD -> new KeywordFieldDataGenerator(context.specification().arbitrary());
};

childFields.add(new ChildField(fieldName, generator));
}

private String generateFieldName(Set<String> existingFields) {
var fieldName = context.specification().arbitrary().fieldName(1, 10);
while (existingFields.contains(fieldName)) {
fieldName = context.specification().arbitrary().fieldName(1, 10);
}
existingFields.add(fieldName);

return fieldName;
}

private record ChildField(String fieldName, FieldDataGenerator generator) {}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
* or more contributor license agreements. Licensed under the Elastic License
* 2.0 and the Server Side Public License, v 1; you may not use this file except
* in compliance with, at your election, the Elastic License 2.0 or the Server
* Side Public License, v 1.
*/

package org.elasticsearch.logsdb.datageneration.fields;

import org.elasticsearch.core.CheckedConsumer;
import org.elasticsearch.logsdb.datageneration.FieldDataGenerator;
import org.elasticsearch.xcontent.XContentBuilder;

import java.io.IOException;

public class NestedFieldDataGenerator implements FieldDataGenerator {
private final GenericSubObjectFieldDataGenerator delegate;

public NestedFieldDataGenerator(Context context) {
this.delegate = new GenericSubObjectFieldDataGenerator(context);
}

@Override
public CheckedConsumer<XContentBuilder, IOException> mappingWriter() {
return delegate.mappingWriter(b -> b.field("type", "nested"));
}

@Override
public CheckedConsumer<XContentBuilder, IOException> fieldValueGenerator() {
return delegate.fieldValueGenerator();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,91 +11,28 @@
import org.elasticsearch.core.CheckedConsumer;
import org.elasticsearch.logsdb.datageneration.DataGeneratorSpecification;
import org.elasticsearch.logsdb.datageneration.FieldDataGenerator;
import org.elasticsearch.logsdb.datageneration.FieldType;
import org.elasticsearch.xcontent.XContentBuilder;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

public class ObjectFieldDataGenerator implements FieldDataGenerator {
private final DataGeneratorSpecification specification;
private final int depth;
private final GenericSubObjectFieldDataGenerator delegate;

private final List<ChildField> childFields;
public ObjectFieldDataGenerator(DataGeneratorSpecification specification) {
this(new Context(specification));
}

public ObjectFieldDataGenerator(DataGeneratorSpecification specification, int depth) {
this.specification = specification;
this.depth = depth;
this.childFields = new ArrayList<>();
generateChildFields();
ObjectFieldDataGenerator(Context context) {
this.delegate = new GenericSubObjectFieldDataGenerator(context);
}

@Override
public CheckedConsumer<XContentBuilder, IOException> mappingWriter() {
return b -> {
b.startObject().startObject("properties");

for (var childField : childFields) {
b.field(childField.fieldName);
childField.generator.mappingWriter().accept(b);
}

b.endObject().endObject();
};
return delegate.mappingWriter(b -> {});
}

@Override
public CheckedConsumer<XContentBuilder, IOException> fieldValueGenerator() {
return b -> {
b.startObject();

for (var childField : childFields) {
b.field(childField.fieldName);
childField.generator.fieldValueGenerator().accept(b);
}

b.endObject();
};
}

private void generateChildFields() {
var existingFields = new HashSet<String>();
// no child fields is legal
var childFieldsCount = specification.arbitrary().childFieldCount(0, specification.maxFieldCountPerLevel());

for (int i = 0; i < childFieldsCount; i++) {
var fieldName = generateFieldName(existingFields);

if (specification.arbitrary().generateSubObject() && depth < specification.maxObjectDepth()) {
childFields.add(new ChildField(fieldName, new ObjectFieldDataGenerator(specification, depth + 1)));
} else {
var fieldType = specification.arbitrary().fieldType();
addLeafField(fieldType, fieldName);
}
}
}

private void addLeafField(FieldType type, String fieldName) {
var generator = switch (type) {
case LONG -> new LongFieldDataGenerator(specification.arbitrary());
case KEYWORD -> new KeywordFieldDataGenerator(specification.arbitrary());
};

childFields.add(new ChildField(fieldName, generator));
return delegate.fieldValueGenerator();
}

private String generateFieldName(Set<String> existingFields) {
var fieldName = specification.arbitrary().fieldName(1, 10);
while (existingFields.contains(fieldName)) {
fieldName = specification.arbitrary().fieldName(1, 10);
}
existingFields.add(fieldName);

return fieldName;
}

private record ChildField(String fieldName, FieldDataGenerator generator) {}
}
Loading