Skip to content

Commit

Permalink
[ML] Return both Joda and Java formats from structure finder (#33900)
Browse files Browse the repository at this point in the history
Previously the timestamp_formats field in the response
from the find_file_structure endpoint contained Joda
timestamp formats.  This change makes that clear by
renaming the field to joda_timestamp_formats, and also
adds a java_timestamp_formats field containing the
equivalent Java time format strings.
  • Loading branch information
droberts195 authored Sep 25, 2018
1 parent 612201a commit dfe5af0
Show file tree
Hide file tree
Showing 17 changed files with 1,254 additions and 233 deletions.
792 changes: 790 additions & 2 deletions docs/reference/ml/apis/find-file-structure.asciidoc

Large diffs are not rendered by default.

Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ public String toString() {
public static final ParseField SHOULD_TRIM_FIELDS = new ParseField("should_trim_fields");
public static final ParseField GROK_PATTERN = new ParseField("grok_pattern");
public static final ParseField TIMESTAMP_FIELD = new ParseField("timestamp_field");
public static final ParseField TIMESTAMP_FORMATS = new ParseField("timestamp_formats");
public static final ParseField JODA_TIMESTAMP_FORMATS = new ParseField("joda_timestamp_formats");
public static final ParseField JAVA_TIMESTAMP_FORMATS = new ParseField("java_timestamp_formats");
public static final ParseField NEED_CLIENT_TIMEZONE = new ParseField("need_client_timezone");
public static final ParseField MAPPINGS = new ParseField("mappings");
public static final ParseField FIELD_STATS = new ParseField("field_stats");
Expand All @@ -123,7 +124,8 @@ public String toString() {
PARSER.declareBoolean(Builder::setShouldTrimFields, SHOULD_TRIM_FIELDS);
PARSER.declareString(Builder::setGrokPattern, GROK_PATTERN);
PARSER.declareString(Builder::setTimestampField, TIMESTAMP_FIELD);
PARSER.declareStringArray(Builder::setTimestampFormats, TIMESTAMP_FORMATS);
PARSER.declareStringArray(Builder::setJodaTimestampFormats, JODA_TIMESTAMP_FORMATS);
PARSER.declareStringArray(Builder::setJavaTimestampFormats, JAVA_TIMESTAMP_FORMATS);
PARSER.declareBoolean(Builder::setNeedClientTimezone, NEED_CLIENT_TIMEZONE);
PARSER.declareObject(Builder::setMappings, (p, c) -> new TreeMap<>(p.map()), MAPPINGS);
PARSER.declareObject(Builder::setFieldStats, (p, c) -> {
Expand All @@ -150,7 +152,8 @@ public String toString() {
private final Character quote;
private final Boolean shouldTrimFields;
private final String grokPattern;
private final List<String> timestampFormats;
private final List<String> jodaTimestampFormats;
private final List<String> javaTimestampFormats;
private final String timestampField;
private final boolean needClientTimezone;
private final SortedMap<String, Object> mappings;
Expand All @@ -160,8 +163,9 @@ public String toString() {
public FileStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampleStart, String charset, Boolean hasByteOrderMarker,
Format format, String multilineStartPattern, String excludeLinesPattern, List<String> columnNames,
Boolean hasHeaderRow, Character delimiter, Character quote, Boolean shouldTrimFields, String grokPattern,
String timestampField, List<String> timestampFormats, boolean needClientTimezone, Map<String, Object> mappings,
Map<String, FieldStats> fieldStats, List<String> explanation) {
String timestampField, List<String> jodaTimestampFormats, List<String> javaTimestampFormats,
boolean needClientTimezone, Map<String, Object> mappings, Map<String, FieldStats> fieldStats,
List<String> explanation) {

this.numLinesAnalyzed = numLinesAnalyzed;
this.numMessagesAnalyzed = numMessagesAnalyzed;
Expand All @@ -178,7 +182,10 @@ public FileStructure(int numLinesAnalyzed, int numMessagesAnalyzed, String sampl
this.shouldTrimFields = shouldTrimFields;
this.grokPattern = grokPattern;
this.timestampField = timestampField;
this.timestampFormats = (timestampFormats == null) ? null : Collections.unmodifiableList(new ArrayList<>(timestampFormats));
this.jodaTimestampFormats =
(jodaTimestampFormats == null) ? null : Collections.unmodifiableList(new ArrayList<>(jodaTimestampFormats));
this.javaTimestampFormats =
(javaTimestampFormats == null) ? null : Collections.unmodifiableList(new ArrayList<>(javaTimestampFormats));
this.needClientTimezone = needClientTimezone;
this.mappings = Collections.unmodifiableSortedMap(new TreeMap<>(mappings));
this.fieldStats = Collections.unmodifiableSortedMap(new TreeMap<>(fieldStats));
Expand All @@ -200,7 +207,8 @@ public FileStructure(StreamInput in) throws IOException {
quote = in.readBoolean() ? (char) in.readVInt() : null;
shouldTrimFields = in.readOptionalBoolean();
grokPattern = in.readOptionalString();
timestampFormats = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null;
jodaTimestampFormats = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null;
javaTimestampFormats = in.readBoolean() ? Collections.unmodifiableList(in.readList(StreamInput::readString)) : null;
timestampField = in.readOptionalString();
needClientTimezone = in.readBoolean();
mappings = Collections.unmodifiableSortedMap(new TreeMap<>(in.readMap()));
Expand Down Expand Up @@ -239,11 +247,17 @@ public void writeTo(StreamOutput out) throws IOException {
}
out.writeOptionalBoolean(shouldTrimFields);
out.writeOptionalString(grokPattern);
if (timestampFormats == null) {
if (jodaTimestampFormats == null) {
out.writeBoolean(false);
} else {
out.writeBoolean(true);
out.writeCollection(timestampFormats, StreamOutput::writeString);
out.writeCollection(jodaTimestampFormats, StreamOutput::writeString);
}
if (javaTimestampFormats == null) {
out.writeBoolean(false);
} else {
out.writeBoolean(true);
out.writeCollection(javaTimestampFormats, StreamOutput::writeString);
}
out.writeOptionalString(timestampField);
out.writeBoolean(needClientTimezone);
Expand Down Expand Up @@ -312,8 +326,12 @@ public String getTimestampField() {
return timestampField;
}

public List<String> getTimestampFormats() {
return timestampFormats;
public List<String> getJodaTimestampFormats() {
return jodaTimestampFormats;
}

public List<String> getJavaTimestampFormats() {
return javaTimestampFormats;
}

public boolean needClientTimezone() {
Expand Down Expand Up @@ -371,8 +389,11 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws
if (timestampField != null && timestampField.isEmpty() == false) {
builder.field(TIMESTAMP_FIELD.getPreferredName(), timestampField);
}
if (timestampFormats != null && timestampFormats.isEmpty() == false) {
builder.field(TIMESTAMP_FORMATS.getPreferredName(), timestampFormats);
if (jodaTimestampFormats != null && jodaTimestampFormats.isEmpty() == false) {
builder.field(JODA_TIMESTAMP_FORMATS.getPreferredName(), jodaTimestampFormats);
}
if (javaTimestampFormats != null && javaTimestampFormats.isEmpty() == false) {
builder.field(JAVA_TIMESTAMP_FORMATS.getPreferredName(), javaTimestampFormats);
}
builder.field(NEED_CLIENT_TIMEZONE.getPreferredName(), needClientTimezone);
builder.field(MAPPINGS.getPreferredName(), mappings);
Expand All @@ -396,7 +417,7 @@ public int hashCode() {

return Objects.hash(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern,
timestampField, timestampFormats, needClientTimezone, mappings, fieldStats, explanation);
timestampField, jodaTimestampFormats, javaTimestampFormats, needClientTimezone, mappings, fieldStats, explanation);
}

@Override
Expand All @@ -413,7 +434,6 @@ public boolean equals(Object other) {
FileStructure that = (FileStructure) other;
return this.numLinesAnalyzed == that.numLinesAnalyzed &&
this.numMessagesAnalyzed == that.numMessagesAnalyzed &&
this.needClientTimezone == that.needClientTimezone &&
Objects.equals(this.sampleStart, that.sampleStart) &&
Objects.equals(this.charset, that.charset) &&
Objects.equals(this.hasByteOrderMarker, that.hasByteOrderMarker) &&
Expand All @@ -427,7 +447,9 @@ public boolean equals(Object other) {
Objects.equals(this.shouldTrimFields, that.shouldTrimFields) &&
Objects.equals(this.grokPattern, that.grokPattern) &&
Objects.equals(this.timestampField, that.timestampField) &&
Objects.equals(this.timestampFormats, that.timestampFormats) &&
Objects.equals(this.jodaTimestampFormats, that.jodaTimestampFormats) &&
Objects.equals(this.javaTimestampFormats, that.javaTimestampFormats) &&
this.needClientTimezone == that.needClientTimezone &&
Objects.equals(this.mappings, that.mappings) &&
Objects.equals(this.fieldStats, that.fieldStats) &&
Objects.equals(this.explanation, that.explanation);
Expand All @@ -450,7 +472,8 @@ public static class Builder {
private Boolean shouldTrimFields;
private String grokPattern;
private String timestampField;
private List<String> timestampFormats;
private List<String> jodaTimestampFormats;
private List<String> javaTimestampFormats;
private boolean needClientTimezone;
private Map<String, Object> mappings;
private Map<String, FieldStats> fieldStats = Collections.emptyMap();
Expand Down Expand Up @@ -539,8 +562,13 @@ public Builder setTimestampField(String timestampField) {
return this;
}

public Builder setTimestampFormats(List<String> timestampFormats) {
this.timestampFormats = timestampFormats;
public Builder setJodaTimestampFormats(List<String> jodaTimestampFormats) {
this.jodaTimestampFormats = jodaTimestampFormats;
return this;
}

public Builder setJavaTimestampFormats(List<String> javaTimestampFormats) {
this.javaTimestampFormats = javaTimestampFormats;
return this;
}

Expand Down Expand Up @@ -652,11 +680,21 @@ public FileStructure build() {
throw new IllegalStateException("enum value [" + format + "] missing from switch.");
}

if ((timestampField == null) != (timestampFormats == null || timestampFormats.isEmpty())) {
throw new IllegalArgumentException("Timestamp field and timestamp formats must both be specified or neither be specified.");
boolean isTimestampFieldSpecified = (timestampField != null);
boolean isJodaTimestampFormatsSpecified = (jodaTimestampFormats != null && jodaTimestampFormats.isEmpty() == false);
boolean isJavaTimestampFormatsSpecified = (javaTimestampFormats != null && javaTimestampFormats.isEmpty() == false);

if (isTimestampFieldSpecified != isJodaTimestampFormatsSpecified) {
throw new IllegalArgumentException(
"Timestamp field and Joda timestamp formats must both be specified or neither be specified.");
}

if (isTimestampFieldSpecified != isJavaTimestampFormatsSpecified) {
throw new IllegalArgumentException(
"Timestamp field and Java timestamp formats must both be specified or neither be specified.");
}

if (needClientTimezone && timestampField == null) {
if (needClientTimezone && isTimestampFieldSpecified == false) {
throw new IllegalArgumentException("Client timezone cannot be needed if there is no timestamp field.");
}

Expand All @@ -670,7 +708,7 @@ public FileStructure build() {

return new FileStructure(numLinesAnalyzed, numMessagesAnalyzed, sampleStart, charset, hasByteOrderMarker, format,
multilineStartPattern, excludeLinesPattern, columnNames, hasHeaderRow, delimiter, quote, shouldTrimFields, grokPattern,
timestampField, timestampFormats, needClientTimezone, mappings, fieldStats, explanation);
timestampField, jodaTimestampFormats, javaTimestampFormats, needClientTimezone, mappings, fieldStats, explanation);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ public static FileStructure createTestFileStructure() {

if (format == FileStructure.Format.SEMI_STRUCTURED_TEXT || randomBoolean()) {
builder.setTimestampField(randomAlphaOfLength(10));
builder.setTimestampFormats(Arrays.asList(generateRandomStringArray(3, 20, false, false)));
builder.setJodaTimestampFormats(Arrays.asList(generateRandomStringArray(3, 20, false, false)));
builder.setJavaTimestampFormats(Arrays.asList(generateRandomStringArray(3, 20, false, false)));
builder.setNeedClientTimezone(randomBoolean());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,8 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List<String
}

structureBuilder.setTimestampField(timeField.v1())
.setTimestampFormats(timeField.v2().dateFormats)
.setJodaTimestampFormats(timeField.v2().jodaTimestampFormats)
.setJavaTimestampFormats(timeField.v2().javaTimestampFormats)
.setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing())
.setMultilineStartPattern(timeLineRegex);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,7 @@ static Map<String, String> guessScalarMapping(List<String> explanation, String f
Iterator<String> iter = fieldValues.iterator();
TimestampMatch timestampMatch = TimestampFormatFinder.findFirstFullMatch(iter.next());
while (timestampMatch != null && iter.hasNext()) {
// To be mapped as type date all the values must match the same date format - it is
// To be mapped as type date all the values must match the same timestamp format - it is
// not acceptable for all values to be dates, but with different formats
if (timestampMatch.equals(TimestampFormatFinder.findFirstFullMatch(iter.next(), timestampMatch.candidateIndex)) == false) {
timestampMatch = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,8 @@ static JsonFileStructureFinder makeJsonFileStructureFinder(List<String> explanat
Tuple<String, TimestampMatch> timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides);
if (timeField != null) {
structureBuilder.setTimestampField(timeField.v1())
.setTimestampFormats(timeField.v2().dateFormats)
.setJodaTimestampFormats(timeField.v2().jodaTimestampFormats)
.setJavaTimestampFormats(timeField.v2().javaTimestampFormats)
.setNeedClientTimezone(timeField.v2().hasTimezoneDependentParsing());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ static TextLogFileStructureFinder makeTextLogFileStructureFinder(List<String> ex

FileStructure structure = structureBuilder
.setTimestampField(interimTimestampField)
.setTimestampFormats(bestTimestamp.v1().dateFormats)
.setJodaTimestampFormats(bestTimestamp.v1().jodaTimestampFormats)
.setJavaTimestampFormats(bestTimestamp.v1().javaTimestampFormats)
.setNeedClientTimezone(bestTimestamp.v1().hasTimezoneDependentParsing())
.setGrokPattern(grokPattern)
.setMappings(mappings)
Expand Down Expand Up @@ -147,8 +148,8 @@ static Tuple<TimestampMatch, Set<String>> mostLikelyTimestamp(String[] sampleLin
for (String sampleLine : sampleLines) {
TimestampMatch match = TimestampFormatFinder.findFirstMatch(sampleLine, overrides.getTimestampFormat());
if (match != null) {
TimestampMatch pureMatch = new TimestampMatch(match.candidateIndex, "", match.dateFormats, match.simplePattern,
match.grokPatternName, "");
TimestampMatch pureMatch = new TimestampMatch(match.candidateIndex, "", match.jodaTimestampFormats,
match.javaTimestampFormats, match.simplePattern, match.grokPatternName, "");
timestampMatches.compute(pureMatch, (k, v) -> {
if (v == null) {
return new Tuple<>(weightForMatch(match.preface), new HashSet<>(Collections.singletonList(match.preface)));
Expand Down
Loading

0 comments on commit dfe5af0

Please sign in to comment.