From 49935dd5980b96f3ff8e0107bbc5ebef8695fb42 Mon Sep 17 00:00:00 2001 From: Matteo Piergiovanni <134913285+piergm@users.noreply.github.com> Date: Thu, 7 Nov 2024 09:33:57 +0100 Subject: [PATCH] Better sizing BytesRef for Strings in Queries (#115655) * Better sizing BytesRefs for Strings in Queries * Update docs/changelog/115655.yaml * iter * added test * iter * extracted method * iter --------- Co-authored-by: Elastic Machine (cherry picked from commit 9ebe95a8a806927147f054a211f26034254560f6) --- docs/changelog/115655.yaml | 5 +++++ .../common/lucene/BytesRefs.java | 21 +++++++++++++++++- .../index/query/AbstractQueryBuilder.java | 12 +++++----- .../query/AbstractQueryBuilderTests.java | 22 +++++++++++++++++++ 4 files changed, 53 insertions(+), 7 deletions(-) create mode 100644 docs/changelog/115655.yaml diff --git a/docs/changelog/115655.yaml b/docs/changelog/115655.yaml new file mode 100644 index 0000000000000..7184405867657 --- /dev/null +++ b/docs/changelog/115655.yaml @@ -0,0 +1,5 @@ +pr: 115655 +summary: Better sizing `BytesRef` for Strings in Queries +area: Search +type: enhancement +issues: [] diff --git a/server/src/main/java/org/elasticsearch/common/lucene/BytesRefs.java b/server/src/main/java/org/elasticsearch/common/lucene/BytesRefs.java index ff8af9b80edcc..ed88c3a5a9c91 100644 --- a/server/src/main/java/org/elasticsearch/common/lucene/BytesRefs.java +++ b/server/src/main/java/org/elasticsearch/common/lucene/BytesRefs.java @@ -11,6 +11,7 @@ import org.apache.lucene.index.IndexWriter; import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.UnicodeUtil; public class BytesRefs { @@ -56,6 +57,25 @@ public static BytesRef checkIndexableLength(BytesRef input) { return input; } + /** + * Converts a given string to a {@link BytesRef} object with an exactly sized byte array. + *

+ * This method alternative method to the standard {@link BytesRef} constructor's allocates the + * exact byte array size needed for the string. This is done by parsing the UTF-16 string two + * times the first to estimate the array length and the second to copy the string value inside + * the array. + *

+ * + * @param s the input string to convert + * @return a BytesRef object representing the input string + */ + public static BytesRef toExactSizedBytesRef(String s) { + int l = s.length(); + byte[] b = new byte[UnicodeUtil.calcUTF16toUTF8Length(s, 0, l)]; + UnicodeUtil.UTF16toUTF8(s, 0, l, b); + return new BytesRef(b, 0, b.length); + } + /** * Produces a UTF-string prefix of the input BytesRef. If the prefix cutoff would produce * ill-formed UTF, it falls back to the hexadecimal representation. @@ -70,5 +90,4 @@ private static String safeStringPrefix(BytesRef input, int prefixLength) { return prefix.toString(); } } - } diff --git a/server/src/main/java/org/elasticsearch/index/query/AbstractQueryBuilder.java b/server/src/main/java/org/elasticsearch/index/query/AbstractQueryBuilder.java index 5df63687e1786..f00e6904feac7 100644 --- a/server/src/main/java/org/elasticsearch/index/query/AbstractQueryBuilder.java +++ b/server/src/main/java/org/elasticsearch/index/query/AbstractQueryBuilder.java @@ -216,12 +216,12 @@ public final int hashCode() { * @return the same input object or a {@link BytesRef} representation if input was of type string */ static Object maybeConvertToBytesRef(Object obj) { - if (obj instanceof String) { - return BytesRefs.checkIndexableLength(BytesRefs.toBytesRef(obj)); - } else if (obj instanceof CharBuffer) { - return BytesRefs.checkIndexableLength(new BytesRef((CharBuffer) obj)); - } else if (obj instanceof BigInteger) { - return BytesRefs.toBytesRef(obj); + if (obj instanceof String v) { + return BytesRefs.checkIndexableLength(BytesRefs.toExactSizedBytesRef(v)); + } else if (obj instanceof CharBuffer v) { + return BytesRefs.checkIndexableLength(new BytesRef(v)); + } else if (obj instanceof BigInteger v) { + return BytesRefs.toBytesRef(v); } return obj; } diff --git a/server/src/test/java/org/elasticsearch/index/query/AbstractQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/AbstractQueryBuilderTests.java index 07c8166741e63..a43c1a8ba3395 100644 --- a/server/src/test/java/org/elasticsearch/index/query/AbstractQueryBuilderTests.java +++ b/server/src/test/java/org/elasticsearch/index/query/AbstractQueryBuilderTests.java @@ -10,6 +10,7 @@ package org.elasticsearch.index.query; import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.util.BytesRef; import org.elasticsearch.common.ParsingException; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.search.SearchModule; @@ -93,4 +94,25 @@ public void testMaybeConvertToBytesRefLongTerm() { assertThat(e.getMessage(), containsString("term starting with [aaaaa")); } + public void testMaybeConvertToBytesRefStringCorrectSize() { + int capacity = randomIntBetween(20, 40); + StringBuilder termBuilder = new StringBuilder(capacity); + int correctSize = 0; + for (int i = 0; i < capacity; i++) { + if (i < capacity / 3) { + termBuilder.append((char) randomIntBetween(0, 128)); + ++correctSize; // use only one byte for char < 128 + } else if (i < 2 * capacity / 3) { + termBuilder.append((char) randomIntBetween(128, 2048)); + correctSize += 2; // use two bytes for char < 2048 + } else { + termBuilder.append((char) randomIntBetween(2048, 4092)); + correctSize += 3; // use three bytes for char >= 2048 + } + } + BytesRef bytesRef = (BytesRef) AbstractQueryBuilder.maybeConvertToBytesRef(termBuilder.toString()); + assertEquals(correctSize, bytesRef.bytes.length); + assertEquals(correctSize, bytesRef.length); + } + }