Better sizing BytesRef for Strings in Queries (elastic#115655)

* Better sizing BytesRefs for Strings in Queries * Update docs/changelog/115655.yaml * iter * added test * iter * extracted method * iter --------- Co-authored-by: Elastic Machine <[email protected]> (cherry picked from commit 9ebe95a)
piergm · Nov 7, 2024 · 49935dd · 49935dd
1 parent 7641277
commit 49935dd
Show file tree

Hide file tree

Showing 4 changed files with 53 additions and 7 deletions.
diff --git a/docs/changelog/115655.yaml b/docs/changelog/115655.yaml
@@ -0,0 +1,5 @@
+pr: 115655
+summary: Better sizing `BytesRef` for Strings in Queries
+area: Search
+type: enhancement
+issues: []
diff --git a/server/src/main/java/org/elasticsearch/common/lucene/BytesRefs.java b/server/src/main/java/org/elasticsearch/common/lucene/BytesRefs.java
@@ -11,6 +11,7 @@
 
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.UnicodeUtil;
 
 public class BytesRefs {
 
@@ -56,6 +57,25 @@ public static BytesRef checkIndexableLength(BytesRef input) {
         return input;
     }
 
+    /**
+     * Converts a given string to a {@link BytesRef} object with an exactly sized byte array.
+     * <p>
+     * This method alternative method to the standard {@link BytesRef} constructor's allocates the
+     * exact byte array size needed for the string. This is done by parsing the UTF-16 string two
+     * times the first to estimate the array length and the second to copy the string value inside
+     * the array.
+     * </p>
+     *
+     * @param s the input string to convert
+     * @return a BytesRef object representing the input string
+     */
+    public static BytesRef toExactSizedBytesRef(String s) {
+        int l = s.length();
+        byte[] b = new byte[UnicodeUtil.calcUTF16toUTF8Length(s, 0, l)];
+        UnicodeUtil.UTF16toUTF8(s, 0, l, b);
+        return new BytesRef(b, 0, b.length);
+    }
+
     /**
      * Produces a UTF-string prefix of the input BytesRef.  If the prefix cutoff would produce
      * ill-formed UTF, it falls back to the hexadecimal representation.
@@ -70,5 +90,4 @@ private static String safeStringPrefix(BytesRef input, int prefixLength) {
             return prefix.toString();
         }
     }
-
 }
diff --git a/server/src/main/java/org/elasticsearch/index/query/AbstractQueryBuilder.java b/server/src/main/java/org/elasticsearch/index/query/AbstractQueryBuilder.java
@@ -216,12 +216,12 @@ public final int hashCode() {
      * @return the same input object or a {@link BytesRef} representation if input was of type string
      */
     static Object maybeConvertToBytesRef(Object obj) {
-        if (obj instanceof String) {
-            return BytesRefs.checkIndexableLength(BytesRefs.toBytesRef(obj));
-        } else if (obj instanceof CharBuffer) {
-            return BytesRefs.checkIndexableLength(new BytesRef((CharBuffer) obj));
-        } else if (obj instanceof BigInteger) {
-            return BytesRefs.toBytesRef(obj);
+        if (obj instanceof String v) {
+            return BytesRefs.checkIndexableLength(BytesRefs.toExactSizedBytesRef(v));
+        } else if (obj instanceof CharBuffer v) {
+            return BytesRefs.checkIndexableLength(new BytesRef(v));
+        } else if (obj instanceof BigInteger v) {
+            return BytesRefs.toBytesRef(v);
         }
         return obj;
     }

diff --git a/server/src/test/java/org/elasticsearch/index/query/AbstractQueryBuilderTests.java b/server/src/test/java/org/elasticsearch/index/query/AbstractQueryBuilderTests.java
@@ -10,6 +10,7 @@
 package org.elasticsearch.index.query;
 
 import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.util.BytesRef;
 import org.elasticsearch.common.ParsingException;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.search.SearchModule;
@@ -93,4 +94,25 @@ public void testMaybeConvertToBytesRefLongTerm() {
         assertThat(e.getMessage(), containsString("term starting with [aaaaa"));
     }
 
+    public void testMaybeConvertToBytesRefStringCorrectSize() {
+        int capacity = randomIntBetween(20, 40);
+        StringBuilder termBuilder = new StringBuilder(capacity);
+        int correctSize = 0;
+        for (int i = 0; i < capacity; i++) {
+            if (i < capacity / 3) {
+                termBuilder.append((char) randomIntBetween(0, 128));
+                ++correctSize; // use only one byte for char < 128
+            } else if (i < 2 * capacity / 3) {
+                termBuilder.append((char) randomIntBetween(128, 2048));
+                correctSize += 2; // use two bytes for char < 2048
+            } else {
+                termBuilder.append((char) randomIntBetween(2048, 4092));
+                correctSize += 3; // use three bytes for char >= 2048
+            }
+        }
+        BytesRef bytesRef = (BytesRef) AbstractQueryBuilder.maybeConvertToBytesRef(termBuilder.toString());
+        assertEquals(correctSize, bytesRef.bytes.length);
+        assertEquals(correctSize, bytesRef.length);
+    }
+
 }