Skip to content

Commit

Permalink
[8.x] Better sizing BytesRef for Strings in Queries (#115655) (#116381)
Browse files Browse the repository at this point in the history
* Better sizing BytesRef for Strings in Queries (#115655)

* Better sizing BytesRefs for Strings in Queries

* Update docs/changelog/115655.yaml

* iter

* added test

* iter

* extracted method

* iter

---------

Co-authored-by: Elastic Machine <[email protected]>
(cherry picked from commit 9ebe95a)

* iter
  • Loading branch information
piergm authored Nov 7, 2024
1 parent 69df7fb commit 94498b4
Show file tree
Hide file tree
Showing 4 changed files with 53 additions and 7 deletions.
5 changes: 5 additions & 0 deletions docs/changelog/115655.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
pr: 115655
summary: Better sizing `BytesRef` for Strings in Queries
area: Search
type: enhancement
issues: []
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.UnicodeUtil;

public class BytesRefs {

Expand Down Expand Up @@ -56,6 +57,25 @@ public static BytesRef checkIndexableLength(BytesRef input) {
return input;
}

/**
* Converts a given string to a {@link BytesRef} object with an exactly sized byte array.
* <p>
* This method alternative method to the standard {@link BytesRef} constructor's allocates the
* exact byte array size needed for the string. This is done by parsing the UTF-16 string two
* times the first to estimate the array length and the second to copy the string value inside
* the array.
* </p>
*
* @param s the input string to convert
* @return a BytesRef object representing the input string
*/
public static BytesRef toExactSizedBytesRef(String s) {
int l = s.length();
byte[] b = new byte[UnicodeUtil.calcUTF16toUTF8Length(s, 0, l)];
UnicodeUtil.UTF16toUTF8(s, 0, l, b);
return new BytesRef(b, 0, b.length);
}

/**
* Produces a UTF-string prefix of the input BytesRef. If the prefix cutoff would produce
* ill-formed UTF, it falls back to the hexadecimal representation.
Expand All @@ -70,5 +90,4 @@ private static String safeStringPrefix(BytesRef input, int prefixLength) {
return prefix.toString();
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -216,12 +216,12 @@ public final int hashCode() {
* @return the same input object or a {@link BytesRef} representation if input was of type string
*/
static Object maybeConvertToBytesRef(Object obj) {
if (obj instanceof String) {
return BytesRefs.checkIndexableLength(BytesRefs.toBytesRef(obj));
} else if (obj instanceof CharBuffer) {
return BytesRefs.checkIndexableLength(new BytesRef((CharBuffer) obj));
} else if (obj instanceof BigInteger) {
return BytesRefs.toBytesRef(obj);
if (obj instanceof String v) {
return BytesRefs.checkIndexableLength(BytesRefs.toExactSizedBytesRef(v));
} else if (obj instanceof CharBuffer v) {
return BytesRefs.checkIndexableLength(new BytesRef(v));
} else if (obj instanceof BigInteger v) {
return BytesRefs.toBytesRef(v);
}
return obj;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
package org.elasticsearch.index.query;

import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.util.BytesRef;
import org.elasticsearch.common.ParsingException;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.search.SearchModule;
Expand Down Expand Up @@ -93,4 +94,25 @@ public void testMaybeConvertToBytesRefLongTerm() {
assertThat(e.getMessage(), containsString("term starting with [aaaaa"));
}

public void testMaybeConvertToBytesRefStringCorrectSize() {
int capacity = randomIntBetween(20, 40);
StringBuilder termBuilder = new StringBuilder(capacity);
int correctSize = 0;
for (int i = 0; i < capacity; i++) {
if (i < capacity / 3) {
termBuilder.append((char) randomIntBetween(0, 127));
++correctSize; // use only one byte for char < 128
} else if (i < 2 * capacity / 3) {
termBuilder.append((char) randomIntBetween(128, 2047));
correctSize += 2; // use two bytes for char < 2048
} else {
termBuilder.append((char) randomIntBetween(2048, 4092));
correctSize += 3; // use three bytes for char >= 2048
}
}
BytesRef bytesRef = (BytesRef) AbstractQueryBuilder.maybeConvertToBytesRef(termBuilder.toString());
assertEquals(correctSize, bytesRef.bytes.length);
assertEquals(correctSize, bytesRef.length);
}

}

0 comments on commit 94498b4

Please sign in to comment.