elastic · benwtrent · Jul 13, 2022 · Jun 15, 2022 · Jul 8, 2022 · Jul 8, 2022
diff --git a/...ava/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java b/...ava/org/elasticsearch/xpack/ml/inference/nlp/tokenizers/PrecompiledCharMapNormalizer.java
@@ -144,7 +144,7 @@ Optional<BytesRef> normalizePart(byte[] strBytes, int offset, int len) {
             secondIndex++;
         }
         if (secondIndex == firstIndex) {
-            return Optional.empty();
+            return Optional.of(new BytesRef(BytesRef.EMPTY_BYTES));
         }
         return Optional.of(new BytesRef(normalizedStrUtf8Bytes, firstIndex, secondIndex - firstIndex));
     }
@@ -155,40 +155,43 @@ String normalize(String str) {
         // break points from there. But, this seemed the easiest way for now
         //
         // Keep in mind, these break points aren't necessarily surrogate pairs, but also codepoints that contain a combining mark
+        byte[] strBytes = str.getBytes(StandardCharsets.UTF_8);
+        char[] strChars = str.toCharArray();
+        int[] strCp = str.codePoints().toArray();
         BreakIterator b = BreakIterator.getCharacterInstance(Locale.ROOT);
         b.setText(str);
         int start = b.first();
-        // If we knew the utf-8 length ahead of time (and iterated over the bytes in the appropriate chunks)
-        // we could pre-populate the known length here.
         BytesRefBuilder strBuilder = new BytesRefBuilder();
+        strBuilder.grow(strBytes.length);
+        int bytePos = 0;
         for (int end = b.next(); end != BreakIterator.DONE; start = end, end = b.next()) {
-            // TODO: It would be awesome if we could translate these starts and ends to byte positions, if we could performance would be
-            // dramatically improved
-            String unicodeStr = str.substring(start, end);
-            byte[] unicode = unicodeStr.getBytes(StandardCharsets.UTF_8);
+            int byteLen = 0;
+            int numCp = str.codePointCount(start, end);
+            for (int i = start; i < numCp + start; i++) {
+                byteLen += numUtf8Bytes(strCp[i]);
+            }
             // The trie only go up to a depth of 5 bytes.
             // So even looking at it for graphemes (with combining, surrogate, etc.) that are 6+ bytes in length is useless.
-            if (unicode.length < 6) {
-                Optional<BytesRef> subStr = normalizePart(unicode, 0, unicode.length);
+            if (byteLen < 6) {
+                Optional<BytesRef> subStr = normalizePart(strBytes, bytePos, byteLen);
                 if (subStr.isPresent()) {
                     strBuilder.append(subStr.get());
+                    bytePos += byteLen;
                     continue;
                 }
             }
-            int charIndex = 0;
             int charByteIndex = 0;
-            char[] unicodeCharArray = unicodeStr.toCharArray();
-            for (char c : unicodeCharArray) {
-                Optional<BytesRef> subStr = normalizePart(unicode, charByteIndex, numUtf8Bytes(c));
+            for (int i = start; i < end; i++) {
+                Optional<BytesRef> subStr = normalizePart(strBytes, charByteIndex + bytePos, numUtf8Bytes(strChars[i]));
                 if (subStr.isPresent()) {
                     strBuilder.append(subStr.get());
                 } else {
-                    int numBytes = UnicodeUtil.UTF16toUTF8(unicodeCharArray, charIndex, 1, reusableCharByteBuffer);
+                    int numBytes = UnicodeUtil.UTF16toUTF8(strChars, i, 1, reusableCharByteBuffer);
                     strBuilder.append(reusableCharByteBuffer, 0, numBytes);
                 }
-                charByteIndex += numUtf8Bytes(c);
-                ++charIndex;
+                charByteIndex += numUtf8Bytes(strChars[i]);
             }
+            bytePos += byteLen;
         }
         return strBuilder.get().utf8ToString();
     }