Merge remote-tracking branch 'elastic/main' into bloom_filter

elastic · Jul 29, 2022 · cfc05d6 · cfc05d6
2 parents ab24abc + 9f2b96d
commit cfc05d6
Show file tree

Hide file tree

Showing 15 changed files with 944 additions and 128 deletions.
diff --git a/...-internal/src/main/java/org/elasticsearch/gradle/internal/release/ReleaseToolsPlugin.java b/...-internal/src/main/java/org/elasticsearch/gradle/internal/release/ReleaseToolsPlugin.java
@@ -12,6 +12,7 @@
 import org.elasticsearch.gradle.VersionProperties;
 import org.elasticsearch.gradle.internal.conventions.precommit.PrecommitTaskPlugin;
 import org.elasticsearch.gradle.internal.precommit.ValidateYamlAgainstSchemaTask;
+import org.gradle.api.Action;
 import org.gradle.api.Plugin;
 import org.gradle.api.Project;
 import org.gradle.api.file.Directory;
@@ -22,6 +23,7 @@
 import org.gradle.api.tasks.util.PatternSet;
 
 import java.io.File;
+import java.util.function.Function;
 
 import javax.inject.Inject;
 
@@ -67,10 +69,14 @@ public void apply(Project project) {
                 task.dependsOn(validateChangelogsAgainstYamlTask);
             });
 
-        project.getTasks().register("generateReleaseNotes", GenerateReleaseNotesTask.class).configure(task -> {
+        final Function<Boolean, Action<GenerateReleaseNotesTask>> configureGenerateTask = shouldConfigureYamlFiles -> task -> {
             task.setGroup("Documentation");
-            task.setDescription("Generates release notes from changelog files held in this checkout");
-            task.setChangelogs(yamlFiles);
+            if (shouldConfigureYamlFiles) {
+                task.setChangelogs(yamlFiles);
+                task.setDescription("Generates release notes from changelog files held in this checkout");
+            } else {
+                task.setDescription("Generates stub release notes e.g. after feature freeze");
+            }
 
             task.setReleaseNotesIndexTemplate(projectDirectory.file(RESOURCES + "templates/release-notes-index.asciidoc"));
             task.setReleaseNotesIndexFile(projectDirectory.file("docs/reference/release-notes.asciidoc"));
@@ -100,7 +106,12 @@ public void apply(Project project) {
             task.setMigrationIndexFile(projectDirectory.file("docs/reference/migration/index.asciidoc"));
 
             task.dependsOn(validateChangelogsTask);
-        });
+        };
+
+        project.getTasks().register("generateReleaseNotes", GenerateReleaseNotesTask.class).configure(configureGenerateTask.apply(true));
+        project.getTasks()
+            .register("generateStubReleaseNotes", GenerateReleaseNotesTask.class)
+            .configure(configureGenerateTask.apply(false));
 
         project.getTasks().register("pruneChangelogs", PruneChangelogsTask.class).configure(task -> {
             task.setGroup("Documentation");

diff --git a/docs/changelog/88907.yaml b/docs/changelog/88907.yaml
@@ -0,0 +1,6 @@
+pr: 88907
+summary: Fix BERT and MPNet tokenization bug when handling unicode accents
+area: Machine Learning
+type: bug
+issues:
+ - 88900
diff --git a/docs/reference/migration/index.asciidoc b/docs/reference/migration/index.asciidoc
@@ -1,11 +1,13 @@
 include::migration_intro.asciidoc[]
 
+* <<migrating-8.5,Migrating to 8.5>>
 * <<migrating-8.4,Migrating to 8.4>>
 * <<migrating-8.3,Migrating to 8.3>>
 * <<migrating-8.2,Migrating to 8.2>>
 * <<migrating-8.1,Migrating to 8.1>>
 * <<migrating-8.0,Migrating to 8.0>>
 
+include::migrate_8_5.asciidoc[]
 include::migrate_8_4.asciidoc[]
 include::migrate_8_3.asciidoc[]
 include::migrate_8_2.asciidoc[]

diff --git a/docs/reference/migration/migrate_8_5.asciidoc b/docs/reference/migration/migrate_8_5.asciidoc
@@ -0,0 +1,22 @@
+[[migrating-8.5]]
+== Migrating to 8.5
+++++
+<titleabbrev>8.5</titleabbrev>
+++++
+
+This section discusses the changes that you need to be aware of when migrating
+your application to {es} 8.5.
+
+See also <<release-highlights>> and <<es-release-notes>>.
+
+coming::[8.5.0]
+
+
+[discrete]
+[[breaking-changes-8.5]]
+=== Breaking changes
+
+// tag::notable-breaking-changes[]
+There are no breaking changes in {es} 8.5.
+// end::notable-breaking-changes[]
+
diff --git a/...c/main/java/org/elasticsearch/index/snapshots/blobstore/BlobStoreIndexShardSnapshots.java b/...c/main/java/org/elasticsearch/index/snapshots/blobstore/BlobStoreIndexShardSnapshots.java
@@ -10,20 +10,25 @@
 
 import org.elasticsearch.common.util.CollectionUtils;
 import org.elasticsearch.common.xcontent.XContentParserUtils;
+import org.elasticsearch.core.Tuple;
 import org.elasticsearch.index.snapshots.blobstore.BlobStoreIndexShardSnapshot.FileInfo;
+import org.elasticsearch.index.store.StoreFileMetadata;
+import org.elasticsearch.snapshots.SnapshotId;
 import org.elasticsearch.xcontent.ToXContentFragment;
 import org.elasticsearch.xcontent.XContentBuilder;
 import org.elasticsearch.xcontent.XContentParser;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
+import java.util.IdentityHashMap;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
-
-import static java.util.Collections.unmodifiableMap;
+import java.util.Set;
+import java.util.stream.Collectors;
 
 /**
  * Contains information about all snapshots for the given shard in repository
@@ -33,54 +38,53 @@
  */
 public class BlobStoreIndexShardSnapshots implements Iterable<SnapshotFiles>, ToXContentFragment {
 
-    public static final BlobStoreIndexShardSnapshots EMPTY = new BlobStoreIndexShardSnapshots(Collections.emptyList());
+    public static final BlobStoreIndexShardSnapshots EMPTY = new BlobStoreIndexShardSnapshots(Map.of(), List.of());
 
     private final List<SnapshotFiles> shardSnapshots;
     private final Map<String, FileInfo> files;
-    private final Map<String, List<FileInfo>> physicalFiles;
 
-    public BlobStoreIndexShardSnapshots(List<SnapshotFiles> shardSnapshots) {
+    private BlobStoreIndexShardSnapshots(Map<String, FileInfo> files, List<SnapshotFiles> shardSnapshots) {
         this.shardSnapshots = List.copyOf(shardSnapshots);
-        // Map between blob names and file info
+        this.files = files;
+    }
+
+    public BlobStoreIndexShardSnapshots withRetainedSnapshots(Set<SnapshotId> retainedSnapshots) {
+        if (retainedSnapshots.isEmpty()) {
+            return EMPTY;
+        }
+        final var survivingSnapshotNames = retainedSnapshots.stream().map(SnapshotId::getName).collect(Collectors.toSet());
+        final ArrayList<SnapshotFiles> updatedSnapshots = new ArrayList<>(survivingSnapshotNames.size());
         Map<String, FileInfo> newFiles = new HashMap<>();
-        // Map between original physical names and file info
-        Map<String, List<FileInfo>> physicalFiles = new HashMap<>();
         for (SnapshotFiles snapshot : shardSnapshots) {
-            // First we build map between filenames in the repo and their original file info
-            // this map will be used in the next loop
+            if (survivingSnapshotNames.contains(snapshot.snapshot()) == false) {
+                continue;
+            }
+            updatedSnapshots.add(snapshot);
             for (FileInfo fileInfo : snapshot.indexFiles()) {
                 FileInfo oldFile = newFiles.put(fileInfo.name(), fileInfo);
                 assert oldFile == null || oldFile.isSame(fileInfo);
             }
-            // We are doing it in two loops here so we keep only one copy of the fileInfo per blob
-            // the first loop de-duplicates fileInfo objects that were loaded from different snapshots but refer to
-            // the same blob
-            for (FileInfo fileInfo : snapshot.indexFiles()) {
-                physicalFiles.computeIfAbsent(fileInfo.physicalName(), k -> new ArrayList<>()).add(newFiles.get(fileInfo.name()));
-            }
         }
-        Map<String, List<FileInfo>> mapBuilder = new HashMap<>();
-        for (Map.Entry<String, List<FileInfo>> entry : physicalFiles.entrySet()) {
-            mapBuilder.put(entry.getKey(), List.copyOf(entry.getValue()));
-        }
-        this.physicalFiles = unmodifiableMap(mapBuilder);
-        this.files = unmodifiableMap(newFiles);
+        return new BlobStoreIndexShardSnapshots(newFiles, updatedSnapshots);
     }
 
-    private BlobStoreIndexShardSnapshots(Map<String, FileInfo> files, List<SnapshotFiles> shardSnapshots) {
-        this.shardSnapshots = shardSnapshots;
-        this.files = files;
-        Map<String, List<FileInfo>> physicalFiles = new HashMap<>();
-        for (SnapshotFiles snapshot : shardSnapshots) {
-            for (FileInfo fileInfo : snapshot.indexFiles()) {
-                physicalFiles.computeIfAbsent(fileInfo.physicalName(), k -> new ArrayList<>()).add(files.get(fileInfo.name()));
+    public BlobStoreIndexShardSnapshots withAddedSnapshot(SnapshotFiles snapshotFiles) {
+        Map<String, FileInfo> updatedFiles = null;
+        for (FileInfo fileInfo : snapshotFiles.indexFiles()) {
+            final FileInfo known = files.get(fileInfo.name());
+            if (known == null) {
+                if (updatedFiles == null) {
+                    updatedFiles = new HashMap<>(files);
+                }
+                updatedFiles.put(fileInfo.name(), fileInfo);
+            } else {
+                assert fileInfo.isSame(known);
             }
         }
-        Map<String, List<FileInfo>> mapBuilder = new HashMap<>();
-        for (Map.Entry<String, List<FileInfo>> entry : physicalFiles.entrySet()) {
-            mapBuilder.put(entry.getKey(), List.copyOf(entry.getValue()));
-        }
-        this.physicalFiles = unmodifiableMap(mapBuilder);
+        return new BlobStoreIndexShardSnapshots(
+            updatedFiles == null ? files : updatedFiles,
+            CollectionUtils.appendToCopyNoNullElements(shardSnapshots, snapshotFiles)
+        );
     }
 
     /**
@@ -102,7 +106,10 @@ public BlobStoreIndexShardSnapshots withClone(String source, String target) {
         if (sourceFiles == null) {
             throw new IllegalArgumentException("unknown source [" + source + "]");
         }
-        return new BlobStoreIndexShardSnapshots(CollectionUtils.appendToCopy(shardSnapshots, sourceFiles.withSnapshotName(target)));
+        return new BlobStoreIndexShardSnapshots(
+            files,
+            CollectionUtils.appendToCopyNoNullElements(shardSnapshots, sourceFiles.withSnapshotName(target))
+        );
     }
 
     /**
@@ -114,14 +121,40 @@ public List<SnapshotFiles> snapshots() {
         return this.shardSnapshots;
     }
 
+    // index of Lucene file name to collection of file info in the repository
+    // lazy computed because building this is map is rather expensive and only needed for the snapshot create operation
+    private Map<String, Collection<FileInfo>> physicalFiles;
+
     /**
-     * Finds reference to a snapshotted file by its original name
+     * Finds reference to a snapshotted file by its {@link StoreFileMetadata}
      *
-     * @param physicalName original name
-     * @return a list of file infos that match specified physical file or null if the file is not present in any of snapshots
+     * @param storeFileMetadata store file metadata to find file info for
+     * @return the file info that matches the specified physical file or null if the file is not present in any of snapshots
      */
-    public List<FileInfo> findPhysicalIndexFiles(String physicalName) {
-        return physicalFiles.get(physicalName);
+    public FileInfo findPhysicalIndexFile(StoreFileMetadata storeFileMetadata) {
+        var p = this.physicalFiles;
+        if (p == null) {
+            p = new HashMap<>();
+            for (SnapshotFiles snapshot : shardSnapshots) {
+                for (FileInfo fileInfo : snapshot.indexFiles()) {
+                    // we use identity hash set since we lookup all instances from the same map and thus equality == instance equality
+                    // and we don't want to add the same file to the map multiple times
+                    p.computeIfAbsent(fileInfo.physicalName(), k -> Collections.newSetFromMap(new IdentityHashMap<>()))
+                        .add(files.get(fileInfo.name()));
+                }
+            }
+            physicalFiles = p;
+        }
+        final var found = p.get(storeFileMetadata.name());
+        if (found == null) {
+            return null;
+        }
+        for (FileInfo fileInfo : found) {
+            if (fileInfo.isSame(storeFileMetadata)) {
+                return fileInfo;
+            }
+        }
+        return null;
     }
 
     /**
@@ -228,7 +261,8 @@ public static BlobStoreIndexShardSnapshots fromXContent(XContentParser parser) t
         if (token == null) { // New parser
             token = parser.nextToken();
         }
-        Map<String, List<String>> snapshotsMap = new HashMap<>();
+        // list of tuples of snapshot name and file ids in the snapshot
+        List<Tuple<String, List<String>>> snapshotsAndFiles = new ArrayList<>();
         Map<String, String> historyUUIDs = new HashMap<>();
         Map<String, FileInfo> files = new HashMap<>();
         XContentParserUtils.ensureExpectedToken(XContentParser.Token.START_OBJECT, token, parser);
@@ -256,7 +290,9 @@ public static BlobStoreIndexShardSnapshots fromXContent(XContentParser parser) t
                             token = parser.nextToken();
                             if (Fields.FILES.equals(currentFieldName)) {
                                 if (token == XContentParser.Token.START_ARRAY) {
-                                    snapshotsMap.put(snapshot, XContentParserUtils.parseList(parser, XContentParser::text));
+                                    snapshotsAndFiles.add(
+                                        Tuple.tuple(snapshot, XContentParserUtils.parseList(parser, XContentParser::text))
+                                    );
                                 }
                             } else if (Fields.SHARD_STATE_ID.equals(currentFieldName)) {
                                 historyUUIDs.put(snapshot, parser.text());
@@ -268,19 +304,17 @@ public static BlobStoreIndexShardSnapshots fromXContent(XContentParser parser) t
             }
         }
 
-        List<SnapshotFiles> snapshots = new ArrayList<>(snapshotsMap.size());
-        for (Map.Entry<String, List<String>> entry : snapshotsMap.entrySet()) {
+        List<SnapshotFiles> snapshots = new ArrayList<>(snapshotsAndFiles.size());
+        for (Tuple<String, List<String>> entry : snapshotsAndFiles) {
             List<FileInfo> fileInfosBuilder = new ArrayList<>();
-            for (String file : entry.getValue()) {
+            for (String file : entry.v2()) {
                 FileInfo fileInfo = files.get(file);
                 assert fileInfo != null;
                 fileInfosBuilder.add(fileInfo);
             }
-            snapshots.add(
-                new SnapshotFiles(entry.getKey(), Collections.unmodifiableList(fileInfosBuilder), historyUUIDs.get(entry.getKey()))
-            );
+            snapshots.add(new SnapshotFiles(entry.v1(), Collections.unmodifiableList(fileInfosBuilder), historyUUIDs.get(entry.v1())));
         }
-        return new BlobStoreIndexShardSnapshots(files, Collections.unmodifiableList(snapshots));
+        return new BlobStoreIndexShardSnapshots(files, snapshots);
     }
 
 }
diff --git a/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java b/server/src/main/java/org/elasticsearch/repositories/blobstore/BlobStoreRepository.java
@@ -2696,18 +2696,7 @@ public void snapshotShard(SnapshotShardContext context) {
 
                     logger.trace("[{}] [{}] Processing [{}]", shardId, snapshotId, fileName);
                     final StoreFileMetadata md = metadataFromStore.get(fileName);
-                    BlobStoreIndexShardSnapshot.FileInfo existingFileInfo = null;
-                    List<BlobStoreIndexShardSnapshot.FileInfo> filesInfo = snapshots.findPhysicalIndexFiles(fileName);
-                    if (filesInfo != null) {
-                        for (BlobStoreIndexShardSnapshot.FileInfo fileInfo : filesInfo) {
-                            if (fileInfo.isSame(md)) {
-                                // a commit point file with the same name, size and checksum was already copied to repository
-                                // we will reuse it for this snapshot
-                                existingFileInfo = fileInfo;
-                                break;
-                            }
-                        }
-                    }
+                    BlobStoreIndexShardSnapshot.FileInfo existingFileInfo = snapshots.findPhysicalIndexFile(md);
 
                     // We can skip writing blobs where the metadata hash is equal to the blob's contents because we store the hash/contents
                     // directly in the shard level metadata in this case
@@ -2733,6 +2722,8 @@ public void snapshotShard(SnapshotShardContext context) {
                             filesInShardMetadataSize += md.length();
                         }
                     } else {
+                        // a commit point file with the same name, size and checksum was already copied to repository
+                        // we will reuse it for this snapshot
                         indexCommitPointFiles.add(existingFileInfo);
                     }
                 }
@@ -2756,12 +2747,9 @@ public void snapshotShard(SnapshotShardContext context) {
             final boolean writeShardGens = SnapshotsService.useShardGenerations(context.getRepositoryMetaVersion());
             final boolean writeFileInfoWriterUUID = SnapshotsService.includeFileInfoWriterUUID(context.getRepositoryMetaVersion());
             // build a new BlobStoreIndexShardSnapshot, that includes this one and all the saved ones
-            List<SnapshotFiles> newSnapshotsList = new ArrayList<>();
-            newSnapshotsList.add(new SnapshotFiles(snapshotId.getName(), indexCommitPointFiles, context.stateIdentifier()));
-            for (SnapshotFiles point : snapshots) {
-                newSnapshotsList.add(point);
-            }
-            final BlobStoreIndexShardSnapshots updatedBlobStoreIndexShardSnapshots = new BlobStoreIndexShardSnapshots(newSnapshotsList);
+            final BlobStoreIndexShardSnapshots updatedBlobStoreIndexShardSnapshots = snapshots.withAddedSnapshot(
+                new SnapshotFiles(snapshotId.getName(), indexCommitPointFiles, context.stateIdentifier())
+            );
             final Runnable afterWriteSnapBlob;
             if (writeShardGens) {
                 // When using shard generations we can safely write the index-${uuid} blob before writing out any of the actual data
@@ -3253,19 +3241,12 @@ private ShardSnapshotMetaDeleteResult deleteFromShardSnapshotMeta(
         long indexGeneration
     ) {
         // Build a list of snapshots that should be preserved
-        List<SnapshotFiles> newSnapshotsList = new ArrayList<>();
-        final Set<String> survivingSnapshotNames = survivingSnapshots.stream().map(SnapshotId::getName).collect(Collectors.toSet());
-        for (SnapshotFiles point : snapshots) {
-            if (survivingSnapshotNames.contains(point.snapshot())) {
-                newSnapshotsList.add(point);
-            }
-        }
+        final BlobStoreIndexShardSnapshots updatedSnapshots = snapshots.withRetainedSnapshots(survivingSnapshots);
         ShardGeneration writtenGeneration = null;
         try {
-            if (newSnapshotsList.isEmpty()) {
+            if (updatedSnapshots.snapshots().isEmpty()) {
                 return new ShardSnapshotMetaDeleteResult(indexId, snapshotShardId, ShardGenerations.DELETED_SHARD_GEN, blobs);
             } else {
-                final BlobStoreIndexShardSnapshots updatedSnapshots = new BlobStoreIndexShardSnapshots(newSnapshotsList);
                 if (indexGeneration < 0L) {
                     writtenGeneration = ShardGeneration.newGeneration();
                     INDEX_SHARD_SNAPSHOTS_FORMAT.write(updatedSnapshots, shardContainer, writtenGeneration.toBlobNamePart(), compress);

diff --git a/...n/ml/src/main/java/org/elasticsearch/xpack/ml/inference/deployment/DeploymentManager.java b/...n/ml/src/main/java/org/elasticsearch/xpack/ml/inference/deployment/DeploymentManager.java
@@ -206,7 +206,7 @@ Vocabulary parseVocabularyDocLeniently(SearchHit hit) throws IOException {
                     stream
                 )
         ) {
-            return Vocabulary.createParser(true).apply(parser, null);
+            return Vocabulary.PARSER.apply(parser, null);
         } catch (IOException e) {
             logger.error(() -> "failed to parse trained model vocabulary [" + hit.getId() + "]", e);
             throw e;

diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/Vocabulary.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/inference/nlp/Vocabulary.java
@@ -45,6 +45,8 @@ public static ConstructingObjectParser<Vocabulary, Void> createParser(boolean ig
         return parser;
     }
 
+    public static ConstructingObjectParser<Vocabulary, Void> PARSER = createParser(true);
+
     private final List<String> vocab;
     private final List<String> merges;
     private final String modelId;