From f5ba77114ef662e91a8ce26838159f383931912c Mon Sep 17 00:00:00 2001 From: Doo Yong Kim <0ctopus13prime@gmail.com> Date: Mon, 12 Aug 2024 12:16:07 -0700 Subject: [PATCH] Disallow invalid characters for physical file name to be included within vector field name. (#1936) * Block a vector field to have invalid characters for a physical file name. Signed-off-by: Dooyong Kim * Block a vector field to have invalid characters for a physical file name. Signed-off-by: Dooyong Kim --------- Signed-off-by: Dooyong Kim Signed-off-by: Doo Yong Kim <0ctopus13prime@gmail.com> Co-authored-by: Dooyong Kim --- CHANGELOG.md | 1 + .../index/mapper/KNNVectorFieldMapper.java | 31 +++++++++++++++++++ .../mapper/KNNVectorFieldMapperTests.java | 18 +++++++++++ 3 files changed, 50 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f1dc5b14d..eb8427b1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), * Add script_fields context to KNNAllowlist [#1917] (https://github.com/opensearch-project/k-NN/pull/1917) * Fix graph merge stats size calculation [#1844](https://github.com/opensearch-project/k-NN/pull/1844) * Integrate Lucene Vector field with native engines to use KNNVectorFormat during segment creation [#1945](https://github.com/opensearch-project/k-NN/pull/1945) +* Disallow a vector field to have an invalid character for a physical file name. [#1936] (https://github.com/opensearch-project/k-NN/pull/1936) ### Infrastructure ### Documentation ### Maintenance diff --git a/src/main/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapper.java b/src/main/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapper.java index 5d4d3ca58..94756f595 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapper.java +++ b/src/main/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapper.java @@ -13,6 +13,8 @@ import java.util.Map; import java.util.Optional; import java.util.function.Supplier; +import java.util.stream.Collectors; + import lombok.extern.log4j.Log4j2; import org.apache.lucene.document.Field; import org.apache.lucene.document.FieldType; @@ -23,6 +25,7 @@ import org.opensearch.common.Explicit; import org.opensearch.common.ValidationException; import org.opensearch.common.xcontent.support.XContentMapValues; +import org.opensearch.core.common.Strings; import org.opensearch.core.xcontent.ToXContent; import org.opensearch.core.xcontent.XContentBuilder; import org.opensearch.core.xcontent.XContentParser; @@ -219,6 +222,8 @@ private void validateFlatMapper() { @Override public KNNVectorFieldMapper build(BuilderContext context) { + validateFullFieldName(context); + final MultiFields multiFieldsBuilder = this.multiFieldsBuilder.build(this, context); final CopyTo copyToBuilder = copyTo.build(); final Explicit ignoreMalformed = ignoreMalformed(context); @@ -413,6 +418,32 @@ private KNNEngine validateDimensions(final KNNMethodContext knnMethodContext, fi } return knnEngine; } + + /** + * Validate whether provided full field name contain any invalid characters for physical file name. + * At the moment, we use a field name as a part of file name while we throw an exception + * if a physical file name contains any invalid characters when creating snapshot. + * To prevent from this happening, we restrict vector field name and make sure generated file to have a valid name. + * + * @param context : Builder context to have field name info. + */ + private void validateFullFieldName(final BuilderContext context) { + final String fullFieldName = buildFullName(context); + for (char ch : fullFieldName.toCharArray()) { + if (Strings.INVALID_FILENAME_CHARS.contains(ch)) { + throw new IllegalArgumentException( + String.format( + Locale.ROOT, + "Vector field name must not include invalid characters of %s. " + + "Provided field name=[%s] had a disallowed character [%c]", + Strings.INVALID_FILENAME_CHARS.stream().map(c -> "'" + c + "'").collect(Collectors.toList()), + fullFieldName, + ch + ) + ); + } + } + } } public static class TypeParser implements Mapper.TypeParser { diff --git a/src/test/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapperTests.java b/src/test/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapperTests.java index e1d842112..b3139fa5c 100644 --- a/src/test/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapperTests.java +++ b/src/test/java/org/opensearch/knn/index/mapper/KNNVectorFieldMapperTests.java @@ -22,6 +22,7 @@ import org.opensearch.common.settings.IndexScopedSettings; import org.opensearch.common.settings.Settings; import org.opensearch.common.xcontent.XContentFactory; +import org.opensearch.core.common.Strings; import org.opensearch.core.xcontent.XContentBuilder; import org.opensearch.index.IndexSettings; import org.opensearch.index.mapper.ContentPath; @@ -1171,6 +1172,23 @@ public void testBuilder_whenBinaryWithLegacyKNNEnabled_thenException() { assertTrue(ex.getMessage(), ex.getMessage().contains("is not supported for")); } + public void testBuild_whenInvalidCharsInFieldName_thenThrowException() { + for (char disallowChar : Strings.INVALID_FILENAME_CHARS) { + // When an invalid vector field name was given. + final String invalidVectorFieldName = "fieldname" + disallowChar; + + // Prepare context. + Settings settings = Settings.builder().put(settings(CURRENT).build()).put(KNN_INDEX, true).build(); + Mapper.BuilderContext builderContext = new Mapper.BuilderContext(settings, new ContentPath()); + + // IllegalArgumentException should be thrown. + Exception e = assertThrows(IllegalArgumentException.class, () -> { + new KNNVectorFieldMapper.Builder(invalidVectorFieldName, null, CURRENT, null).build(builderContext); + }); + assertTrue(e.getMessage(), e.getMessage().contains("Vector field name must not include")); + } + } + private LuceneFieldMapper.CreateLuceneFieldMapperInput.CreateLuceneFieldMapperInputBuilder createLuceneFieldMapperInputBuilder( VectorDataType vectorDataType ) {