From e7a0aaa69eefdb9e61bf27682a2cfadbe4b81c4c Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Tue, 27 Feb 2024 15:34:27 +0800 Subject: [PATCH] add unit tests with string, map and nested map type for document chunking processor Signed-off-by: yuye-aws --- .../processor/DocumentChunkingProcessor.java | 2 +- .../chunker/FixedTokenLengthChunker.java | 50 ++--- .../DocumentChunkingProcessorTests.java | 181 ++++++++++++++++-- .../chunker/DelimiterChunkerTests.java | 15 +- .../chunker/FixedTokenLengthChunkerTests.java | 24 +-- 5 files changed, 209 insertions(+), 63 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java index a5a5726d4..550c8013f 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessor.java @@ -224,7 +224,7 @@ public IngestDocument execute(IngestDocument document) { IndexService indexService = indicesService.indexServiceSafe(indexMetadata.getIndex()); maxTokenCount = indexService.getIndexSettings().getMaxTokenCount(); } - chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT, maxTokenCount); + chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); } IFieldChunker chunker = ChunkerFactory.create(parameterKey, analysisRegistry); document.setFieldValue(outputField, chunk(chunker, content, chunkerParameters)); diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 9045f4be6..3079fcf8e 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -19,12 +19,12 @@ @Log4j2 public class FixedTokenLengthChunker implements IFieldChunker { - public static final String TOKEN_LIMIT = "token_limit"; - public static final String OVERLAP_RATE = "overlap_rate"; + public static final String TOKEN_LIMIT_FIELD = "token_limit"; + public static final String OVERLAP_RATE_FIELD = "overlap_rate"; - public static final String MAX_TOKEN_COUNT = "max_token_count"; + public static final String MAX_TOKEN_COUNT_FIELD = "max_token_count"; - public static final String TOKENIZER = "tokenizer"; + public static final String TOKENIZER_FIELD = "tokenizer"; // default values for each parameter private static final int DEFAULT_TOKEN_LIMIT = 500; @@ -64,17 +64,17 @@ public List chunk(String content, Map parameters) { int maxTokenCount = DEFAULT_MAX_TOKEN_COUNT; String tokenizer = DEFAULT_TOKENIZER; - if (parameters.containsKey(TOKEN_LIMIT)) { - tokenLimit = ((Number) parameters.get(TOKEN_LIMIT)).intValue(); + if (parameters.containsKey(TOKEN_LIMIT_FIELD)) { + tokenLimit = ((Number) parameters.get(TOKEN_LIMIT_FIELD)).intValue(); } - if (parameters.containsKey(OVERLAP_RATE)) { - overlapRate = ((Number) parameters.get(OVERLAP_RATE)).doubleValue(); + if (parameters.containsKey(OVERLAP_RATE_FIELD)) { + overlapRate = ((Number) parameters.get(OVERLAP_RATE_FIELD)).doubleValue(); } - if (parameters.containsKey(MAX_TOKEN_COUNT)) { - maxTokenCount = ((Number) parameters.get(MAX_TOKEN_COUNT)).intValue(); + if (parameters.containsKey(MAX_TOKEN_COUNT_FIELD)) { + maxTokenCount = ((Number) parameters.get(MAX_TOKEN_COUNT_FIELD)).intValue(); } - if (parameters.containsKey(TOKENIZER)) { - tokenizer = (String) parameters.get(TOKENIZER); + if (parameters.containsKey(TOKENIZER_FIELD)) { + tokenizer = (String) parameters.get(TOKENIZER_FIELD); } List tokens = tokenize(content, tokenizer, maxTokenCount); @@ -103,34 +103,34 @@ public List chunk(String content, Map parameters) { @Override public void validateParameters(Map parameters) { - if (parameters.containsKey(TOKEN_LIMIT)) { - if (!(parameters.get(TOKEN_LIMIT) instanceof Number)) { + if (parameters.containsKey(TOKEN_LIMIT_FIELD)) { + if (!(parameters.get(TOKEN_LIMIT_FIELD) instanceof Number)) { throw new IllegalArgumentException( - "fixed length parameter [" + TOKEN_LIMIT + "] cannot be cast to [" + Number.class.getName() + "]" + "fixed length parameter [" + TOKEN_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" ); } - if (((Number) parameters.get(TOKEN_LIMIT)).intValue() <= 0) { - throw new IllegalArgumentException("fixed length parameter [" + TOKEN_LIMIT + "] must be positive"); + if (((Number) parameters.get(TOKEN_LIMIT_FIELD)).intValue() <= 0) { + throw new IllegalArgumentException("fixed length parameter [" + TOKEN_LIMIT_FIELD + "] must be positive"); } } - if (parameters.containsKey(OVERLAP_RATE)) { - if (!(parameters.get(OVERLAP_RATE) instanceof Number)) { + if (parameters.containsKey(OVERLAP_RATE_FIELD)) { + if (!(parameters.get(OVERLAP_RATE_FIELD) instanceof Number)) { throw new IllegalArgumentException( - "fixed length parameter [" + OVERLAP_RATE + "] cannot be cast to [" + Number.class.getName() + "]" + "fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]" ); } - if (((Number) parameters.get(OVERLAP_RATE)).doubleValue() < 0.0 - || ((Number) parameters.get(OVERLAP_RATE)).doubleValue() >= 1.0) { + if (((Number) parameters.get(OVERLAP_RATE_FIELD)).doubleValue() < 0.0 + || ((Number) parameters.get(OVERLAP_RATE_FIELD)).doubleValue() >= 1.0) { throw new IllegalArgumentException( - "fixed length parameter [" + OVERLAP_RATE + "] must be between 0 and 1, 1 is not included." + "fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and 1, 1 is not included." ); } } - if (parameters.containsKey(TOKENIZER) && !(parameters.get(TOKENIZER) instanceof String)) { + if (parameters.containsKey(TOKENIZER_FIELD) && !(parameters.get(TOKENIZER_FIELD) instanceof String)) { throw new IllegalArgumentException( - "fixed length parameter [" + TOKENIZER + "] cannot be cast to [" + String.class.getName() + "]" + "fixed length parameter [" + TOKENIZER_FIELD + "] cannot be cast to [" + String.class.getName() + "]" ); } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java index 17044eb6a..3b0395320 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java @@ -4,6 +4,7 @@ */ package org.opensearch.neuralsearch.processor; +import com.google.common.collect.ImmutableMap; import lombok.SneakyThrows; import org.apache.lucene.tests.analysis.MockTokenizer; import org.junit.Before; @@ -21,6 +22,7 @@ import org.opensearch.ingest.IngestDocument; import org.opensearch.ingest.Processor; import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; +import org.opensearch.neuralsearch.processor.chunker.DelimiterChunker; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; import org.opensearch.plugins.AnalysisPlugin; import org.opensearch.test.OpenSearchTestCase; @@ -42,6 +44,9 @@ public class DocumentChunkingProcessorTests extends OpenSearchTestCase { private static final String PROCESSOR_TAG = "mockTag"; private static final String DESCRIPTION = "mockDescription"; + private static final String INPUT_FIELD = "body"; + private static final String OUTPUT_FIELD = "body_chunk"; + private static final String INDEX_NAME = "_index"; @SneakyThrows private AnalysisRegistry getAnalysisRegistry() { @@ -85,7 +90,13 @@ public void testGetType() { private Map createFixedTokenLengthParameters() { Map parameters = new HashMap<>(); - parameters.put(FixedTokenLengthChunker.TOKEN_LIMIT, 10); + parameters.put(FixedTokenLengthChunker.TOKEN_LIMIT_FIELD, 10); + return parameters; + } + + private Map createDelimiterParameters() { + Map parameters = new HashMap<>(); + parameters.put(DelimiterChunker.DELIMITER_FIELD, "."); return parameters; } @@ -95,30 +106,79 @@ private DocumentChunkingProcessor createFixedTokenLengthInstance() { Map fieldParameters = new HashMap<>(); Map chunkerParameters = new HashMap<>(); chunkerParameters.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParameters()); - chunkerParameters.put(DocumentChunkingProcessor.OUTPUT_FIELD, "body_chunk"); - fieldParameters.put("body", chunkerParameters); + chunkerParameters.put(DocumentChunkingProcessor.OUTPUT_FIELD, OUTPUT_FIELD); + fieldParameters.put(INPUT_FIELD, chunkerParameters); config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldParameters); Map registry = new HashMap<>(); return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); } - private IngestDocument createIngestDocument() { - Map sourceAndMetadata = new HashMap<>(); - sourceAndMetadata.put( - "body", - "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + @SneakyThrows + private DocumentChunkingProcessor createDelimiterInstance() { + Map config = new HashMap<>(); + Map fieldParameters = new HashMap<>(); + Map chunkerParameters = new HashMap<>(); + chunkerParameters.put(ChunkerFactory.DELIMITER_ALGORITHM, createDelimiterParameters()); + chunkerParameters.put(DocumentChunkingProcessor.OUTPUT_FIELD, OUTPUT_FIELD); + fieldParameters.put(INPUT_FIELD, chunkerParameters); + config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldParameters); + Map registry = new HashMap<>(); + return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config); + } + + private String createSourceDataString() { + return "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; + } + + private List createSourceDataList() { + List documents = new ArrayList<>(); + documents.add( + "This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + documents.add( + "This is the second document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." ); - sourceAndMetadata.put(IndexFieldMapper.NAME, "_index"); + return documents; + } + + private Map createSourceDataMap() { + Map documents = new HashMap<>(); + documents.put( + "third", + "This is the third document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + documents.put( + "fourth", + "This is the fourth document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch." + ); + return documents; + } + + private Map createSourceDataNestedMap() { + String documentString = createSourceDataString(); + List documentList = createSourceDataList(); + Map documentMap = createSourceDataMap(); + Map documents = new HashMap<>(); + documents.put("String", documentString); + documents.put("List", documentList); + documents.put("Map", documentMap); + return documents; + } + + private IngestDocument createIngestDocumentWithSourceData(Object sourceData) { + Map sourceAndMetadata = new HashMap<>(); + sourceAndMetadata.put(INPUT_FIELD, sourceData); + sourceAndMetadata.put(IndexFieldMapper.NAME, INDEX_NAME); return new IngestDocument(sourceAndMetadata, new HashMap<>()); } @SneakyThrows - public void testExecute_withFixedTokenLength_successful() { + public void testExecute_withFixedTokenLength_andSourceDataString_successful() { DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); - IngestDocument ingestDocument = createIngestDocument(); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); IngestDocument document = processor.execute(ingestDocument); - assert document.getSourceAndMetadata().containsKey("body_chunk"); - Object passages = document.getSourceAndMetadata().get("body_chunk"); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); assert (passages instanceof List); List expectedPassages = new ArrayList<>(); expectedPassages.add("This is an example document to be chunked The document"); @@ -126,4 +186,99 @@ public void testExecute_withFixedTokenLength_successful() { expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); assertEquals(expectedPassages, passages); } + + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataList_successful() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataList()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is the first document to be chunked The document"); + expectedPassages.add("The document contains a single paragraph two sentences and 24"); + expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages.add("This is the second document to be chunked The document"); + expectedPassages.add("The document contains a single paragraph two sentences and 24"); + expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch"); + assertEquals(expectedPassages, passages); + } + + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataMap_successful() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataMap()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof Map); + + List expectedPassages1 = new ArrayList<>(); + List expectedPassages2 = new ArrayList<>(); + + expectedPassages1.add("This is the third document to be chunked The document"); + expectedPassages1.add("The document contains a single paragraph two sentences and 24"); + expectedPassages1.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages2.add("This is the fourth document to be chunked The document"); + expectedPassages2.add("The document contains a single paragraph two sentences and 24"); + expectedPassages2.add("and 24 tokens by standard tokenizer in OpenSearch"); + + Map expectedPassages = ImmutableMap.of("third", expectedPassages1, "fourth", expectedPassages2); + + assertEquals(expectedPassages, passages); + } + + @SneakyThrows + public void testExecute_withFixedTokenLength_andSourceDataNestedMap_successful() { + DocumentChunkingProcessor processor = createFixedTokenLengthInstance(); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataNestedMap()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof Map); + + Map expectedPassages = new HashMap<>(); + List expectedPassages1 = new ArrayList<>(); + List expectedPassages2 = new ArrayList<>(); + List expectedPassages3 = new ArrayList<>(); + List expectedPassages4 = new ArrayList<>(); + + expectedPassages1.add("This is an example document to be chunked The document"); + expectedPassages1.add("The document contains a single paragraph two sentences and 24"); + expectedPassages1.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages2.add("This is the first document to be chunked The document"); + expectedPassages2.add("The document contains a single paragraph two sentences and 24"); + expectedPassages2.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages2.add("This is the second document to be chunked The document"); + expectedPassages2.add("The document contains a single paragraph two sentences and 24"); + expectedPassages2.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages3.add("This is the third document to be chunked The document"); + expectedPassages3.add("The document contains a single paragraph two sentences and 24"); + expectedPassages3.add("and 24 tokens by standard tokenizer in OpenSearch"); + expectedPassages4.add("This is the fourth document to be chunked The document"); + expectedPassages4.add("The document contains a single paragraph two sentences and 24"); + expectedPassages4.add("and 24 tokens by standard tokenizer in OpenSearch"); + + expectedPassages.put("String", expectedPassages1); + expectedPassages.put("List", expectedPassages2); + expectedPassages.put("Map", ImmutableMap.of("third", expectedPassages3, "fourth", expectedPassages4)); + + assertEquals(expectedPassages, passages); + } + + @SneakyThrows + public void testExecute_withDelimiter_andSourceDataString_successful() { + DocumentChunkingProcessor processor = createDelimiterInstance(); + IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString()); + IngestDocument document = processor.execute(ingestDocument); + assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD); + Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD); + assert (passages instanceof List); + List expectedPassages = new ArrayList<>(); + expectedPassages.add("This is an example document to be chunked."); + expectedPassages.add(" The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."); + assertEquals(expectedPassages, passages); + } } diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java index 8838310f4..d201ab574 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/DelimiterChunkerTests.java @@ -5,7 +5,7 @@ package org.opensearch.neuralsearch.processor.chunker; import org.junit.Assert; -import org.junit.Test; +import org.opensearch.test.OpenSearchTestCase; import java.util.List; import java.util.Map; @@ -14,9 +14,8 @@ import static org.junit.Assert.assertThrows; import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.DELIMITER_FIELD; -public class DelimiterChunkerTests { +public class DelimiterChunkerTests extends OpenSearchTestCase { - @Test public void testChunkerWithNoDelimiterField() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; @@ -25,7 +24,6 @@ public void testChunkerWithNoDelimiterField() { Assert.assertEquals("You must contain field:" + DELIMITER_FIELD + " in your parameter.", exception.getMessage()); } - @Test public void testChunkerWithDelimiterFieldNotString() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; @@ -34,7 +32,6 @@ public void testChunkerWithDelimiterFieldNotString() { Assert.assertEquals("delimiter parameters: " + List.of("") + " must be string.", exception.getMessage()); } - @Test public void testChunkerWithDelimiterFieldNoString() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; @@ -43,7 +40,6 @@ public void testChunkerWithDelimiterFieldNoString() { Assert.assertEquals("delimiter parameters should not be empty.", exception.getMessage()); } - @Test public void testChunker() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd"; @@ -52,7 +48,6 @@ public void testChunker() { assertEquals(List.of("a\n", "b\n", "c\n", "d"), chunkResult); } - @Test public void testChunkerWithDelimiterEnd() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a\nb\nc\nd\n"; @@ -61,7 +56,6 @@ public void testChunkerWithDelimiterEnd() { assertEquals(List.of("a\n", "b\n", "c\n", "d\n"), chunkResult); } - @Test public void testChunkerWithOnlyDelimiter() { DelimiterChunker chunker = new DelimiterChunker(); String content = "\n"; @@ -70,7 +64,6 @@ public void testChunkerWithOnlyDelimiter() { assertEquals(List.of("\n"), chunkResult); } - @Test public void testChunkerWithAllDelimiters() { DelimiterChunker chunker = new DelimiterChunker(); String content = "\n\n\n"; @@ -79,7 +72,6 @@ public void testChunkerWithAllDelimiters() { assertEquals(List.of("\n", "\n", "\n"), chunkResult); } - @Test public void testChunkerWithDifferentDelimiters() { DelimiterChunker chunker = new DelimiterChunker(); String content = "a.b.cc.d."; @@ -88,8 +80,7 @@ public void testChunkerWithDifferentDelimiters() { assertEquals(List.of("a.", "b.", "cc.", "d."), chunkResult); } - @Test - public void testChunkerWithStringDelimter() { + public void testChunkerWithStringDelimiter() { DelimiterChunker chunker = new DelimiterChunker(); String content = "\n\na\n\n\n"; Map inputParameters = Map.of(DELIMITER_FIELD, "\n\n"); diff --git a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java index e9b36682a..4c498d070 100644 --- a/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java +++ b/src/test/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunkerTests.java @@ -23,9 +23,9 @@ import static java.util.Collections.singletonList; import static java.util.Collections.singletonMap; -import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT; -import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.OVERLAP_RATE; -import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKENIZER; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT_FIELD; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.OVERLAP_RATE_FIELD; +import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKENIZER_FIELD; public class FixedTokenLengthChunkerTests extends OpenSearchTestCase { @@ -61,7 +61,7 @@ public void testValidateParameters_whenNoParams_thenSuccessful() { public void testValidateParameters_whenIllegalTokenLimitType_thenFail() { Map parameters = new HashMap<>(); - parameters.put(TOKEN_LIMIT, "invalid token limit"); + parameters.put(TOKEN_LIMIT_FIELD, "invalid token limit"); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, () -> FixedTokenLengthChunker.validateParameters(parameters) @@ -71,7 +71,7 @@ public void testValidateParameters_whenIllegalTokenLimitType_thenFail() { public void testValidateParameters_whenIllegalTokenLimitValue_thenFail() { Map parameters = new HashMap<>(); - parameters.put(TOKEN_LIMIT, -1); + parameters.put(TOKEN_LIMIT_FIELD, -1); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, () -> FixedTokenLengthChunker.validateParameters(parameters) @@ -81,7 +81,7 @@ public void testValidateParameters_whenIllegalTokenLimitValue_thenFail() { public void testValidateParameters_whenIllegalOverlapRateType_thenFail() { Map parameters = new HashMap<>(); - parameters.put(OVERLAP_RATE, "invalid overlap rate"); + parameters.put(OVERLAP_RATE_FIELD, "invalid overlap rate"); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, () -> FixedTokenLengthChunker.validateParameters(parameters) @@ -91,7 +91,7 @@ public void testValidateParameters_whenIllegalOverlapRateType_thenFail() { public void testValidateParameters_whenIllegalOverlapRateValue_thenFail() { Map parameters = new HashMap<>(); - parameters.put(OVERLAP_RATE, 1.0); + parameters.put(OVERLAP_RATE_FIELD, 1.0); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, () -> FixedTokenLengthChunker.validateParameters(parameters) @@ -104,7 +104,7 @@ public void testValidateParameters_whenIllegalOverlapRateValue_thenFail() { public void testValidateParameters_whenIllegalTokenizerType_thenFail() { Map parameters = new HashMap<>(); - parameters.put(TOKENIZER, 111); + parameters.put(TOKENIZER_FIELD, 111); IllegalArgumentException illegalArgumentException = assertThrows( IllegalArgumentException.class, () -> FixedTokenLengthChunker.validateParameters(parameters) @@ -114,7 +114,7 @@ public void testValidateParameters_whenIllegalTokenizerType_thenFail() { public void testChunk_withTokenLimit_10() { Map parameters = new HashMap<>(); - parameters.put(TOKEN_LIMIT, 10); + parameters.put(TOKEN_LIMIT_FIELD, 10); String content = "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; List passages = FixedTokenLengthChunker.chunk(content, parameters); @@ -127,7 +127,7 @@ public void testChunk_withTokenLimit_10() { public void testChunk_withTokenLimit_20() { Map parameters = new HashMap<>(); - parameters.put(TOKEN_LIMIT, 20); + parameters.put(TOKEN_LIMIT_FIELD, 20); String content = "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; List passages = FixedTokenLengthChunker.chunk(content, parameters); @@ -141,8 +141,8 @@ public void testChunk_withTokenLimit_20() { public void testChunk_withOverlapRate_half() { Map parameters = new HashMap<>(); - parameters.put(TOKEN_LIMIT, 10); - parameters.put(OVERLAP_RATE, 0.5); + parameters.put(TOKEN_LIMIT_FIELD, 10); + parameters.put(OVERLAP_RATE_FIELD, 0.5); String content = "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."; List passages = FixedTokenLengthChunker.chunk(content, parameters);