From 34348b3be8a91b99ac5c1c29051fabd5a7acc624 Mon Sep 17 00:00:00 2001 From: yuye-aws Date: Thu, 14 Mar 2024 18:09:41 +0800 Subject: [PATCH] update comment Signed-off-by: yuye-aws --- .../processor/TextChunkingProcessor.java | 15 +++++++-------- .../chunker/FixedTokenLengthChunker.java | 2 +- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java index b2fe4d7d7..7b45d6d45 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java @@ -24,9 +24,9 @@ import org.opensearch.index.IndexSettings; import org.opensearch.ingest.AbstractProcessor; import org.opensearch.ingest.IngestDocument; -import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.Chunker; import org.opensearch.index.mapper.IndexFieldMapper; +import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory; import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter; import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter; @@ -155,19 +155,18 @@ private int getMaxTokenCount(final Map sourceAndMetadataMap) { */ @Override public IngestDocument execute(IngestDocument ingestDocument) { - validateFieldsValue(ingestDocument); + Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); + validateFieldsValue(sourceAndMetadataMap); + // fixed token length algorithm needs runtime parameter max_token_count for tokenization int chunkCount = 0; Map runtimeParameters = new HashMap<>(); - Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); - // fixed token length algorithm needs max_token_count for tokenization int maxTokenCount = getMaxTokenCount(sourceAndMetadataMap); runtimeParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount); chunkMapType(sourceAndMetadataMap, fieldMap, runtimeParameters, chunkCount); return ingestDocument; } - private void validateFieldsValue(final IngestDocument ingestDocument) { - Map sourceAndMetadataMap = ingestDocument.getSourceAndMetadata(); + private void validateFieldsValue(final Map sourceAndMetadataMap) { for (Map.Entry embeddingFieldsEntry : fieldMap.entrySet()) { Object sourceValue = sourceAndMetadataMap.get(embeddingFieldsEntry.getKey()); if (sourceValue != null) { @@ -297,8 +296,8 @@ private int chunkList( @SuppressWarnings("unchecked") private int chunkLeafType(final Object value, List result, final Map runTimeParameters, int chunkCount) { - // leaf type means either String or List - // the result should be an empty list + // leaf type means null, String or List + // the result should be an empty list when the input is null if (value instanceof String) { chunkCount = chunkString(value.toString(), result, runTimeParameters, chunkCount); } else if (isListOfString(value)) { diff --git a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java index 39da2fcd6..145484b79 100644 --- a/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java +++ b/src/main/java/org/opensearch/neuralsearch/processor/chunker/FixedTokenLengthChunker.java @@ -104,7 +104,7 @@ public void validateParameters(Map parameters) { * will throw IllegalArgumentException when parameters are invalid * * @param parameters a map non-runtime parameters as the following: - * 1. tokenizer: the analyzer tokenizer in opensearch + * 1. tokenizer: the word tokenizer in opensearch * 2. token_limit: the token limit for each chunked passage * 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage */