Skip to content

Commit

Permalink
update comment
Browse files Browse the repository at this point in the history
Signed-off-by: yuye-aws <[email protected]>
  • Loading branch information
yuye-aws committed Mar 14, 2024
1 parent a969a60 commit 34348b3
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@
import org.opensearch.index.IndexSettings;
import org.opensearch.ingest.AbstractProcessor;
import org.opensearch.ingest.IngestDocument;
import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory;
import org.opensearch.neuralsearch.processor.chunker.Chunker;
import org.opensearch.index.mapper.IndexFieldMapper;
import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory;
import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterValidator.validatePositiveIntegerParameter;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter;
Expand Down Expand Up @@ -155,19 +155,18 @@ private int getMaxTokenCount(final Map<String, Object> sourceAndMetadataMap) {
*/
@Override
public IngestDocument execute(IngestDocument ingestDocument) {
validateFieldsValue(ingestDocument);
Map<String, Object> sourceAndMetadataMap = ingestDocument.getSourceAndMetadata();
validateFieldsValue(sourceAndMetadataMap);
// fixed token length algorithm needs runtime parameter max_token_count for tokenization
int chunkCount = 0;
Map<String, Object> runtimeParameters = new HashMap<>();
Map<String, Object> sourceAndMetadataMap = ingestDocument.getSourceAndMetadata();
// fixed token length algorithm needs max_token_count for tokenization
int maxTokenCount = getMaxTokenCount(sourceAndMetadataMap);
runtimeParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount);
chunkMapType(sourceAndMetadataMap, fieldMap, runtimeParameters, chunkCount);
return ingestDocument;
}

private void validateFieldsValue(final IngestDocument ingestDocument) {
Map<String, Object> sourceAndMetadataMap = ingestDocument.getSourceAndMetadata();
private void validateFieldsValue(final Map<String, Object> sourceAndMetadataMap) {
for (Map.Entry<String, Object> embeddingFieldsEntry : fieldMap.entrySet()) {
Object sourceValue = sourceAndMetadataMap.get(embeddingFieldsEntry.getKey());
if (sourceValue != null) {
Expand Down Expand Up @@ -297,8 +296,8 @@ private int chunkList(

@SuppressWarnings("unchecked")
private int chunkLeafType(final Object value, List<String> result, final Map<String, Object> runTimeParameters, int chunkCount) {
// leaf type means either String or List<String>
// the result should be an empty list
// leaf type means null, String or List<String>
// the result should be an empty list when the input is null
if (value instanceof String) {
chunkCount = chunkString(value.toString(), result, runTimeParameters, chunkCount);
} else if (isListOfString(value)) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ public void validateParameters(Map<String, Object> parameters) {
* will throw IllegalArgumentException when parameters are invalid
*
* @param parameters a map non-runtime parameters as the following:
* 1. tokenizer: the <a href="https://opensearch.org/docs/latest/analyzers/tokenizers/index/">analyzer tokenizer</a> in opensearch
* 1. tokenizer: the <a href="https://opensearch.org/docs/latest/analyzers/tokenizers/index/">word tokenizer</a> in opensearch
* 2. token_limit: the token limit for each chunked passage
* 3. overlap_rate: the overlapping degree for each chunked passage, indicating how many token comes from the previous passage
*/
Expand Down

0 comments on commit 34348b3

Please sign in to comment.