Skip to content

Commit

Permalink
provide fixed token length as the default algorithm
Browse files Browse the repository at this point in the history
Signed-off-by: yuye-aws <[email protected]>
  • Loading branch information
yuye-aws committed Mar 14, 2024
1 parent 34348b3 commit 4153988
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 39 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -40,15 +40,13 @@
public final class TextChunkingProcessor extends AbstractProcessor {

public static final String TYPE = "text_chunking";

public static final String FIELD_MAP_FIELD = "field_map";

public static final String ALGORITHM_FIELD = "algorithm";

@VisibleForTesting
static final String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit";

private static final int DEFAULT_MAX_CHUNK_LIMIT = -1;
private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME;

private int maxChunkLimit;

Expand Down Expand Up @@ -88,29 +86,32 @@ public String getType() {

@SuppressWarnings("unchecked")
private void validateAndParseAlgorithmMap(final Map<String, Object> algorithmMap) {
if (algorithmMap.isEmpty()) {
throw new IllegalArgumentException(
String.format(Locale.ROOT, "Unable to create %s processor as [%s] does not contain any algorithm", TYPE, ALGORITHM_FIELD)
);
} else if (algorithmMap.size() > 1) {
if (algorithmMap.size() > 1) {
throw new IllegalArgumentException(
String.format(Locale.ROOT, "Unable to create %s processor as [%s] contains multiple algorithms", TYPE, ALGORITHM_FIELD)
);
}

Entry<String, Object> algorithmEntry = algorithmMap.entrySet().iterator().next();
String algorithmKey = algorithmEntry.getKey();
Object algorithmValue = algorithmEntry.getValue();
if (!(algorithmValue instanceof Map)) {
throw new IllegalArgumentException(
String.format(
Locale.ROOT,
"Unable to create %s processor as [%s] parameters cannot be cast to [%s]",
TYPE,
algorithmKey,
Map.class.getName()
)
);
String algorithmKey;
Object algorithmValue;
if (algorithmMap.isEmpty()) {
algorithmKey = DEFAULT_ALGORITHM;
algorithmValue = new HashMap<>();
} else {
Entry<String, Object> algorithmEntry = algorithmMap.entrySet().iterator().next();
algorithmKey = algorithmEntry.getKey();
algorithmValue = algorithmEntry.getValue();
if (!(algorithmValue instanceof Map)) {
throw new IllegalArgumentException(
String.format(
Locale.ROOT,
"Unable to create %s processor as [%s] parameters cannot be cast to [%s]",
TYPE,
algorithmKey,
Map.class.getName()
)
);
}
}

Map<String, Object> chunkerParameters = (Map<String, Object>) algorithmValue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,16 @@ private Map<String, Object> createNestedFieldMap() {
return fieldMap;
}

@SneakyThrows
private TextChunkingProcessor createDefaultAlgorithmInstance(Map<String, Object> fieldMap) {
Map<String, Object> config = new HashMap<>();
Map<String, Object> algorithmMap = new HashMap<>();
config.put(FIELD_MAP_FIELD, fieldMap);
config.put(ALGORITHM_FIELD, algorithmMap);
Map<String, Processor.Factory> registry = new HashMap<>();
return textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config);
}

@SneakyThrows
private TextChunkingProcessor createFixedTokenLengthInstance(Map<String, Object> fieldMap) {
Map<String, Object> config = new HashMap<>();
Expand Down Expand Up @@ -195,24 +205,6 @@ public void testCreate_whenMaxChunkNumInvalidValue_thenFail() {
);
}

public void testCreate_whenAlgorithmFieldNoAlgorithm_thenFail() {
Map<String, Object> config = new HashMap<>();
Map<String, Object> fieldMap = new HashMap<>();
Map<String, Object> algorithmMap = new HashMap<>();
fieldMap.put(INPUT_FIELD, OUTPUT_FIELD);
config.put(TextChunkingProcessor.FIELD_MAP_FIELD, fieldMap);
config.put(ALGORITHM_FIELD, algorithmMap);
Map<String, Processor.Factory> registry = new HashMap<>();
IllegalArgumentException illegalArgumentException = assertThrows(
IllegalArgumentException.class,
() -> textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config)
);
assertEquals(
String.format(Locale.ROOT, "Unable to create %s processor as [%s] does not contain any algorithm", TYPE, ALGORITHM_FIELD),
illegalArgumentException.getMessage()
);
}

public void testCreate_whenAlgorithmFieldMultipleAlgorithm_thenFail() {
Map<String, Object> config = new HashMap<>();
Map<String, Object> fieldMap = new HashMap<>();
Expand Down Expand Up @@ -403,7 +395,21 @@ public void testExecute_withFixedTokenLength_andSourceDataStringWithMaxChunkNumE
),
illegalArgumentException.getMessage()
);
}

@SneakyThrows
public void testCreate_withDefaultAlgorithm_andSourceDataString_thenSucceed() {
TextChunkingProcessor processor = createDefaultAlgorithmInstance(createStringFieldMap());
IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString());
IngestDocument document = processor.execute(ingestDocument);
assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD);
Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD);
assert (passages instanceof List<?>);
List<String> expectedPassages = new ArrayList<>();
expectedPassages.add(
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
);
assertEquals(expectedPassages, passages);
}

@SneakyThrows
Expand Down

0 comments on commit 4153988

Please sign in to comment.