Skip to content

Commit

Permalink
add unit tests with string, map and nested map type for document chun…
Browse files Browse the repository at this point in the history
…king processor

Signed-off-by: yuye-aws <[email protected]>
  • Loading branch information
yuye-aws committed Feb 27, 2024
1 parent 8f461e2 commit e7a0aaa
Show file tree
Hide file tree
Showing 5 changed files with 209 additions and 63 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,7 @@ public IngestDocument execute(IngestDocument document) {
IndexService indexService = indicesService.indexServiceSafe(indexMetadata.getIndex());
maxTokenCount = indexService.getIndexSettings().getMaxTokenCount();
}
chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT, maxTokenCount);
chunkerParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount);
}
IFieldChunker chunker = ChunkerFactory.create(parameterKey, analysisRegistry);
document.setFieldValue(outputField, chunk(chunker, content, chunkerParameters));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@
@Log4j2
public class FixedTokenLengthChunker implements IFieldChunker {

public static final String TOKEN_LIMIT = "token_limit";
public static final String OVERLAP_RATE = "overlap_rate";
public static final String TOKEN_LIMIT_FIELD = "token_limit";
public static final String OVERLAP_RATE_FIELD = "overlap_rate";

public static final String MAX_TOKEN_COUNT = "max_token_count";
public static final String MAX_TOKEN_COUNT_FIELD = "max_token_count";

public static final String TOKENIZER = "tokenizer";
public static final String TOKENIZER_FIELD = "tokenizer";

// default values for each parameter
private static final int DEFAULT_TOKEN_LIMIT = 500;
Expand Down Expand Up @@ -64,17 +64,17 @@ public List<String> chunk(String content, Map<String, Object> parameters) {
int maxTokenCount = DEFAULT_MAX_TOKEN_COUNT;
String tokenizer = DEFAULT_TOKENIZER;

if (parameters.containsKey(TOKEN_LIMIT)) {
tokenLimit = ((Number) parameters.get(TOKEN_LIMIT)).intValue();
if (parameters.containsKey(TOKEN_LIMIT_FIELD)) {
tokenLimit = ((Number) parameters.get(TOKEN_LIMIT_FIELD)).intValue();
}
if (parameters.containsKey(OVERLAP_RATE)) {
overlapRate = ((Number) parameters.get(OVERLAP_RATE)).doubleValue();
if (parameters.containsKey(OVERLAP_RATE_FIELD)) {
overlapRate = ((Number) parameters.get(OVERLAP_RATE_FIELD)).doubleValue();
}
if (parameters.containsKey(MAX_TOKEN_COUNT)) {
maxTokenCount = ((Number) parameters.get(MAX_TOKEN_COUNT)).intValue();
if (parameters.containsKey(MAX_TOKEN_COUNT_FIELD)) {
maxTokenCount = ((Number) parameters.get(MAX_TOKEN_COUNT_FIELD)).intValue();
}
if (parameters.containsKey(TOKENIZER)) {
tokenizer = (String) parameters.get(TOKENIZER);
if (parameters.containsKey(TOKENIZER_FIELD)) {
tokenizer = (String) parameters.get(TOKENIZER_FIELD);
}

List<String> tokens = tokenize(content, tokenizer, maxTokenCount);
Expand Down Expand Up @@ -103,34 +103,34 @@ public List<String> chunk(String content, Map<String, Object> parameters) {

@Override
public void validateParameters(Map<String, Object> parameters) {
if (parameters.containsKey(TOKEN_LIMIT)) {
if (!(parameters.get(TOKEN_LIMIT) instanceof Number)) {
if (parameters.containsKey(TOKEN_LIMIT_FIELD)) {
if (!(parameters.get(TOKEN_LIMIT_FIELD) instanceof Number)) {
throw new IllegalArgumentException(
"fixed length parameter [" + TOKEN_LIMIT + "] cannot be cast to [" + Number.class.getName() + "]"
"fixed length parameter [" + TOKEN_LIMIT_FIELD + "] cannot be cast to [" + Number.class.getName() + "]"
);
}
if (((Number) parameters.get(TOKEN_LIMIT)).intValue() <= 0) {
throw new IllegalArgumentException("fixed length parameter [" + TOKEN_LIMIT + "] must be positive");
if (((Number) parameters.get(TOKEN_LIMIT_FIELD)).intValue() <= 0) {
throw new IllegalArgumentException("fixed length parameter [" + TOKEN_LIMIT_FIELD + "] must be positive");
}
}

if (parameters.containsKey(OVERLAP_RATE)) {
if (!(parameters.get(OVERLAP_RATE) instanceof Number)) {
if (parameters.containsKey(OVERLAP_RATE_FIELD)) {
if (!(parameters.get(OVERLAP_RATE_FIELD) instanceof Number)) {
throw new IllegalArgumentException(
"fixed length parameter [" + OVERLAP_RATE + "] cannot be cast to [" + Number.class.getName() + "]"
"fixed length parameter [" + OVERLAP_RATE_FIELD + "] cannot be cast to [" + Number.class.getName() + "]"
);
}
if (((Number) parameters.get(OVERLAP_RATE)).doubleValue() < 0.0
|| ((Number) parameters.get(OVERLAP_RATE)).doubleValue() >= 1.0) {
if (((Number) parameters.get(OVERLAP_RATE_FIELD)).doubleValue() < 0.0
|| ((Number) parameters.get(OVERLAP_RATE_FIELD)).doubleValue() >= 1.0) {
throw new IllegalArgumentException(
"fixed length parameter [" + OVERLAP_RATE + "] must be between 0 and 1, 1 is not included."
"fixed length parameter [" + OVERLAP_RATE_FIELD + "] must be between 0 and 1, 1 is not included."
);
}
}

if (parameters.containsKey(TOKENIZER) && !(parameters.get(TOKENIZER) instanceof String)) {
if (parameters.containsKey(TOKENIZER_FIELD) && !(parameters.get(TOKENIZER_FIELD) instanceof String)) {
throw new IllegalArgumentException(
"fixed length parameter [" + TOKENIZER + "] cannot be cast to [" + String.class.getName() + "]"
"fixed length parameter [" + TOKENIZER_FIELD + "] cannot be cast to [" + String.class.getName() + "]"
);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
*/
package org.opensearch.neuralsearch.processor;

import com.google.common.collect.ImmutableMap;
import lombok.SneakyThrows;
import org.apache.lucene.tests.analysis.MockTokenizer;
import org.junit.Before;
Expand All @@ -21,6 +22,7 @@
import org.opensearch.ingest.IngestDocument;
import org.opensearch.ingest.Processor;
import org.opensearch.neuralsearch.processor.chunker.ChunkerFactory;
import org.opensearch.neuralsearch.processor.chunker.DelimiterChunker;
import org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker;
import org.opensearch.plugins.AnalysisPlugin;
import org.opensearch.test.OpenSearchTestCase;
Expand All @@ -42,6 +44,9 @@ public class DocumentChunkingProcessorTests extends OpenSearchTestCase {

private static final String PROCESSOR_TAG = "mockTag";
private static final String DESCRIPTION = "mockDescription";
private static final String INPUT_FIELD = "body";
private static final String OUTPUT_FIELD = "body_chunk";
private static final String INDEX_NAME = "_index";

@SneakyThrows
private AnalysisRegistry getAnalysisRegistry() {
Expand Down Expand Up @@ -85,7 +90,13 @@ public void testGetType() {

private Map<String, Object> createFixedTokenLengthParameters() {
Map<String, Object> parameters = new HashMap<>();
parameters.put(FixedTokenLengthChunker.TOKEN_LIMIT, 10);
parameters.put(FixedTokenLengthChunker.TOKEN_LIMIT_FIELD, 10);
return parameters;
}

private Map<String, Object> createDelimiterParameters() {
Map<String, Object> parameters = new HashMap<>();
parameters.put(DelimiterChunker.DELIMITER_FIELD, ".");
return parameters;
}

Expand All @@ -95,35 +106,179 @@ private DocumentChunkingProcessor createFixedTokenLengthInstance() {
Map<String, Object> fieldParameters = new HashMap<>();
Map<String, Object> chunkerParameters = new HashMap<>();
chunkerParameters.put(ChunkerFactory.FIXED_LENGTH_ALGORITHM, createFixedTokenLengthParameters());
chunkerParameters.put(DocumentChunkingProcessor.OUTPUT_FIELD, "body_chunk");
fieldParameters.put("body", chunkerParameters);
chunkerParameters.put(DocumentChunkingProcessor.OUTPUT_FIELD, OUTPUT_FIELD);
fieldParameters.put(INPUT_FIELD, chunkerParameters);
config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldParameters);
Map<String, Processor.Factory> registry = new HashMap<>();
return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config);
}

private IngestDocument createIngestDocument() {
Map<String, Object> sourceAndMetadata = new HashMap<>();
sourceAndMetadata.put(
"body",
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
@SneakyThrows
private DocumentChunkingProcessor createDelimiterInstance() {
Map<String, Object> config = new HashMap<>();
Map<String, Object> fieldParameters = new HashMap<>();
Map<String, Object> chunkerParameters = new HashMap<>();
chunkerParameters.put(ChunkerFactory.DELIMITER_ALGORITHM, createDelimiterParameters());
chunkerParameters.put(DocumentChunkingProcessor.OUTPUT_FIELD, OUTPUT_FIELD);
fieldParameters.put(INPUT_FIELD, chunkerParameters);
config.put(DocumentChunkingProcessor.FIELD_MAP_FIELD, fieldParameters);
Map<String, Processor.Factory> registry = new HashMap<>();
return factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config);
}

private String createSourceDataString() {
return "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.";
}

private List<String> createSourceDataList() {
List<String> documents = new ArrayList<>();
documents.add(
"This is the first document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
);
documents.add(
"This is the second document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
);
sourceAndMetadata.put(IndexFieldMapper.NAME, "_index");
return documents;
}

private Map<String, String> createSourceDataMap() {
Map<String, String> documents = new HashMap<>();
documents.put(
"third",
"This is the third document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
);
documents.put(
"fourth",
"This is the fourth document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
);
return documents;
}

private Map<String, Object> createSourceDataNestedMap() {
String documentString = createSourceDataString();
List<String> documentList = createSourceDataList();
Map<String, String> documentMap = createSourceDataMap();
Map<String, Object> documents = new HashMap<>();
documents.put("String", documentString);
documents.put("List", documentList);
documents.put("Map", documentMap);
return documents;
}

private IngestDocument createIngestDocumentWithSourceData(Object sourceData) {
Map<String, Object> sourceAndMetadata = new HashMap<>();
sourceAndMetadata.put(INPUT_FIELD, sourceData);
sourceAndMetadata.put(IndexFieldMapper.NAME, INDEX_NAME);
return new IngestDocument(sourceAndMetadata, new HashMap<>());
}

@SneakyThrows
public void testExecute_withFixedTokenLength_successful() {
public void testExecute_withFixedTokenLength_andSourceDataString_successful() {
DocumentChunkingProcessor processor = createFixedTokenLengthInstance();
IngestDocument ingestDocument = createIngestDocument();
IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString());
IngestDocument document = processor.execute(ingestDocument);
assert document.getSourceAndMetadata().containsKey("body_chunk");
Object passages = document.getSourceAndMetadata().get("body_chunk");
assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD);
Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD);
assert (passages instanceof List<?>);
List<String> expectedPassages = new ArrayList<>();
expectedPassages.add("This is an example document to be chunked The document");
expectedPassages.add("The document contains a single paragraph two sentences and 24");
expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch");
assertEquals(expectedPassages, passages);
}

@SneakyThrows
public void testExecute_withFixedTokenLength_andSourceDataList_successful() {
DocumentChunkingProcessor processor = createFixedTokenLengthInstance();
IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataList());
IngestDocument document = processor.execute(ingestDocument);
assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD);
Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD);
assert (passages instanceof List<?>);

List<String> expectedPassages = new ArrayList<>();
expectedPassages.add("This is the first document to be chunked The document");
expectedPassages.add("The document contains a single paragraph two sentences and 24");
expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch");
expectedPassages.add("This is the second document to be chunked The document");
expectedPassages.add("The document contains a single paragraph two sentences and 24");
expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch");
assertEquals(expectedPassages, passages);
}

@SneakyThrows
public void testExecute_withFixedTokenLength_andSourceDataMap_successful() {
DocumentChunkingProcessor processor = createFixedTokenLengthInstance();
IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataMap());
IngestDocument document = processor.execute(ingestDocument);
assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD);
Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD);
assert (passages instanceof Map<?, ?>);

List<String> expectedPassages1 = new ArrayList<>();
List<String> expectedPassages2 = new ArrayList<>();

expectedPassages1.add("This is the third document to be chunked The document");
expectedPassages1.add("The document contains a single paragraph two sentences and 24");
expectedPassages1.add("and 24 tokens by standard tokenizer in OpenSearch");
expectedPassages2.add("This is the fourth document to be chunked The document");
expectedPassages2.add("The document contains a single paragraph two sentences and 24");
expectedPassages2.add("and 24 tokens by standard tokenizer in OpenSearch");

Map<String, Object> expectedPassages = ImmutableMap.of("third", expectedPassages1, "fourth", expectedPassages2);

assertEquals(expectedPassages, passages);
}

@SneakyThrows
public void testExecute_withFixedTokenLength_andSourceDataNestedMap_successful() {
DocumentChunkingProcessor processor = createFixedTokenLengthInstance();
IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataNestedMap());
IngestDocument document = processor.execute(ingestDocument);
assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD);
Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD);
assert (passages instanceof Map<?, ?>);

Map<String, Object> expectedPassages = new HashMap<>();
List<String> expectedPassages1 = new ArrayList<>();
List<String> expectedPassages2 = new ArrayList<>();
List<String> expectedPassages3 = new ArrayList<>();
List<String> expectedPassages4 = new ArrayList<>();

expectedPassages1.add("This is an example document to be chunked The document");
expectedPassages1.add("The document contains a single paragraph two sentences and 24");
expectedPassages1.add("and 24 tokens by standard tokenizer in OpenSearch");
expectedPassages2.add("This is the first document to be chunked The document");
expectedPassages2.add("The document contains a single paragraph two sentences and 24");
expectedPassages2.add("and 24 tokens by standard tokenizer in OpenSearch");
expectedPassages2.add("This is the second document to be chunked The document");
expectedPassages2.add("The document contains a single paragraph two sentences and 24");
expectedPassages2.add("and 24 tokens by standard tokenizer in OpenSearch");
expectedPassages3.add("This is the third document to be chunked The document");
expectedPassages3.add("The document contains a single paragraph two sentences and 24");
expectedPassages3.add("and 24 tokens by standard tokenizer in OpenSearch");
expectedPassages4.add("This is the fourth document to be chunked The document");
expectedPassages4.add("The document contains a single paragraph two sentences and 24");
expectedPassages4.add("and 24 tokens by standard tokenizer in OpenSearch");

expectedPassages.put("String", expectedPassages1);
expectedPassages.put("List", expectedPassages2);
expectedPassages.put("Map", ImmutableMap.of("third", expectedPassages3, "fourth", expectedPassages4));

assertEquals(expectedPassages, passages);
}

@SneakyThrows
public void testExecute_withDelimiter_andSourceDataString_successful() {
DocumentChunkingProcessor processor = createDelimiterInstance();
IngestDocument ingestDocument = createIngestDocumentWithSourceData(createSourceDataString());
IngestDocument document = processor.execute(ingestDocument);
assert document.getSourceAndMetadata().containsKey(OUTPUT_FIELD);
Object passages = document.getSourceAndMetadata().get(OUTPUT_FIELD);
assert (passages instanceof List<?>);
List<String> expectedPassages = new ArrayList<>();
expectedPassages.add("This is an example document to be chunked.");
expectedPassages.add(" The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.");
assertEquals(expectedPassages, passages);
}
}
Loading

0 comments on commit e7a0aaa

Please sign in to comment.