Skip to content

Commit

Permalink
update ut for text chunking processor
Browse files Browse the repository at this point in the history
Signed-off-by: yuye-aws <[email protected]>
  • Loading branch information
yuye-aws committed Apr 30, 2024
1 parent 84cd362 commit 65f8cf7
Showing 1 changed file with 208 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
import java.util.Locale;
import java.util.Map;
import java.util.Objects;
import java.util.Set;

import static java.util.Collections.singletonList;
import static java.util.Collections.singletonMap;
Expand Down Expand Up @@ -122,12 +123,18 @@ private Map<String, Object> createStringFieldMap() {
return fieldMap;
}

private Map<String, Object> createNestedFieldMap() {
private Map<String, Object> createNestedFieldMapSingleField() {
Map<String, Object> fieldMap = new HashMap<>();
fieldMap.put(INPUT_NESTED_FIELD_KEY, Map.of(INPUT_FIELD, OUTPUT_FIELD));
return fieldMap;
}

private Map<String, Object> createNestedFieldMapMultipleField() {
Map<String, Object> fieldMap = new HashMap<>();
fieldMap.put(INPUT_NESTED_FIELD_KEY, Map.of(INPUT_FIELD + "_1", OUTPUT_FIELD + "_1", INPUT_FIELD + "_2", OUTPUT_FIELD + "_2"));
return fieldMap;
}

@SneakyThrows
private TextChunkingProcessor createDefaultAlgorithmInstance(Map<String, Object> fieldMap) {
Map<String, Object> config = new HashMap<>();
Expand Down Expand Up @@ -331,12 +338,19 @@ private List<Object> createSourceDataListWithNull() {
return documents;
}

private Map<String, Object> createSourceDataNestedMap() {
private Map<String, Object> createSourceDataNestedMapSingleField() {
Map<String, Object> documents = new HashMap<>();
documents.put(INPUT_FIELD, createSourceDataString());
return documents;
}

private Map<String, Object> createSourceDataNestedMapMultipleField() {
Map<String, Object> documents = new HashMap<>();
documents.put(INPUT_FIELD + "_1", createSourceDataString());
documents.put(INPUT_FIELD + "_2", createSourceDataString());
return documents;
}

private Map<String, Object> createSourceDataInvalidNestedMap() {
Map<String, Object> documents = new HashMap<>();
documents.put(INPUT_FIELD, Map.of(INPUT_NESTED_FIELD_KEY, 1));
Expand Down Expand Up @@ -639,9 +653,9 @@ public void testExecute_withFixedTokenLength_andSourceDataListWithNull_thenFail(

@SuppressWarnings("unchecked")
@SneakyThrows
public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenSucceed() {
TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap());
IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMap());
public void testExecute_withFixedTokenLength_andFieldMapNestedMapSingleField_thenSucceed() {
TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMapSingleField());
IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMapSingleField());
IngestDocument document = processor.execute(ingestDocument);
assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY);
Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY);
Expand All @@ -657,9 +671,193 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenSucceed()
assertEquals(expectedPassages, passages);
}

@SneakyThrows
@SuppressWarnings("unchecked")
public void testExecute_withFixedTokenLength_andFieldMapNestedMapMultipleField_thenSucceed() {
TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMapMultipleField());
IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMapMultipleField());
IngestDocument document = processor.execute(ingestDocument);
assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY);
Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY);
assert (nestedResult instanceof Map<?, ?>);
assert ((Map<String, Object>) nestedResult).containsKey(OUTPUT_FIELD + "_1");
assert ((Map<String, Object>) nestedResult).containsKey(OUTPUT_FIELD + "_2");
Object passages1 = ((Map<String, Object>) nestedResult).get(OUTPUT_FIELD + "_1");
Object passages2 = ((Map<String, Object>) nestedResult).get(OUTPUT_FIELD + "_2");
assert (passages1 instanceof List);
assert (passages2 instanceof List);

List<String> expectedPassages = List.of(
"This is an example document to be chunked. The document ",
"contains a single paragraph, two sentences and 24 tokens by ",
"standard tokenizer in OpenSearch."
);
assertEquals(expectedPassages, passages1);
assertEquals(expectedPassages, passages2);
}

@SneakyThrows
@SuppressWarnings("unchecked")
public
void
testExecute_withFixedTokenLength_andFieldMapNestedMapMultipleField_exceedMaxChunkLimitFive_thenLastPassageGetConcatenated() {
int maxChunkLimit = 5;
TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(
createNestedFieldMapMultipleField(),
maxChunkLimit
);
IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMapMultipleField());
IngestDocument document = processor.execute(ingestDocument);
assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY);
Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY);
assert (nestedResult instanceof Map<?, ?>);
assert ((Map<String, Object>) nestedResult).containsKey(OUTPUT_FIELD + "_1");
assert ((Map<String, Object>) nestedResult).containsKey(OUTPUT_FIELD + "_2");
Object passages1 = ((Map<String, Object>) nestedResult).get(OUTPUT_FIELD + "_1");
Object passages2 = ((Map<String, Object>) nestedResult).get(OUTPUT_FIELD + "_2");
assert (passages1 instanceof List);
assert (passages2 instanceof List);

List<String> expectedPassages1 = List.of(
"This is an example document to be chunked. The document ",
"contains a single paragraph, two sentences and 24 tokens by ",
"standard tokenizer in OpenSearch."
);
List<String> expectedPassages2 = List.of(
"This is an example document to be chunked. The document ",
"contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
);
Set<List<String>> passages = Set.of((List<String>) passages1, (List<String>) passages2);
Set<List<String>> expectedPassages = Set.of(expectedPassages1, expectedPassages2);
assertEquals(passages, expectedPassages);
}

@SneakyThrows
@SuppressWarnings("unchecked")
public
void
testExecute_withFixedTokenLength_andFieldMapNestedMapMultipleField_exceedMaxChunkLimitFour_thenLastPassageGetConcatenated() {
int maxChunkLimit = 4;
TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(
createNestedFieldMapMultipleField(),
maxChunkLimit
);
IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMapMultipleField());
IngestDocument document = processor.execute(ingestDocument);
assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY);
Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY);
assert (nestedResult instanceof Map<?, ?>);
assert ((Map<String, Object>) nestedResult).containsKey(OUTPUT_FIELD + "_1");
assert ((Map<String, Object>) nestedResult).containsKey(OUTPUT_FIELD + "_2");
Object passages1 = ((Map<String, Object>) nestedResult).get(OUTPUT_FIELD + "_1");
Object passages2 = ((Map<String, Object>) nestedResult).get(OUTPUT_FIELD + "_2");
assert (passages1 instanceof List);
assert (passages2 instanceof List);

List<String> expectedPassages1 = List.of(
"This is an example document to be chunked. The document ",
"contains a single paragraph, two sentences and 24 tokens by ",
"standard tokenizer in OpenSearch."
);
List<String> expectedPassages2 = List.of(
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
);
Set<List<String>> passages = Set.of((List<String>) passages1, (List<String>) passages2);
Set<List<String>> expectedPassages = Set.of(expectedPassages1, expectedPassages2);
assertEquals(passages, expectedPassages);
}

@SneakyThrows
@SuppressWarnings("unchecked")
public
void
testExecute_withFixedTokenLength_andFieldMapNestedMapMultipleField_exceedMaxChunkLimitThree_thenLastPassageGetConcatenated() {
int maxChunkLimit = 3;
TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(
createNestedFieldMapMultipleField(),
maxChunkLimit
);
IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMapMultipleField());
IngestDocument document = processor.execute(ingestDocument);
assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY);
Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY);
assert (nestedResult instanceof Map<?, ?>);
assert ((Map<String, Object>) nestedResult).containsKey(OUTPUT_FIELD + "_1");
assert ((Map<String, Object>) nestedResult).containsKey(OUTPUT_FIELD + "_2");
Object passages1 = ((Map<String, Object>) nestedResult).get(OUTPUT_FIELD + "_1");
Object passages2 = ((Map<String, Object>) nestedResult).get(OUTPUT_FIELD + "_2");
assert (passages1 instanceof List);
assert (passages2 instanceof List);

List<String> expectedPassages1 = List.of(
"This is an example document to be chunked. The document ",
"contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
);
List<String> expectedPassages2 = List.of(
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
);
Set<List<String>> passages = Set.of((List<String>) passages1, (List<String>) passages2);
Set<List<String>> expectedPassages = Set.of(expectedPassages1, expectedPassages2);
assertEquals(passages, expectedPassages);
}

@SneakyThrows
@SuppressWarnings("unchecked")
public void testExecute_withFixedTokenLength_andFieldMapNestedMapMultipleField_exceedMaxChunkLimitTwo_thenLastPassageGetConcatenated() {
int maxChunkLimit = 2;
TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(
createNestedFieldMapMultipleField(),
maxChunkLimit
);
IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMapMultipleField());
IngestDocument document = processor.execute(ingestDocument);
assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY);
Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY);
assert (nestedResult instanceof Map<?, ?>);
assert ((Map<String, Object>) nestedResult).containsKey(OUTPUT_FIELD + "_1");
assert ((Map<String, Object>) nestedResult).containsKey(OUTPUT_FIELD + "_2");
Object passages1 = ((Map<String, Object>) nestedResult).get(OUTPUT_FIELD + "_1");
Object passages2 = ((Map<String, Object>) nestedResult).get(OUTPUT_FIELD + "_2");
assert (passages1 instanceof List);
assert (passages2 instanceof List);

List<String> expectedPassages = List.of(
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
);
assertEquals(passages1, expectedPassages);
assertEquals(passages2, expectedPassages);
}

@SneakyThrows
@SuppressWarnings("unchecked")
public void testExecute_withFixedTokenLength_andFieldMapNestedMapMultipleField_exceedMaxChunkLimitOne_thenLastPassageGetConcatenated() {
int maxChunkLimit = 1;
TextChunkingProcessor processor = createFixedTokenLengthInstanceWithMaxChunkLimit(
createNestedFieldMapMultipleField(),
maxChunkLimit
);
IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataNestedMapMultipleField());
IngestDocument document = processor.execute(ingestDocument);
assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY);
Object nestedResult = document.getSourceAndMetadata().get(INPUT_NESTED_FIELD_KEY);
assert (nestedResult instanceof Map<?, ?>);
assert ((Map<String, Object>) nestedResult).containsKey(OUTPUT_FIELD + "_1");
assert ((Map<String, Object>) nestedResult).containsKey(OUTPUT_FIELD + "_2");
Object passages1 = ((Map<String, Object>) nestedResult).get(OUTPUT_FIELD + "_1");
Object passages2 = ((Map<String, Object>) nestedResult).get(OUTPUT_FIELD + "_2");
assert (passages1 instanceof List);
assert (passages2 instanceof List);

List<String> expectedPassages = List.of(
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
);
assertEquals(passages1, expectedPassages);
assertEquals(passages2, expectedPassages);
}

@SneakyThrows
public void testExecute_withFixedTokenLength_andMaxDepthLimitExceedFieldMap_thenFail() {
TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap());
TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMapSingleField());
IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createMaxDepthLimitExceedMap(0));
IllegalArgumentException illegalArgumentException = assertThrows(
IllegalArgumentException.class,
Expand All @@ -672,8 +870,8 @@ public void testExecute_withFixedTokenLength_andMaxDepthLimitExceedFieldMap_then
}

@SneakyThrows
public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenFail() {
TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap());
public void testExecute_withFixedTokenLength_andFieldMapNestedMapSingleField_thenFail() {
TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMapSingleField());
IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataInvalidNestedMap());
IllegalArgumentException illegalArgumentException = assertThrows(
IllegalArgumentException.class,
Expand All @@ -687,8 +885,8 @@ public void testExecute_withFixedTokenLength_andFieldMapNestedMap_thenFail() {

@SneakyThrows
@SuppressWarnings("unchecked")
public void testExecute_withFixedTokenLength_andFieldMapNestedMap_sourceDataList_thenSucceed() {
TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMap());
public void testExecute_withFixedTokenLength_andFieldMapNestedMapSingleField_sourceDataList_thenSucceed() {
TextChunkingProcessor processor = createFixedTokenLengthInstance(createNestedFieldMapSingleField());
IngestDocument ingestDocument = createIngestDocumentWithNestedSourceData(createSourceDataListNestedMap());
IngestDocument document = processor.execute(ingestDocument);
assert document.getSourceAndMetadata().containsKey(INPUT_NESTED_FIELD_KEY);
Expand Down

0 comments on commit 65f8cf7

Please sign in to comment.