Skip to content

Commit

Permalink
integration tests for document chunking processor
Browse files Browse the repository at this point in the history
Signed-off-by: yuye-aws <[email protected]>
  • Loading branch information
yuye-aws committed Feb 29, 2024
1 parent 428ec53 commit 69efb3b
Show file tree
Hide file tree
Showing 9 changed files with 241 additions and 30 deletions.
23 changes: 0 additions & 23 deletions .idea/runConfigurations/Run_Neural_Search.xml

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
/*
* Copyright OpenSearch Contributors
* SPDX-License-Identifier: Apache-2.0
*/
package org.opensearch.neuralsearch.processor;

import com.google.common.collect.ImmutableList;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.message.BasicHeader;
import org.junit.Before;
import org.opensearch.client.Response;
import org.opensearch.common.xcontent.XContentHelper;
import org.opensearch.common.xcontent.XContentType;
import org.opensearch.index.query.MatchAllQueryBuilder;
import org.opensearch.neuralsearch.BaseNeuralSearchIT;

import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;

import static org.opensearch.neuralsearch.TestUtils.DEFAULT_USER_AGENT;

public class DocumentChunkingProcessorIT extends BaseNeuralSearchIT {
private static final String INDEX_NAME = "document_chunking_index";

private static final String OUTPUT_FIELD = "body_chunk";

private static final String FIXED_TOKEN_LENGTH_PIPELINE_NAME = "pipeline-document-chunking-fixed-token-length";

private static final String DELIMITER_PIPELINE_NAME = "pipeline-document-chunking-delimiter";

private static final String CASCADE_PIPELINE_NAME = "pipeline-document-chunking-cascade";

private static final String TEST_DOCUMENT = "processor/chunker/DocumentChunkingTestDocument.json";

private static final String TEST_LONG_DOCUMENT = "processor/chunker/DocumentChunkingTestLongDocument.json";

private static final Map<String, String> PIPELINE_CONFIGS_BY_NAME = Map.of(
FIXED_TOKEN_LENGTH_PIPELINE_NAME,
"processor/chunker/PipelineForFixedTokenLengthChunker.json",
DELIMITER_PIPELINE_NAME,
"processor/chunker/PipelineForDelimiterChunker.json",
CASCADE_PIPELINE_NAME,
"processor/chunker/PipelineForCascadedChunker.json"
);

@Before
public void setUp() throws Exception {
super.setUp();
updateClusterSettings();
}

public void testDocumentChunkingProcessor_withFixedTokenLength_successful() throws Exception {
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_NAME);
createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
expectedPassages.add("This is an example document to be chunked The document");
expectedPassages.add("The document contains a single paragraph two sentences and 24");
expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch");
validateIndexIngestResults(INDEX_NAME, expectedPassages);
} finally {
wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_NAME, null, null);
}
}

public void testDocumentChunkingProcessor_withFixedTokenLength_fail() throws Exception {
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_NAME);
createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_NAME);
Exception exception = assertThrows(Exception.class, () -> ingestDocument(TEST_LONG_DOCUMENT));
// max_token_count is 100 by index settings
assert (exception.getMessage()
.contains("The number of tokens produced by calling _analyze has exceeded the allowed maximum of [100]."));
assertEquals(0, getDocCount(INDEX_NAME));
} finally {
wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_NAME, null, null);
}
}

public void testDocumentChunkingProcessor_withDelimiter_successful() throws Exception {
try {
createPipelineProcessor(DELIMITER_PIPELINE_NAME);
createDocumentChunkingIndex(DELIMITER_PIPELINE_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
expectedPassages.add("This is an example document to be chunked.");
expectedPassages.add(
" The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
);
validateIndexIngestResults(INDEX_NAME, expectedPassages);
} finally {
wipeOfTestResources(INDEX_NAME, DELIMITER_PIPELINE_NAME, null, null);
}
}

public void testDocumentChunkingProcessor_withCascade_successful() throws Exception {
try {
createPipelineProcessor(CASCADE_PIPELINE_NAME);
createDocumentChunkingIndex(CASCADE_PIPELINE_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
// " ", "." and "," will not be included in fixed token length output
expectedPassages.add("This is an example document to be chunked");
expectedPassages.add("The document contains a single paragraph two sentences and 24");
expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch");
validateIndexIngestResults(INDEX_NAME, expectedPassages);
} finally {
wipeOfTestResources(INDEX_NAME, CASCADE_PIPELINE_NAME, null, null);
}
}

private void validateIndexIngestResults(String indexName, Object expected) {
assertEquals(1, getDocCount(indexName));
MatchAllQueryBuilder query = new MatchAllQueryBuilder();
Map<String, Object> searchResults = search(indexName, query, 10);
assertNotNull(searchResults);
Map<String, Object> document = getFirstInnerHit(searchResults);
assertNotNull(document);
Object documentSource = document.get("_source");
assert (documentSource instanceof Map);
@SuppressWarnings("unchecked")
Map<String, Object> documentSourceMap = (Map<String, Object>) documentSource;
assert (documentSourceMap).containsKey(OUTPUT_FIELD);
Object ingestOutputs = documentSourceMap.get(OUTPUT_FIELD);
assertEquals(expected, ingestOutputs);
}

private void createPipelineProcessor(final String pipelineName) throws Exception {
String requestBody = Files.readString(Path.of(classLoader.getResource(PIPELINE_CONFIGS_BY_NAME.get(pipelineName)).toURI()));
Response pipelineCreateResponse = makeRequest(
client(),
"PUT",
"/_ingest/pipeline/" + pipelineName,
null,
toHttpEntity(String.format(LOCALE, requestBody)),
ImmutableList.of(new BasicHeader(HttpHeaders.USER_AGENT, DEFAULT_USER_AGENT))
);
Map<String, Object> node = XContentHelper.convertToMap(
XContentType.JSON.xContent(),
EntityUtils.toString(pipelineCreateResponse.getEntity()),
false
);
assertEquals("true", node.get("acknowledged").toString());
}

private void createDocumentChunkingIndex(String pipelineName) throws Exception {
createIndexWithConfiguration(
INDEX_NAME,
Files.readString(Path.of(classLoader.getResource("processor/chunker/DocumentChunkingIndexSettings.json").toURI())),
pipelineName
);
}

private void ingestDocument(String documentPath) throws Exception {
String ingestDocument = Files.readString(Path.of(classLoader.getResource(documentPath).toURI()));
Response response = makeRequest(
client(),
"POST",
INDEX_NAME + "/_doc?refresh",
null,
toHttpEntity(ingestDocument),
ImmutableList.of(new BasicHeader(HttpHeaders.USER_AGENT, "Kibana"))
);
Map<String, Object> map = XContentHelper.convertToMap(
XContentType.JSON.xContent(),
EntityUtils.toString(response.getEntity()),
false
);
assertEquals("created", map.get("result"));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -199,10 +199,7 @@ public void testCreate_whenFieldMapWithIllegalKey_failure() {
IllegalArgumentException.class,
() -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config)
);
assertEquals(
"found parameter entry with non-string key",
illegalArgumentException.getMessage()
);
assertEquals("found parameter entry with non-string key", illegalArgumentException.getMessage());
}

public void testCreate_whenFieldMapWithNoAlgorithm_failure() {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
{
"settings":{
"index.analyze.max_token_count" : 100,
"default_pipeline": "%s"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"body": "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"body": "This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch."
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
{
"description": "An example cascaded pipeline with fixed token length algorithm after chunking algorithm",
"processors" : [
{
"chunking": {
"field_map": {
"body": {
"delimiter": {
"delimiter": "."
},
"output_field": "body_chunk_intermediate"
}
}
}
},
{
"chunking": {
"field_map": {
"body_chunk_intermediate": {
"fix_length": {
"token_limit": 10
},
"output_field": "body_chunk"
}
}
}
}
]
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@
{
"chunking": {
"field_map": {
"body_chunk1": {
"body": {
"delimiter": {
"delimiter": "\n"
"delimiter": "."
},
"output_field": "body_chunk2"
"output_field": "body_chunk"
}
}
}
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"description": "An example fixed token length chunker pipeline",
"processors" : [
{
"chunking": {
"field_map": {
"body": {
"fix_length": {
"token_limit": 10
},
"output_field": "body_chunk"
}
}
}
}
]
}

0 comments on commit 69efb3b

Please sign in to comment.