integration tests for document chunking processor

Signed-off-by: yuye-aws <[email protected]>
yuye-aws · Feb 29, 2024 · 69efb3b · 69efb3b
1 parent 428ec53
commit 69efb3b
Show file tree

Hide file tree

Showing 9 changed files with 241 additions and 30 deletions.
diff --git a/.idea/runConfigurations/Run_Neural_Search.xml b/.idea/runConfigurations/Run_Neural_Search.xml
diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorIT.java
@@ -0,0 +1,179 @@
+/*
+ * Copyright OpenSearch Contributors
+ * SPDX-License-Identifier: Apache-2.0
+ */
+package org.opensearch.neuralsearch.processor;
+
+import com.google.common.collect.ImmutableList;
+import org.apache.hc.core5.http.HttpHeaders;
+import org.apache.hc.core5.http.io.entity.EntityUtils;
+import org.apache.hc.core5.http.message.BasicHeader;
+import org.junit.Before;
+import org.opensearch.client.Response;
+import org.opensearch.common.xcontent.XContentHelper;
+import org.opensearch.common.xcontent.XContentType;
+import org.opensearch.index.query.MatchAllQueryBuilder;
+import org.opensearch.neuralsearch.BaseNeuralSearchIT;
+
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Map;
+
+import static org.opensearch.neuralsearch.TestUtils.DEFAULT_USER_AGENT;
+
+public class DocumentChunkingProcessorIT extends BaseNeuralSearchIT {
+    private static final String INDEX_NAME = "document_chunking_index";
+
+    private static final String OUTPUT_FIELD = "body_chunk";
+
+    private static final String FIXED_TOKEN_LENGTH_PIPELINE_NAME = "pipeline-document-chunking-fixed-token-length";
+
+    private static final String DELIMITER_PIPELINE_NAME = "pipeline-document-chunking-delimiter";
+
+    private static final String CASCADE_PIPELINE_NAME = "pipeline-document-chunking-cascade";
+
+    private static final String TEST_DOCUMENT = "processor/chunker/DocumentChunkingTestDocument.json";
+
+    private static final String TEST_LONG_DOCUMENT = "processor/chunker/DocumentChunkingTestLongDocument.json";
+
+    private static final Map<String, String> PIPELINE_CONFIGS_BY_NAME = Map.of(
+        FIXED_TOKEN_LENGTH_PIPELINE_NAME,
+        "processor/chunker/PipelineForFixedTokenLengthChunker.json",
+        DELIMITER_PIPELINE_NAME,
+        "processor/chunker/PipelineForDelimiterChunker.json",
+        CASCADE_PIPELINE_NAME,
+        "processor/chunker/PipelineForCascadedChunker.json"
+    );
+
+    @Before
+    public void setUp() throws Exception {
+        super.setUp();
+        updateClusterSettings();
+    }
+
+    public void testDocumentChunkingProcessor_withFixedTokenLength_successful() throws Exception {
+        try {
+            createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_NAME);
+            createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_NAME);
+            ingestDocument(TEST_DOCUMENT);
+
+            List<String> expectedPassages = new ArrayList<>();
+            expectedPassages.add("This is an example document to be chunked The document");
+            expectedPassages.add("The document contains a single paragraph two sentences and 24");
+            expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch");
+            validateIndexIngestResults(INDEX_NAME, expectedPassages);
+        } finally {
+            wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_NAME, null, null);
+        }
+    }
+
+    public void testDocumentChunkingProcessor_withFixedTokenLength_fail() throws Exception {
+        try {
+            createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_NAME);
+            createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_NAME);
+            Exception exception = assertThrows(Exception.class, () -> ingestDocument(TEST_LONG_DOCUMENT));
+            // max_token_count is 100 by index settings
+            assert (exception.getMessage()
+                .contains("The number of tokens produced by calling _analyze has exceeded the allowed maximum of [100]."));
+            assertEquals(0, getDocCount(INDEX_NAME));
+        } finally {
+            wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_NAME, null, null);
+        }
+    }
+
+    public void testDocumentChunkingProcessor_withDelimiter_successful() throws Exception {
+        try {
+            createPipelineProcessor(DELIMITER_PIPELINE_NAME);
+            createDocumentChunkingIndex(DELIMITER_PIPELINE_NAME);
+            ingestDocument(TEST_DOCUMENT);
+
+            List<String> expectedPassages = new ArrayList<>();
+            expectedPassages.add("This is an example document to be chunked.");
+            expectedPassages.add(
+                " The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
+            );
+            validateIndexIngestResults(INDEX_NAME, expectedPassages);
+        } finally {
+            wipeOfTestResources(INDEX_NAME, DELIMITER_PIPELINE_NAME, null, null);
+        }
+    }
+
+    public void testDocumentChunkingProcessor_withCascade_successful() throws Exception {
+        try {
+            createPipelineProcessor(CASCADE_PIPELINE_NAME);
+            createDocumentChunkingIndex(CASCADE_PIPELINE_NAME);
+            ingestDocument(TEST_DOCUMENT);
+
+            List<String> expectedPassages = new ArrayList<>();
+            // " ", "." and "," will not be included in fixed token length output
+            expectedPassages.add("This is an example document to be chunked");
+            expectedPassages.add("The document contains a single paragraph two sentences and 24");
+            expectedPassages.add("and 24 tokens by standard tokenizer in OpenSearch");
+            validateIndexIngestResults(INDEX_NAME, expectedPassages);
+        } finally {
+            wipeOfTestResources(INDEX_NAME, CASCADE_PIPELINE_NAME, null, null);
+        }
+    }
+
+    private void validateIndexIngestResults(String indexName, Object expected) {
+        assertEquals(1, getDocCount(indexName));
+        MatchAllQueryBuilder query = new MatchAllQueryBuilder();
+        Map<String, Object> searchResults = search(indexName, query, 10);
+        assertNotNull(searchResults);
+        Map<String, Object> document = getFirstInnerHit(searchResults);
+        assertNotNull(document);
+        Object documentSource = document.get("_source");
+        assert (documentSource instanceof Map);
+        @SuppressWarnings("unchecked")
+        Map<String, Object> documentSourceMap = (Map<String, Object>) documentSource;
+        assert (documentSourceMap).containsKey(OUTPUT_FIELD);
+        Object ingestOutputs = documentSourceMap.get(OUTPUT_FIELD);
+        assertEquals(expected, ingestOutputs);
+    }
+
+    private void createPipelineProcessor(final String pipelineName) throws Exception {
+        String requestBody = Files.readString(Path.of(classLoader.getResource(PIPELINE_CONFIGS_BY_NAME.get(pipelineName)).toURI()));
+        Response pipelineCreateResponse = makeRequest(
+            client(),
+            "PUT",
+            "/_ingest/pipeline/" + pipelineName,
+            null,
+            toHttpEntity(String.format(LOCALE, requestBody)),
+            ImmutableList.of(new BasicHeader(HttpHeaders.USER_AGENT, DEFAULT_USER_AGENT))
+        );
+        Map<String, Object> node = XContentHelper.convertToMap(
+            XContentType.JSON.xContent(),
+            EntityUtils.toString(pipelineCreateResponse.getEntity()),
+            false
+        );
+        assertEquals("true", node.get("acknowledged").toString());
+    }
+
+    private void createDocumentChunkingIndex(String pipelineName) throws Exception {
+        createIndexWithConfiguration(
+            INDEX_NAME,
+            Files.readString(Path.of(classLoader.getResource("processor/chunker/DocumentChunkingIndexSettings.json").toURI())),
+            pipelineName
+        );
+    }
+
+    private void ingestDocument(String documentPath) throws Exception {
+        String ingestDocument = Files.readString(Path.of(classLoader.getResource(documentPath).toURI()));
+        Response response = makeRequest(
+            client(),
+            "POST",
+            INDEX_NAME + "/_doc?refresh",
+            null,
+            toHttpEntity(ingestDocument),
+            ImmutableList.of(new BasicHeader(HttpHeaders.USER_AGENT, "Kibana"))
+        );
+        Map<String, Object> map = XContentHelper.convertToMap(
+            XContentType.JSON.xContent(),
+            EntityUtils.toString(response.getEntity()),
+            false
+        );
+        assertEquals("created", map.get("result"));
+    }
+}
diff --git a/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/DocumentChunkingProcessorTests.java
@@ -199,10 +199,7 @@ public void testCreate_whenFieldMapWithIllegalKey_failure() {
             IllegalArgumentException.class,
             () -> factory.create(registry, PROCESSOR_TAG, DESCRIPTION, config)
         );
-        assertEquals(
-            "found parameter entry with non-string key",
-            illegalArgumentException.getMessage()
-        );
+        assertEquals("found parameter entry with non-string key", illegalArgumentException.getMessage());
     }
 
     public void testCreate_whenFieldMapWithNoAlgorithm_failure() {

diff --git a/src/test/resources/processor/chunker/DocumentChunkingIndexSettings.json b/src/test/resources/processor/chunker/DocumentChunkingIndexSettings.json
@@ -0,0 +1,6 @@
+{
+  "settings":{
+    "index.analyze.max_token_count" : 100,
+    "default_pipeline": "%s"
+  }
+}
diff --git a/src/test/resources/processor/chunker/DocumentChunkingTestDocument.json b/src/test/resources/processor/chunker/DocumentChunkingTestDocument.json
@@ -0,0 +1,3 @@
+{
+  "body": "This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
+}
diff --git a/src/test/resources/processor/chunker/DocumentChunkingTestLongDocument.json b/src/test/resources/processor/chunker/DocumentChunkingTestLongDocument.json
@@ -0,0 +1,3 @@
+{
+  "body": "This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch. This is an example long document to be chunked. The document has more than 100 tokens by standard tokenizer in OpenSearch."
+}
diff --git a/src/test/resources/processor/chunker/PipelineForCascadedChunker.json b/src/test/resources/processor/chunker/PipelineForCascadedChunker.json
@@ -0,0 +1,29 @@
+{
+  "description": "An example cascaded pipeline with fixed token length algorithm after chunking algorithm",
+  "processors" : [
+    {
+      "chunking": {
+        "field_map": {
+          "body": {
+            "delimiter": {
+              "delimiter": "."
+            },
+            "output_field": "body_chunk_intermediate"
+          }
+        }
+      }
+    },
+    {
+      "chunking": {
+        "field_map": {
+          "body_chunk_intermediate": {
+            "fix_length": {
+              "token_limit": 10
+            },
+            "output_field": "body_chunk"
+          }
+        }
+      }
+    }
+  ]
+}
diff --git a/.../processor/DelimiterChunkerProcessor.json → .../chunker/PipelineForDelimiterChunker.json b/.../processor/DelimiterChunkerProcessor.json → .../chunker/PipelineForDelimiterChunker.json
@@ -4,11 +4,11 @@
     {
       "chunking": {
         "field_map": {
-          "body_chunk1": {
+          "body": {
             "delimiter": {
-              "delimiter": "\n"
+              "delimiter": "."
             },
-            "output_field": "body_chunk2"
+            "output_field": "body_chunk"
           }
         }
       }

diff --git a/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json b/src/test/resources/processor/chunker/PipelineForFixedTokenLengthChunker.json
@@ -0,0 +1,17 @@
+{
+  "description": "An example fixed token length chunker pipeline",
+  "processors" : [
+    {
+      "chunking": {
+        "field_map": {
+          "body": {
+            "fix_length": {
+              "token_limit": 10
+            },
+            "output_field": "body_chunk"
+          }
+        }
+      }
+    }
+  ]
+}