Skip to content

Commit

Permalink
add log message for document chunking processor IT
Browse files Browse the repository at this point in the history
Signed-off-by: yuye-aws <[email protected]>
  • Loading branch information
yuye-aws committed Mar 12, 2024
1 parent 1ca60a3 commit 5a9b253
Showing 1 changed file with 19 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
package org.opensearch.neuralsearch.processor;

import com.google.common.collect.ImmutableList;
import lombok.extern.log4j.Log4j2;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.message.BasicHeader;
Expand All @@ -24,6 +25,7 @@
import org.opensearch.neuralsearch.BaseNeuralSearchIT;
import static org.opensearch.neuralsearch.TestUtils.DEFAULT_USER_AGENT;

@Log4j2
public class DocumentChunkingProcessorIT extends BaseNeuralSearchIT {
private static final String INDEX_NAME = "document_chunking_index";

Expand Down Expand Up @@ -70,7 +72,7 @@ public void setUp() throws Exception {
public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_thenSucceed() throws Exception {
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
Expand All @@ -86,7 +88,7 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardT
public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLetterTokenizer_thenSucceed() throws Exception {
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME);
createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME);
createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
Expand All @@ -102,24 +104,32 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLetterTok
public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLowercaseTokenizer_thenSucceed() throws Exception {
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME);
createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME);
log.error("Successfully created pipeline: " + FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME);
createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME);
log.error("Successfully created index: " + INDEX_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
expectedPassages.add("This is an example document to be chunked. The document");
expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by standard");
expectedPassages.add("tokenizer in OpenSearch.");
validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages);
} catch (Exception e) {
log.error("LowercaseTokenizer main body encounters exception: " + e);
} finally {
wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME, null, null);
try {
wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME, null, null);
} catch (Exception e) {
log.error("LowercaseTokenizer wipeOfTestResources encounters exception: " + e);
}
}
}

public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_whenExceedMaxTokenCount_thenFail()
throws Exception {
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
Exception exception = assertThrows(Exception.class, () -> ingestDocument(TEST_LONG_DOCUMENT));
// max_token_count is 100 by index settings
assert (exception.getMessage()
Expand All @@ -133,7 +143,7 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardT
public void testDocumentChunkingProcessor_withDelimiterAlgorithm_successful() throws Exception {
try {
createPipelineProcessor(DELIMITER_PIPELINE_NAME);
createDocumentChunkingIndex(DELIMITER_PIPELINE_NAME);
createDocumentChunkingIndex(INDEX_NAME, DELIMITER_PIPELINE_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
Expand All @@ -150,7 +160,7 @@ public void testDocumentChunkingProcessor_withDelimiterAlgorithm_successful() th
public void testDocumentChunkingProcessor_withCascadePipeline_successful() throws Exception {
try {
createPipelineProcessor(CASCADE_PIPELINE_NAME);
createDocumentChunkingIndex(CASCADE_PIPELINE_NAME);
createDocumentChunkingIndex(INDEX_NAME, CASCADE_PIPELINE_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
Expand Down Expand Up @@ -207,10 +217,10 @@ private void createPipelineProcessor(String pipelineName) throws Exception {
assertEquals("true", node.get("acknowledged").toString());
}

private void createDocumentChunkingIndex(String pipelineName) throws Exception {
private void createDocumentChunkingIndex(String indexName, String pipelineName) throws Exception {
URL indexSettingsURLPath = classLoader.getResource("processor/chunker/DocumentChunkingIndexSettings.json");
assert indexSettingsURLPath != null;
createIndexWithConfiguration(INDEX_NAME, Files.readString(Path.of(indexSettingsURLPath.toURI())), pipelineName);
createIndexWithConfiguration(indexName, Files.readString(Path.of(indexSettingsURLPath.toURI())), pipelineName);
}

private void ingestDocument(String documentPath) throws Exception {
Expand Down

0 comments on commit 5a9b253

Please sign in to comment.