Skip to content

Commit

Permalink
fix document chunking processor IT
Browse files Browse the repository at this point in the history
Signed-off-by: yuye-aws <[email protected]>
  • Loading branch information
yuye-aws committed Mar 12, 2024
1 parent a4d90a4 commit 499f2d2
Show file tree
Hide file tree
Showing 2 changed files with 2 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
package org.opensearch.neuralsearch.processor;

import com.google.common.collect.ImmutableList;
import lombok.extern.log4j.Log4j2;
import org.apache.hc.core5.http.HttpHeaders;
import org.apache.hc.core5.http.io.entity.EntityUtils;
import org.apache.hc.core5.http.message.BasicHeader;
Expand All @@ -25,9 +24,8 @@
import org.opensearch.neuralsearch.BaseNeuralSearchIT;
import static org.opensearch.neuralsearch.TestUtils.DEFAULT_USER_AGENT;

@Log4j2
public class DocumentChunkingProcessorIT extends BaseNeuralSearchIT {
private static final String INDEX_NAME = "document_chunking_index";
private static final String INDEX_NAME = "document_chunking_test_index";

private static final String OUTPUT_FIELD = "body_chunk";

Expand Down Expand Up @@ -103,26 +101,17 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLetterTok

public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLowercaseTokenizer_thenSucceed() throws Exception {
try {
log.error("Creating pipeline: " + FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME);
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME);
log.error("Successfully created pipeline: " + FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME);
createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME);
log.error("Successfully created index: " + INDEX_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
expectedPassages.add("This is an example document to be chunked. The document");
expectedPassages.add("contains a single paragraph, two sentences and 24 tokens by standard");
expectedPassages.add("tokenizer in OpenSearch.");
validateIndexIngestResults(INDEX_NAME, OUTPUT_FIELD, expectedPassages);
} catch (Exception e) {
log.error("LowercaseTokenizer main body encounters exception: " + e);
} finally {
try {
wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME, null, null);
} catch (Exception e) {
log.error("LowercaseTokenizer wipeOfTestResources encounters exception: " + e);
}
wipeOfTestResources(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME, null, null);
}
}

Expand Down Expand Up @@ -176,7 +165,6 @@ public void testDocumentChunkingProcessor_withCascadePipeline_successful() throw
" The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
);
validateIndexIngestResults(INDEX_NAME, INTERMEDIATE_FIELD, expectedPassages);

} finally {
wipeOfTestResources(INDEX_NAME, CASCADE_PIPELINE_NAME, null, null);
}
Expand All @@ -199,7 +187,6 @@ private void validateIndexIngestResults(String indexName, String fieldName, Obje
}

private void createPipelineProcessor(String pipelineName) throws Exception {
log.error("createPipelineProcessor with file: " + PIPELINE_CONFIGS_BY_NAME.get(pipelineName));
URL pipelineURLPath = classLoader.getResource(PIPELINE_CONFIGS_BY_NAME.get(pipelineName));
assert pipelineURLPath != null;
String requestBody = Files.readString(Path.of(pipelineURLPath.toURI()));
Expand Down

0 comments on commit 499f2d2

Please sign in to comment.