Skip to content

Commit

Permalink
fix document chunking processor IT
Browse files Browse the repository at this point in the history
Signed-off-by: yuye-aws <[email protected]>
  • Loading branch information
yuye-aws committed Mar 12, 2024
1 parent 1ca60a3 commit a30711d
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
import static org.opensearch.neuralsearch.TestUtils.DEFAULT_USER_AGENT;

public class DocumentChunkingProcessorIT extends BaseNeuralSearchIT {
private static final String INDEX_NAME = "document_chunking_index";
private static final String INDEX_NAME = "document_chunking_test_index";

private static final String OUTPUT_FIELD = "body_chunk";

Expand Down Expand Up @@ -70,7 +70,7 @@ public void setUp() throws Exception {
public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_thenSucceed() throws Exception {
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
Expand All @@ -86,7 +86,7 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardT
public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLetterTokenizer_thenSucceed() throws Exception {
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME);
createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME);
createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
Expand All @@ -102,7 +102,7 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLetterTok
public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLowercaseTokenizer_thenSucceed() throws Exception {
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME);
createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME);
createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
Expand All @@ -119,7 +119,7 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardT
throws Exception {
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
createDocumentChunkingIndex(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
Exception exception = assertThrows(Exception.class, () -> ingestDocument(TEST_LONG_DOCUMENT));
// max_token_count is 100 by index settings
assert (exception.getMessage()
Expand All @@ -133,7 +133,7 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardT
public void testDocumentChunkingProcessor_withDelimiterAlgorithm_successful() throws Exception {
try {
createPipelineProcessor(DELIMITER_PIPELINE_NAME);
createDocumentChunkingIndex(DELIMITER_PIPELINE_NAME);
createDocumentChunkingIndex(INDEX_NAME, DELIMITER_PIPELINE_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
Expand All @@ -150,7 +150,7 @@ public void testDocumentChunkingProcessor_withDelimiterAlgorithm_successful() th
public void testDocumentChunkingProcessor_withCascadePipeline_successful() throws Exception {
try {
createPipelineProcessor(CASCADE_PIPELINE_NAME);
createDocumentChunkingIndex(CASCADE_PIPELINE_NAME);
createDocumentChunkingIndex(INDEX_NAME, CASCADE_PIPELINE_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
Expand All @@ -165,7 +165,6 @@ public void testDocumentChunkingProcessor_withCascadePipeline_successful() throw
" The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch."
);
validateIndexIngestResults(INDEX_NAME, INTERMEDIATE_FIELD, expectedPassages);

} finally {
wipeOfTestResources(INDEX_NAME, CASCADE_PIPELINE_NAME, null, null);
}
Expand Down Expand Up @@ -207,10 +206,10 @@ private void createPipelineProcessor(String pipelineName) throws Exception {
assertEquals("true", node.get("acknowledged").toString());
}

private void createDocumentChunkingIndex(String pipelineName) throws Exception {
private void createDocumentChunkingIndex(String indexName, String pipelineName) throws Exception {
URL indexSettingsURLPath = classLoader.getResource("processor/chunker/DocumentChunkingIndexSettings.json");
assert indexSettingsURLPath != null;
createIndexWithConfiguration(INDEX_NAME, Files.readString(Path.of(indexSettingsURLPath.toURI())), pipelineName);
createIndexWithConfiguration(indexName, Files.readString(Path.of(indexSettingsURLPath.toURI())), pipelineName);
}

private void ingestDocument(String documentPath) throws Exception {
Expand Down

0 comments on commit a30711d

Please sign in to comment.