Skip to content

Commit

Permalink
rename processor
Browse files Browse the repository at this point in the history
Signed-off-by: yuye-aws <[email protected]>
  • Loading branch information
yuye-aws committed Mar 12, 2024
1 parent 586f428 commit b37e997
Show file tree
Hide file tree
Showing 9 changed files with 91 additions and 98 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,11 @@
import org.opensearch.neuralsearch.processor.NormalizationProcessorWorkflow;
import org.opensearch.neuralsearch.processor.SparseEncodingProcessor;
import org.opensearch.neuralsearch.processor.TextEmbeddingProcessor;
import org.opensearch.neuralsearch.processor.DocumentChunkingProcessor;
import org.opensearch.neuralsearch.processor.TextChunkingProcessor;
import org.opensearch.neuralsearch.processor.TextImageEmbeddingProcessor;
import org.opensearch.neuralsearch.processor.combination.ScoreCombinationFactory;
import org.opensearch.neuralsearch.processor.combination.ScoreCombiner;
import org.opensearch.neuralsearch.processor.factory.DocumentChunkingProcessorFactory;
import org.opensearch.neuralsearch.processor.factory.TextChunkingProcessorFactory;
import org.opensearch.neuralsearch.processor.factory.NormalizationProcessorFactory;
import org.opensearch.neuralsearch.processor.factory.RerankProcessorFactory;
import org.opensearch.neuralsearch.processor.factory.SparseEncodingProcessorFactory;
Expand Down Expand Up @@ -117,8 +117,8 @@ public Map<String, Processor.Factory> getProcessors(Processor.Parameters paramet
new SparseEncodingProcessorFactory(clientAccessor, parameters.env),
TextImageEmbeddingProcessor.TYPE,
new TextImageEmbeddingProcessorFactory(clientAccessor, parameters.env, parameters.ingestService.getClusterService()),
DocumentChunkingProcessor.TYPE,
new DocumentChunkingProcessorFactory(
TextChunkingProcessor.TYPE,
new TextChunkingProcessorFactory(
parameters.env,
parameters.ingestService.getClusterService(),
parameters.indicesService,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
import java.util.Objects;

import com.google.common.annotations.VisibleForTesting;
import lombok.extern.log4j.Log4j2;
import org.apache.commons.lang3.math.NumberUtils;

import org.opensearch.cluster.metadata.IndexMetadata;
Expand All @@ -38,10 +37,9 @@
* algorithm can be used to indicate chunking algorithm and parameters,
* and field_map can be used to indicate which fields needs chunking and the corresponding keys for the chunking results.
*/
@Log4j2
public final class DocumentChunkingProcessor extends AbstractProcessor {
public final class TextChunkingProcessor extends AbstractProcessor {

public static final String TYPE = "chunking";
public static final String TYPE = "text_chunking";

public static final String FIELD_MAP_FIELD = "field_map";

Expand All @@ -65,7 +63,7 @@ public final class DocumentChunkingProcessor extends AbstractProcessor {

private final Environment environment;

public DocumentChunkingProcessor(
public TextChunkingProcessor(
String tag,
String description,
Map<String, Object> fieldMap,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,17 +11,17 @@
import org.opensearch.index.analysis.AnalysisRegistry;
import org.opensearch.indices.IndicesService;
import org.opensearch.ingest.Processor;
import org.opensearch.neuralsearch.processor.DocumentChunkingProcessor;
import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.TYPE;
import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.FIELD_MAP_FIELD;
import static org.opensearch.neuralsearch.processor.DocumentChunkingProcessor.ALGORITHM_FIELD;
import org.opensearch.neuralsearch.processor.TextChunkingProcessor;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD;
import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD;
import static org.opensearch.ingest.ConfigurationUtils.readMap;

/**
* Factory for chunking ingest processor for ingestion pipeline.
* Instantiates processor based on user provided input.
*/
public class DocumentChunkingProcessorFactory implements Processor.Factory {
public class TextChunkingProcessorFactory implements Processor.Factory {

private final Environment environment;

Expand All @@ -31,7 +31,7 @@ public class DocumentChunkingProcessorFactory implements Processor.Factory {

private final AnalysisRegistry analysisRegistry;

public DocumentChunkingProcessorFactory(
public TextChunkingProcessorFactory(
Environment environment,
ClusterService clusterService,
IndicesService indicesService,
Expand All @@ -44,15 +44,15 @@ public DocumentChunkingProcessorFactory(
}

@Override
public DocumentChunkingProcessor create(
public TextChunkingProcessor create(
Map<String, Processor.Factory> registry,
String processorTag,
String description,
Map<String, Object> config
) throws Exception {
Map<String, Object> fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD);
Map<String, Object> algorithmMap = readMap(TYPE, processorTag, config, ALGORITHM_FIELD);
return new DocumentChunkingProcessor(
return new TextChunkingProcessor(
processorTag,
description,
fieldMap,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,29 +24,29 @@
import org.opensearch.neuralsearch.BaseNeuralSearchIT;
import static org.opensearch.neuralsearch.TestUtils.DEFAULT_USER_AGENT;

public class DocumentChunkingProcessorIT extends BaseNeuralSearchIT {
private static final String INDEX_NAME = "document_chunking_test_index";
public class TextChunkingProcessorIT extends BaseNeuralSearchIT {
private static final String INDEX_NAME = "text_chunking_test_index";

private static final String OUTPUT_FIELD = "body_chunk";

private static final String INTERMEDIATE_FIELD = "body_chunk_intermediate";

private static final String FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME =
"pipeline-document-chunking-fixed-token-length-standard-tokenizer";
"pipeline-text-chunking-fixed-token-length-standard-tokenizer";

private static final String FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME =
"pipeline-document-chunking-fixed-token-length-letter-tokenizer";
"pipeline-text-chunking-fixed-token-length-letter-tokenizer";

private static final String FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME =
"pipeline-document-chunking-fixed-token-length-lowercase-tokenizer";
"pipeline-text-chunking-fixed-token-length-lowercase-tokenizer";

private static final String DELIMITER_PIPELINE_NAME = "pipeline-document-chunking-delimiter";
private static final String DELIMITER_PIPELINE_NAME = "pipeline-text-chunking-delimiter";

private static final String CASCADE_PIPELINE_NAME = "pipeline-document-chunking-cascade";
private static final String CASCADE_PIPELINE_NAME = "pipeline-text-chunking-cascade";

private static final String TEST_DOCUMENT = "processor/chunker/DocumentChunkingTestDocument.json";
private static final String TEST_DOCUMENT = "processor/chunker/TextChunkingTestDocument.json";

private static final String TEST_LONG_DOCUMENT = "processor/chunker/DocumentChunkingTestLongDocument.json";
private static final String TEST_LONG_DOCUMENT = "processor/chunker/TextChunkingTestLongDocument.json";

private static final Map<String, String> PIPELINE_CONFIGS_BY_NAME = Map.of(
FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME,
Expand All @@ -67,10 +67,10 @@ public void setUp() throws Exception {
updateClusterSettings();
}

public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_thenSucceed() throws Exception {
public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_thenSucceed() throws Exception {
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
createTextChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
Expand All @@ -83,10 +83,10 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardT
}
}

public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLetterTokenizer_thenSucceed() throws Exception {
public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmLetterTokenizer_thenSucceed() throws Exception {
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME);
createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME);
createTextChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LETTER_TOKENIZER_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
Expand All @@ -99,10 +99,10 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLetterTok
}
}

public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLowercaseTokenizer_thenSucceed() throws Exception {
public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmLowercaseTokenizer_thenSucceed() throws Exception {
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME);
createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME);
createTextChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_LOWERCASE_TOKENIZER_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
Expand All @@ -115,11 +115,11 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmLowercase
}
}

public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_whenExceedMaxTokenCount_thenFail()
public void testTextChunkingProcessor_withFixedTokenLengthAlgorithmStandardTokenizer_whenExceedMaxTokenCount_thenFail()
throws Exception {
try {
createPipelineProcessor(FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
createDocumentChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
createTextChunkingIndex(INDEX_NAME, FIXED_TOKEN_LENGTH_PIPELINE_WITH_STANDARD_TOKENIZER_NAME);
Exception exception = assertThrows(Exception.class, () -> ingestDocument(TEST_LONG_DOCUMENT));
// max_token_count is 100 by index settings
assert (exception.getMessage()
Expand All @@ -130,10 +130,10 @@ public void testDocumentChunkingProcessor_withFixedTokenLengthAlgorithmStandardT
}
}

public void testDocumentChunkingProcessor_withDelimiterAlgorithm_successful() throws Exception {
public void testTextChunkingProcessor_withDelimiterAlgorithm_successful() throws Exception {
try {
createPipelineProcessor(DELIMITER_PIPELINE_NAME);
createDocumentChunkingIndex(INDEX_NAME, DELIMITER_PIPELINE_NAME);
createTextChunkingIndex(INDEX_NAME, DELIMITER_PIPELINE_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
Expand All @@ -147,10 +147,10 @@ public void testDocumentChunkingProcessor_withDelimiterAlgorithm_successful() th
}
}

public void testDocumentChunkingProcessor_withCascadePipeline_successful() throws Exception {
public void testTextChunkingProcessor_withCascadePipeline_successful() throws Exception {
try {
createPipelineProcessor(CASCADE_PIPELINE_NAME);
createDocumentChunkingIndex(INDEX_NAME, CASCADE_PIPELINE_NAME);
createTextChunkingIndex(INDEX_NAME, CASCADE_PIPELINE_NAME);
ingestDocument(TEST_DOCUMENT);

List<String> expectedPassages = new ArrayList<>();
Expand Down Expand Up @@ -206,8 +206,8 @@ private void createPipelineProcessor(String pipelineName) throws Exception {
assertEquals("true", node.get("acknowledged").toString());
}

private void createDocumentChunkingIndex(String indexName, String pipelineName) throws Exception {
URL indexSettingsURLPath = classLoader.getResource("processor/chunker/DocumentChunkingIndexSettings.json");
private void createTextChunkingIndex(String indexName, String pipelineName) throws Exception {
URL indexSettingsURLPath = classLoader.getResource("processor/chunker/TextChunkingIndexSettings.json");
assert indexSettingsURLPath != null;
createIndexWithConfiguration(indexName, Files.readString(Path.of(indexSettingsURLPath.toURI())), pipelineName);
}
Expand Down
Loading

0 comments on commit b37e997

Please sign in to comment.