Skip to content

Commit

Permalink
update parameter name to chunk_string_count
Browse files Browse the repository at this point in the history
Signed-off-by: yuye-aws <[email protected]>
  • Loading branch information
yuye-aws committed Apr 30, 2024
1 parent 65f8cf7 commit 18b7936
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 29 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD;
import static org.opensearch.neuralsearch.processor.chunker.Chunker.DEFAULT_MAX_CHUNK_LIMIT;
import static org.opensearch.neuralsearch.processor.chunker.Chunker.DISABLED_MAX_CHUNK_LIMIT;
import static org.opensearch.neuralsearch.processor.chunker.Chunker.STRING_TOBE_CHUNKED_FIELD;
import static org.opensearch.neuralsearch.processor.chunker.Chunker.CHUNK_STRING_COUNT_FIELD;
import static org.opensearch.neuralsearch.processor.chunker.ChunkerParameterParser.parseIntegerParameter;

/**
Expand Down Expand Up @@ -172,10 +172,10 @@ public IngestDocument execute(final IngestDocument ingestDocument) {
// fixed token length algorithm needs runtime parameter max_token_count for tokenization
Map<String, Object> runtimeParameters = new HashMap<>();
int maxTokenCount = getMaxTokenCount(sourceAndMetadataMap);
int stringTobeChunkedCount = getStringTobeChunkedCountFromMap(sourceAndMetadataMap, fieldMap);
int chunkStringCount = getChunkStringCountFromMap(sourceAndMetadataMap, fieldMap);
runtimeParameters.put(FixedTokenLengthChunker.MAX_TOKEN_COUNT_FIELD, maxTokenCount);
runtimeParameters.put(MAX_CHUNK_LIMIT_FIELD, maxChunkLimit);
runtimeParameters.put(STRING_TOBE_CHUNKED_FIELD, stringTobeChunkedCount);
runtimeParameters.put(CHUNK_STRING_COUNT_FIELD, chunkStringCount);
chunkMapType(sourceAndMetadataMap, fieldMap, runtimeParameters);
return ingestDocument;
}
Expand Down Expand Up @@ -234,8 +234,8 @@ private void validateListTypeValue(final String sourceKey, final Object sourceVa
}

@SuppressWarnings("unchecked")
private int getStringTobeChunkedCountFromMap(Map<String, Object> sourceAndMetadataMap, final Map<String, Object> fieldMap) {
int stringTobeChunkedCount = 0;
private int getChunkStringCountFromMap(Map<String, Object> sourceAndMetadataMap, final Map<String, Object> fieldMap) {
int chunkStringCount = 0;
for (Map.Entry<String, Object> fieldMapEntry : fieldMap.entrySet()) {
String originalKey = fieldMapEntry.getKey();
Object targetKey = fieldMapEntry.getValue();
Expand All @@ -246,29 +246,29 @@ private int getStringTobeChunkedCountFromMap(Map<String, Object> sourceAndMetada
List<Object> sourceObjectList = (List<Object>) sourceObject;
for (Object source : sourceObjectList) {
if (source instanceof Map) {
stringTobeChunkedCount += getStringTobeChunkedCountFromMap(
chunkStringCount += getChunkStringCountFromMap(
(Map<String, Object>) source,
(Map<String, Object>) targetKey
);
}
}
} else if (sourceObject instanceof Map) {
stringTobeChunkedCount += getStringTobeChunkedCountFromMap(
chunkStringCount += getChunkStringCountFromMap(
(Map<String, Object>) sourceObject,
(Map<String, Object>) targetKey
);
}
} else {
// chunk the object when target key is of leaf type (null, string and list of string)
Object chunkObject = sourceAndMetadataMap.get(originalKey);
stringTobeChunkedCount += getStringTobeChunkedCountFromLeafType(chunkObject);
chunkStringCount += getChunkStringCountFromLeafType(chunkObject);
}
}
return stringTobeChunkedCount;
return chunkStringCount;
}

@SuppressWarnings("unchecked")
private int getStringTobeChunkedCountFromLeafType(final Object value) {
private int getChunkStringCountFromLeafType(final Object value) {
// leaf type means null, String or List<String>
// the result should be an empty list when the input is null
if (value instanceof String) {
Expand Down Expand Up @@ -319,9 +319,9 @@ private List<String> chunkString(final String content, final Map<String, Object>
return List.of();
}
List<String> contentResult = chunker.chunk(content, runTimeParameters);
// update string_tobe_chunked_count for each string
int stringTobeChunkedCount = parseIntegerParameter(runTimeParameters, STRING_TOBE_CHUNKED_FIELD, 1);
runTimeParameters.put(STRING_TOBE_CHUNKED_FIELD, stringTobeChunkedCount - 1);
// update chunk_string_count for each string
int chunkStringCount = parseIntegerParameter(runTimeParameters, CHUNK_STRING_COUNT_FIELD, 1);
runTimeParameters.put(CHUNK_STRING_COUNT_FIELD, chunkStringCount - 1);
// update runtime max_chunk_limit if not disabled
int runtimeMaxChunkLimit = parseIntegerParameter(runTimeParameters, MAX_CHUNK_LIMIT_FIELD, maxChunkLimit);
if (runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
public interface Chunker {

String MAX_CHUNK_LIMIT_FIELD = "max_chunk_limit";
String STRING_TOBE_CHUNKED_FIELD = "string_tobe_chunked_count";
String CHUNK_STRING_COUNT_FIELD = "chunk_string_count";
int DEFAULT_MAX_CHUNK_LIMIT = 100;
int DISABLED_MAX_CHUNK_LIMIT = -1;

Expand All @@ -41,9 +41,9 @@ public interface Chunker {
*
* @param chunkResultSize the size of chunking result
* @param runtimeMaxChunkLimit runtime max_chunk_limit, used to check with chunkResultSize
* @param stringTobeChunkedCount runtime string_tobe_chunked_count, used to check with chunkResultSize
* @param chunkStringCount runtime chunk_string_count, used to check with chunkResultSize
*/
static boolean checkRunTimeMaxChunkLimit(int chunkResultSize, int runtimeMaxChunkLimit, int stringTobeChunkedCount) {
return runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT && chunkResultSize + stringTobeChunkedCount >= runtimeMaxChunkLimit;
static boolean checkRunTimeMaxChunkLimit(int chunkResultSize, int runtimeMaxChunkLimit, int chunkStringCount) {
return runtimeMaxChunkLimit != DISABLED_MAX_CHUNK_LIMIT && chunkResultSize + chunkStringCount >= runtimeMaxChunkLimit;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,19 @@ public void parseParameters(Map<String, Object> parameters) {
* @param content input string
* @param runtimeParameters a map for runtime parameters, containing the following runtime parameters:
* 1. max_chunk_limit field level max chunk limit
* 2. string_tobe_chunked_count number of non-empty strings (including itself) which need to be chunked later
* 2. chunk_string_count number of non-empty strings (including itself) which need to be chunked later
*/
@Override
public List<String> chunk(final String content, final Map<String, Object> runtimeParameters) {
int runtimeMaxChunkLimit = parseIntegerParameter(runtimeParameters, MAX_CHUNK_LIMIT_FIELD, maxChunkLimit);
int stringTobeChunkedCount = parseIntegerParameter(runtimeParameters, STRING_TOBE_CHUNKED_FIELD, 1);
int chunkStringCount = parseIntegerParameter(runtimeParameters, CHUNK_STRING_COUNT_FIELD, 1);

List<String> chunkResult = new ArrayList<>();
int start = 0, end;
int nextDelimiterPosition = content.indexOf(delimiter);

while (nextDelimiterPosition != -1) {
if (Chunker.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, stringTobeChunkedCount)) {
if (Chunker.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, chunkStringCount)) {
break;
}
end = nextDelimiterPosition + delimiter.length();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,13 @@ public void parseParameters(Map<String, Object> parameters) {
* @param runtimeParameters a map for runtime parameters, containing the following runtime parameters:
* 1. max_token_count the max token limit for the tokenizer
* 2. max_chunk_limit field level max chunk limit
* 3. string_tobe_chunked_count number of non-empty strings (including itself) which need to be chunked later
* 3. chunk_string_count number of non-empty strings (including itself) which need to be chunked later
*/
@Override
public List<String> chunk(final String content, final Map<String, Object> runtimeParameters) {
int maxTokenCount = parsePositiveIntegerParameter(runtimeParameters, MAX_TOKEN_COUNT_FIELD, DEFAULT_MAX_TOKEN_COUNT);
int runtimeMaxChunkLimit = parseIntegerParameter(runtimeParameters, MAX_CHUNK_LIMIT_FIELD, this.maxChunkLimit);
int stringTobeChunkedCount = parseIntegerParameter(runtimeParameters, STRING_TOBE_CHUNKED_FIELD, 1);
int chunkStringCount = parseIntegerParameter(runtimeParameters, CHUNK_STRING_COUNT_FIELD, 1);

List<AnalyzeToken> tokens = tokenize(content, tokenizer, maxTokenCount);
List<String> chunkResult = new ArrayList<>();
Expand All @@ -139,7 +139,7 @@ public List<String> chunk(final String content, final Map<String, Object> runtim
} else {
startContentPosition = tokens.get(startTokenIndex).getStartOffset();
}
if (Chunker.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, stringTobeChunkedCount)) {
if (Chunker.checkRunTimeMaxChunkLimit(chunkResult.size(), runtimeMaxChunkLimit, chunkStringCount)) {
// include all characters till the end if exceeds max chunk limit
chunkResult.add(content.substring(startContentPosition));
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import org.opensearch.test.OpenSearchTestCase;

import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD;
import static org.opensearch.neuralsearch.processor.chunker.Chunker.STRING_TOBE_CHUNKED_FIELD;
import static org.opensearch.neuralsearch.processor.chunker.Chunker.CHUNK_STRING_COUNT_FIELD;
import static org.opensearch.neuralsearch.processor.chunker.DelimiterChunker.DELIMITER_FIELD;

public class DelimiterChunkerTests extends OpenSearchTestCase {
Expand Down Expand Up @@ -112,10 +112,10 @@ public void testChunk_whenExceedRuntimeMaxChunkLimit_withTwoStringsTobeChunked_t
int maxChunkLimit = 3;
DelimiterChunker chunker = new DelimiterChunker(Map.of(DELIMITER_FIELD, "\n\n", MAX_CHUNK_LIMIT_FIELD, maxChunkLimit));
String content = "\n\na\n\n\n";
int runtimeMaxChunkLimit = 2, stringTobeChunkedCount = 2;
int runtimeMaxChunkLimit = 2, chunkStringCount = 2;
List<String> passages = chunker.chunk(
content,
Map.of(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit, STRING_TOBE_CHUNKED_FIELD, stringTobeChunkedCount)
Map.of(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit, CHUNK_STRING_COUNT_FIELD, chunkStringCount)
);
List<String> expectedPassages = List.of("\n\na\n\n\n");
assertEquals(expectedPassages, passages);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
import static java.util.Collections.singletonList;
import static java.util.Collections.singletonMap;
import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD;
import static org.opensearch.neuralsearch.processor.chunker.Chunker.STRING_TOBE_CHUNKED_FIELD;
import static org.opensearch.neuralsearch.processor.chunker.Chunker.CHUNK_STRING_COUNT_FIELD;
import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ALGORITHM_NAME;
import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.ANALYSIS_REGISTRY_FIELD;
import static org.opensearch.neuralsearch.processor.chunker.FixedTokenLengthChunker.TOKEN_LIMIT_FIELD;
Expand Down Expand Up @@ -292,7 +292,7 @@ public void testChunk_whenExceedRuntimeMaxChunkLimit_thenLastPassageGetConcatena
}

public void testChunk_whenExceedRuntimeMaxChunkLimit_withOneStringTobeChunked_thenLastPassageGetConcatenated() {
int maxChunkLimit = 3, runtimeMaxChunkLimit = 2, stringTobeChunkedCount = 1;
int maxChunkLimit = 3, runtimeMaxChunkLimit = 2, chunkStringCount = 1;
Map<String, Object> parameters = new HashMap<>();
parameters.put(TOKEN_LIMIT_FIELD, 10);
parameters.put(TOKENIZER_FIELD, "standard");
Expand All @@ -301,7 +301,7 @@ public void testChunk_whenExceedRuntimeMaxChunkLimit_withOneStringTobeChunked_th
Map<String, Object> runtimeParameters = new HashMap<>();
runtimeParameters.put(MAX_TOKEN_COUNT_FIELD, 10000);
runtimeParameters.put(MAX_CHUNK_LIMIT_FIELD, runtimeMaxChunkLimit);
runtimeParameters.put(STRING_TOBE_CHUNKED_FIELD, stringTobeChunkedCount);
runtimeParameters.put(CHUNK_STRING_COUNT_FIELD, chunkStringCount);
String content =
"This is an example document to be chunked. The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.";
List<String> passages = fixedTokenLengthChunker.chunk(content, runtimeParameters);
Expand Down

0 comments on commit 18b7936

Please sign in to comment.