From cd2755a45cf9fc7b6f4aff434f86464ac248b3eb Mon Sep 17 00:00:00 2001 From: Tommaso Bolis Date: Thu, 7 Nov 2024 17:01:46 +0100 Subject: [PATCH 1/2] Use 0 vector to query all chunks --- pom.xml | 2 +- .../internal/operation/EmbeddingOperations.java | 16 +++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/pom.xml b/pom.xml index b2aa35a..afe8424 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 com.mule.mulechain mulechain-vectors - 0.1.78-SNAPSHOT + 0.1.79-SNAPSHOT mule-extension MAC Vectors diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java index 2c4ebf2..4a2327d 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java @@ -5,6 +5,7 @@ import static org.mule.runtime.extension.api.annotation.param.MediaType.APPLICATION_JSON; import java.io.InputStream; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.nio.charset.StandardCharsets; @@ -367,18 +368,22 @@ public InputStream listSourcesFromStore(String storeName, EmbeddingModel embeddingModel = EmbeddingModelFactory.createModel(configuration, modelParams); EmbeddingStore store = EmbeddingStoreFactory.createStore(configuration, storeName, embeddingModel.dimension()); - Embedding queryEmbedding = embeddingModel.embed(".").content(); + // Create a general query vector (e.g., zero vector). Zero vector is often used when you need to retrieve all + // embeddings without any specific bias. + float[] queryVector = new float[embeddingModel.dimension()]; + for (int i = 0; i < embeddingModel.dimension(); i++) { + queryVector[i]=0.0f; // Zero vector + } + + Embedding queryEmbedding = new Embedding(queryVector); EmbeddingSearchRequest searchRequest = EmbeddingSearchRequest.builder() .queryEmbedding(queryEmbedding) - .maxResults(16384) + .maxResults(10000) .minScore(0.0) .build(); EmbeddingSearchResult searchResult = store.search(searchRequest); List> embeddingMatches = searchResult.matches(); - String information = embeddingMatches.stream() - .map(match -> match.embedded().text()) - .collect(joining("\n\n")); JSONObject jsonObject = new JSONObject(); jsonObject.put("storeName", storeName); @@ -421,6 +426,7 @@ public InputStream listSourcesFromStore(String storeName, } jsonObject.put("sources", JsonUtils.jsonObjectCollectionToJsonArray(sourcesJSONObjectHashMap.values())); + jsonObject.put("sourceCount", sourcesJSONObjectHashMap.size()); return toInputStream(jsonObject.toString(), StandardCharsets.UTF_8); } From 132afbc0825890f01c7b1a855791e94b89cb6380 Mon Sep 17 00:00:00 2001 From: Tommaso Bolis Date: Thu, 7 Nov 2024 19:07:39 +0100 Subject: [PATCH 2/2] Introduce pagination for list sources. --- pom.xml | 2 +- .../helper/parameter/QueryParameters.java | 38 ++++++ .../operation/EmbeddingOperations.java | 128 +++++++++++------- 3 files changed, 116 insertions(+), 52 deletions(-) create mode 100644 src/main/java/org/mule/extension/mulechain/vectors/internal/helper/parameter/QueryParameters.java diff --git a/pom.xml b/pom.xml index afe8424..a209835 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 com.mule.mulechain mulechain-vectors - 0.1.79-SNAPSHOT + 0.1.83-SNAPSHOT mule-extension MAC Vectors diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/helper/parameter/QueryParameters.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/helper/parameter/QueryParameters.java new file mode 100644 index 0000000..12f6371 --- /dev/null +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/helper/parameter/QueryParameters.java @@ -0,0 +1,38 @@ +package org.mule.extension.mulechain.vectors.internal.helper.parameter; + +import org.mule.runtime.api.meta.ExpressionSupport; +import org.mule.runtime.extension.api.annotation.Expression; +import org.mule.runtime.extension.api.annotation.param.Optional; +import org.mule.runtime.extension.api.annotation.param.Parameter; +import org.mule.runtime.extension.api.annotation.param.display.Summary; + +public class QueryParameters { + + @Parameter + @Expression(ExpressionSupport.SUPPORTED) + @Summary("The embedding page size used when querying the vector store. Defaults to 5000 embeddings.") + @Optional(defaultValue = "5000") + private Number embeddingPageSize; + +// @Parameter +// @Expression(ExpressionSupport.SUPPORTED) +// @Summary("The offset used when querying the vector store") +// @Optional(defaultValue = "0") +// private Number offset; + +// @Parameter +// @Expression(ExpressionSupport.SUPPORTED) +// @Summary("The limit applied used when querying the vector store") +// @Optional +// private Number limit; + + public int embeddingPageSize() {return embeddingPageSize != null ? embeddingPageSize.intValue() : 5000;} + +// public int offset() { +// return offset.intValue(); +// } + +// public int limit() { +// return limit.intValue(); +// } +} diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java index 4a2327d..74461da 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java @@ -1,11 +1,11 @@ package org.mule.extension.mulechain.vectors.internal.operation; +import static dev.langchain4j.store.embedding.filter.MetadataFilterBuilder.metadataKey; import static org.apache.commons.io.IOUtils.toInputStream; import static org.mule.extension.mulechain.vectors.internal.util.JsonUtils.readConfigFile; import static org.mule.runtime.extension.api.annotation.param.MediaType.APPLICATION_JSON; import java.io.InputStream; -import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.nio.charset.StandardCharsets; @@ -15,18 +15,16 @@ import org.mule.extension.mulechain.vectors.internal.helper.EmbeddingStoreIngestorHelper; import org.mule.extension.mulechain.vectors.internal.helper.factory.EmbeddingModelFactory; import org.mule.extension.mulechain.vectors.internal.helper.factory.EmbeddingStoreFactory; -import org.mule.extension.mulechain.vectors.internal.helper.parameter.FileTypeParameters; +import org.mule.extension.mulechain.vectors.internal.helper.parameter.*; import org.mule.extension.mulechain.vectors.internal.config.Configuration; -import org.mule.extension.mulechain.vectors.internal.helper.parameter.MetadataFilterParameters; -import org.mule.extension.mulechain.vectors.internal.helper.parameter.EmbeddingModelNameParameters; import dev.langchain4j.store.embedding.*; import dev.langchain4j.store.embedding.filter.Filter; import org.json.JSONArray; import org.json.JSONObject; import org.mule.extension.mulechain.vectors.internal.util.JsonUtils; import org.mule.runtime.extension.api.annotation.Alias; -import org.mule.runtime.extension.api.annotation.param.MediaType; -import org.mule.runtime.extension.api.annotation.param.ParameterGroup; +import org.mule.runtime.extension.api.annotation.param.*; + import static java.util.stream.Collectors.joining; import dev.langchain4j.data.document.Metadata; import dev.langchain4j.data.document.splitter.DocumentSplitters; @@ -34,9 +32,6 @@ import dev.langchain4j.data.segment.TextSegment; import dev.langchain4j.model.embedding.EmbeddingModel; -import org.mule.runtime.extension.api.annotation.param.Config; -import org.mule.extension.mulechain.vectors.internal.helper.parameter.StorageTypeParameters; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -360,7 +355,9 @@ public InputStream queryByFilterFromEmbedding(String storeName, String question, @Alias("EMBEDDING-list-sources") public InputStream listSourcesFromStore(String storeName, @Config Configuration configuration, - @ParameterGroup(name = "Additional Properties") EmbeddingModelNameParameters modelParams) { + @ParameterGroup(name = "Querying Strategy") QueryParameters queryParams, + @ParameterGroup(name = "Additional Properties") EmbeddingModelNameParameters modelParams + ) { EmbeddingOperationValidator.validateOperationType( Constants.EMBEDDING_OPERATION_TYPE_FILTER_BY_METADATA,configuration.getVectorStore()); @@ -376,54 +373,83 @@ public InputStream listSourcesFromStore(String storeName, } Embedding queryEmbedding = new Embedding(queryVector); - EmbeddingSearchRequest searchRequest = EmbeddingSearchRequest.builder() - .queryEmbedding(queryEmbedding) - .maxResults(10000) - .minScore(0.0) - .build(); - - EmbeddingSearchResult searchResult = store.search(searchRequest); - List> embeddingMatches = searchResult.matches(); JSONObject jsonObject = new JSONObject(); jsonObject.put("storeName", storeName); JSONArray sources = new JSONArray(); - String absoluteDirectoryPath; - String fileName; - String url; - String ingestionDatetime; - - JSONObject contentObject; - String fullPath; - - HashMap sourcesJSONObjectHashMap = new HashMap(); - for (EmbeddingMatch match : embeddingMatches) { - - Metadata matchMetadata = match.embedded().metadata(); - fileName = matchMetadata.getString(Constants.METADATA_KEY_FILE_NAME); - url = matchMetadata.getString(Constants.METADATA_KEY_URL); - fullPath = matchMetadata.getString(Constants.METADATA_KEY_FULL_PATH); - absoluteDirectoryPath = matchMetadata.getString(Constants.METADATA_KEY_ABSOLUTE_DIRECTORY_PATH); - ingestionDatetime = matchMetadata.getString(Constants.METADATA_KEY_INGESTION_DATETIME); - contentObject = new JSONObject(); - contentObject.put(Constants.METADATA_KEY_ABSOLUTE_DIRECTORY_PATH, absoluteDirectoryPath); - contentObject.put(Constants.METADATA_KEY_FULL_PATH, fullPath); - contentObject.put(Constants.METADATA_KEY_FILE_NAME, fileName); - contentObject.put(Constants.METADATA_KEY_URL, url); - contentObject.put(Constants.METADATA_KEY_INGESTION_DATETIME, ingestionDatetime); - - String key = - ((fullPath != null &&!fullPath.isEmpty()) ? fullPath : - (url != null && !url.isEmpty()) ? url : "") + - ((ingestionDatetime != null && !ingestionDatetime.isEmpty()) ? ingestionDatetime : ""); - - // Add contentObject to sources only if it has at least one key-value pair - if (!contentObject.isEmpty() && !key.isEmpty()) { + List> embeddingMatches = null; + HashMap sourcesJSONObjectHashMap = new HashMap(); + String lowerBoundaryIngestionDateTime = "0000-00-00T00:00:00.000Z"; + int lowerBoundaryIndex = -1; + + LOGGER.debug("Embedding page size: " + queryParams.embeddingPageSize()); + String previousPageEmbeddingId = ""; + do { + + EmbeddingSearchRequest searchRequest = EmbeddingSearchRequest.builder() + .queryEmbedding(queryEmbedding) + .maxResults(queryParams.embeddingPageSize()) + .minScore(0.0) + .filter( + metadataKey(Constants.METADATA_KEY_INGESTION_DATETIME).isGreaterThan(lowerBoundaryIngestionDateTime).or( + metadataKey(Constants.METADATA_KEY_INGESTION_DATETIME).isGreaterThanOrEqualTo(lowerBoundaryIngestionDateTime).and( + metadataKey("index").isGreaterThan(lowerBoundaryIndex)))) + .build(); + + EmbeddingSearchResult searchResult = store.search(searchRequest); + embeddingMatches = searchResult.matches(); + + String currentPageEmbeddingId = ""; + for (EmbeddingMatch match : embeddingMatches) { + + Metadata matchMetadata = match.embedded().metadata(); + String index = matchMetadata.getString("index"); + String fileName = matchMetadata.getString(Constants.METADATA_KEY_FILE_NAME); + String url = matchMetadata.getString(Constants.METADATA_KEY_URL); + String fullPath = matchMetadata.getString(Constants.METADATA_KEY_FULL_PATH); + String absoluteDirectoryPath = matchMetadata.getString(Constants.METADATA_KEY_ABSOLUTE_DIRECTORY_PATH); + String ingestionDatetime = matchMetadata.getString(Constants.METADATA_KEY_INGESTION_DATETIME); + + if(lowerBoundaryIngestionDateTime.compareTo(ingestionDatetime) < 0) { + + lowerBoundaryIngestionDateTime = ingestionDatetime; + lowerBoundaryIndex = -1; + } else if(lowerBoundaryIngestionDateTime.compareTo(ingestionDatetime) == 0) { + + if(Integer.parseInt(index) > lowerBoundaryIndex) { + lowerBoundaryIndex = Integer.parseInt(index); + } + } + + JSONObject contentObject = new JSONObject(); + contentObject.put(Constants.METADATA_KEY_ABSOLUTE_DIRECTORY_PATH, absoluteDirectoryPath); + contentObject.put(Constants.METADATA_KEY_FULL_PATH, fullPath); + contentObject.put(Constants.METADATA_KEY_FILE_NAME, fileName); + contentObject.put(Constants.METADATA_KEY_URL, url); + contentObject.put(Constants.METADATA_KEY_INGESTION_DATETIME, ingestionDatetime); + + String key = + ((fullPath != null && !fullPath.isEmpty()) ? fullPath : + (url != null && !url.isEmpty()) ? url : "") + + ((ingestionDatetime != null && !ingestionDatetime.isEmpty()) ? ingestionDatetime : ""); + + // Add contentObject to sources only if it has at least one key-value pair + if (!contentObject.isEmpty() && !key.isEmpty()) { + + sourcesJSONObjectHashMap.put(key, contentObject); + } + currentPageEmbeddingId = match.embeddingId(); + } - sourcesJSONObjectHashMap.put(key, contentObject); + LOGGER.debug("previousPageEmbeddingId: " + previousPageEmbeddingId + ", currentPageEmbeddingId: " + currentPageEmbeddingId); + if(previousPageEmbeddingId.compareTo(currentPageEmbeddingId) == 0) { + break; + } else { + previousPageEmbeddingId = currentPageEmbeddingId; } - } + + } while(embeddingMatches.size() == queryParams.embeddingPageSize()); jsonObject.put("sources", JsonUtils.jsonObjectCollectionToJsonArray(sourcesJSONObjectHashMap.values())); jsonObject.put("sourceCount", sourcesJSONObjectHashMap.size());