From bdcf2e4901f9c0e7055423234bb8d24fa93831e8 Mon Sep 17 00:00:00 2001 From: Tommaso Bolis Date: Mon, 18 Nov 2024 14:19:31 +0100 Subject: [PATCH 1/6] Generic Refactoring. --- pom.xml | 2 +- .../vectors/internal/constant/Constants.java | 20 +++- .../internal/error/MuleVectorsErrorType.java | 2 +- .../provider/EmbeddingErrorTypeProvider.java | 5 +- .../vectors/internal/model/BaseModel.java | 8 +- .../model/azureopenai/AzureOpenAIModel.java | 3 +- .../operation/DocumentOperations.java | 26 ++--- .../operation/EmbeddingOperations.java | 110 ++++++++++++------ .../vectors/internal/store/BaseStore.java | 11 +- .../store/aisearch/AISearchStore.java | 10 +- .../internal/store/chroma/ChromaStore.java | 6 +- .../internal/store/milvus/MilvusStore.java | 6 +- .../store/pgvector/PGVectorStore.java | 6 +- .../vectors/internal/util/JsonUtils.java | 6 +- 14 files changed, 135 insertions(+), 86 deletions(-) diff --git a/pom.xml b/pom.xml index 1438097..e228c9c 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 com.mulesoft.connectors mule4-vectors-connector - 0.1.117-SNAPSHOT + 0.1.119-SNAPSHOT mule-extension MuleSoft Vectors Connector - Mule 4 MuleSoft Vectors Connector provides access to a broad number of external Vector Stores. diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/constant/Constants.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/constant/Constants.java index c57ed2b..1e21330 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/constant/Constants.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/constant/Constants.java @@ -17,10 +17,10 @@ private Constants() {} public static final String STORAGE_TYPE_AZURE_BLOB = "AZURE_BLOB"; public static final String EMBEDDING_MODEL_SERVICE_OPENAI = "OPENAI"; + public static final String EMBEDDING_MODEL_SERVICE_AZURE_OPENAI = "AZURE_OPENAI"; public static final String EMBEDDING_MODEL_SERVICE_MISTRAL_AI = "MISTRAL_AI"; public static final String EMBEDDING_MODEL_SERVICE_NOMIC = "NOMIC"; public static final String EMBEDDING_MODEL_SERVICE_HUGGING_FACE = "HUGGING_FACE"; - public static final String EMBEDDING_MODEL_SERVICE_AZURE_OPENAI = "AZURE_OPENAI"; public static final String EMBEDDING_MODEL_NAME_TEXT_EMBEDDING_3_SMALL = "text-embedding-3-small"; public static final String EMBEDDING_MODEL_NAME_TEXT_EMBEDDING_3_LARGE = "text-embedding-3-large"; @@ -66,6 +66,24 @@ private Constants() {} public static final String EMBEDDING_OPERATION_TYPE_REMOVE_EMBEDDINGS = "REMOVE_EMBEDDINGS"; public static final String EMBEDDING_OPERATION_TYPE_QUERY_ALL = "QUERY_ALL"; + public static final String JSON_KEY_SOURCES = "sources"; + public static final String JSON_KEY_SEGMENTS = "segments"; + public static final String JSON_KEY_SEGMENT_COUNT = "segmentCount"; + public static final String JSON_KEY_SOURCE_COUNT = "sourceCount"; + public static final String JSON_KEY_STORE_NAME = "storeName"; + public static final String JSON_KEY_TEXT = "text"; + public static final String JSON_KEY_STATUS = "status"; + public static final String JSON_KEY_EMBEDDING = "embedding"; + public static final String JSON_KEY_DIMENSIONS = "dimensions"; + public static final String JSON_KEY_RESPONSE = "response"; + public static final String JSON_KEY_QUESTION = "question"; + public static final String JSON_KEY_MAX_RESULTS = "maxResults"; + public static final String JSON_KEY_MIN_SCORE = "minScore"; + public static final String JSON_KEY_EMBEDDING_ID = "embeddingId"; + public static final String JSON_KEY_SCORE = "score"; + public static final String JSON_KEY_METADATA = "metadata"; + public static final String JSON_KEY_INDEX = "index"; + public static final String OPERATION_STATUS_ADDED = "added"; public static final String OPERATION_STATUS_UPDATED = "updated"; public static final String OPERATION_STATUS_DELETED = "deleted"; diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/error/MuleVectorsErrorType.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/error/MuleVectorsErrorType.java index e762988..b31fa55 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/error/MuleVectorsErrorType.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/error/MuleVectorsErrorType.java @@ -7,5 +7,5 @@ */ public enum MuleVectorsErrorType implements ErrorTypeDefinition { - DOCUMENT_OPERATIONS_FAILURE, EMBEDDING_OPERATIONS_FAILURE + DOCUMENT_OPERATIONS_FAILURE, EMBEDDING_OPERATIONS_FAILURE, AI_SERVICES_FAILURE } diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/error/provider/EmbeddingErrorTypeProvider.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/error/provider/EmbeddingErrorTypeProvider.java index 7c4979c..947d5c5 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/error/provider/EmbeddingErrorTypeProvider.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/error/provider/EmbeddingErrorTypeProvider.java @@ -11,6 +11,7 @@ import static java.util.Arrays.asList; import static java.util.Collections.unmodifiableSet; +import static org.mule.extension.mulechain.vectors.internal.error.MuleVectorsErrorType.AI_SERVICES_FAILURE; import static org.mule.extension.mulechain.vectors.internal.error.MuleVectorsErrorType.EMBEDDING_OPERATIONS_FAILURE; public class EmbeddingErrorTypeProvider implements ErrorTypeProvider { @@ -18,6 +19,8 @@ public class EmbeddingErrorTypeProvider implements ErrorTypeProvider { @SuppressWarnings("rawtypes") @Override public Set getErrorTypes() { - return unmodifiableSet(new HashSet<>(asList(EMBEDDING_OPERATIONS_FAILURE))); + return unmodifiableSet(new HashSet<>(asList( + EMBEDDING_OPERATIONS_FAILURE, + AI_SERVICES_FAILURE))); } } diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/model/BaseModel.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/model/BaseModel.java index ae7c7bf..e1bb16f 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/model/BaseModel.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/model/BaseModel.java @@ -3,12 +3,15 @@ import dev.langchain4j.model.embedding.EmbeddingModel; import org.mule.extension.mulechain.vectors.internal.config.Configuration; import org.mule.extension.mulechain.vectors.internal.constant.Constants; +import org.mule.extension.mulechain.vectors.internal.error.MuleVectorsErrorType; import org.mule.extension.mulechain.vectors.internal.helper.parameter.EmbeddingModelParameters; import org.mule.extension.mulechain.vectors.internal.model.azureopenai.AzureOpenAIModel; import org.mule.extension.mulechain.vectors.internal.model.huggingface.HuggingFaceModel; import org.mule.extension.mulechain.vectors.internal.model.mistralai.MistralAIModel; import org.mule.extension.mulechain.vectors.internal.model.nomic.NomicModel; import org.mule.extension.mulechain.vectors.internal.model.openai.OpenAIModel; +import org.mule.runtime.extension.api.exception.ModuleException; +import org.mule.runtime.module.extension.internal.runtime.operation.IllegalOperationException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -82,8 +85,9 @@ public BaseModel build() { break; default: - //throw new IllegalOperationException("Unsupported Vector Store: " + configuration.getVectorStore()); - baseModel = null; + throw new ModuleException( + String.format("Error while initializing embedding model service. \"%s\" is not supported.", configuration.getEmbeddingModelService()), + MuleVectorsErrorType.AI_SERVICES_FAILURE); } return baseModel; } diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/model/azureopenai/AzureOpenAIModel.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/model/azureopenai/AzureOpenAIModel.java index 5750bbe..2625110 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/model/azureopenai/AzureOpenAIModel.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/model/azureopenai/AzureOpenAIModel.java @@ -4,6 +4,7 @@ import dev.langchain4j.model.embedding.EmbeddingModel; import org.json.JSONObject; import org.mule.extension.mulechain.vectors.internal.config.Configuration; +import org.mule.extension.mulechain.vectors.internal.constant.Constants; import org.mule.extension.mulechain.vectors.internal.helper.parameter.EmbeddingModelParameters; import org.mule.extension.mulechain.vectors.internal.model.BaseModel; @@ -19,7 +20,7 @@ public AzureOpenAIModel(Configuration configuration, EmbeddingModelParameters em super(configuration,embeddingModelParameters); JSONObject config = readConfigFile(configuration.getConfigFilePath()); assert config != null; - JSONObject modelConfig = config.getJSONObject("AZURE_OPENAI"); + JSONObject modelConfig = config.getJSONObject(Constants.EMBEDDING_MODEL_SERVICE_AZURE_OPENAI); this.apiKey = modelConfig.getString("AZURE_OPENAI_KEY"); this.endpoint = modelConfig.getString("AZURE_OPENAI_ENDPOINT"); } diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/DocumentOperations.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/DocumentOperations.java index ea93cfb..3187a7d 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/DocumentOperations.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/DocumentOperations.java @@ -2,11 +2,7 @@ import dev.langchain4j.data.document.Document; import dev.langchain4j.data.document.DocumentSplitter; -import dev.langchain4j.data.document.loader.UrlDocumentLoader; -import dev.langchain4j.data.document.parser.TextDocumentParser; -import dev.langchain4j.data.document.parser.apache.tika.ApacheTikaDocumentParser; import dev.langchain4j.data.document.splitter.DocumentSplitters; -import dev.langchain4j.data.document.transformer.jsoup.HtmlToTextDocumentTransformer; import dev.langchain4j.data.segment.TextSegment; import org.json.JSONArray; import org.json.JSONObject; @@ -25,23 +21,15 @@ import org.mule.runtime.extension.api.annotation.param.Config; import org.mule.runtime.extension.api.annotation.param.MediaType; import org.mule.runtime.extension.api.annotation.param.ParameterGroup; -import org.mule.runtime.extension.api.annotation.param.display.DisplayName; -import org.mule.runtime.extension.api.annotation.param.display.Summary; import org.mule.runtime.extension.api.exception.ModuleException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.InputStream; -import java.net.MalformedURLException; -import java.net.URL; -import java.nio.charset.StandardCharsets; -import java.util.Arrays; import java.util.HashMap; import java.util.List; -import java.util.stream.Collectors; import java.util.stream.IntStream; -import static dev.langchain4j.data.document.loader.FileSystemDocumentLoader.loadDocument; import static org.mule.extension.mulechain.vectors.internal.helper.ResponseHelper.createDocumentResponse; import static org.mule.runtime.extension.api.annotation.param.MediaType.APPLICATION_JSON; @@ -81,17 +69,20 @@ public class DocumentOperations { JSONArray jsonSegments = IntStream.range(0, segments.size()) .mapToObj(i -> { JSONObject jsonObject = new JSONObject(); - jsonObject.put("text", segments.get(i).text()); // Replace getText with the actual method - jsonObject.put("index", i); + jsonObject.put(Constants.JSON_KEY_TEXT, segments.get(i).text()); // Replace getText with the actual method + jsonObject.put(Constants.JSON_KEY_INDEX, i); return jsonObject; }) .collect(JSONArray::new, JSONArray::put, JSONArray::putAll); JSONObject jsonObject = new JSONObject(); - jsonObject.put("segments", jsonSegments); + jsonObject.put(Constants.JSON_KEY_SEGMENTS, jsonSegments); return createDocumentResponse(jsonObject.toString(), new HashMap<>()); + } catch (ModuleException me) { + throw me; + } catch (Exception e) { throw new ModuleException( @@ -123,10 +114,13 @@ public class DocumentOperations { Document document = baseStorage.getSingleDocument(); JSONObject jsonObject = new JSONObject(); - jsonObject.put("text",document.text()); + jsonObject.put(Constants.JSON_KEY_TEXT,document.text()); return createDocumentResponse(jsonObject.toString(), new HashMap<>()); + } catch (ModuleException me) { + throw me; + } catch (Exception e) { throw new ModuleException( diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java index 100959c..2642169 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java @@ -62,6 +62,8 @@ public class EmbeddingOperations { try { + LOGGER.debug(String.format("Adding text %s to store %s", text, storeName)); + BaseModel baseModel = BaseModel.builder() .configuration(configuration) .embeddingModelParameters(embeddingModelParameters) @@ -82,17 +84,20 @@ public class EmbeddingOperations { embeddingStore.add(textEmbedding, textSegment); JSONObject jsonObject = new JSONObject(); - jsonObject.put("status", Constants.OPERATION_STATUS_ADDED); - jsonObject.put("textSegment", textSegment.toString()); - jsonObject.put("textEmbedding", textEmbedding.toString()); - jsonObject.put("storeName", storeName); + jsonObject.put(Constants.JSON_KEY_STATUS, Constants.OPERATION_STATUS_ADDED); + jsonObject.put(Constants.JSON_KEY_TEXT, textSegment.toString()); + jsonObject.put(Constants.JSON_KEY_EMBEDDING, textEmbedding.toString()); + jsonObject.put(Constants.JSON_KEY_STORE_NAME, storeName); return createEmbeddingResponse(jsonObject.toString(), new HashMap<>()); + } catch (ModuleException me) { + throw me; + } catch (Exception e) { throw new ModuleException( - String.format("Error while adding text %s into the store %s", text, storeName), + String.format("Error while adding text \"%s\" into the store %s", text, storeName), MuleVectorsErrorType.EMBEDDING_OPERATIONS_FAILURE, e); } @@ -119,19 +124,33 @@ public class EmbeddingOperations { EmbeddingModel embeddingModel = baseModel.buildEmbeddingModel(); TextSegment textSegment = TextSegment.from(text); - Embedding textEmbedding = embeddingModel.embed(textSegment).content(); + Embedding embedding = null; + try { + + embedding = embeddingModel.embed(textSegment).content(); + + } catch(Exception e) { + + throw new ModuleException( + String.format("Error while generating embedding from text \"%s\"", text), + MuleVectorsErrorType.AI_SERVICES_FAILURE, + e); + } JSONObject jsonObject = new JSONObject(); - jsonObject.put("Segment", textSegment.toString()); - jsonObject.put("Embedding", textEmbedding.toString()); - jsonObject.put("Dimension", textEmbedding.dimension()); + jsonObject.put(Constants.JSON_KEY_TEXT, textSegment.toString()); + jsonObject.put(Constants.JSON_KEY_EMBEDDING, embedding.toString()); + jsonObject.put(Constants.JSON_KEY_DIMENSIONS, embedding.dimension()); return createEmbeddingResponse(jsonObject.toString(), new HashMap<>()); + } catch (ModuleException me) { + throw me; + } catch (Exception e) { throw new ModuleException( - String.format("Error while generating embedding from text %s", text), + String.format("Error while generating embedding from text \"%s\"", text), MuleVectorsErrorType.EMBEDDING_OPERATIONS_FAILURE, e); } @@ -195,6 +214,9 @@ public class EmbeddingOperations { return createEmbeddingResponse(jsonObject.toString(), new HashMap<>()); + } catch (ModuleException me) { + throw me; + } catch (Exception e) { throw new ModuleException( @@ -259,6 +281,9 @@ public class EmbeddingOperations { return createEmbeddingResponse(jsonObject.toString(), new HashMap<>()); + } catch (ModuleException me) { + throw me; + } catch (Exception e) { throw new ModuleException( @@ -320,9 +345,12 @@ public class EmbeddingOperations { .collect(joining("\n\n")); JSONObject jsonObject = new JSONObject(); - jsonObject.put("response", information); - jsonObject.put("storeName", storeName); - jsonObject.put("question", question); + jsonObject.put(Constants.JSON_KEY_RESPONSE, information); + jsonObject.put(Constants.JSON_KEY_STORE_NAME, storeName); + jsonObject.put(Constants.JSON_KEY_QUESTION, question); + jsonObject.put(Constants.JSON_KEY_MAX_RESULTS, maxResults); + jsonObject.put(Constants.JSON_KEY_MIN_SCORE, minScore); + JSONArray sources = new JSONArray(); JSONObject contentObject; @@ -330,26 +358,25 @@ public class EmbeddingOperations { Metadata matchMetadata = match.embedded().metadata(); contentObject = new JSONObject(); - contentObject.put("embeddingId", match.embeddingId()); - contentObject.put("text", match.embedded().text()); - contentObject.put("score", match.score()); + contentObject.put(Constants.JSON_KEY_EMBEDDING_ID, match.embeddingId()); + contentObject.put(Constants.JSON_KEY_TEXT, match.embedded().text()); + contentObject.put(Constants.JSON_KEY_SCORE, match.score()); JSONObject metadataObject = new JSONObject(matchMetadata.toMap()); - contentObject.put("metadata", metadataObject); + contentObject.put(Constants.JSON_KEY_METADATA, metadataObject); sources.put(contentObject); } - jsonObject.put("sources", sources); + jsonObject.put(Constants.JSON_KEY_SOURCES, sources); - jsonObject.put("maxResults", maxResults); - jsonObject.put("minScore", minScore); - jsonObject.put("question", question); - jsonObject.put("storeName", storeName); return createEmbeddingResponse(jsonObject.toString(), new HashMap<>()); + } catch (ModuleException me) { + throw me; + } catch (Exception e) { throw new ModuleException( @@ -427,9 +454,11 @@ public class EmbeddingOperations { .map(match -> match.embedded().text()) .collect(joining("\n\n")); - jsonObject.put("response", information); - jsonObject.put("storeName", storeName); - jsonObject.put("question", question); + jsonObject.put(Constants.JSON_KEY_RESPONSE, information); + jsonObject.put(Constants.JSON_KEY_STORE_NAME, storeName); + jsonObject.put(Constants.JSON_KEY_QUESTION, question); + jsonObject.put(Constants.JSON_KEY_MAX_RESULTS, maxResults); + jsonObject.put(Constants.JSON_KEY_MIN_SCORE, minScore); JSONArray sources = new JSONArray(); @@ -440,25 +469,23 @@ public class EmbeddingOperations { contentObject = new JSONObject(); - contentObject.put("embeddingId", match.embeddingId()); - contentObject.put("text", match.embedded().text()); - contentObject.put("score", match.score()); + contentObject.put(Constants.JSON_KEY_EMBEDDING_ID, match.embeddingId()); + contentObject.put(Constants.JSON_KEY_TEXT, match.embedded().text()); + contentObject.put(Constants.JSON_KEY_SCORE, match.score()); JSONObject metadataObject = new JSONObject(matchMetadata.toMap()); - contentObject.put("metadata", metadataObject); + contentObject.put(Constants.JSON_KEY_METADATA, metadataObject); sources.put(contentObject); } - jsonObject.put("sources", sources); - - jsonObject.put("maxResults", maxResults); - jsonObject.put("minScore", minScore); - jsonObject.put("question", question); - jsonObject.put("storeName", storeName); + jsonObject.put(Constants.JSON_KEY_SOURCES, sources); return createEmbeddingResponse(jsonObject.toString(), new HashMap<>()); + } catch (ModuleException me) { + throw me; + } catch (Exception e) { throw new ModuleException( @@ -508,6 +535,9 @@ public class EmbeddingOperations { return createEmbeddingResponse(jsonObject.toString(), new HashMap<>()); + } catch (ModuleException me) { + throw me; + } catch (Exception e) { throw new ModuleException( @@ -557,12 +587,16 @@ public class EmbeddingOperations { embeddingStore.removeAll(filter); JSONObject jsonObject = new JSONObject(); - jsonObject.put("storeName", storeName); - jsonObject.put("filter", removeFilterParams.getFilterJSONObject()); - jsonObject.put("status", Constants.OPERATION_STATUS_DELETED); + jsonObject.put(Constants.JSON_KEY_STORE_NAME, storeName); + jsonObject.put(Constants.JSON_KEY_STATUS, Constants.OPERATION_STATUS_DELETED); + + //jsonObject.put("filter", removeFilterParams.getFilterJSONObject()); return createEmbeddingResponse(jsonObject.toString(), new HashMap<>()); + } catch (ModuleException me) { + throw me; + } catch (Exception e) { throw new ModuleException( String.format("Error while removing embeddings from the store %s", storeName), diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/store/BaseStore.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/store/BaseStore.java index 5c11d2d..21215ac 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/store/BaseStore.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/store/BaseStore.java @@ -30,11 +30,6 @@ public class BaseStore { protected static final Logger LOGGER = LoggerFactory.getLogger(BaseStore.class); - protected static final String JSON_KEY_SOURCES = "sources"; - protected static final String JSON_KEY_SEGMENT_COUNT = "segmentCount"; - protected static final String JSON_KEY_SOURCE_COUNT = "sourceCount"; - protected static final String JSON_KEY_STORE_NAME = "storeName"; - protected String storeName; protected Configuration configuration; protected QueryParameters queryParams; @@ -127,9 +122,9 @@ protected void addOrUpdateSourceObjectIntoSourceObjectMap(HashMap storedSegmentCount) { sourceObjectMap.put(sourceUniqueKey, sourceObject); @@ -163,7 +158,7 @@ protected JSONObject getSourceObject(JSONObject metadataObject) { Long ingestionTimestamp = metadataObject.has(Constants.METADATA_KEY_INGESTION_TIMESTAMP) ? metadataObject.getLong(Constants.METADATA_KEY_INGESTION_TIMESTAMP) : null; JSONObject sourceObject = new JSONObject(); - sourceObject.put(JSON_KEY_SEGMENT_COUNT, Integer.parseInt(index) + 1); + sourceObject.put(Constants.JSON_KEY_SEGMENT_COUNT, Integer.parseInt(index) + 1); sourceObject.put(Constants.METADATA_KEY_SOURCE_ID, sourceId); sourceObject.put(Constants.METADATA_KEY_ABSOLUTE_DIRECTORY_PATH, absoluteDirectoryPath); sourceObject.put(Constants.METADATA_KEY_SOURCE, source); diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/store/aisearch/AISearchStore.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/store/aisearch/AISearchStore.java index f172617..fd3f1eb 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/store/aisearch/AISearchStore.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/store/aisearch/AISearchStore.java @@ -5,8 +5,8 @@ import dev.langchain4j.store.embedding.azure.search.AzureAiSearchEmbeddingStore; import org.json.JSONArray; import org.json.JSONObject; -import org.mule.extension.mulechain.vectors.internal.config.Configuration; import org.mule.extension.mulechain.vectors.internal.constant.Constants; +import org.mule.extension.mulechain.vectors.internal.config.Configuration; import org.mule.extension.mulechain.vectors.internal.helper.parameter.QueryParameters; import org.mule.extension.mulechain.vectors.internal.store.BaseStore; import org.mule.extension.mulechain.vectors.internal.util.JsonUtils; @@ -51,7 +51,7 @@ public JSONObject listSources() { HashMap sourceObjectMap = new HashMap(); JSONObject jsonObject = new JSONObject(); - jsonObject.put(JSON_KEY_STORE_NAME, storeName); + jsonObject.put(Constants.JSON_KEY_STORE_NAME, storeName); int segmentCount = 0; // Counter to track the number of segments processed int offset = 0; // Initialize offset for pagination @@ -144,7 +144,7 @@ public JSONObject listSources() { } while (hasMore); // Continue if more pages are available // Output total count of processed documents - LOGGER.debug(JSON_KEY_SEGMENT_COUNT + ": " + segmentCount); + LOGGER.debug(Constants.JSON_KEY_SEGMENT_COUNT + ": " + segmentCount); } catch (Exception e) { @@ -152,8 +152,8 @@ public JSONObject listSources() { LOGGER.error("Error while listing sources", e); } - jsonObject.put(JSON_KEY_SOURCES, JsonUtils.jsonObjectCollectionToJsonArray(sourceObjectMap.values())); - jsonObject.put(JSON_KEY_SOURCE_COUNT, sourceObjectMap.size()); + jsonObject.put(Constants.JSON_KEY_SOURCES, JsonUtils.jsonObjectCollectionToJsonArray(sourceObjectMap.values())); + jsonObject.put(Constants.JSON_KEY_SOURCE_COUNT, sourceObjectMap.size()); return jsonObject; } diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/store/chroma/ChromaStore.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/store/chroma/ChromaStore.java index d074bcd..720281a 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/store/chroma/ChromaStore.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/store/chroma/ChromaStore.java @@ -61,7 +61,7 @@ public JSONObject listSources() { HashMap sourceObjectMap = new HashMap(); JSONObject jsonObject = new JSONObject(); - jsonObject.put(JSON_KEY_STORE_NAME, storeName); + jsonObject.put(Constants.JSON_KEY_STORE_NAME, storeName); long segmentCount = 0; // Counter to track the number of segments processed long offset = 0; // Initialize offset for pagination @@ -89,8 +89,8 @@ public JSONObject listSources() { LOGGER.error("Error while listing sources", e); } - jsonObject.put(JSON_KEY_SOURCES, JsonUtils.jsonObjectCollectionToJsonArray(sourceObjectMap.values())); - jsonObject.put(JSON_KEY_SOURCE_COUNT, sourceObjectMap.size()); + jsonObject.put(Constants.JSON_KEY_SOURCES, JsonUtils.jsonObjectCollectionToJsonArray(sourceObjectMap.values())); + jsonObject.put(Constants.JSON_KEY_SOURCE_COUNT, sourceObjectMap.size()); return jsonObject; } diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/store/milvus/MilvusStore.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/store/milvus/MilvusStore.java index 074efed..d4b29a0 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/store/milvus/MilvusStore.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/store/milvus/MilvusStore.java @@ -51,7 +51,7 @@ public JSONObject listSources() { HashMap sourceObjectMap = new HashMap(); JSONObject jsonObject = new JSONObject(); - jsonObject.put(JSON_KEY_STORE_NAME, storeName); + jsonObject.put(Constants.JSON_KEY_STORE_NAME, storeName); // Specify the host and port for the Milvus server ConnectParam connectParam = ConnectParam.newBuilder() @@ -103,8 +103,8 @@ public JSONObject listSources() { client.close(); } - jsonObject.put(JSON_KEY_SOURCES, JsonUtils.jsonObjectCollectionToJsonArray(sourceObjectMap.values())); - jsonObject.put(JSON_KEY_SOURCE_COUNT, sourceObjectMap.size()); + jsonObject.put(Constants.JSON_KEY_SOURCES, JsonUtils.jsonObjectCollectionToJsonArray(sourceObjectMap.values())); + jsonObject.put(Constants.JSON_KEY_SOURCE_COUNT, sourceObjectMap.size()); return jsonObject; } diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/store/pgvector/PGVectorStore.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/store/pgvector/PGVectorStore.java index 3f10247..438aa9a 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/store/pgvector/PGVectorStore.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/store/pgvector/PGVectorStore.java @@ -80,7 +80,7 @@ public JSONObject listSources() { HashMap sourceObjectMap = new HashMap<>(); JSONObject jsonObject = new JSONObject(); - jsonObject.put(JSON_KEY_STORE_NAME, storeName); + jsonObject.put(Constants.JSON_KEY_STORE_NAME, storeName); try (PgVectorMetadataIterator iterator = new PgVectorMetadataIterator(userName, password, host, port, database, storeName, (int)queryParams.embeddingPageSize())) { while (iterator.hasNext()) { @@ -93,8 +93,8 @@ public JSONObject listSources() { LOGGER.error("Error while listing sources", e); } - jsonObject.put(JSON_KEY_SOURCES, JsonUtils.jsonObjectCollectionToJsonArray(sourceObjectMap.values())); - jsonObject.put(JSON_KEY_SOURCE_COUNT, sourceObjectMap.size()); + jsonObject.put(Constants.JSON_KEY_SOURCES, JsonUtils.jsonObjectCollectionToJsonArray(sourceObjectMap.values())); + jsonObject.put(Constants.JSON_KEY_SOURCE_COUNT, sourceObjectMap.size()); return jsonObject; } diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/util/JsonUtils.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/util/JsonUtils.java index 8b651ee..6f49f52 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/util/JsonUtils.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/util/JsonUtils.java @@ -66,7 +66,7 @@ public static JSONObject createFileIngestionStatusObject(String storeName, Strin jsonObject.put("fileType", fileType); jsonObject.put("filePath", contextPath); jsonObject.put("storeName", storeName); - jsonObject.put("status", Constants.OPERATION_STATUS_UPDATED); + jsonObject.put(Constants.JSON_KEY_STATUS, Constants.OPERATION_STATUS_UPDATED); return jsonObject; } @@ -82,8 +82,8 @@ public static JSONObject createFolderIngestionStatusObject(String storeName, Lon JSONObject jsonObject = new JSONObject(); jsonObject.put("filesCount", totalFiles); jsonObject.put("folderPath", contextPath); - jsonObject.put("storeName", storeName); - jsonObject.put("status", Constants.OPERATION_STATUS_UPDATED); + jsonObject.put(Constants.JSON_KEY_STORE_NAME, storeName); + jsonObject.put(Constants.JSON_KEY_STATUS, Constants.OPERATION_STATUS_UPDATED); return jsonObject; } } From 53d23762a3335780afd2e6fdaf62a87df3369d3e Mon Sep 17 00:00:00 2001 From: Tommaso Bolis Date: Tue, 19 Nov 2024 00:00:48 +0100 Subject: [PATCH 2/6] Generic Refactoring. Add attributes in the response for ingestion methods. Add metadata when ingesting text. --- README.md | 8 +-- pom.xml | 2 +- .../metadata/DocumentResponseAttributes.java | 11 +++- .../metadata/EmbeddingResponseAttributes.java | 11 +++- .../internal/helper/ResponseHelper.java | 8 +-- .../operation/EmbeddingOperations.java | 58 +++++++++++++------ .../storage/azureblob/AzureBlobStorage.java | 19 +----- .../internal/storage/local/LocalStorage.java | 17 ++---- .../internal/storage/s3/AWSS3Storage.java | 8 +-- .../vectors/internal/util/JsonUtils.java | 24 +------- ...DocumentUtils.java => MetadatatUtils.java} | 22 +++---- .../EmbeddingAddDocumentToStoreResponse.json | 10 ++++ .../EmbeddingAddFolderToStoreResponse.json | 10 ++++ .../EmbeddingAddTextToStoreResponse.json | 10 ++++ .../EmbeddingGenerateFromTextResponse.json | 23 ++++++++ 15 files changed, 142 insertions(+), 99 deletions(-) rename src/main/java/org/mule/extension/mulechain/vectors/internal/util/{DocumentUtils.java => MetadatatUtils.java} (68%) create mode 100644 src/main/resources/api/response/EmbeddingAddDocumentToStoreResponse.json create mode 100644 src/main/resources/api/response/EmbeddingAddFolderToStoreResponse.json create mode 100644 src/main/resources/api/response/EmbeddingAddTextToStoreResponse.json create mode 100644 src/main/resources/api/response/EmbeddingGenerateFromTextResponse.json diff --git a/README.md b/README.md index fe56792..f40359f 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ MAC Vectors provides access to a broad number of external Vector Stores and Data cloud.anypoint mule-vectors-connector - 0.1.25 + 0.2.0 mule-plugin ``` @@ -34,9 +34,9 @@ Then add the following dependency to your application's `pom.xml`: ```xml - com.mule.mulechain - mulechain-vectors - 0.1.25 + com.mulesoft.connectors + mule4-vectors-connector + 0.2.0 mule-plugin ``` diff --git a/pom.xml b/pom.xml index e228c9c..b8fa290 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 com.mulesoft.connectors mule4-vectors-connector - 0.1.119-SNAPSHOT + 0.1.121-SNAPSHOT mule-extension MuleSoft Vectors Connector - Mule 4 MuleSoft Vectors Connector provides access to a broad number of external Vector Stores. diff --git a/src/main/java/org/mule/extension/mulechain/vectors/api/metadata/DocumentResponseAttributes.java b/src/main/java/org/mule/extension/mulechain/vectors/api/metadata/DocumentResponseAttributes.java index 3e60821..eb8a1c2 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/api/metadata/DocumentResponseAttributes.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/api/metadata/DocumentResponseAttributes.java @@ -2,13 +2,18 @@ import java.io.Serializable; import java.util.HashMap; +import java.util.Map; public class DocumentResponseAttributes implements Serializable { - private final HashMap documentAttributes; + private final HashMap requestAttributes; - public DocumentResponseAttributes(HashMap documentAttributes) { + public DocumentResponseAttributes(HashMap requestAttributes) { - this.documentAttributes = documentAttributes; + this.requestAttributes = requestAttributes; + } + + public Map getRequestAttributes() { + return requestAttributes; } } diff --git a/src/main/java/org/mule/extension/mulechain/vectors/api/metadata/EmbeddingResponseAttributes.java b/src/main/java/org/mule/extension/mulechain/vectors/api/metadata/EmbeddingResponseAttributes.java index 8b54397..00d3800 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/api/metadata/EmbeddingResponseAttributes.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/api/metadata/EmbeddingResponseAttributes.java @@ -2,13 +2,18 @@ import java.io.Serializable; import java.util.HashMap; +import java.util.Map; public class EmbeddingResponseAttributes implements Serializable { - private final HashMap embeddingAttributes; + private final HashMap requestAttributes; - public EmbeddingResponseAttributes(HashMap embeddingAttributes) { + public EmbeddingResponseAttributes(HashMap requestAttributes) { - this.embeddingAttributes = embeddingAttributes; + this.requestAttributes = requestAttributes; + } + + public Map getRequestAttributes() { + return requestAttributes; } } diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/helper/ResponseHelper.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/helper/ResponseHelper.java index 38a4119..e8dfc08 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/helper/ResponseHelper.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/helper/ResponseHelper.java @@ -18,10 +18,10 @@ private ResponseHelper() { public static Result createEmbeddingResponse( String response, - Map embeddingAttributes) { + Map embeddingAttributes) { return Result.builder() - .attributes(new EmbeddingResponseAttributes((HashMap) embeddingAttributes)) + .attributes(new EmbeddingResponseAttributes((HashMap) embeddingAttributes)) .attributesMediaType(org.mule.runtime.api.metadata.MediaType.APPLICATION_JAVA) .output(toInputStream(response, StandardCharsets.UTF_8)) .mediaType(org.mule.runtime.api.metadata.MediaType.APPLICATION_JSON) @@ -30,10 +30,10 @@ public static Result createEmbeddingRe public static Result createDocumentResponse( String response, - Map documentAttributes) { + Map documentAttributes) { return Result.builder() - .attributes(new DocumentResponseAttributes((HashMap) documentAttributes)) + .attributes(new DocumentResponseAttributes((HashMap) documentAttributes)) .attributesMediaType(org.mule.runtime.api.metadata.MediaType.APPLICATION_JAVA) .output(toInputStream(response, StandardCharsets.UTF_8)) .mediaType(org.mule.runtime.api.metadata.MediaType.APPLICATION_JSON) diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java index 2642169..7b51649 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java @@ -4,6 +4,7 @@ import static org.mule.runtime.extension.api.annotation.param.MediaType.APPLICATION_JSON; import java.io.InputStream; +import java.util.Arrays; import java.util.HashMap; import java.util.List; @@ -22,6 +23,7 @@ import org.mule.extension.mulechain.vectors.internal.model.BaseModel; import org.mule.extension.mulechain.vectors.internal.storage.BaseStorage; import org.mule.extension.mulechain.vectors.internal.store.BaseStore; +import org.mule.extension.mulechain.vectors.internal.util.MetadatatUtils; import org.mule.extension.mulechain.vectors.internal.util.JsonUtils; import org.mule.runtime.extension.api.annotation.Alias; import org.mule.runtime.extension.api.annotation.error.Throws; @@ -53,7 +55,7 @@ public class EmbeddingOperations { @MediaType(value = APPLICATION_JSON, strict = false) @Alias("Embedding-add-text-to-store") @Throws(EmbeddingErrorTypeProvider.class) - + @OutputJsonType(schema = "api/response/EmbeddingAddTextToStoreResponse.json") public org.mule.runtime.extension.api.runtime.operation.Result addTextToStore( @Config Configuration configuration, @Alias("text") @DisplayName("Text") String text, @@ -79,17 +81,20 @@ public class EmbeddingOperations { EmbeddingStore embeddingStore = baseStore.buildEmbeddingStore(); - TextSegment textSegment = TextSegment.from(text); + Metadata metadata = new Metadata(); + MetadatatUtils.setBaseMetadata(metadata); + + TextSegment textSegment = TextSegment.from(text, metadata); Embedding textEmbedding = embeddingModel.embed(textSegment).content(); embeddingStore.add(textEmbedding, textSegment); - JSONObject jsonObject = new JSONObject(); - jsonObject.put(Constants.JSON_KEY_STATUS, Constants.OPERATION_STATUS_ADDED); - jsonObject.put(Constants.JSON_KEY_TEXT, textSegment.toString()); - jsonObject.put(Constants.JSON_KEY_EMBEDDING, textEmbedding.toString()); - jsonObject.put(Constants.JSON_KEY_STORE_NAME, storeName); + JSONObject jsonObject = JsonUtils.createIngestionStatusObject(storeName); - return createEmbeddingResponse(jsonObject.toString(), new HashMap<>()); + return createEmbeddingResponse( + jsonObject.toString(), + new HashMap() {{ + put("storeName", storeName); + }}); } catch (ModuleException me) { throw me; @@ -109,6 +114,7 @@ public class EmbeddingOperations { @MediaType(value = APPLICATION_JSON, strict = false) @Alias("Embedding-generate-from-text") @Throws(EmbeddingErrorTypeProvider.class) + @OutputJsonType(schema = "api/response/EmbeddingGenerateFromTextResponse.json") public org.mule.runtime.extension.api.runtime.operation.Result generateEmbedding(@Config Configuration configuration, @Alias("text") @DisplayName("Text") String text, @@ -138,8 +144,8 @@ public class EmbeddingOperations { } JSONObject jsonObject = new JSONObject(); - jsonObject.put(Constants.JSON_KEY_TEXT, textSegment.toString()); - jsonObject.put(Constants.JSON_KEY_EMBEDDING, embedding.toString()); + jsonObject.put(Constants.JSON_KEY_TEXT, textSegment.text()); + jsonObject.put(Constants.JSON_KEY_EMBEDDING, Arrays.toString(embedding.vector())); jsonObject.put(Constants.JSON_KEY_DIMENSIONS, embedding.dimension()); return createEmbeddingResponse(jsonObject.toString(), new HashMap<>()); @@ -162,7 +168,7 @@ public class EmbeddingOperations { @MediaType(value = APPLICATION_JSON, strict = false) @Alias("Embedding-add-folder-to-store") @Throws(EmbeddingErrorTypeProvider.class) - + @OutputJsonType(schema = "api/response/EmbeddingAddFolderToStoreResponse.json") public org.mule.runtime.extension.api.runtime.operation.Result addFolderToStore( @Config Configuration configuration, @Alias("storeName") @DisplayName("Store Name") String storeName, @@ -203,16 +209,25 @@ public class EmbeddingOperations { .fileType(documentParameters.getFileType()) .build(); - long documentNumber = 0; + long documentNumber = 0L; while(baseStorage.hasNext()) { Document document = baseStorage.next(); embeddingStoreIngestor.ingest(document); documentNumber ++; } - JSONObject jsonObject = JsonUtils.createFolderIngestionStatusObject(storeName, documentNumber, documentParameters.getFileType()); - - return createEmbeddingResponse(jsonObject.toString(), new HashMap<>()); + JSONObject jsonObject = JsonUtils.createIngestionStatusObject(storeName); + + long finalDocumentNumber = documentNumber; + return createEmbeddingResponse( + jsonObject.toString(), + new HashMap() {{ + put("documentCount", finalDocumentNumber); + put("storeName", storeName); + put("storageType", documentParameters.getStorageType()); + put("fileType", documentParameters.getFileType()); + put("contextPath", documentParameters.getContextPath()); + }}); } catch (ModuleException me) { throw me; @@ -233,7 +248,7 @@ public class EmbeddingOperations { @MediaType(value = APPLICATION_JSON, strict = false) @Alias("EMBEDDING-add-document-to-store") @Throws(EmbeddingErrorTypeProvider.class) - + @OutputJsonType(schema = "api/response/EmbeddingAddDocumentToStoreResponse.json") public org.mule.runtime.extension.api.runtime.operation.Result addFileEmbedding( @Config Configuration configuration, @Alias("storeName") @DisplayName("Store Name") String storeName, @@ -277,9 +292,16 @@ public class EmbeddingOperations { embeddingStoreIngestor.ingest(document); - JSONObject jsonObject = JsonUtils.createFileIngestionStatusObject(storeName, documentParameters.getFileType(), documentParameters.getContextPath()); + JSONObject jsonObject = JsonUtils.createIngestionStatusObject(storeName); - return createEmbeddingResponse(jsonObject.toString(), new HashMap<>()); + return createEmbeddingResponse( + jsonObject.toString(), + new HashMap() {{ + put("storeName", storeName); + put("storageType", documentParameters.getStorageType()); + put("fileType", documentParameters.getFileType()); + put("contextPath", documentParameters.getContextPath()); + }}); } catch (ModuleException me) { throw me; diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/storage/azureblob/AzureBlobStorage.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/storage/azureblob/AzureBlobStorage.java index 523b948..40583e9 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/storage/azureblob/AzureBlobStorage.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/storage/azureblob/AzureBlobStorage.java @@ -7,26 +7,13 @@ import com.azure.storage.common.StorageSharedKeyCredential; import dev.langchain4j.data.document.loader.azure.storage.blob.AzureBlobStorageDocumentLoader; -import dev.langchain4j.store.embedding.EmbeddingStoreIngestor; -import dev.langchain4j.data.document.DocumentParser; - -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; import java.util.Iterator; -import java.util.List; -import java.util.stream.Stream; import org.json.JSONObject; import org.mule.extension.mulechain.vectors.internal.config.Configuration; -import org.mule.extension.mulechain.vectors.internal.constant.Constants; -import dev.langchain4j.data.document.parser.TextDocumentParser; -import dev.langchain4j.data.document.parser.apache.tika.ApacheTikaDocumentParser; import dev.langchain4j.data.document.Document; import org.mule.extension.mulechain.vectors.internal.storage.BaseStorage; -import org.mule.extension.mulechain.vectors.internal.util.DocumentUtils; -import org.mule.extension.mulechain.vectors.internal.util.JsonUtils; +import org.mule.extension.mulechain.vectors.internal.util.MetadatatUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -105,7 +92,7 @@ public Document next() { BlobItem blobItem = blobIterator.next(); LOGGER.debug("Blob name: " + blobItem.getName()); Document document = getLoader().loadDocument(contextPath, blobItem.getName(), documentParser); - DocumentUtils.addMetadataToDocument(document, fileType, blobItem.getName()); + MetadatatUtils.addMetadataToDocument(document, fileType, blobItem.getName()); return document; } @@ -116,7 +103,7 @@ public Document getSingleDocument() { String blobName = parts[1]; LOGGER.debug("Blob name: " + blobName); Document document = getLoader().loadDocument(containerName, blobName, documentParser); - DocumentUtils.addMetadataToDocument(document, fileType, blobName); + MetadatatUtils.addMetadataToDocument(document, fileType, blobName); return document; } } diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/storage/local/LocalStorage.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/storage/local/LocalStorage.java index f083f52..43401a3 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/storage/local/LocalStorage.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/storage/local/LocalStorage.java @@ -3,17 +3,11 @@ import dev.langchain4j.data.document.Document; import dev.langchain4j.data.document.DocumentParser; import dev.langchain4j.data.document.loader.UrlDocumentLoader; -import dev.langchain4j.data.document.parser.TextDocumentParser; -import dev.langchain4j.data.document.parser.apache.tika.ApacheTikaDocumentParser; import dev.langchain4j.data.document.transformer.jsoup.HtmlToTextDocumentTransformer; -import dev.langchain4j.store.embedding.EmbeddingStoreIngestor; -import org.json.JSONObject; import org.mule.extension.mulechain.vectors.internal.config.Configuration; import org.mule.extension.mulechain.vectors.internal.constant.Constants; import org.mule.extension.mulechain.vectors.internal.storage.BaseStorage; -import org.mule.extension.mulechain.vectors.internal.storage.s3.AWSS3Storage; -import org.mule.extension.mulechain.vectors.internal.util.DocumentUtils; -import org.mule.extension.mulechain.vectors.internal.util.JsonUtils; +import org.mule.extension.mulechain.vectors.internal.util.MetadatatUtils; import org.mule.extension.mulechain.vectors.internal.util.Utils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -24,15 +18,12 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.util.Arrays; import java.util.Iterator; import java.util.List; -import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; import java.util.stream.Stream; import static dev.langchain4j.data.document.loader.FileSystemDocumentLoader.loadDocument; -import static org.mule.extension.mulechain.vectors.internal.util.JsonUtils.readConfigFile; public class LocalStorage extends BaseStorage { @@ -73,7 +64,7 @@ public Document next() { Path path = getPathIterator().next(); LOGGER.debug("File: " + path.getFileName().toString()); Document document = loadDocument(path.toString(), documentParser); - DocumentUtils.addMetadataToDocument(document, fileType, path.getFileName().toString()); + MetadatatUtils.addMetadataToDocument(document, fileType, path.getFileName().toString()); return document; } throw new IllegalStateException("No more files to iterate"); @@ -91,7 +82,7 @@ public Document getSingleDocument() { case Constants.FILE_TYPE_TEXT: case Constants.FILE_TYPE_ANY: document = loadDocument(path.toString(), documentParser); - DocumentUtils.addMetadataToDocument(document, fileType, Utils.getFileNameFromPath(contextPath)); + MetadatatUtils.addMetadataToDocument(document, fileType, Utils.getFileNameFromPath(contextPath)); break; case Constants.FILE_TYPE_URL: document = loadUrlDocument(contextPath); @@ -111,7 +102,7 @@ private Document loadUrlDocument(String contextPath) { HtmlToTextDocumentTransformer transformer = new HtmlToTextDocumentTransformer(null, null, true); document = transformer.transform(htmlDocument); document.metadata().put(Constants.METADATA_KEY_URL, contextPath); - DocumentUtils.addMetadataToDocument(document, Constants.FILE_TYPE_URL, ""); + MetadatatUtils.addMetadataToDocument(document, Constants.FILE_TYPE_URL, ""); } catch (MalformedURLException e) { throw new RuntimeException("Invalid URL: " + contextPath, e); } diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/storage/s3/AWSS3Storage.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/storage/s3/AWSS3Storage.java index 70feb1c..430c615 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/storage/s3/AWSS3Storage.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/storage/s3/AWSS3Storage.java @@ -1,8 +1,6 @@ package org.mule.extension.mulechain.vectors.internal.storage.s3; -import org.mule.extension.mulechain.vectors.internal.util.JsonUtils; import software.amazon.awssdk.regions.Region; -import dev.langchain4j.store.embedding.EmbeddingStoreIngestor; import dev.langchain4j.data.document.loader.amazon.s3.AmazonS3DocumentLoader; import dev.langchain4j.data.document.loader.amazon.s3.AwsCredentials; @@ -12,7 +10,7 @@ import org.mule.extension.mulechain.vectors.internal.config.Configuration; import dev.langchain4j.data.document.Document; import org.mule.extension.mulechain.vectors.internal.storage.BaseStorage; -import org.mule.extension.mulechain.vectors.internal.util.DocumentUtils; +import org.mule.extension.mulechain.vectors.internal.util.MetadatatUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import software.amazon.awssdk.auth.credentials.AwsBasicCredentials; @@ -122,7 +120,7 @@ public Document next() { S3Object object = getS3ObjectIterator().next(); LOGGER.debug("AWS S3 Key: " + object.key()); Document document = getLoader().loadDocument(awsS3Bucket, object.key(), documentParser); - DocumentUtils.addMetadataToDocument(document, fileType, object.key()); + MetadatatUtils.addMetadataToDocument(document, fileType, object.key()); return document; } @@ -130,7 +128,7 @@ public Document getSingleDocument() { LOGGER.debug("AWS S3 Key: " + contextPath); Document document = getLoader().loadDocument(awsS3Bucket, contextPath, documentParser); - DocumentUtils.addMetadataToDocument(document, fileType, contextPath); + MetadatatUtils.addMetadataToDocument(document, fileType, contextPath); return document; } } diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/util/JsonUtils.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/util/JsonUtils.java index 6f49f52..e544385 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/util/JsonUtils.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/util/JsonUtils.java @@ -6,6 +6,7 @@ import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import opennlp.tools.parser.Cons; import org.json.JSONArray; import org.json.JSONObject; @@ -53,35 +54,14 @@ public static JSONArray jsonObjectCollectionToJsonArray(Collection j return jsonArray; } - /** - * Creates a JSONObject representing the ingestion status. - * - * @param fileType the type of the ingested file. - * @param contextPath the path of the ingested file or folder. - * @return a JSONObject containing ingestion status metadata. - */ - public static JSONObject createFileIngestionStatusObject(String storeName, String fileType, String contextPath) { - - JSONObject jsonObject = new JSONObject(); - jsonObject.put("fileType", fileType); - jsonObject.put("filePath", contextPath); - jsonObject.put("storeName", storeName); - jsonObject.put(Constants.JSON_KEY_STATUS, Constants.OPERATION_STATUS_UPDATED); - return jsonObject; - } - /** * Creates a JSONObject representing the ingestion status of a folder or set of files. * - * @param totalFiles the total number of files processed. - * @param contextPath the path of the processed folder. * @return a JSONObject containing the ingestion status with file count, folder path, store name, and status. */ - public static JSONObject createFolderIngestionStatusObject(String storeName, Long totalFiles, String contextPath) { + public static JSONObject createIngestionStatusObject(String storeName) { JSONObject jsonObject = new JSONObject(); - jsonObject.put("filesCount", totalFiles); - jsonObject.put("folderPath", contextPath); jsonObject.put(Constants.JSON_KEY_STORE_NAME, storeName); jsonObject.put(Constants.JSON_KEY_STATUS, Constants.OPERATION_STATUS_UPDATED); return jsonObject; diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/util/DocumentUtils.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/util/MetadatatUtils.java similarity index 68% rename from src/main/java/org/mule/extension/mulechain/vectors/internal/util/DocumentUtils.java rename to src/main/java/org/mule/extension/mulechain/vectors/internal/util/MetadatatUtils.java index 1972076..5693a9f 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/util/DocumentUtils.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/util/MetadatatUtils.java @@ -1,24 +1,28 @@ package org.mule.extension.mulechain.vectors.internal.util; -import com.azure.core.implementation.logging.DefaultLogger; import com.fasterxml.jackson.databind.JsonNode; import dev.langchain4j.data.document.Document; +import dev.langchain4j.data.document.Metadata; import org.mule.extension.mulechain.vectors.internal.constant.Constants; -import org.mule.extension.mulechain.vectors.internal.storage.s3.AWSS3Storage; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; import java.util.Arrays; /** - * Utility class for adding metadata to Document instances. + * Utility class for setting metadata. */ -public class DocumentUtils { +public class MetadatatUtils { - private static final Logger LOGGER = LoggerFactory.getLogger(DocumentUtils.class); + private static final Logger LOGGER = LoggerFactory.getLogger(MetadatatUtils.class); + + public static void setBaseMetadata(Metadata metadata) { + + metadata.put(Constants.METADATA_KEY_SOURCE_ID, dev.langchain4j.internal.Utils.randomUUID()); + metadata.put(Constants.METADATA_KEY_INGESTION_DATETIME, Utils.getCurrentISO8601Timestamp()); + metadata.put(Constants.METADATA_KEY_INGESTION_TIMESTAMP, Utils.getCurrentTimeMillis()); + } /** * Adds metadata to a Document with specified file type, file name, and file path. @@ -29,9 +33,7 @@ public class DocumentUtils { */ public static void addMetadataToDocument(Document document, String fileType, String fileName) { - document.metadata().put(Constants.METADATA_KEY_SOURCE_ID, dev.langchain4j.internal.Utils.randomUUID()); - document.metadata().put(Constants.METADATA_KEY_INGESTION_DATETIME, Utils.getCurrentISO8601Timestamp()); - document.metadata().put(Constants.METADATA_KEY_INGESTION_TIMESTAMP, Utils.getCurrentTimeMillis()); + setBaseMetadata(document.metadata()); if(!fileType.isEmpty()) document.metadata().put(Constants.METADATA_KEY_FILE_TYPE, fileType); if(!fileName.isEmpty()) document.metadata().put(Constants.METADATA_KEY_FILE_NAME, fileName); diff --git a/src/main/resources/api/response/EmbeddingAddDocumentToStoreResponse.json b/src/main/resources/api/response/EmbeddingAddDocumentToStoreResponse.json new file mode 100644 index 0000000..6f9dbf5 --- /dev/null +++ b/src/main/resources/api/response/EmbeddingAddDocumentToStoreResponse.json @@ -0,0 +1,10 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "status": { + "type": "string" + } + }, + "required": ["status"] +} diff --git a/src/main/resources/api/response/EmbeddingAddFolderToStoreResponse.json b/src/main/resources/api/response/EmbeddingAddFolderToStoreResponse.json new file mode 100644 index 0000000..6f9dbf5 --- /dev/null +++ b/src/main/resources/api/response/EmbeddingAddFolderToStoreResponse.json @@ -0,0 +1,10 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "status": { + "type": "string" + } + }, + "required": ["status"] +} diff --git a/src/main/resources/api/response/EmbeddingAddTextToStoreResponse.json b/src/main/resources/api/response/EmbeddingAddTextToStoreResponse.json new file mode 100644 index 0000000..6f9dbf5 --- /dev/null +++ b/src/main/resources/api/response/EmbeddingAddTextToStoreResponse.json @@ -0,0 +1,10 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "status": { + "type": "string" + } + }, + "required": ["status"] +} diff --git a/src/main/resources/api/response/EmbeddingGenerateFromTextResponse.json b/src/main/resources/api/response/EmbeddingGenerateFromTextResponse.json new file mode 100644 index 0000000..2ed1eef --- /dev/null +++ b/src/main/resources/api/response/EmbeddingGenerateFromTextResponse.json @@ -0,0 +1,23 @@ +{ + "$schema": "http://json-schema.org/draft-04/schema#", + "type": "object", + "properties": { + "text": { + "type": "string", + "description": "The original text being represented by the embedding." + }, + "embedding": { + "type": "array", + "items": { + "type": "number", + "format": "float" + }, + "description": "An array of floats representing the embedding vector." + }, + "dimensions": { + "type": "integer", + "description": "The number of dimensions in the embedding vector." + } + }, + "required": ["text", "embedding", "dimensions"] +} From c168d3b43e413d0fcbf1e1ddc7bcd664ca7ed85d Mon Sep 17 00:00:00 2001 From: Tommaso Bolis Date: Tue, 19 Nov 2024 00:02:18 +0100 Subject: [PATCH 3/6] Add attributes in the response for ingestion methods. --- .../internal/operation/EmbeddingOperations.java | 6 +++--- .../response/EmbeddingAddFolderToStoreResponse.json | 10 ---------- .../api/response/EmbeddingAddTextToStoreResponse.json | 10 ---------- ...eResponse.json => EmbeddingAddToStoreResponse.json} | 0 4 files changed, 3 insertions(+), 23 deletions(-) delete mode 100644 src/main/resources/api/response/EmbeddingAddFolderToStoreResponse.json delete mode 100644 src/main/resources/api/response/EmbeddingAddTextToStoreResponse.json rename src/main/resources/api/response/{EmbeddingAddDocumentToStoreResponse.json => EmbeddingAddToStoreResponse.json} (100%) diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java index 7b51649..885a91d 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java @@ -55,7 +55,7 @@ public class EmbeddingOperations { @MediaType(value = APPLICATION_JSON, strict = false) @Alias("Embedding-add-text-to-store") @Throws(EmbeddingErrorTypeProvider.class) - @OutputJsonType(schema = "api/response/EmbeddingAddTextToStoreResponse.json") + @OutputJsonType(schema = "api/response/EmbeddingAddToStoreResponse.json") public org.mule.runtime.extension.api.runtime.operation.Result addTextToStore( @Config Configuration configuration, @Alias("text") @DisplayName("Text") String text, @@ -168,7 +168,7 @@ public class EmbeddingOperations { @MediaType(value = APPLICATION_JSON, strict = false) @Alias("Embedding-add-folder-to-store") @Throws(EmbeddingErrorTypeProvider.class) - @OutputJsonType(schema = "api/response/EmbeddingAddFolderToStoreResponse.json") + @OutputJsonType(schema = "api/response/EmbeddingAddToStoreResponse.json") public org.mule.runtime.extension.api.runtime.operation.Result addFolderToStore( @Config Configuration configuration, @Alias("storeName") @DisplayName("Store Name") String storeName, @@ -248,7 +248,7 @@ public class EmbeddingOperations { @MediaType(value = APPLICATION_JSON, strict = false) @Alias("EMBEDDING-add-document-to-store") @Throws(EmbeddingErrorTypeProvider.class) - @OutputJsonType(schema = "api/response/EmbeddingAddDocumentToStoreResponse.json") + @OutputJsonType(schema = "api/response/EmbeddingAddToStoreResponse.json") public org.mule.runtime.extension.api.runtime.operation.Result addFileEmbedding( @Config Configuration configuration, @Alias("storeName") @DisplayName("Store Name") String storeName, diff --git a/src/main/resources/api/response/EmbeddingAddFolderToStoreResponse.json b/src/main/resources/api/response/EmbeddingAddFolderToStoreResponse.json deleted file mode 100644 index 6f9dbf5..0000000 --- a/src/main/resources/api/response/EmbeddingAddFolderToStoreResponse.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "properties": { - "status": { - "type": "string" - } - }, - "required": ["status"] -} diff --git a/src/main/resources/api/response/EmbeddingAddTextToStoreResponse.json b/src/main/resources/api/response/EmbeddingAddTextToStoreResponse.json deleted file mode 100644 index 6f9dbf5..0000000 --- a/src/main/resources/api/response/EmbeddingAddTextToStoreResponse.json +++ /dev/null @@ -1,10 +0,0 @@ -{ - "$schema": "http://json-schema.org/draft-07/schema#", - "type": "object", - "properties": { - "status": { - "type": "string" - } - }, - "required": ["status"] -} diff --git a/src/main/resources/api/response/EmbeddingAddDocumentToStoreResponse.json b/src/main/resources/api/response/EmbeddingAddToStoreResponse.json similarity index 100% rename from src/main/resources/api/response/EmbeddingAddDocumentToStoreResponse.json rename to src/main/resources/api/response/EmbeddingAddToStoreResponse.json From daea025e876214a587d34ecdc30a62a812b8d3af Mon Sep 17 00:00:00 2001 From: Tommaso Bolis Date: Tue, 19 Nov 2024 18:58:36 +0100 Subject: [PATCH 4/6] Add text to store now implements segmentation --- pom.xml | 2 +- .../metadata/DocumentResponseAttributes.java | 5 +++ .../metadata/EmbeddingResponseAttributes.java | 5 +++ .../operation/EmbeddingOperations.java | 15 +++++--- .../vectors/internal/util/MetadatatUtils.java | 34 +++++++++---------- 5 files changed, 38 insertions(+), 23 deletions(-) diff --git a/pom.xml b/pom.xml index b8fa290..9fd7a06 100644 --- a/pom.xml +++ b/pom.xml @@ -6,7 +6,7 @@ 4.0.0 com.mulesoft.connectors mule4-vectors-connector - 0.1.121-SNAPSHOT + 0.1.122-SNAPSHOT mule-extension MuleSoft Vectors Connector - Mule 4 MuleSoft Vectors Connector provides access to a broad number of external Vector Stores. diff --git a/src/main/java/org/mule/extension/mulechain/vectors/api/metadata/DocumentResponseAttributes.java b/src/main/java/org/mule/extension/mulechain/vectors/api/metadata/DocumentResponseAttributes.java index eb8a1c2..b121253 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/api/metadata/DocumentResponseAttributes.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/api/metadata/DocumentResponseAttributes.java @@ -1,9 +1,13 @@ package org.mule.extension.mulechain.vectors.api.metadata; +import org.mule.runtime.extension.api.annotation.param.MediaType; + import java.io.Serializable; import java.util.HashMap; import java.util.Map; +import static org.mule.runtime.extension.api.annotation.param.MediaType.APPLICATION_JSON; + public class DocumentResponseAttributes implements Serializable { private final HashMap requestAttributes; @@ -13,6 +17,7 @@ public DocumentResponseAttributes(HashMap requestAttributes) { this.requestAttributes = requestAttributes; } + @MediaType(value = APPLICATION_JSON, strict = false) public Map getRequestAttributes() { return requestAttributes; } diff --git a/src/main/java/org/mule/extension/mulechain/vectors/api/metadata/EmbeddingResponseAttributes.java b/src/main/java/org/mule/extension/mulechain/vectors/api/metadata/EmbeddingResponseAttributes.java index 00d3800..0074086 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/api/metadata/EmbeddingResponseAttributes.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/api/metadata/EmbeddingResponseAttributes.java @@ -1,9 +1,13 @@ package org.mule.extension.mulechain.vectors.api.metadata; +import org.mule.runtime.extension.api.annotation.param.MediaType; + import java.io.Serializable; import java.util.HashMap; import java.util.Map; +import static org.mule.runtime.extension.api.annotation.param.MediaType.APPLICATION_JSON; + public class EmbeddingResponseAttributes implements Serializable { private final HashMap requestAttributes; @@ -13,6 +17,7 @@ public EmbeddingResponseAttributes(HashMap requestAttributes) { this.requestAttributes = requestAttributes; } + @MediaType(value = APPLICATION_JSON, strict = false) public Map getRequestAttributes() { return requestAttributes; } diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java index 885a91d..d637302 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java @@ -60,6 +60,7 @@ public class EmbeddingOperations { addTextToStore( @Config Configuration configuration, @Alias("text") @DisplayName("Text") String text, @Alias("storeName") @DisplayName("Store Name") String storeName, + @ParameterGroup(name = "Segmentation") SegmentationParameters segmentationParameters, @ParameterGroup(name = "Embedding Model") EmbeddingModelParameters embeddingModelParameters){ try { @@ -81,12 +82,16 @@ public class EmbeddingOperations { EmbeddingStore embeddingStore = baseStore.buildEmbeddingStore(); - Metadata metadata = new Metadata(); - MetadatatUtils.setBaseMetadata(metadata); + EmbeddingStoreIngestor embeddingStoreIngestor = EmbeddingStoreIngestor.builder() + .documentSplitter(DocumentSplitters.recursive(segmentationParameters.getMaxSegmentSizeInChar(), segmentationParameters.getMaxOverlapSizeInChars())) + .embeddingModel(embeddingModel) + .embeddingStore(embeddingStore) + .build(); + + Document document = new Document(text); + MetadatatUtils.addMetadataToDocument(document, Constants.FILE_TYPE_TEXT); - TextSegment textSegment = TextSegment.from(text, metadata); - Embedding textEmbedding = embeddingModel.embed(textSegment).content(); - embeddingStore.add(textEmbedding, textSegment); + embeddingStoreIngestor.ingest(document); JSONObject jsonObject = JsonUtils.createIngestionStatusObject(storeName); diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/util/MetadatatUtils.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/util/MetadatatUtils.java index 5693a9f..af4312a 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/util/MetadatatUtils.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/util/MetadatatUtils.java @@ -17,26 +17,13 @@ public class MetadatatUtils { private static final Logger LOGGER = LoggerFactory.getLogger(MetadatatUtils.class); - public static void setBaseMetadata(Metadata metadata) { + public static void addMetadataToDocument(Document document, String fileType) { - metadata.put(Constants.METADATA_KEY_SOURCE_ID, dev.langchain4j.internal.Utils.randomUUID()); - metadata.put(Constants.METADATA_KEY_INGESTION_DATETIME, Utils.getCurrentISO8601Timestamp()); - metadata.put(Constants.METADATA_KEY_INGESTION_TIMESTAMP, Utils.getCurrentTimeMillis()); - } - - /** - * Adds metadata to a Document with specified file type, file name, and file path. - * - * @param document the Document to which metadata is added. - * @param fileType the type of the file (e.g., text, any). - * @param fileName the name of the file. - */ - public static void addMetadataToDocument(Document document, String fileType, String fileName) { - - setBaseMetadata(document.metadata()); + document.metadata().put(Constants.METADATA_KEY_SOURCE_ID, dev.langchain4j.internal.Utils.randomUUID()); + document.metadata().put(Constants.METADATA_KEY_INGESTION_DATETIME, Utils.getCurrentISO8601Timestamp()); + document.metadata().put(Constants.METADATA_KEY_INGESTION_TIMESTAMP, Utils.getCurrentTimeMillis()); if(!fileType.isEmpty()) document.metadata().put(Constants.METADATA_KEY_FILE_TYPE, fileType); - if(!fileName.isEmpty()) document.metadata().put(Constants.METADATA_KEY_FILE_NAME, fileName); if (fileType.equals(Constants.FILE_TYPE_CRAWL)) { @@ -53,4 +40,17 @@ public static void addMetadataToDocument(Document document, String fileType, Str } } } + + /** + * Adds metadata to a Document with specified file type, file name, and file path. + * + * @param document the Document to which metadata is added. + * @param fileType the type of the file (e.g., text, any). + * @param fileName the name of the file. + */ + public static void addMetadataToDocument(Document document, String fileType, String fileName) { + + addMetadataToDocument(document, fileType); + if(!fileName.isEmpty()) document.metadata().put(Constants.METADATA_KEY_FILE_NAME, fileName); + } } From dbf7e11e8eff096ca5cfdc6433561d450fd8f3a6 Mon Sep 17 00:00:00 2001 From: Tommaso Bolis Date: Tue, 19 Nov 2024 19:05:09 +0100 Subject: [PATCH 5/6] Fix bug when index metadata key is empty. --- .../extension/mulechain/vectors/internal/store/BaseStore.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/store/BaseStore.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/store/BaseStore.java index 21215ac..8d842a6 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/store/BaseStore.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/store/BaseStore.java @@ -158,7 +158,7 @@ protected JSONObject getSourceObject(JSONObject metadataObject) { Long ingestionTimestamp = metadataObject.has(Constants.METADATA_KEY_INGESTION_TIMESTAMP) ? metadataObject.getLong(Constants.METADATA_KEY_INGESTION_TIMESTAMP) : null; JSONObject sourceObject = new JSONObject(); - sourceObject.put(Constants.JSON_KEY_SEGMENT_COUNT, Integer.parseInt(index) + 1); + if(index != null && !index.isEmpty()) sourceObject.put(Constants.JSON_KEY_SEGMENT_COUNT, Integer.parseInt(index) + 1); sourceObject.put(Constants.METADATA_KEY_SOURCE_ID, sourceId); sourceObject.put(Constants.METADATA_KEY_ABSOLUTE_DIRECTORY_PATH, absoluteDirectoryPath); sourceObject.put(Constants.METADATA_KEY_SOURCE, source); From 7207c41c88d50e856de71ec5590de2594adf684c Mon Sep 17 00:00:00 2001 From: Tommaso Bolis Date: Tue, 19 Nov 2024 19:21:36 +0100 Subject: [PATCH 6/6] Adding json output schema --- .../operation/EmbeddingOperations.java | 37 ++++++++----- .../EmbeddingListSourcesResponse.json | 54 +++++++++++++++++++ .../EmbeddingRemoveFromStoreResponse.json | 10 ++++ 3 files changed, 89 insertions(+), 12 deletions(-) create mode 100644 src/main/resources/api/response/EmbeddingListSourcesResponse.json create mode 100644 src/main/resources/api/response/EmbeddingRemoveFromStoreResponse.json diff --git a/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java b/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java index d637302..c12e07e 100644 --- a/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java +++ b/src/main/java/org/mule/extension/mulechain/vectors/internal/operation/EmbeddingOperations.java @@ -397,9 +397,11 @@ public class EmbeddingOperations { jsonObject.put(Constants.JSON_KEY_SOURCES, sources); - - - return createEmbeddingResponse(jsonObject.toString(), new HashMap<>()); + return createEmbeddingResponse( + jsonObject.toString(), + new HashMap() {{ + put("storeName", storeName); + }}); } catch (ModuleException me) { throw me; @@ -490,7 +492,6 @@ public class EmbeddingOperations { JSONArray sources = new JSONArray(); JSONObject contentObject; - String fullPath; for (EmbeddingMatch match : embeddingMatches) { Metadata matchMetadata = match.embedded().metadata(); @@ -508,7 +509,13 @@ public class EmbeddingOperations { jsonObject.put(Constants.JSON_KEY_SOURCES, sources); - return createEmbeddingResponse(jsonObject.toString(), new HashMap<>()); + + return createEmbeddingResponse( + jsonObject.toString(), + new HashMap() {{ + put("storeName", storeName); + put("filter", searchFilterParams.getFilterJSONObject()); + }}); } catch (ModuleException me) { throw me; @@ -522,7 +529,6 @@ public class EmbeddingOperations { } } - /** * Retrieves and lists sources from the specified embedding store. * @@ -539,6 +545,7 @@ public class EmbeddingOperations { @MediaType(value = APPLICATION_JSON, strict = false) @Alias("EMBEDDING-list-sources") @Throws(EmbeddingErrorTypeProvider.class) + @OutputJsonType(schema = "api/response/EmbeddingListSourcesResponse.json") public org.mule.runtime.extension.api.runtime.operation.Result listSourcesFromStore( String storeName, @Config Configuration configuration, @@ -560,7 +567,11 @@ public class EmbeddingOperations { JSONObject jsonObject = baseStore.listSources(); - return createEmbeddingResponse(jsonObject.toString(), new HashMap<>()); + return createEmbeddingResponse( + jsonObject.toString(), + new HashMap() {{ + put("storeName", storeName); + }}); } catch (ModuleException me) { throw me; @@ -581,7 +592,7 @@ public class EmbeddingOperations { @MediaType(value = APPLICATION_JSON, strict = false) @Alias("EMBEDDING-remove-from-store-by-filter") @Throws(EmbeddingErrorTypeProvider.class) - + @OutputJsonType(schema = "api/response/EmbeddingRemoveFromStoreResponse.json") public org.mule.runtime.extension.api.runtime.operation.Result removeEmbeddingsByFilter( String storeName, @Config Configuration configuration, @@ -614,12 +625,14 @@ public class EmbeddingOperations { embeddingStore.removeAll(filter); JSONObject jsonObject = new JSONObject(); - jsonObject.put(Constants.JSON_KEY_STORE_NAME, storeName); jsonObject.put(Constants.JSON_KEY_STATUS, Constants.OPERATION_STATUS_DELETED); - //jsonObject.put("filter", removeFilterParams.getFilterJSONObject()); - - return createEmbeddingResponse(jsonObject.toString(), new HashMap<>()); + return createEmbeddingResponse( + jsonObject.toString(), + new HashMap() {{ + put("storeName", storeName); + put("filter", removeFilterParams.getFilterJSONObject()); + }}); } catch (ModuleException me) { throw me; diff --git a/src/main/resources/api/response/EmbeddingListSourcesResponse.json b/src/main/resources/api/response/EmbeddingListSourcesResponse.json new file mode 100644 index 0000000..b55c666 --- /dev/null +++ b/src/main/resources/api/response/EmbeddingListSourcesResponse.json @@ -0,0 +1,54 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "sourceCount": { + "type": "integer" + }, + "sources": { + "type": "array", + "items": { + "type": "object", + "properties": { + "ingestion_timestamp": { + "type": "integer", + "format": "int64" + }, + "source_id": { + "type": "string", + "format": "uuid" + }, + "source": { + "type": "string" + }, + "url": { + "type": "string" + }, + "title": { + "type": "string" + }, + "segmentCount": { + "type": "integer" + }, + "ingestion_datetime": { + "type": "string", + "format": "date-time" + }, + "absolute_directory_path": { + "type": "string", + "optional": true + }, + "file_name": { + "type": "string", + "optional": true + }, + "file_type": { + "type": "string" + } + } + }, + "minItems": 0 + } + }, + "required": ["sourceCount", "sources"] +} diff --git a/src/main/resources/api/response/EmbeddingRemoveFromStoreResponse.json b/src/main/resources/api/response/EmbeddingRemoveFromStoreResponse.json new file mode 100644 index 0000000..6f9dbf5 --- /dev/null +++ b/src/main/resources/api/response/EmbeddingRemoveFromStoreResponse.json @@ -0,0 +1,10 @@ +{ + "$schema": "http://json-schema.org/draft-07/schema#", + "type": "object", + "properties": { + "status": { + "type": "string" + } + }, + "required": ["status"] +}