From 900becab581555ac2be8fa79e8eb9b82ef9bf36c Mon Sep 17 00:00:00 2001 From: westey <164392973+westey-m@users.noreply.github.com> Date: Thu, 10 Oct 2024 16:49:55 +0100 Subject: [PATCH] .Net: Vector search rag sample (#9174) ### Motivation and Context We need an end to end RAG sample to show consumers how to ingest data and use an LLM to answer questions about the data. #7350 ### Description - Add a Demo app that shows how to load a PDF into a vector store of your choice and ask questions about the data. ### Contribution Checklist - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone :smile: --------- Co-authored-by: Mark Wallace <127216156+markwallace-microsoft@users.noreply.github.com> Co-authored-by: Dmytro Struk <13853051+dmytrostruk@users.noreply.github.com> Co-authored-by: Weihan Li <7604648+WeihanLi@users.noreply.github.com> Co-authored-by: Roger Barreto <19890735+RogerBarreto@users.noreply.github.com> --- dotnet/Directory.Packages.props | 1 + dotnet/SK-dotnet.sln | 11 +- .../Demos/VectorStoreRAG/DataLoader.cs | 86 +++++++++ .../Demos/VectorStoreRAG/IDataLoader.cs | 17 ++ .../Options/ApplicationConfig.cs | 71 ++++++++ .../Options/AzureAISearchConfig.cs | 19 ++ .../Options/AzureCosmosDBConfig.cs | 20 +++ .../Options/AzureOpenAIConfig.cs | 19 ++ .../Options/AzureOpenAIEmbeddingsConfig.cs | 19 ++ .../VectorStoreRAG/Options/QdrantConfig.cs | 22 +++ .../Demos/VectorStoreRAG/Options/RagConfig.cs | 25 +++ .../VectorStoreRAG/Options/RedisConfig.cs | 16 ++ .../VectorStoreRAG/Options/WeaviateConfig.cs | 16 ++ .../samples/Demos/VectorStoreRAG/Program.cs | 124 +++++++++++++ .../Demos/VectorStoreRAG/RAGChatService.cs | 170 ++++++++++++++++++ dotnet/samples/Demos/VectorStoreRAG/README.md | 141 +++++++++++++++ .../Demos/VectorStoreRAG/TextSnippet.cs | 27 +++ .../VectorStoreRAG/UniqueKeyGenerator.cs | 18 ++ .../VectorStoreRAG/VectorStoreRAG.csproj | 36 ++++ .../Demos/VectorStoreRAG/appsettings.json | 49 +++++ ...ureAISearchKernelBuilderExtensionsTests.cs | 4 - ...ISearchServiceCollectionExtensionsTests.cs | 4 - ...zureAISearchServiceCollectionExtensions.cs | 20 --- 23 files changed, 906 insertions(+), 29 deletions(-) create mode 100644 dotnet/samples/Demos/VectorStoreRAG/DataLoader.cs create mode 100644 dotnet/samples/Demos/VectorStoreRAG/IDataLoader.cs create mode 100644 dotnet/samples/Demos/VectorStoreRAG/Options/ApplicationConfig.cs create mode 100644 dotnet/samples/Demos/VectorStoreRAG/Options/AzureAISearchConfig.cs create mode 100644 dotnet/samples/Demos/VectorStoreRAG/Options/AzureCosmosDBConfig.cs create mode 100644 dotnet/samples/Demos/VectorStoreRAG/Options/AzureOpenAIConfig.cs create mode 100644 dotnet/samples/Demos/VectorStoreRAG/Options/AzureOpenAIEmbeddingsConfig.cs create mode 100644 dotnet/samples/Demos/VectorStoreRAG/Options/QdrantConfig.cs create mode 100644 dotnet/samples/Demos/VectorStoreRAG/Options/RagConfig.cs create mode 100644 dotnet/samples/Demos/VectorStoreRAG/Options/RedisConfig.cs create mode 100644 dotnet/samples/Demos/VectorStoreRAG/Options/WeaviateConfig.cs create mode 100644 dotnet/samples/Demos/VectorStoreRAG/Program.cs create mode 100644 dotnet/samples/Demos/VectorStoreRAG/RAGChatService.cs create mode 100644 dotnet/samples/Demos/VectorStoreRAG/README.md create mode 100644 dotnet/samples/Demos/VectorStoreRAG/TextSnippet.cs create mode 100644 dotnet/samples/Demos/VectorStoreRAG/UniqueKeyGenerator.cs create mode 100644 dotnet/samples/Demos/VectorStoreRAG/VectorStoreRAG.csproj create mode 100644 dotnet/samples/Demos/VectorStoreRAG/appsettings.json diff --git a/dotnet/Directory.Packages.props b/dotnet/Directory.Packages.props index 1f9410ac40af..3013d99d5f34 100644 --- a/dotnet/Directory.Packages.props +++ b/dotnet/Directory.Packages.props @@ -32,6 +32,7 @@ + diff --git a/dotnet/SK-dotnet.sln b/dotnet/SK-dotnet.sln index ea92dd3b4cad..56bb42eca153 100644 --- a/dotnet/SK-dotnet.sln +++ b/dotnet/SK-dotnet.sln @@ -359,7 +359,9 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Process.LocalRuntime", "src EndProject Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Process.UnitTests", "src\Experimental\Process.UnitTests\Process.UnitTests.csproj", "{21A32285-8443-4A75-B2E8-27E6090EC562}" EndProject -Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "GettingStartedWithProcesses", "samples\GettingStartedWithProcesses\GettingStartedWithProcesses.csproj", "{C057ACDF-DDD8-496B-BAF9-1C6E4E1248D7}" +Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "GettingStartedWithProcesses", "samples\GettingStartedWithProcesses\GettingStartedWithProcesses.csproj", "{C057ACDF-DDD8-496B-BAF9-1C6E4E1248D7}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "VectorStoreRAG", "samples\Demos\VectorStoreRAG\VectorStoreRAG.csproj", "{28DFAF27-8FF3-4373-AAA4-2A6969C86246}" EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution @@ -926,6 +928,12 @@ Global {C057ACDF-DDD8-496B-BAF9-1C6E4E1248D7}.Publish|Any CPU.Build.0 = Debug|Any CPU {C057ACDF-DDD8-496B-BAF9-1C6E4E1248D7}.Release|Any CPU.ActiveCfg = Release|Any CPU {C057ACDF-DDD8-496B-BAF9-1C6E4E1248D7}.Release|Any CPU.Build.0 = Release|Any CPU + {28DFAF27-8FF3-4373-AAA4-2A6969C86246}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {28DFAF27-8FF3-4373-AAA4-2A6969C86246}.Debug|Any CPU.Build.0 = Debug|Any CPU + {28DFAF27-8FF3-4373-AAA4-2A6969C86246}.Publish|Any CPU.ActiveCfg = Debug|Any CPU + {28DFAF27-8FF3-4373-AAA4-2A6969C86246}.Publish|Any CPU.Build.0 = Debug|Any CPU + {28DFAF27-8FF3-4373-AAA4-2A6969C86246}.Release|Any CPU.ActiveCfg = Release|Any CPU + {28DFAF27-8FF3-4373-AAA4-2A6969C86246}.Release|Any CPU.Build.0 = Release|Any CPU EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE @@ -1054,6 +1062,7 @@ Global {27AF60D6-86F5-4591-A700-4F8C93F41B11} = {0D8C6358-5DAA-4EA6-A924-C268A9A21BC9} {21A32285-8443-4A75-B2E8-27E6090EC562} = {0D8C6358-5DAA-4EA6-A924-C268A9A21BC9} {C057ACDF-DDD8-496B-BAF9-1C6E4E1248D7} = {FA3720F1-C99A-49B2-9577-A940257098BF} + {28DFAF27-8FF3-4373-AAA4-2A6969C86246} = {5D4C0700-BBB5-418F-A7B2-F392B9A18263} EndGlobalSection GlobalSection(ExtensibilityGlobals) = postSolution SolutionGuid = {FBDC56A3-86AD-4323-AA0F-201E59123B83} diff --git a/dotnet/samples/Demos/VectorStoreRAG/DataLoader.cs b/dotnet/samples/Demos/VectorStoreRAG/DataLoader.cs new file mode 100644 index 000000000000..a652295849a0 --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/DataLoader.cs @@ -0,0 +1,86 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.SemanticKernel.Data; +using Microsoft.SemanticKernel.Embeddings; +using UglyToad.PdfPig; +using UglyToad.PdfPig.Content; +using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; + +namespace VectorStoreRAG; + +/// +/// Class that loads text from a PDF file into a vector store. +/// +/// The type of the data model key. +/// A function to generate unique keys with. +/// The collection to load the data into. +/// The service to use for generating embeddings from the text. +internal sealed class DataLoader( + UniqueKeyGenerator uniqueKeyGenerator, + IVectorStoreRecordCollection> vectorStoreRecordCollection, + ITextEmbeddingGenerationService textEmbeddingGenerationService) : IDataLoader where TKey : notnull +{ + /// + public async Task LoadPdf(string pdfPath, CancellationToken cancellationToken) + { + // Create the collection if it doesn't exist. + await vectorStoreRecordCollection.CreateCollectionIfNotExistsAsync(cancellationToken).ConfigureAwait(false); + + // Load the paragraphs from the PDF file and split them into batches. + var sections = LoadParagraphs(pdfPath, cancellationToken); + var batches = sections.Chunk(10); + + // Process each batch of paragraphs. + foreach (var batch in batches) + { + // Map each paragraph to a TextSnippet and generate an embedding for it. + var recordTasks = batch.Select(async section => new TextSnippet + { + Key = uniqueKeyGenerator.GenerateKey(), + Text = section.ParagraphText, + ReferenceDescription = $"{new FileInfo(pdfPath).Name}#page={section.PageNumber}", + ReferenceLink = $"{new Uri(new FileInfo(pdfPath).FullName).AbsoluteUri}#page={section.PageNumber}", + TextEmbedding = await textEmbeddingGenerationService.GenerateEmbeddingAsync(section.ParagraphText, cancellationToken: cancellationToken).ConfigureAwait(false) + }); + + // Upsert the records into the vector store. + var records = await Task.WhenAll(recordTasks).ConfigureAwait(false); + var upsertedKeys = vectorStoreRecordCollection.UpsertBatchAsync(records, cancellationToken: cancellationToken); + await foreach (var key in upsertedKeys.ConfigureAwait(false)) + { + Console.WriteLine($"Upserted record '{key}' into VectorDB"); + } + } + } + + /// + /// Read the text from each paragraph in the provided PDF file. + /// + /// The pdf file to read the paragraphs from. + /// The to monitor for cancellation requests. + /// The paragraphs from the pdf file, plus the page that they are on. + private static IEnumerable<(string ParagraphText, int PageNumber)> LoadParagraphs(string pdfPath, CancellationToken cancellationToken) + { + using (PdfDocument document = PdfDocument.Open(pdfPath)) + { + foreach (Page page in document.GetPages()) + { + if (cancellationToken.IsCancellationRequested) + { + break; + } + + var blocks = DefaultPageSegmenter.Instance.GetBlocks(page.GetWords()); + foreach (var block in blocks) + { + if (cancellationToken.IsCancellationRequested) + { + break; + } + + yield return (ParagraphText: block.Text, PageNumber: page.Number); + } + } + } + } +} diff --git a/dotnet/samples/Demos/VectorStoreRAG/IDataLoader.cs b/dotnet/samples/Demos/VectorStoreRAG/IDataLoader.cs new file mode 100644 index 000000000000..abd2eed7e8ef --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/IDataLoader.cs @@ -0,0 +1,17 @@ +// Copyright (c) Microsoft. All rights reserved. + +namespace VectorStoreRAG; + +/// +/// Interface for loading data into a data store. +/// +internal interface IDataLoader +{ + /// + /// Load the text from a PDF file into the data store. + /// + /// The pdf file to load. + /// The to monitor for cancellation requests. + /// An async task that completes when the loading is complete. + Task LoadPdf(string pdfPath, CancellationToken cancellationToken); +} diff --git a/dotnet/samples/Demos/VectorStoreRAG/Options/ApplicationConfig.cs b/dotnet/samples/Demos/VectorStoreRAG/Options/ApplicationConfig.cs new file mode 100644 index 000000000000..470fbce5008a --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/Options/ApplicationConfig.cs @@ -0,0 +1,71 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.Extensions.Configuration; + +namespace VectorStoreRAG.Options; + +/// +/// Helper class to load all configuration settings for the VectorStoreRAG project. +/// +internal sealed class ApplicationConfig +{ + private readonly AzureOpenAIConfig _azureOpenAIConfig; + private readonly AzureOpenAIEmbeddingsConfig _azureOpenAIEmbeddingsConfig = new(); + private readonly RagConfig _ragConfig = new(); + private readonly AzureAISearchConfig _azureAISearchConfig = new(); + private readonly AzureCosmosDBConfig _azureCosmosDBMongoDBConfig = new(); + private readonly AzureCosmosDBConfig _azureCosmosDBNoSQLConfig = new(); + private readonly QdrantConfig _qdrantConfig = new(); + private readonly RedisConfig _redisConfig = new(); + private readonly WeaviateConfig _weaviateConfig = new(); + + public ApplicationConfig(ConfigurationManager configurationManager) + { + this._azureOpenAIConfig = new(); + configurationManager + .GetRequiredSection($"AIServices:{AzureOpenAIConfig.ConfigSectionName}") + .Bind(this._azureOpenAIConfig); + configurationManager + .GetRequiredSection($"AIServices:{AzureOpenAIEmbeddingsConfig.ConfigSectionName}") + .Bind(this._azureOpenAIEmbeddingsConfig); + configurationManager + .GetRequiredSection(RagConfig.ConfigSectionName) + .Bind(this._ragConfig); + configurationManager + .GetRequiredSection($"VectorStores:{AzureAISearchConfig.ConfigSectionName}") + .Bind(this._azureAISearchConfig); + configurationManager + .GetRequiredSection($"VectorStores:{AzureCosmosDBConfig.MongoDBConfigSectionName}") + .Bind(this._azureCosmosDBMongoDBConfig); + configurationManager + .GetRequiredSection($"VectorStores:{AzureCosmosDBConfig.NoSQLConfigSectionName}") + .Bind(this._azureCosmosDBNoSQLConfig); + configurationManager + .GetRequiredSection($"VectorStores:{QdrantConfig.ConfigSectionName}") + .Bind(this._qdrantConfig); + configurationManager + .GetRequiredSection($"VectorStores:{RedisConfig.ConfigSectionName}") + .Bind(this._redisConfig); + configurationManager + .GetRequiredSection($"VectorStores:{WeaviateConfig.ConfigSectionName}") + .Bind(this._weaviateConfig); + } + + public AzureOpenAIConfig AzureOpenAIConfig => this._azureOpenAIConfig; + + public AzureOpenAIEmbeddingsConfig AzureOpenAIEmbeddingsConfig => this._azureOpenAIEmbeddingsConfig; + + public RagConfig RagConfig => this._ragConfig; + + public AzureAISearchConfig AzureAISearchConfig => this._azureAISearchConfig; + + public AzureCosmosDBConfig AzureCosmosDBMongoDBConfig => this._azureCosmosDBMongoDBConfig; + + public AzureCosmosDBConfig AzureCosmosDBNoSQLConfig => this._azureCosmosDBNoSQLConfig; + + public QdrantConfig QdrantConfig => this._qdrantConfig; + + public RedisConfig RedisConfig => this._redisConfig; + + public WeaviateConfig WeaviateConfig => this._weaviateConfig; +} diff --git a/dotnet/samples/Demos/VectorStoreRAG/Options/AzureAISearchConfig.cs b/dotnet/samples/Demos/VectorStoreRAG/Options/AzureAISearchConfig.cs new file mode 100644 index 000000000000..4d721593ce5a --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/Options/AzureAISearchConfig.cs @@ -0,0 +1,19 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.ComponentModel.DataAnnotations; + +namespace VectorStoreRAG.Options; + +/// +/// Azure AI Search service settings. +/// +internal sealed class AzureAISearchConfig +{ + public const string ConfigSectionName = "AzureAISearch"; + + [Required] + public string Endpoint { get; set; } = string.Empty; + + [Required] + public string ApiKey { get; set; } = string.Empty; +} diff --git a/dotnet/samples/Demos/VectorStoreRAG/Options/AzureCosmosDBConfig.cs b/dotnet/samples/Demos/VectorStoreRAG/Options/AzureCosmosDBConfig.cs new file mode 100644 index 000000000000..bf95e5219657 --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/Options/AzureCosmosDBConfig.cs @@ -0,0 +1,20 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.ComponentModel.DataAnnotations; + +namespace VectorStoreRAG.Options; + +/// +/// Azure CosmosDB service settings for use with AzureCosmosDBMongoDB and AzureCosmosDBNoSQL. +/// +internal sealed class AzureCosmosDBConfig +{ + public const string MongoDBConfigSectionName = "AzureCosmosDBMongoDB"; + public const string NoSQLConfigSectionName = "AzureCosmosDBNoSQL"; + + [Required] + public string ConnectionString { get; set; } = string.Empty; + + [Required] + public string DatabaseName { get; set; } = string.Empty; +} diff --git a/dotnet/samples/Demos/VectorStoreRAG/Options/AzureOpenAIConfig.cs b/dotnet/samples/Demos/VectorStoreRAG/Options/AzureOpenAIConfig.cs new file mode 100644 index 000000000000..a4422aca50f0 --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/Options/AzureOpenAIConfig.cs @@ -0,0 +1,19 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.ComponentModel.DataAnnotations; + +namespace VectorStoreRAG.Options; + +/// +/// Azure OpenAI service settings. +/// +internal sealed class AzureOpenAIConfig +{ + public const string ConfigSectionName = "AzureOpenAI"; + + [Required] + public string ChatDeploymentName { get; set; } = string.Empty; + + [Required] + public string Endpoint { get; set; } = string.Empty; +} diff --git a/dotnet/samples/Demos/VectorStoreRAG/Options/AzureOpenAIEmbeddingsConfig.cs b/dotnet/samples/Demos/VectorStoreRAG/Options/AzureOpenAIEmbeddingsConfig.cs new file mode 100644 index 000000000000..f968a7b6827a --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/Options/AzureOpenAIEmbeddingsConfig.cs @@ -0,0 +1,19 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.ComponentModel.DataAnnotations; + +namespace VectorStoreRAG.Options; + +/// +/// Azure OpenAI Embeddings service settings. +/// +internal sealed class AzureOpenAIEmbeddingsConfig +{ + public const string ConfigSectionName = "AzureOpenAIEmbeddings"; + + [Required] + public string DeploymentName { get; set; } = string.Empty; + + [Required] + public string Endpoint { get; set; } = string.Empty; +} diff --git a/dotnet/samples/Demos/VectorStoreRAG/Options/QdrantConfig.cs b/dotnet/samples/Demos/VectorStoreRAG/Options/QdrantConfig.cs new file mode 100644 index 000000000000..013937f19023 --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/Options/QdrantConfig.cs @@ -0,0 +1,22 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.ComponentModel.DataAnnotations; + +namespace VectorStoreRAG.Options; + +/// +/// Qdrant service settings. +/// +internal sealed class QdrantConfig +{ + public const string ConfigSectionName = "Qdrant"; + + [Required] + public string Host { get; set; } = string.Empty; + + public int Port { get; set; } = 6334; + + public bool Https { get; set; } = false; + + public string ApiKey { get; set; } = string.Empty; +} diff --git a/dotnet/samples/Demos/VectorStoreRAG/Options/RagConfig.cs b/dotnet/samples/Demos/VectorStoreRAG/Options/RagConfig.cs new file mode 100644 index 000000000000..b7a919d890bc --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/Options/RagConfig.cs @@ -0,0 +1,25 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.ComponentModel.DataAnnotations; + +namespace VectorStoreRAG.Options; + +/// +/// Contains settings to control the RAG experience. +/// +internal sealed class RagConfig +{ + public const string ConfigSectionName = "Rag"; + + [Required] + public bool BuildCollection { get; set; } = true; + + [Required] + public string[]? PdfFilePaths { get; set; } + + [Required] + public string VectorStoreType { get; set; } = string.Empty; + + [Required] + public string CollectionName { get; set; } = string.Empty; +} diff --git a/dotnet/samples/Demos/VectorStoreRAG/Options/RedisConfig.cs b/dotnet/samples/Demos/VectorStoreRAG/Options/RedisConfig.cs new file mode 100644 index 000000000000..ca7e481aa62f --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/Options/RedisConfig.cs @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.ComponentModel.DataAnnotations; + +namespace VectorStoreRAG.Options; + +/// +/// Redis service settings. +/// +internal sealed class RedisConfig +{ + public const string ConfigSectionName = "Redis"; + + [Required] + public string ConnectionConfiguration { get; set; } = string.Empty; +} diff --git a/dotnet/samples/Demos/VectorStoreRAG/Options/WeaviateConfig.cs b/dotnet/samples/Demos/VectorStoreRAG/Options/WeaviateConfig.cs new file mode 100644 index 000000000000..8c943313ab88 --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/Options/WeaviateConfig.cs @@ -0,0 +1,16 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.ComponentModel.DataAnnotations; + +namespace VectorStoreRAG.Options; + +/// +/// Weaviate service settings. +/// +internal sealed class WeaviateConfig +{ + public const string ConfigSectionName = "Weaviate"; + + [Required] + public string Endpoint { get; set; } = string.Empty; +} diff --git a/dotnet/samples/Demos/VectorStoreRAG/Program.cs b/dotnet/samples/Demos/VectorStoreRAG/Program.cs new file mode 100644 index 000000000000..39fc7522f506 --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/Program.cs @@ -0,0 +1,124 @@ +// Copyright (c) Microsoft. All rights reserved. + +using System.Globalization; +using Azure; +using Azure.Identity; +using Microsoft.Extensions.Configuration; +using Microsoft.Extensions.DependencyInjection; +using Microsoft.Extensions.Hosting; +using Microsoft.SemanticKernel; +using Microsoft.SemanticKernel.Data; +using VectorStoreRAG; +using VectorStoreRAG.Options; + +HostApplicationBuilder builder = Host.CreateApplicationBuilder(args); + +builder.Configuration + .AddUserSecrets(); + +builder.Services.Configure(builder.Configuration.GetSection(RagConfig.ConfigSectionName)); + +var appConfig = new ApplicationConfig(builder.Configuration); + +// Register the kernel with the dependency injection container +// and add Chat Completion and Text Embedding Generation services. +var kernelBuilder = builder.Services.AddKernel() + .AddAzureOpenAIChatCompletion( + appConfig.AzureOpenAIConfig.ChatDeploymentName, + appConfig.AzureOpenAIConfig.Endpoint, + new AzureCliCredential()) + .AddAzureOpenAITextEmbeddingGeneration( + appConfig.AzureOpenAIEmbeddingsConfig.DeploymentName, + appConfig.AzureOpenAIEmbeddingsConfig.Endpoint, + new AzureCliCredential()); + +// Add the configured vector store record collection type to the +// dependency injection container. +switch (appConfig.RagConfig.VectorStoreType) +{ + case "AzureAISearch": + kernelBuilder.AddAzureAISearchVectorStoreRecordCollection>( + appConfig.RagConfig.CollectionName, + new Uri(appConfig.AzureAISearchConfig.Endpoint), + new AzureKeyCredential(appConfig.AzureAISearchConfig.ApiKey)); + break; + case "AzureCosmosDBMongoDB": + kernelBuilder.AddAzureCosmosDBMongoDBVectorStoreRecordCollection>( + appConfig.RagConfig.CollectionName, + appConfig.AzureCosmosDBMongoDBConfig.ConnectionString, + appConfig.AzureCosmosDBMongoDBConfig.DatabaseName); + break; + case "AzureCosmosDBNoSQL": + kernelBuilder.AddAzureCosmosDBNoSQLVectorStoreRecordCollection>( + appConfig.RagConfig.CollectionName, + appConfig.AzureCosmosDBNoSQLConfig.ConnectionString, + appConfig.AzureCosmosDBNoSQLConfig.DatabaseName); + break; + case "Qdrant": + kernelBuilder.AddQdrantVectorStoreRecordCollection>( + appConfig.RagConfig.CollectionName, + appConfig.QdrantConfig.Host, + appConfig.QdrantConfig.Port, + appConfig.QdrantConfig.Https, + appConfig.QdrantConfig.ApiKey); + break; + case "Redis": + kernelBuilder.AddRedisJsonVectorStoreRecordCollection>( + appConfig.RagConfig.CollectionName, + appConfig.RedisConfig.ConnectionConfiguration); + break; + case "Weaviate": + kernelBuilder.AddWeaviateVectorStoreRecordCollection>( + // Weaviate collection names must start with an upper case letter. + char.ToUpper(appConfig.RagConfig.CollectionName[0], CultureInfo.InvariantCulture) + appConfig.RagConfig.CollectionName.Substring(1), + null, + new() { Endpoint = new Uri(appConfig.WeaviateConfig.Endpoint) }); + break; + default: + throw new NotSupportedException($"Vector store type '{appConfig.RagConfig.VectorStoreType}' is not supported."); +} + +// Register all the other required services. +switch (appConfig.RagConfig.VectorStoreType) +{ + case "AzureAISearch": + case "AzureCosmosDBMongoDB": + case "AzureCosmosDBNoSQL": + case "Redis": + RegisterServices(builder, kernelBuilder, appConfig); + break; + case "Qdrant": + case "Weaviate": + RegisterServices(builder, kernelBuilder, appConfig); + break; + default: + throw new NotSupportedException($"Vector store type '{appConfig.RagConfig.VectorStoreType}' is not supported."); +} + +// Build and run the host. +using IHost host = builder.Build(); +await host.RunAsync().ConfigureAwait(false); + +static void RegisterServices(HostApplicationBuilder builder, IKernelBuilder kernelBuilder, ApplicationConfig vectorStoreRagConfig) + where TKey : notnull +{ + // Add a text search implementation that uses the registered vector store record collection for search. + kernelBuilder.AddVectorStoreTextSearch>( + new TextSearchStringMapper((result) => (result as TextSnippet)!.Text!), + new TextSearchResultMapper((result) => + { + // Create a mapping from the Vector Store data type to the data type returned by the Text Search. + // This text search will ultimately be used in a plugin and this TextSearchResult will be returned to the prompt template + // when the plugin is invoked from the prompt template. + var castResult = result as TextSnippet; + return new TextSearchResult(value: castResult!.Text!) { Name = castResult.ReferenceDescription, Link = castResult.ReferenceLink }; + })); + + // Add the key generator and data loader to the dependency injection container. + builder.Services.AddSingleton>(new UniqueKeyGenerator(() => Guid.NewGuid())); + builder.Services.AddSingleton>(new UniqueKeyGenerator(() => Guid.NewGuid().ToString())); + builder.Services.AddSingleton>(); + + // Add the main service for this application. + builder.Services.AddHostedService>(); +} diff --git a/dotnet/samples/Demos/VectorStoreRAG/RAGChatService.cs b/dotnet/samples/Demos/VectorStoreRAG/RAGChatService.cs new file mode 100644 index 000000000000..0b0bf875c1c0 --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/RAGChatService.cs @@ -0,0 +1,170 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Options; +using Microsoft.SemanticKernel; +using Microsoft.SemanticKernel.Data; +using Microsoft.SemanticKernel.PromptTemplates.Handlebars; +using VectorStoreRAG.Options; + +namespace VectorStoreRAG; + +/// +/// Main service class for the application. +/// +/// The type of the data model key. +/// Used to load data into the vector store. +/// Used to search the vector store. +/// Used to make requests to the LLM. +/// The configuration options for the application. +internal sealed class RAGChatService( + IDataLoader dataLoader, + VectorStoreTextSearch> vectorStoreTextSearch, + Kernel kernel, + IOptions ragConfigOptions) : IHostedService +{ + private Task? _dataLoaded; + private Task? _chatLoop; + + /// + /// Start the service. + /// + /// The to monitor for cancellation requests. + /// An async task that completes when the service is started. + public Task StartAsync(CancellationToken cancellationToken) + { + // Start to load all the configured PDFs into the vector store. + if (ragConfigOptions.Value.BuildCollection) + { + this._dataLoaded = this.LoadDataAsync(cancellationToken); + } + else + { + this._dataLoaded = Task.CompletedTask; + } + + // Start the chat loop. + this._chatLoop = this.ChatLoopAsync(cancellationToken); + + return Task.CompletedTask; + } + + /// + /// Stop the service. + /// + /// The to monitor for cancellation requests. + /// An async task that completes when the service is stopped. + public Task StopAsync(CancellationToken cancellationToken) + { + return Task.CompletedTask; + } + + /// + /// Contains the main chat loop for the application. + /// + /// The to monitor for cancellation requests. + /// An async task that completes when the chat loop is shut down. + private async Task ChatLoopAsync(CancellationToken cancellationToken) + { + var pdfFiles = string.Join(", ", ragConfigOptions.Value.PdfFilePaths ?? []); + + // Wait for the data to be loaded before starting the chat loop. + while (this._dataLoaded != null && !this._dataLoaded.IsCompleted && !cancellationToken.IsCancellationRequested) + { + await Task.Delay(1_000, cancellationToken).ConfigureAwait(false); + } + + // If data loading failed, don't start the chat loop. + if (this._dataLoaded != null && this._dataLoaded.IsFaulted) + { + Console.WriteLine("Failed to load data"); + return; + } + + Console.WriteLine("PDF loading complete\n"); + + // Add a search plugin to the kernel which we will use in the template below + // to do a vector search for related information to the user query. + kernel.Plugins.Add(vectorStoreTextSearch.CreateWithGetTextSearchResults("SearchPlugin")); + + // Start the chat loop. + while (!cancellationToken.IsCancellationRequested) + { + // Prompt the user for a question. + Console.ForegroundColor = ConsoleColor.Green; + Console.WriteLine($"Assistant > What would you like to know from the loaded PDFs: ({pdfFiles})?"); + + // Read the user question. + Console.ForegroundColor = ConsoleColor.White; + Console.Write("User > "); + var question = Console.ReadLine(); + + // Invoke the LLM with a template that uses the search plugin to + // 1. get related information to the user query from the vector store + // 2. add the information to the LLM prompt. + var response = kernel.InvokePromptStreamingAsync( + promptTemplate: """ + Please use this information to answer the question: + {{#with (SearchPlugin-GetTextSearchResults question)}} + {{#each this}} + Name: {{Name}} + Value: {{Value}} + Link: {{Link}} + ----------------- + {{/each}} + {{/with}} + + Include citations to the relevant information where it is referenced in the response. + + Question: {{question}} + """, + arguments: new KernelArguments() + { + { "question", question }, + }, + templateFormat: "handlebars", + promptTemplateFactory: new HandlebarsPromptTemplateFactory(), + cancellationToken: cancellationToken); + + // Stream the LLM response to the console with error handling. + Console.ForegroundColor = ConsoleColor.Green; + Console.Write("\nAssistant > "); + + try + { + await foreach (var message in response.ConfigureAwait(false)) + { + Console.Write(message); + } + Console.WriteLine(); + } + catch (Exception ex) + { + Console.ForegroundColor = ConsoleColor.Red; + Console.WriteLine($"Call to LLM failed with error: {ex}"); + } + } + } + + /// + /// Load all configured PDFs into the vector store. + /// + /// The to monitor for cancellation requests. + /// An async task that completes when the loading is complete. + private async Task LoadDataAsync(CancellationToken cancellationToken) + { + try + { + foreach (var pdfFilePath in ragConfigOptions.Value.PdfFilePaths ?? []) + { + Console.WriteLine($"Loading PDF into vector store: {pdfFilePath}"); + await dataLoader.LoadPdf(pdfFilePath, cancellationToken).ConfigureAwait(false); + } + } + catch (Exception ex) + { + Console.WriteLine($"Failed to load PDFs: {ex}"); + throw; + } + } +} diff --git a/dotnet/samples/Demos/VectorStoreRAG/README.md b/dotnet/samples/Demos/VectorStoreRAG/README.md new file mode 100644 index 000000000000..bd7abc41ae30 --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/README.md @@ -0,0 +1,141 @@ +# Vector Store RAG Demo + +This sample demonstrates how to ingest text from pdf files into a vector store and ask questions about the content +using an LLM while using RAG to supplement the LLM with additional information from the vector store. + +## Configuring the Sample + +The sample can be configured in various ways: + +1. You can choose your preferred vector store by setting the `Rag:VectorStoreType` configuration setting in the `appsettings.json` file to one of the following values: + 1. AzureAISearch + 1. AzureCosmosDBMongoDB + 1. AzureCosmosDBNoSQL + 1. Qdrant + 1. Redis + 1. Weaviate +1. You can choose whether to load data into the vector store by setting the `Rag:BuildCollection` configuration setting in the `appsettings.json` file to `true`. If you set this to `false`, the sample will assume that data was already loaded previously and it will go straight into the chat experience. +1. You can choose the name of the collection to use by setting the `Rag:CollectionName` configuration setting in the `appsettings.json` file. +1. You can choose the pdf file to load into the vector store by setting the `Rag:PdfFilePaths` array in the `appsettings.json` file. + +## Dependency Setup + +To run this sample, you need to setup your source data, setup your vector store and AI services, and setup secrets for these. + +### Source PDF File + +You will need to supply some source pdf files to load into the vector store. +Once you have a file ready, update the `PdfFilePaths` array in the `appsettings.json` file with the path to the file. + +```json +{ + "Rag": { + "PdfFilePaths": [ "sourcedocument.pdf" ], + } +} +``` + +Why not try the semantic kernel documentation as your input. +You can download it as a PDF from the https://learn.microsoft.com/en-us/semantic-kernel/overview/ page. +See the Download PDF button at the bottom of the page. + +### Azure OpenAI + +For Azure OpenAI, you need to add the following secrets: + +```cli +dotnet user-secrets set "AIServices:AzureOpenAI:Endpoint" "https://.openai.azure.com" +``` + +Note that the code doesn't use an API Key to communicate with Azure Open AI, but rather an `AzureCliCredential` so no api key secret is required. + +### Azure OpenAI Embeddings + +For Azure OpenAI Embeddings, you need to add the following secrets: + +```cli +dotnet user-secrets set "AIServices:AzureOpenAIEmbeddings:Endpoint" "https://.openai.azure.com" +``` + +Note that the code doesn't use an API Key to communicate with Azure Open AI, but rather an `AzureCliCredential` so no api key secret is required. + +### Azure AI Search + +If you want to use Azure AI Search as your vector store, you will need to create an instance of Azure AI Search and add +the following secrets here: + +```cli +dotnet user-secrets set "VectorStores:AzureAISearch:Endpoint" "https://.search.windows.net" +dotnet user-secrets set "VectorStores:AzureAISearch:ApiKey" "" +``` + +### Azure CosmosDB MongoDB + +If you want to use Azure CosmosDB MongoDB as your vector store, you will need to create an instance of Azure CosmosDB MongoDB and add +the following secrets here: + +```cli +dotnet user-secrets set "VectorStores:AzureCosmosDBMongoDB:ConnectionString" "" +dotnet user-secrets set "VectorStores:AzureCosmosDBMongoDB:DatabaseName" "" +``` + +### Azure CosmosDB NoSQL + +If you want to use Azure CosmosDB NoSQL as your vector store, you will need to create an instance of Azure CosmosDB NoSQL and add +the following secrets here: + +```cli +dotnet user-secrets set "VectorStores:AzureCosmosDBNoSQL:ConnectionString" "" +dotnet user-secrets set "VectorStores:AzureCosmosDBNoSQL:DatabaseName" "" +``` + +### Qdrant + +If you want to use Qdrant as your vector store, you will need to have an instance of Qdrant available. + +You can use the following command to start a Qdrant instance in docker, and this will work with the default configured settings: + +```cli +docker run -d --name qdrant -p 6333:6333 -p 6334:6334 qdrant/qdrant:latest +``` + +If you want to use a different instance of Qdrant, you can update the appsettings.json file or add the following secrets to reconfigure: + +```cli +dotnet user-secrets set "VectorStores:Qdrant:Host" "" +dotnet user-secrets set "VectorStores:Qdrant:Port" "6334" +dotnet user-secrets set "VectorStores:Qdrant:Https" "true" +dotnet user-secrets set "VectorStores:Qdrant:ApiKey" "" +``` + +### Redis + +If you want to use Redis as your vector store, you will need to have an instance of Redis available. + +You can use the following command to start a Redis instance in docker, and this will work with the default configured settings: + +```cli +docker run -d --name redis-stack -p 6379:6379 -p 8001:8001 redis/redis-stack:latest +``` + +If you want to use a different instance of Redis, you can update the appsettings.json file or add the following secret to reconfigure: + +```cli +dotnet user-secrets set "VectorStores:Redis:ConnectionConfiguration" "" +``` + +### Weaviate + +If you want to use Weaviate as your vector store, you will need to have an instance of Weaviate available. + +You can use the following command to start a Weaviate instance in docker, and this will work with the default configured settings: + +```cli +docker run -d --name weaviate -p 8080:8080 -p 50051:50051 cr.weaviate.io/semitechnologies/weaviate:1.26.4 +``` + +If you want to use a different instance of Weaviate, you can update the appsettings.json file or add the following secret to reconfigure: + +```cli +dotnet user-secrets set "VectorStores:Weaviate:Endpoint" "" +``` diff --git a/dotnet/samples/Demos/VectorStoreRAG/TextSnippet.cs b/dotnet/samples/Demos/VectorStoreRAG/TextSnippet.cs new file mode 100644 index 000000000000..d1c83041d67b --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/TextSnippet.cs @@ -0,0 +1,27 @@ +// Copyright (c) Microsoft. All rights reserved. + +using Microsoft.SemanticKernel.Data; + +namespace VectorStoreRAG; + +/// +/// Data model for storing a section of text with an embedding and an optional reference link. +/// +/// The type of the data model key. +internal sealed class TextSnippet +{ + [VectorStoreRecordKey] + public required TKey Key { get; set; } + + [VectorStoreRecordData] + public string? Text { get; set; } + + [VectorStoreRecordData] + public string? ReferenceDescription { get; set; } + + [VectorStoreRecordData] + public string? ReferenceLink { get; set; } + + [VectorStoreRecordVector(1536)] + public ReadOnlyMemory TextEmbedding { get; set; } +} diff --git a/dotnet/samples/Demos/VectorStoreRAG/UniqueKeyGenerator.cs b/dotnet/samples/Demos/VectorStoreRAG/UniqueKeyGenerator.cs new file mode 100644 index 000000000000..7572eb8d7825 --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/UniqueKeyGenerator.cs @@ -0,0 +1,18 @@ +// Copyright (c) Microsoft. All rights reserved. + +namespace VectorStoreRAG; + +/// +/// Class for generating unique keys via a provided function. +/// +/// The type of key to generate. +/// The function to generate the key with. +internal sealed class UniqueKeyGenerator(Func generator) + where TKey : notnull +{ + /// + /// Generate a unique key. + /// + /// The unique key that was generated. + public TKey GenerateKey() => generator(); +} diff --git a/dotnet/samples/Demos/VectorStoreRAG/VectorStoreRAG.csproj b/dotnet/samples/Demos/VectorStoreRAG/VectorStoreRAG.csproj new file mode 100644 index 000000000000..fd80cbe8c047 --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/VectorStoreRAG.csproj @@ -0,0 +1,36 @@ + + + + Exe + net8.0 + enable + enable + $(NoWarn);SKEXP0001;SKEXP0010;SKEXP0020 + c4203b00-7179-47c1-8701-ee352e381412 + + + + + + + + + + + + + + + + + + + + + + + + PreserveNewest + + + diff --git a/dotnet/samples/Demos/VectorStoreRAG/appsettings.json b/dotnet/samples/Demos/VectorStoreRAG/appsettings.json new file mode 100644 index 000000000000..47a1daa9d4d4 --- /dev/null +++ b/dotnet/samples/Demos/VectorStoreRAG/appsettings.json @@ -0,0 +1,49 @@ +{ + "Logging": { + "LogLevel": { + "Default": "None" + } + }, + "AIServices": { + "AzureOpenAI": { + "Endpoint": "", + "ChatDeploymentName": "gpt-4" + }, + "AzureOpenAIEmbeddings": { + "Endpoint": "", + "DeploymentName": "text-embedding-ada-002" + } + }, + "VectorStores": { + "AzureAISearch": { + "Endpoint": "", + "ApiKey": "" + }, + "AzureCosmosDBMongoDB": { + "ConnectionString": "", + "DatabaseName": "" + }, + "AzureCosmosDBNoSQL": { + "ConnectionString": "", + "DatabaseName": "" + }, + "Qdrant": { + "Host": "localhost", + "Port": 6334, + "Https": false, + "ApiKey": "" + }, + "Redis": { + "ConnectionConfiguration": "localhost:6379" + }, + "Weaviate": { + "Endpoint": "http://localhost:8080/v1/" + } + }, + "Rag": { + "BuildCollection": true, + "PdfFilePaths": [ "sourcedocument.pdf" ], + "VectorStoreType": "Qdrant", + "CollectionName": "pdfcontent" + } +} diff --git a/dotnet/src/Connectors/Connectors.AzureAISearch.UnitTests/AzureAISearchKernelBuilderExtensionsTests.cs b/dotnet/src/Connectors/Connectors.AzureAISearch.UnitTests/AzureAISearchKernelBuilderExtensionsTests.cs index 1f1308a227d1..57194a256cc8 100644 --- a/dotnet/src/Connectors/Connectors.AzureAISearch.UnitTests/AzureAISearchKernelBuilderExtensionsTests.cs +++ b/dotnet/src/Connectors/Connectors.AzureAISearch.UnitTests/AzureAISearchKernelBuilderExtensionsTests.cs @@ -110,10 +110,6 @@ private void AssertVectorStoreRecordCollectionCreated() var vectorizedSearch = kernel.Services.GetRequiredService>(); Assert.NotNull(vectorizedSearch); Assert.IsType>(vectorizedSearch); - - var vectorizableSearch = kernel.Services.GetRequiredService>(); - Assert.NotNull(vectorizableSearch); - Assert.IsType>(vectorizableSearch); } #pragma warning disable CA1812 // Avoid uninstantiated internal classes diff --git a/dotnet/src/Connectors/Connectors.AzureAISearch.UnitTests/AzureAISearchServiceCollectionExtensionsTests.cs b/dotnet/src/Connectors/Connectors.AzureAISearch.UnitTests/AzureAISearchServiceCollectionExtensionsTests.cs index 5739b307a9ac..0ea5403151bc 100644 --- a/dotnet/src/Connectors/Connectors.AzureAISearch.UnitTests/AzureAISearchServiceCollectionExtensionsTests.cs +++ b/dotnet/src/Connectors/Connectors.AzureAISearch.UnitTests/AzureAISearchServiceCollectionExtensionsTests.cs @@ -110,10 +110,6 @@ private void AssertVectorStoreRecordCollectionCreated() var vectorizedSearch = serviceProvider.GetRequiredService>(); Assert.NotNull(vectorizedSearch); Assert.IsType>(vectorizedSearch); - - var vectorizableSearch = serviceProvider.GetRequiredService>(); - Assert.NotNull(vectorizableSearch); - Assert.IsType>(vectorizableSearch); } #pragma warning disable CA1812 // Avoid uninstantiated internal classes diff --git a/dotnet/src/Connectors/Connectors.Memory.AzureAISearch/AzureAISearchServiceCollectionExtensions.cs b/dotnet/src/Connectors/Connectors.Memory.AzureAISearch/AzureAISearchServiceCollectionExtensions.cs index f5d418317fd9..cae20446716d 100644 --- a/dotnet/src/Connectors/Connectors.Memory.AzureAISearch/AzureAISearchServiceCollectionExtensions.cs +++ b/dotnet/src/Connectors/Connectors.Memory.AzureAISearch/AzureAISearchServiceCollectionExtensions.cs @@ -140,7 +140,6 @@ public static IServiceCollection AddAzureAISearchVectorStoreRecordCollection(services, serviceId); - AddVectorizableTextSearch(services, serviceId); return services; } @@ -185,7 +184,6 @@ public static IServiceCollection AddAzureAISearchVectorStoreRecordCollection(services, serviceId); - AddVectorizableTextSearch(services, serviceId); return services; } @@ -230,7 +228,6 @@ public static IServiceCollection AddAzureAISearchVectorStoreRecordCollection(services, serviceId); - AddVectorizableTextSearch(services, serviceId); return services; } @@ -252,23 +249,6 @@ private static void AddVectorizedSearch(IServiceCollection services, st }); } - /// - /// Also register the with the given as a . - /// - /// The type of the data model that the collection should contain. - /// The service collection to register on. - /// The service id that the registrations should use. - private static void AddVectorizableTextSearch(IServiceCollection services, string? serviceId) - where TRecord : class - { - services.AddKeyedTransient>( - serviceId, - (sp, obj) => - { - return (sp.GetRequiredKeyedService>(serviceId) as IVectorizableTextSearch)!; - }); - } - /// /// Build a instance, using the provided if it's not null and add the SK user agent string. ///