-
Notifications
You must be signed in to change notification settings - Fork 3.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
.Net: Vector search rag sample (#9174)
### Motivation and Context We need an end to end RAG sample to show consumers how to ingest data and use an LLM to answer questions about the data. #7350 ### Description - Add a Demo app that shows how to load a PDF into a vector store of your choice and ask questions about the data. ### Contribution Checklist <!-- Before submitting this PR, please make sure: --> - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄 --------- Co-authored-by: Mark Wallace <[email protected]> Co-authored-by: Dmytro Struk <[email protected]> Co-authored-by: Weihan Li <[email protected]> Co-authored-by: Roger Barreto <[email protected]>
- Loading branch information
1 parent
cd40e2e
commit 900beca
Showing
23 changed files
with
906 additions
and
29 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,86 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using Microsoft.SemanticKernel.Data; | ||
using Microsoft.SemanticKernel.Embeddings; | ||
using UglyToad.PdfPig; | ||
using UglyToad.PdfPig.Content; | ||
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter; | ||
|
||
namespace VectorStoreRAG; | ||
|
||
/// <summary> | ||
/// Class that loads text from a PDF file into a vector store. | ||
/// </summary> | ||
/// <typeparam name="TKey">The type of the data model key.</typeparam> | ||
/// <param name="uniqueKeyGenerator">A function to generate unique keys with.</param> | ||
/// <param name="vectorStoreRecordCollection">The collection to load the data into.</param> | ||
/// <param name="textEmbeddingGenerationService">The service to use for generating embeddings from the text.</param> | ||
internal sealed class DataLoader<TKey>( | ||
UniqueKeyGenerator<TKey> uniqueKeyGenerator, | ||
IVectorStoreRecordCollection<TKey, TextSnippet<TKey>> vectorStoreRecordCollection, | ||
ITextEmbeddingGenerationService textEmbeddingGenerationService) : IDataLoader where TKey : notnull | ||
{ | ||
/// <inheritdoc/> | ||
public async Task LoadPdf(string pdfPath, CancellationToken cancellationToken) | ||
{ | ||
// Create the collection if it doesn't exist. | ||
await vectorStoreRecordCollection.CreateCollectionIfNotExistsAsync(cancellationToken).ConfigureAwait(false); | ||
|
||
// Load the paragraphs from the PDF file and split them into batches. | ||
var sections = LoadParagraphs(pdfPath, cancellationToken); | ||
var batches = sections.Chunk(10); | ||
|
||
// Process each batch of paragraphs. | ||
foreach (var batch in batches) | ||
{ | ||
// Map each paragraph to a TextSnippet and generate an embedding for it. | ||
var recordTasks = batch.Select(async section => new TextSnippet<TKey> | ||
{ | ||
Key = uniqueKeyGenerator.GenerateKey(), | ||
Text = section.ParagraphText, | ||
ReferenceDescription = $"{new FileInfo(pdfPath).Name}#page={section.PageNumber}", | ||
ReferenceLink = $"{new Uri(new FileInfo(pdfPath).FullName).AbsoluteUri}#page={section.PageNumber}", | ||
TextEmbedding = await textEmbeddingGenerationService.GenerateEmbeddingAsync(section.ParagraphText, cancellationToken: cancellationToken).ConfigureAwait(false) | ||
}); | ||
|
||
// Upsert the records into the vector store. | ||
var records = await Task.WhenAll(recordTasks).ConfigureAwait(false); | ||
var upsertedKeys = vectorStoreRecordCollection.UpsertBatchAsync(records, cancellationToken: cancellationToken); | ||
await foreach (var key in upsertedKeys.ConfigureAwait(false)) | ||
{ | ||
Console.WriteLine($"Upserted record '{key}' into VectorDB"); | ||
} | ||
} | ||
} | ||
|
||
/// <summary> | ||
/// Read the text from each paragraph in the provided PDF file. | ||
/// </summary> | ||
/// <param name="pdfPath">The pdf file to read the paragraphs from.</param> | ||
/// <param name="cancellationToken">The <see cref="CancellationToken"/> to monitor for cancellation requests.</param> | ||
/// <returns>The paragraphs from the pdf file, plus the page that they are on.</returns> | ||
private static IEnumerable<(string ParagraphText, int PageNumber)> LoadParagraphs(string pdfPath, CancellationToken cancellationToken) | ||
{ | ||
using (PdfDocument document = PdfDocument.Open(pdfPath)) | ||
{ | ||
foreach (Page page in document.GetPages()) | ||
{ | ||
if (cancellationToken.IsCancellationRequested) | ||
{ | ||
break; | ||
} | ||
|
||
var blocks = DefaultPageSegmenter.Instance.GetBlocks(page.GetWords()); | ||
foreach (var block in blocks) | ||
{ | ||
if (cancellationToken.IsCancellationRequested) | ||
{ | ||
break; | ||
} | ||
|
||
yield return (ParagraphText: block.Text, PageNumber: page.Number); | ||
} | ||
} | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,17 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
namespace VectorStoreRAG; | ||
|
||
/// <summary> | ||
/// Interface for loading data into a data store. | ||
/// </summary> | ||
internal interface IDataLoader | ||
{ | ||
/// <summary> | ||
/// Load the text from a PDF file into the data store. | ||
/// </summary> | ||
/// <param name="pdfPath">The pdf file to load.</param> | ||
/// <param name="cancellationToken">The <see cref="CancellationToken"/> to monitor for cancellation requests.</param> | ||
/// <returns>An async task that completes when the loading is complete.</returns> | ||
Task LoadPdf(string pdfPath, CancellationToken cancellationToken); | ||
} |
71 changes: 71 additions & 0 deletions
71
dotnet/samples/Demos/VectorStoreRAG/Options/ApplicationConfig.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using Microsoft.Extensions.Configuration; | ||
|
||
namespace VectorStoreRAG.Options; | ||
|
||
/// <summary> | ||
/// Helper class to load all configuration settings for the VectorStoreRAG project. | ||
/// </summary> | ||
internal sealed class ApplicationConfig | ||
{ | ||
private readonly AzureOpenAIConfig _azureOpenAIConfig; | ||
private readonly AzureOpenAIEmbeddingsConfig _azureOpenAIEmbeddingsConfig = new(); | ||
private readonly RagConfig _ragConfig = new(); | ||
private readonly AzureAISearchConfig _azureAISearchConfig = new(); | ||
private readonly AzureCosmosDBConfig _azureCosmosDBMongoDBConfig = new(); | ||
private readonly AzureCosmosDBConfig _azureCosmosDBNoSQLConfig = new(); | ||
private readonly QdrantConfig _qdrantConfig = new(); | ||
private readonly RedisConfig _redisConfig = new(); | ||
private readonly WeaviateConfig _weaviateConfig = new(); | ||
|
||
public ApplicationConfig(ConfigurationManager configurationManager) | ||
{ | ||
this._azureOpenAIConfig = new(); | ||
configurationManager | ||
.GetRequiredSection($"AIServices:{AzureOpenAIConfig.ConfigSectionName}") | ||
.Bind(this._azureOpenAIConfig); | ||
configurationManager | ||
.GetRequiredSection($"AIServices:{AzureOpenAIEmbeddingsConfig.ConfigSectionName}") | ||
.Bind(this._azureOpenAIEmbeddingsConfig); | ||
configurationManager | ||
.GetRequiredSection(RagConfig.ConfigSectionName) | ||
.Bind(this._ragConfig); | ||
configurationManager | ||
.GetRequiredSection($"VectorStores:{AzureAISearchConfig.ConfigSectionName}") | ||
.Bind(this._azureAISearchConfig); | ||
configurationManager | ||
.GetRequiredSection($"VectorStores:{AzureCosmosDBConfig.MongoDBConfigSectionName}") | ||
.Bind(this._azureCosmosDBMongoDBConfig); | ||
configurationManager | ||
.GetRequiredSection($"VectorStores:{AzureCosmosDBConfig.NoSQLConfigSectionName}") | ||
.Bind(this._azureCosmosDBNoSQLConfig); | ||
configurationManager | ||
.GetRequiredSection($"VectorStores:{QdrantConfig.ConfigSectionName}") | ||
.Bind(this._qdrantConfig); | ||
configurationManager | ||
.GetRequiredSection($"VectorStores:{RedisConfig.ConfigSectionName}") | ||
.Bind(this._redisConfig); | ||
configurationManager | ||
.GetRequiredSection($"VectorStores:{WeaviateConfig.ConfigSectionName}") | ||
.Bind(this._weaviateConfig); | ||
} | ||
|
||
public AzureOpenAIConfig AzureOpenAIConfig => this._azureOpenAIConfig; | ||
|
||
public AzureOpenAIEmbeddingsConfig AzureOpenAIEmbeddingsConfig => this._azureOpenAIEmbeddingsConfig; | ||
|
||
public RagConfig RagConfig => this._ragConfig; | ||
|
||
public AzureAISearchConfig AzureAISearchConfig => this._azureAISearchConfig; | ||
|
||
public AzureCosmosDBConfig AzureCosmosDBMongoDBConfig => this._azureCosmosDBMongoDBConfig; | ||
|
||
public AzureCosmosDBConfig AzureCosmosDBNoSQLConfig => this._azureCosmosDBNoSQLConfig; | ||
|
||
public QdrantConfig QdrantConfig => this._qdrantConfig; | ||
|
||
public RedisConfig RedisConfig => this._redisConfig; | ||
|
||
public WeaviateConfig WeaviateConfig => this._weaviateConfig; | ||
} |
19 changes: 19 additions & 0 deletions
19
dotnet/samples/Demos/VectorStoreRAG/Options/AzureAISearchConfig.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.ComponentModel.DataAnnotations; | ||
|
||
namespace VectorStoreRAG.Options; | ||
|
||
/// <summary> | ||
/// Azure AI Search service settings. | ||
/// </summary> | ||
internal sealed class AzureAISearchConfig | ||
{ | ||
public const string ConfigSectionName = "AzureAISearch"; | ||
|
||
[Required] | ||
public string Endpoint { get; set; } = string.Empty; | ||
|
||
[Required] | ||
public string ApiKey { get; set; } = string.Empty; | ||
} |
20 changes: 20 additions & 0 deletions
20
dotnet/samples/Demos/VectorStoreRAG/Options/AzureCosmosDBConfig.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.ComponentModel.DataAnnotations; | ||
|
||
namespace VectorStoreRAG.Options; | ||
|
||
/// <summary> | ||
/// Azure CosmosDB service settings for use with AzureCosmosDBMongoDB and AzureCosmosDBNoSQL. | ||
/// </summary> | ||
internal sealed class AzureCosmosDBConfig | ||
{ | ||
public const string MongoDBConfigSectionName = "AzureCosmosDBMongoDB"; | ||
public const string NoSQLConfigSectionName = "AzureCosmosDBNoSQL"; | ||
|
||
[Required] | ||
public string ConnectionString { get; set; } = string.Empty; | ||
|
||
[Required] | ||
public string DatabaseName { get; set; } = string.Empty; | ||
} |
19 changes: 19 additions & 0 deletions
19
dotnet/samples/Demos/VectorStoreRAG/Options/AzureOpenAIConfig.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.ComponentModel.DataAnnotations; | ||
|
||
namespace VectorStoreRAG.Options; | ||
|
||
/// <summary> | ||
/// Azure OpenAI service settings. | ||
/// </summary> | ||
internal sealed class AzureOpenAIConfig | ||
{ | ||
public const string ConfigSectionName = "AzureOpenAI"; | ||
|
||
[Required] | ||
public string ChatDeploymentName { get; set; } = string.Empty; | ||
|
||
[Required] | ||
public string Endpoint { get; set; } = string.Empty; | ||
} |
19 changes: 19 additions & 0 deletions
19
dotnet/samples/Demos/VectorStoreRAG/Options/AzureOpenAIEmbeddingsConfig.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.ComponentModel.DataAnnotations; | ||
|
||
namespace VectorStoreRAG.Options; | ||
|
||
/// <summary> | ||
/// Azure OpenAI Embeddings service settings. | ||
/// </summary> | ||
internal sealed class AzureOpenAIEmbeddingsConfig | ||
{ | ||
public const string ConfigSectionName = "AzureOpenAIEmbeddings"; | ||
|
||
[Required] | ||
public string DeploymentName { get; set; } = string.Empty; | ||
|
||
[Required] | ||
public string Endpoint { get; set; } = string.Empty; | ||
} |
22 changes: 22 additions & 0 deletions
22
dotnet/samples/Demos/VectorStoreRAG/Options/QdrantConfig.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.ComponentModel.DataAnnotations; | ||
|
||
namespace VectorStoreRAG.Options; | ||
|
||
/// <summary> | ||
/// Qdrant service settings. | ||
/// </summary> | ||
internal sealed class QdrantConfig | ||
{ | ||
public const string ConfigSectionName = "Qdrant"; | ||
|
||
[Required] | ||
public string Host { get; set; } = string.Empty; | ||
|
||
public int Port { get; set; } = 6334; | ||
|
||
public bool Https { get; set; } = false; | ||
|
||
public string ApiKey { get; set; } = string.Empty; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.ComponentModel.DataAnnotations; | ||
|
||
namespace VectorStoreRAG.Options; | ||
|
||
/// <summary> | ||
/// Contains settings to control the RAG experience. | ||
/// </summary> | ||
internal sealed class RagConfig | ||
{ | ||
public const string ConfigSectionName = "Rag"; | ||
|
||
[Required] | ||
public bool BuildCollection { get; set; } = true; | ||
|
||
[Required] | ||
public string[]? PdfFilePaths { get; set; } | ||
|
||
[Required] | ||
public string VectorStoreType { get; set; } = string.Empty; | ||
|
||
[Required] | ||
public string CollectionName { get; set; } = string.Empty; | ||
} |
16 changes: 16 additions & 0 deletions
16
dotnet/samples/Demos/VectorStoreRAG/Options/RedisConfig.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.ComponentModel.DataAnnotations; | ||
|
||
namespace VectorStoreRAG.Options; | ||
|
||
/// <summary> | ||
/// Redis service settings. | ||
/// </summary> | ||
internal sealed class RedisConfig | ||
{ | ||
public const string ConfigSectionName = "Redis"; | ||
|
||
[Required] | ||
public string ConnectionConfiguration { get; set; } = string.Empty; | ||
} |
16 changes: 16 additions & 0 deletions
16
dotnet/samples/Demos/VectorStoreRAG/Options/WeaviateConfig.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
// Copyright (c) Microsoft. All rights reserved. | ||
|
||
using System.ComponentModel.DataAnnotations; | ||
|
||
namespace VectorStoreRAG.Options; | ||
|
||
/// <summary> | ||
/// Weaviate service settings. | ||
/// </summary> | ||
internal sealed class WeaviateConfig | ||
{ | ||
public const string ConfigSectionName = "Weaviate"; | ||
|
||
[Required] | ||
public string Endpoint { get; set; } = string.Empty; | ||
} |
Oops, something went wrong.