Skip to content

Commit

Permalink
.Net: Vector search rag sample (#9174)
Browse files Browse the repository at this point in the history
### Motivation and Context

We need an end to end RAG sample to show consumers how to ingest data
and use an LLM to answer questions about the data.

#7350

### Description

- Add a Demo app that shows how to load a PDF into a vector store of
your choice and ask questions about the data.

### Contribution Checklist

<!-- Before submitting this PR, please make sure: -->

- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [x] I didn't break anyone 😄

---------

Co-authored-by: Mark Wallace <[email protected]>
Co-authored-by: Dmytro Struk <[email protected]>
Co-authored-by: Weihan Li <[email protected]>
Co-authored-by: Roger Barreto <[email protected]>
  • Loading branch information
5 people authored Oct 10, 2024
1 parent cd40e2e commit 900beca
Show file tree
Hide file tree
Showing 23 changed files with 906 additions and 29 deletions.
1 change: 1 addition & 0 deletions dotnet/Directory.Packages.props
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
<PackageVersion Include="Microsoft.Identity.Client" Version="4.65.0" />
<PackageVersion Include="Microsoft.ML.OnnxRuntime" Version="1.19.2" />
<PackageVersion Include="FastBertTokenizer" Version="1.0.28" />
<PackageVersion Include="PdfPig" Version="0.1.9" />
<PackageVersion Include="Pinecone.NET" Version="2.1.1" />
<PackageVersion Include="System.Diagnostics.DiagnosticSource" Version="8.0.1" />
<PackageVersion Include="System.Linq.Async" Version="6.0.1" />
Expand Down
11 changes: 10 additions & 1 deletion dotnet/SK-dotnet.sln
Original file line number Diff line number Diff line change
Expand Up @@ -359,7 +359,9 @@ Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Process.LocalRuntime", "src
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Process.UnitTests", "src\Experimental\Process.UnitTests\Process.UnitTests.csproj", "{21A32285-8443-4A75-B2E8-27E6090EC562}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "GettingStartedWithProcesses", "samples\GettingStartedWithProcesses\GettingStartedWithProcesses.csproj", "{C057ACDF-DDD8-496B-BAF9-1C6E4E1248D7}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "GettingStartedWithProcesses", "samples\GettingStartedWithProcesses\GettingStartedWithProcesses.csproj", "{C057ACDF-DDD8-496B-BAF9-1C6E4E1248D7}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "VectorStoreRAG", "samples\Demos\VectorStoreRAG\VectorStoreRAG.csproj", "{28DFAF27-8FF3-4373-AAA4-2A6969C86246}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Expand Down Expand Up @@ -926,6 +928,12 @@ Global
{C057ACDF-DDD8-496B-BAF9-1C6E4E1248D7}.Publish|Any CPU.Build.0 = Debug|Any CPU
{C057ACDF-DDD8-496B-BAF9-1C6E4E1248D7}.Release|Any CPU.ActiveCfg = Release|Any CPU
{C057ACDF-DDD8-496B-BAF9-1C6E4E1248D7}.Release|Any CPU.Build.0 = Release|Any CPU
{28DFAF27-8FF3-4373-AAA4-2A6969C86246}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{28DFAF27-8FF3-4373-AAA4-2A6969C86246}.Debug|Any CPU.Build.0 = Debug|Any CPU
{28DFAF27-8FF3-4373-AAA4-2A6969C86246}.Publish|Any CPU.ActiveCfg = Debug|Any CPU
{28DFAF27-8FF3-4373-AAA4-2A6969C86246}.Publish|Any CPU.Build.0 = Debug|Any CPU
{28DFAF27-8FF3-4373-AAA4-2A6969C86246}.Release|Any CPU.ActiveCfg = Release|Any CPU
{28DFAF27-8FF3-4373-AAA4-2A6969C86246}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -1054,6 +1062,7 @@ Global
{27AF60D6-86F5-4591-A700-4F8C93F41B11} = {0D8C6358-5DAA-4EA6-A924-C268A9A21BC9}
{21A32285-8443-4A75-B2E8-27E6090EC562} = {0D8C6358-5DAA-4EA6-A924-C268A9A21BC9}
{C057ACDF-DDD8-496B-BAF9-1C6E4E1248D7} = {FA3720F1-C99A-49B2-9577-A940257098BF}
{28DFAF27-8FF3-4373-AAA4-2A6969C86246} = {5D4C0700-BBB5-418F-A7B2-F392B9A18263}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {FBDC56A3-86AD-4323-AA0F-201E59123B83}
Expand Down
86 changes: 86 additions & 0 deletions dotnet/samples/Demos/VectorStoreRAG/DataLoader.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.SemanticKernel.Data;
using Microsoft.SemanticKernel.Embeddings;
using UglyToad.PdfPig;
using UglyToad.PdfPig.Content;
using UglyToad.PdfPig.DocumentLayoutAnalysis.PageSegmenter;

namespace VectorStoreRAG;

/// <summary>
/// Class that loads text from a PDF file into a vector store.
/// </summary>
/// <typeparam name="TKey">The type of the data model key.</typeparam>
/// <param name="uniqueKeyGenerator">A function to generate unique keys with.</param>
/// <param name="vectorStoreRecordCollection">The collection to load the data into.</param>
/// <param name="textEmbeddingGenerationService">The service to use for generating embeddings from the text.</param>
internal sealed class DataLoader<TKey>(
UniqueKeyGenerator<TKey> uniqueKeyGenerator,
IVectorStoreRecordCollection<TKey, TextSnippet<TKey>> vectorStoreRecordCollection,
ITextEmbeddingGenerationService textEmbeddingGenerationService) : IDataLoader where TKey : notnull
{
/// <inheritdoc/>
public async Task LoadPdf(string pdfPath, CancellationToken cancellationToken)
{
// Create the collection if it doesn't exist.
await vectorStoreRecordCollection.CreateCollectionIfNotExistsAsync(cancellationToken).ConfigureAwait(false);

// Load the paragraphs from the PDF file and split them into batches.
var sections = LoadParagraphs(pdfPath, cancellationToken);
var batches = sections.Chunk(10);

// Process each batch of paragraphs.
foreach (var batch in batches)
{
// Map each paragraph to a TextSnippet and generate an embedding for it.
var recordTasks = batch.Select(async section => new TextSnippet<TKey>
{
Key = uniqueKeyGenerator.GenerateKey(),
Text = section.ParagraphText,
ReferenceDescription = $"{new FileInfo(pdfPath).Name}#page={section.PageNumber}",
ReferenceLink = $"{new Uri(new FileInfo(pdfPath).FullName).AbsoluteUri}#page={section.PageNumber}",
TextEmbedding = await textEmbeddingGenerationService.GenerateEmbeddingAsync(section.ParagraphText, cancellationToken: cancellationToken).ConfigureAwait(false)
});

// Upsert the records into the vector store.
var records = await Task.WhenAll(recordTasks).ConfigureAwait(false);
var upsertedKeys = vectorStoreRecordCollection.UpsertBatchAsync(records, cancellationToken: cancellationToken);
await foreach (var key in upsertedKeys.ConfigureAwait(false))
{
Console.WriteLine($"Upserted record '{key}' into VectorDB");
}
}
}

/// <summary>
/// Read the text from each paragraph in the provided PDF file.
/// </summary>
/// <param name="pdfPath">The pdf file to read the paragraphs from.</param>
/// <param name="cancellationToken">The <see cref="CancellationToken"/> to monitor for cancellation requests.</param>
/// <returns>The paragraphs from the pdf file, plus the page that they are on.</returns>
private static IEnumerable<(string ParagraphText, int PageNumber)> LoadParagraphs(string pdfPath, CancellationToken cancellationToken)
{
using (PdfDocument document = PdfDocument.Open(pdfPath))
{
foreach (Page page in document.GetPages())
{
if (cancellationToken.IsCancellationRequested)
{
break;
}

var blocks = DefaultPageSegmenter.Instance.GetBlocks(page.GetWords());
foreach (var block in blocks)
{
if (cancellationToken.IsCancellationRequested)
{
break;
}

yield return (ParagraphText: block.Text, PageNumber: page.Number);
}
}
}
}
}
17 changes: 17 additions & 0 deletions dotnet/samples/Demos/VectorStoreRAG/IDataLoader.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
// Copyright (c) Microsoft. All rights reserved.

namespace VectorStoreRAG;

/// <summary>
/// Interface for loading data into a data store.
/// </summary>
internal interface IDataLoader
{
/// <summary>
/// Load the text from a PDF file into the data store.
/// </summary>
/// <param name="pdfPath">The pdf file to load.</param>
/// <param name="cancellationToken">The <see cref="CancellationToken"/> to monitor for cancellation requests.</param>
/// <returns>An async task that completes when the loading is complete.</returns>
Task LoadPdf(string pdfPath, CancellationToken cancellationToken);
}
71 changes: 71 additions & 0 deletions dotnet/samples/Demos/VectorStoreRAG/Options/ApplicationConfig.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
// Copyright (c) Microsoft. All rights reserved.

using Microsoft.Extensions.Configuration;

namespace VectorStoreRAG.Options;

/// <summary>
/// Helper class to load all configuration settings for the VectorStoreRAG project.
/// </summary>
internal sealed class ApplicationConfig
{
private readonly AzureOpenAIConfig _azureOpenAIConfig;
private readonly AzureOpenAIEmbeddingsConfig _azureOpenAIEmbeddingsConfig = new();
private readonly RagConfig _ragConfig = new();
private readonly AzureAISearchConfig _azureAISearchConfig = new();
private readonly AzureCosmosDBConfig _azureCosmosDBMongoDBConfig = new();
private readonly AzureCosmosDBConfig _azureCosmosDBNoSQLConfig = new();
private readonly QdrantConfig _qdrantConfig = new();
private readonly RedisConfig _redisConfig = new();
private readonly WeaviateConfig _weaviateConfig = new();

public ApplicationConfig(ConfigurationManager configurationManager)
{
this._azureOpenAIConfig = new();
configurationManager
.GetRequiredSection($"AIServices:{AzureOpenAIConfig.ConfigSectionName}")
.Bind(this._azureOpenAIConfig);
configurationManager
.GetRequiredSection($"AIServices:{AzureOpenAIEmbeddingsConfig.ConfigSectionName}")
.Bind(this._azureOpenAIEmbeddingsConfig);
configurationManager
.GetRequiredSection(RagConfig.ConfigSectionName)
.Bind(this._ragConfig);
configurationManager
.GetRequiredSection($"VectorStores:{AzureAISearchConfig.ConfigSectionName}")
.Bind(this._azureAISearchConfig);
configurationManager
.GetRequiredSection($"VectorStores:{AzureCosmosDBConfig.MongoDBConfigSectionName}")
.Bind(this._azureCosmosDBMongoDBConfig);
configurationManager
.GetRequiredSection($"VectorStores:{AzureCosmosDBConfig.NoSQLConfigSectionName}")
.Bind(this._azureCosmosDBNoSQLConfig);
configurationManager
.GetRequiredSection($"VectorStores:{QdrantConfig.ConfigSectionName}")
.Bind(this._qdrantConfig);
configurationManager
.GetRequiredSection($"VectorStores:{RedisConfig.ConfigSectionName}")
.Bind(this._redisConfig);
configurationManager
.GetRequiredSection($"VectorStores:{WeaviateConfig.ConfigSectionName}")
.Bind(this._weaviateConfig);
}

public AzureOpenAIConfig AzureOpenAIConfig => this._azureOpenAIConfig;

public AzureOpenAIEmbeddingsConfig AzureOpenAIEmbeddingsConfig => this._azureOpenAIEmbeddingsConfig;

public RagConfig RagConfig => this._ragConfig;

public AzureAISearchConfig AzureAISearchConfig => this._azureAISearchConfig;

public AzureCosmosDBConfig AzureCosmosDBMongoDBConfig => this._azureCosmosDBMongoDBConfig;

public AzureCosmosDBConfig AzureCosmosDBNoSQLConfig => this._azureCosmosDBNoSQLConfig;

public QdrantConfig QdrantConfig => this._qdrantConfig;

public RedisConfig RedisConfig => this._redisConfig;

public WeaviateConfig WeaviateConfig => this._weaviateConfig;
}
19 changes: 19 additions & 0 deletions dotnet/samples/Demos/VectorStoreRAG/Options/AzureAISearchConfig.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright (c) Microsoft. All rights reserved.

using System.ComponentModel.DataAnnotations;

namespace VectorStoreRAG.Options;

/// <summary>
/// Azure AI Search service settings.
/// </summary>
internal sealed class AzureAISearchConfig
{
public const string ConfigSectionName = "AzureAISearch";

[Required]
public string Endpoint { get; set; } = string.Empty;

[Required]
public string ApiKey { get; set; } = string.Empty;
}
20 changes: 20 additions & 0 deletions dotnet/samples/Demos/VectorStoreRAG/Options/AzureCosmosDBConfig.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
// Copyright (c) Microsoft. All rights reserved.

using System.ComponentModel.DataAnnotations;

namespace VectorStoreRAG.Options;

/// <summary>
/// Azure CosmosDB service settings for use with AzureCosmosDBMongoDB and AzureCosmosDBNoSQL.
/// </summary>
internal sealed class AzureCosmosDBConfig
{
public const string MongoDBConfigSectionName = "AzureCosmosDBMongoDB";
public const string NoSQLConfigSectionName = "AzureCosmosDBNoSQL";

[Required]
public string ConnectionString { get; set; } = string.Empty;

[Required]
public string DatabaseName { get; set; } = string.Empty;
}
19 changes: 19 additions & 0 deletions dotnet/samples/Demos/VectorStoreRAG/Options/AzureOpenAIConfig.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright (c) Microsoft. All rights reserved.

using System.ComponentModel.DataAnnotations;

namespace VectorStoreRAG.Options;

/// <summary>
/// Azure OpenAI service settings.
/// </summary>
internal sealed class AzureOpenAIConfig
{
public const string ConfigSectionName = "AzureOpenAI";

[Required]
public string ChatDeploymentName { get; set; } = string.Empty;

[Required]
public string Endpoint { get; set; } = string.Empty;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
// Copyright (c) Microsoft. All rights reserved.

using System.ComponentModel.DataAnnotations;

namespace VectorStoreRAG.Options;

/// <summary>
/// Azure OpenAI Embeddings service settings.
/// </summary>
internal sealed class AzureOpenAIEmbeddingsConfig
{
public const string ConfigSectionName = "AzureOpenAIEmbeddings";

[Required]
public string DeploymentName { get; set; } = string.Empty;

[Required]
public string Endpoint { get; set; } = string.Empty;
}
22 changes: 22 additions & 0 deletions dotnet/samples/Demos/VectorStoreRAG/Options/QdrantConfig.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Copyright (c) Microsoft. All rights reserved.

using System.ComponentModel.DataAnnotations;

namespace VectorStoreRAG.Options;

/// <summary>
/// Qdrant service settings.
/// </summary>
internal sealed class QdrantConfig
{
public const string ConfigSectionName = "Qdrant";

[Required]
public string Host { get; set; } = string.Empty;

public int Port { get; set; } = 6334;

public bool Https { get; set; } = false;

public string ApiKey { get; set; } = string.Empty;
}
25 changes: 25 additions & 0 deletions dotnet/samples/Demos/VectorStoreRAG/Options/RagConfig.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Copyright (c) Microsoft. All rights reserved.

using System.ComponentModel.DataAnnotations;

namespace VectorStoreRAG.Options;

/// <summary>
/// Contains settings to control the RAG experience.
/// </summary>
internal sealed class RagConfig
{
public const string ConfigSectionName = "Rag";

[Required]
public bool BuildCollection { get; set; } = true;

[Required]
public string[]? PdfFilePaths { get; set; }

[Required]
public string VectorStoreType { get; set; } = string.Empty;

[Required]
public string CollectionName { get; set; } = string.Empty;
}
16 changes: 16 additions & 0 deletions dotnet/samples/Demos/VectorStoreRAG/Options/RedisConfig.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Copyright (c) Microsoft. All rights reserved.

using System.ComponentModel.DataAnnotations;

namespace VectorStoreRAG.Options;

/// <summary>
/// Redis service settings.
/// </summary>
internal sealed class RedisConfig
{
public const string ConfigSectionName = "Redis";

[Required]
public string ConnectionConfiguration { get; set; } = string.Empty;
}
16 changes: 16 additions & 0 deletions dotnet/samples/Demos/VectorStoreRAG/Options/WeaviateConfig.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// Copyright (c) Microsoft. All rights reserved.

using System.ComponentModel.DataAnnotations;

namespace VectorStoreRAG.Options;

/// <summary>
/// Weaviate service settings.
/// </summary>
internal sealed class WeaviateConfig
{
public const string ConfigSectionName = "Weaviate";

[Required]
public string Endpoint { get; set; } = string.Empty;
}
Loading

0 comments on commit 900beca

Please sign in to comment.