Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PostgreSQL and Generate vector for Mongo DB #111

Merged
merged 7 commits into from
Apr 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .github/actions/build-with-plugins/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,21 @@ runs:
-p:PublishReadyToRun=false \
-p:PublishTrimmed=false \
-p:Version=${{ inputs.build-version }}
- name: Build PostgreSQL Extension
shell: bash
run: |
dotnet publish \
Extensions/PostgreSQL/Cosmos.DataTransfer.PostgresqlExtension.csproj \
--configuration Release \
--output ${{ inputs.platform-short }}/Extensions \
--self-contained false \
--runtime ${{ inputs.runtime }} \
-p:PublishSingleFile=false \
-p:DebugType=embedded \
-p:EnableCompressionInSingleFile=true \
-p:PublishReadyToRun=false \
-p:PublishTrimmed=false \
-p:Version=${{ inputs.build-version }}
- name: Upload package
uses: actions/upload-artifact@v3
with:
Expand Down
6 changes: 3 additions & 3 deletions Core/Cosmos.DataTransfer.Core/Cosmos.DataTransfer.Core.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,15 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Azure.Core" Version="1.31.0" />
<PackageReference Include="Microsoft.Data.SqlClient" Version="5.0.0" />
<PackageReference Include="Azure.Core" Version="1.36.0" />
<PackageReference Include="Microsoft.Data.SqlClient" Version="5.2.0" />
<PackageReference Include="Microsoft.Extensions.Configuration.UserSecrets" Version="6.0.1" />
<PackageReference Include="Microsoft.Extensions.Hosting" Version="6.0.1" />
<PackageReference Include="Microsoft.Extensions.Logging.Console" Version="6.0.0" />
<PackageReference Include="System.CommandLine" Version="2.0.0-beta4.22272.1" />
<PackageReference Include="System.CommandLine.Hosting" Version="0.4.0-alpha.22272.1" />
<PackageReference Include="System.ComponentModel.Composition" Version="6.0.0" />
<PackageReference Include="System.Configuration.ConfigurationManager" Version="6.0.0" />
<PackageReference Include="System.Configuration.ConfigurationManager" Version="8.0.0" />
</ItemGroup>

<ItemGroup>
Expand Down
10 changes: 6 additions & 4 deletions Core/Cosmos.DataTransfer.Core/migrationsettings.json
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
{
"Source": null,
"Sink": null,
"SourceSettings": {
"Source": "",
"Sink": "",
"SourceSettings": {

},
"SinkSettings": {
"SinkSettings": {

},
"Operations": [
//{
Expand Down
25 changes: 23 additions & 2 deletions CosmosDbDataMigrationTool.sln
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Mongo", "Mongo", "{F18E789A
Extensions\Mongo\README.md = Extensions\Mongo\README.md
EndProjectSection
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.MongoExtension", "Extensions\Mongo\Cosmos.DataTransfer.MongoExtension\Cosmos.DataTransfer.MongoExtension.csproj", "{F6EAC33B-9F7D-433B-9328-622FB8938C24}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.MongoVectorExtension", "Extensions\Mongo\Cosmos.DataTransfer.MongoVectorExtension\Cosmos.DataTransfer.MongoVectorExtension.csproj", "{F6EAC33B-9F7D-433B-9328-622FB8938C24}"
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.JsonExtension.UnitTests", "Extensions\Json\Cosmos.DataTransfer.JsonExtension.UnitTests\Cosmos.DataTransfer.JsonExtension.UnitTests.csproj", "{ED1E375E-A5A3-47EA-A7D5-07344C7E152F}"
EndProject
Expand Down Expand Up @@ -87,14 +87,24 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Csv", "Csv", "{39930280-DA2
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.CsvExtension", "Extensions\Csv\Cosmos.DataTransfer.CsvExtension\Cosmos.DataTransfer.CsvExtension.csproj", "{6A3FB90C-B837-4724-A406-214D4CEA686F}"
EndProject
Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "Cosmos.DataTransfer.CsvExtension.UnitTests", "Extensions\Csv\Cosmos.DataTransfer.CsvExtension.UnitTests\Cosmos.DataTransfer.CsvExtension.UnitTests.csproj", "{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E}"
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.CsvExtension.UnitTests", "Extensions\Csv\Cosmos.DataTransfer.CsvExtension.UnitTests\Cosmos.DataTransfer.CsvExtension.UnitTests.csproj", "{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution Items", "{BCBBAF22-0CB5-416B-8C80-03AB2FC4D0A0}"
ProjectSection(SolutionItems) = preProject
Contributing.md = Contributing.md
ExampleConfigs.md = ExampleConfigs.md
README.md = README.md
EndProjectSection
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.PostgresqlExtension", "Extensions\PostgreSQL\Cosmos.DataTransfer.PostgresqlExtension.csproj", "{85820167-DB94-458B-B09B-9E823996C692}"
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "PostgreSQL", "PostgreSQL", "{1B927C5F-50FC-42A6-BAF6-B00E6D760543}"
ProjectSection(SolutionItems) = preProject
Extensions\PostgreSQL\README.md = Extensions\PostgreSQL\README.md
EndProjectSection
EndProject
Project("{9A19103F-16F7-4668-BE54-9A1E7A4F7556}") = "Cosmos.DataTransfer.MongoExtension", "Extensions\Mongo\Cosmos.DataTransfer.MongoExtension\Cosmos.DataTransfer.MongoExtension.csproj", "{31BC84E1-55E5-45AA-BFAC-90732F20588B}"
EndProject
Global
GlobalSection(SolutionConfigurationPlatforms) = preSolution
Debug|Any CPU = Debug|Any CPU
Expand Down Expand Up @@ -181,6 +191,14 @@ Global
{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E}.Debug|Any CPU.Build.0 = Debug|Any CPU
{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E}.Release|Any CPU.ActiveCfg = Release|Any CPU
{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E}.Release|Any CPU.Build.0 = Release|Any CPU
{85820167-DB94-458B-B09B-9E823996C692}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{85820167-DB94-458B-B09B-9E823996C692}.Debug|Any CPU.Build.0 = Debug|Any CPU
{85820167-DB94-458B-B09B-9E823996C692}.Release|Any CPU.ActiveCfg = Release|Any CPU
{85820167-DB94-458B-B09B-9E823996C692}.Release|Any CPU.Build.0 = Release|Any CPU
{31BC84E1-55E5-45AA-BFAC-90732F20588B}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
{31BC84E1-55E5-45AA-BFAC-90732F20588B}.Debug|Any CPU.Build.0 = Debug|Any CPU
{31BC84E1-55E5-45AA-BFAC-90732F20588B}.Release|Any CPU.ActiveCfg = Release|Any CPU
{31BC84E1-55E5-45AA-BFAC-90732F20588B}.Release|Any CPU.Build.0 = Release|Any CPU
EndGlobalSection
GlobalSection(SolutionProperties) = preSolution
HideSolutionNode = FALSE
Expand Down Expand Up @@ -212,6 +230,9 @@ Global
{39930280-DA29-4814-837B-FA7F252EB3EC} = {A8A1CEAB-2D82-460C-9B86-74ABD17CD201}
{6A3FB90C-B837-4724-A406-214D4CEA686F} = {39930280-DA29-4814-837B-FA7F252EB3EC}
{40AD8890-BD78-48F5-AE76-2C2FC6F15B7E} = {39930280-DA29-4814-837B-FA7F252EB3EC}
{85820167-DB94-458B-B09B-9E823996C692} = {1B927C5F-50FC-42A6-BAF6-B00E6D760543}
{1B927C5F-50FC-42A6-BAF6-B00E6D760543} = {A8A1CEAB-2D82-460C-9B86-74ABD17CD201}
{31BC84E1-55E5-45AA-BFAC-90732F20588B} = {F18E789A-D32D-48D3-B75F-1196D7215F74}
EndGlobalSection
GlobalSection(ExtensibilityGlobals) = postSolution
SolutionGuid = {662B3F27-70D8-45E6-A1C0-1438A9C8A542}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Azure.Identity" Version="1.6.0" />
<PackageReference Include="Azure.Identity" Version="1.10.3" />
<PackageReference Include="Microsoft.Azure.Cosmos" Version="3.34.0" />
<PackageReference Include="Microsoft.Extensions.Configuration" Version="6.0.1" />
<PackageReference Include="Microsoft.Extensions.Configuration.Abstractions" Version="6.0.0" />
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<OutputType>Exe</OutputType>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Azure.AI.OpenAI" Version="1.0.0-beta.12" />
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="6.0.0" />
<PackageReference Include="MongoDB.Driver" Version="2.19.1" />
<PackageReference Include="System.ComponentModel.Composition" Version="6.0.0" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\..\Interfaces\Cosmos.DataTransfer.Interfaces\Cosmos.DataTransfer.Interfaces.csproj" />
<ProjectReference Include="..\Cosmos.DataTransfer.MongoExtension\Cosmos.DataTransfer.MongoExtension.csproj" />
</ItemGroup>

<Target Name="PublishToExtensionsFolder" AfterTargets="Build" Condition=" '$(Configuration)' == 'Debug' ">
<Exec Command="dotnet publish --configuration $(Configuration) --no-build -p:PublishProfile=PublishToExtensionsFolder" />
</Target>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
using System.ComponentModel.Composition;
using Azure;
using Azure.AI.OpenAI;
using Cosmos.DataTransfer.Interfaces;
using Cosmos.DataTransfer.MongoExtension;
using Cosmos.DataTransfer.MongoVectorExtension.Settings;
using Microsoft.Extensions.Configuration;
using Microsoft.Extensions.Logging;
using MongoDB.Bson;

namespace Cosmos.DataTransfer.MongoVectorExtension;
[Export(typeof(IDataSinkExtension))]
public class MongoVectorDataSinkExtension : IDataSinkExtensionWithSettings
{
public string DisplayName => $"MongoDB-Vector{ExtensionExtensions.BetaExtensionTag}";

public async Task WriteAsync(IAsyncEnumerable<IDataItem> dataItems, IConfiguration config, IDataSourceExtension dataSource, ILogger logger, CancellationToken cancellationToken = default)
{
var settings = config.Get<MongoVectorSinkSettings>();
settings.Validate();

if (!string.IsNullOrEmpty(settings.ConnectionString) && !string.IsNullOrEmpty(settings.DatabaseName) && !string.IsNullOrEmpty(settings.Collection))
{
var Isembeddingsetsvalid = false;
var client = new OpenAIClient("");
if (settings.GenerateEmbedding.HasValue && settings.GenerateEmbedding.Value && settings.SourcePropEmbedding != null && settings.DestPropEmbedding != null)
{
if (!string.IsNullOrEmpty(settings.OpenAIUrl) && !string.IsNullOrEmpty(settings.OpenAIKey) && !string.IsNullOrEmpty(settings.OpenAIDeploymentName))
{
client = new OpenAIClient(new Uri(settings.OpenAIUrl), new AzureKeyCredential(settings.OpenAIKey));
Isembeddingsetsvalid = true;
logger.LogInformation("OpenAI Embedding settings are valid.");
}
}

var context = new Context(settings.ConnectionString, settings.DatabaseName);
var repo = context.GetRepository<BsonDocument>(settings.Collection);
var batchSize = settings.BatchSize ?? 1000;
var objects = new List<BsonDocument>();
int itemCount = 0;
await foreach (var item in dataItems.WithCancellation(cancellationToken))
{
var dict = item.BuildDynamicObjectTree();

if (Isembeddingsetsvalid)
{
var valtoemb = item.GetValue(settings.SourcePropEmbedding)?.ToString();
if (!string.IsNullOrEmpty(valtoemb) && valtoemb?.Length < 8192)
{
var options = new EmbeddingsOptions()
{
DeploymentName = settings.OpenAIDeploymentName,
Input = { valtoemb }
};
var vector = await client.GetEmbeddingsAsync(options,cancellationToken);
if (vector != null)
{
dict?.TryAdd(settings.DestPropEmbedding, vector.Value.Data[0].Embedding.ToArray());
}
}
}
objects.Add(new BsonDocument(dict));
itemCount++;

if (objects.Count == batchSize)
{
await repo.AddRange(objects);
logger.LogInformation("Added {ItemCount} items to collection '{Collection}'", itemCount, settings.Collection);
objects.Clear();
}
}

if (objects.Any())
{
await repo.AddRange(objects);
}

if (itemCount > 0)
logger.LogInformation("Added {ItemCount} total items to collection '{Collection}'", itemCount, settings.Collection);
else
logger.LogWarning("No items added to collection '{Collection}'", settings.Collection);
}
}

public IEnumerable<IDataExtensionSettings> GetSettings()
{
yield return new MongoVectorSinkSettings();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Console.WriteLine("Starting Mongo extension");
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?xml version="1.0" encoding="utf-8"?>
<!--
https://go.microsoft.com/fwlink/?LinkID=208121.
-->
<Project>
<PropertyGroup Condition=" '$(Configuration)' == 'Debug' ">
<Configuration>Debug</Configuration>
<Platform>Any CPU</Platform>
<PublishDir>..\..\..\Core\Cosmos.DataTransfer.Core\bin\Debug\net6.0\Extensions</PublishDir>
<PublishProtocol>FileSystem</PublishProtocol>
<_TargetId>Folder</_TargetId>
<TargetFramework>net6.0</TargetFramework>
<SelfContained>false</SelfContained>
</PropertyGroup>
<PropertyGroup Condition=" '$(Configuration)' != 'Debug' ">
<Configuration>Release</Configuration>
<Platform>Any CPU</Platform>
<PublishDir>..\..\..\Core\Cosmos.DataTransfer.Core\bin\Release\net6.0\Extensions</PublishDir>
<PublishProtocol>FileSystem</PublishProtocol>
<_TargetId>Folder</_TargetId>
<TargetFramework>net6.0</TargetFramework>
<SelfContained>false</SelfContained>
</PropertyGroup>
</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
using System.ComponentModel.DataAnnotations;
using Cosmos.DataTransfer.MongoExtension.Settings;

namespace Cosmos.DataTransfer.MongoVectorExtension.Settings;
public class MongoVectorSinkSettings : MongoBaseSettings
{
[Required]
public string? Collection { get; set; }

public int? BatchSize { get; set; }

public bool? GenerateEmbedding { get; set; }

public string? OpenAIUrl { get; set; }
public string? OpenAIKey { get; set; }

// name of the deployment for text-embedding-ada-002
public string? OpenAIDeploymentName { get; set; }
public string? SourcePropEmbedding { get; set; }
public string? DestPropEmbedding { get; set; }
}
38 changes: 37 additions & 1 deletion Extensions/Mongo/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,47 @@ Source and sink settings require both `ConnectionString` and `DatabaseName` para

### Sink

```json
{
"ConnectionString": "",
"DatabaseName: "",
"Collection": ""
}
```

# MongoDB Vector Extension (Beta)

The MongoDB Vector extension is a Sink only extension that builds on the MongoDB extension by providing additional capabilities for generating embeddings using Azure OpenAI APIs.

> **Note**: When specifying the MongoDB Vector extension as the Sink property in configuration, utilize the name **MongoDB-Vector(beta)**.

## Settings

The settings are based on the MongoDB extension settings with additional parameters for generating embeddings.

### Additional Sink Settings

The sink settings require the following additional parameters:

- `GenerateEmbedding`: If set to true, the sink will generate embeddings for the records before writing them to the database. The sink requires the `OpenAIUrl`, `OpenAIKey`, and `OpenAIDeploymentModel` parameters to be set. Following paramaters are required if this is true
- `OpenAIUrl`: The URL of the OpenAI API
- `OpenAIKey`: The API key for the OpenAI API
- `OpenAIDeploymentModel`: The deployment model to use for the OpenAI API
- `SourcePropEmbedding`: The property in the source data that should be used to generate the embeddings
- `DestPropEmbedding`: New property name that will be added to the source data with the generated embeddings

```json
{
"ConnectionString": "",
"DatabaseName: "",
"Collection": "",
"BatchSize: 100
"BatchSize: 100,
"GenerateEmbedding": true | false
"OpenAIUrl": "",
"OpenAIKey": "",
"OpenAIDeploymentModel": "",
"SourcePropEmbedding": "",
"DestPropEmbedding": ""
}
```

Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<OutputType>Exe</OutputType>
<TargetFramework>net6.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.Extensions.Configuration.Binder" Version="6.0.0" />
<PackageReference Include="Npgsql" Version="7.0.6" />
<PackageReference Include="System.ComponentModel.Composition" Version="7.0.0" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\Interfaces\Cosmos.DataTransfer.Interfaces\Cosmos.DataTransfer.Interfaces.csproj" />
</ItemGroup>
<Target Name="PublishToExtensionsFolder" AfterTargets="Build" Condition=" '$(Configuration)' == 'Debug' ">
<Exec Command="dotnet publish --configuration $(Configuration) --no-build -p:PublishProfile=FolderProfile" />
</Target>
</Project>
Loading
Loading