diff --git a/app.js b/app.js index 644f17b..a0e7bdc 100755 --- a/app.js +++ b/app.js @@ -27,6 +27,8 @@ if (process.env.NODE_ENV == 'production') { config.sessionSecret = process.env.sessionSecret; config.ghToken = process.env.ghToken; config.pineconeAPIKey = process.env.pineconeAPIKey; + config.azureOpenAIAPIKey = process.env.azureOpenAIAPIKey; + config.azureEndpointURL = process.env.azureEndpointURL; hostPort = process.env.PORT ? process.env.PORT : 8080; } else { mongooseConnectionString = config.devMongoDBConnectionString; @@ -1035,7 +1037,6 @@ app.post('/api/getrepoissuegraph', authenticateToken, async function (req, res) app.post('/api/getsimilarissues', async function (req, res) { try { - req.body.username = req.user.id; var returnData = await dataHandler.getSimilarIssues(req.body); return res.json({ success: true, similarIssues: returnData }); @@ -1044,20 +1045,6 @@ app.post('/api/getsimilarissues', async function (req, res) { } }); -// app.get with /api//getsimilarissues/:repoName/:issueNumber -app.get('/api/getsimilarissues/:organizationName/:repoName/*', async function (req, res) { - try { - let issueTitle = decodeURIComponent(req.params[0]); - req.params.issueTitle = issueTitle; - - var returnData = await dataHandler.getSimilarIssues(req.params); - - return res.json({ success: true, similarIssues: returnData }); - } catch (error) { - return res.json(returnFailure(error)); - } -}); - // Interval jobs // Refresh all user data every hour diff --git a/backendsrc/embeddingsHandler.js b/backendsrc/embeddingsHandler.js index 4c7fbe8..2e95a76 100755 --- a/backendsrc/embeddingsHandler.js +++ b/backendsrc/embeddingsHandler.js @@ -1,57 +1,56 @@ -const pythonWorkerHandler = require('./pythonWorkerHandler'); -const zmq = require('zeromq'); const { Pinecone } = require("@pinecone-database/pinecone"); +const { OpenAIClient, AzureKeyCredential } = require("@azure/openai"); +const { Semaphore } = require("async-mutex"); +const { GetDescription } = require('./helpers'); class embeddingsHandler { - static embeddingDimensions = 384; + static embeddingDimensions = 3072; static indexName = "gitgudissues"; constructor(inConfigObject) { - // Set up Python Worker - this.sock = new zmq.Request; - this.pythonWorker = new pythonWorkerHandler(this.sock); + // Set up azureClient and Pinecone + this.azureClient = new OpenAIClient(inConfigObject.azureEndpointURL, new AzureKeyCredential(inConfigObject.azureOpenAIAPIKey), {apiVersion: "2023-05-15"}); this.pinecone = new Pinecone({ environment: "gcp-starter", apiKey: inConfigObject.pineconeAPIKey, }); this.index = this.pinecone.Index(embeddingsHandler.indexName); + this.maxConcurrentRequests = 1; + this.pineconeSemaphore = new Semaphore(this.maxConcurrentRequests); + this.azureSemaphore = new Semaphore(this.maxConcurrentRequests); } - async addMultipleEmbeddings(inputIssues) { - // Get embeddings from Python Worker - - if (inputIssues.length != 0) { - const titles = inputIssues.map(issue => issue.title); - const embeddings = await this.pythonWorker.getMultipleEmbeddings(titles); - - // Get list of issues grouped by repoRef with embeddings added - let issuesByRepo = {}; - for (let i = 0; i < inputIssues.length; i++) { - let issue = inputIssues[i]; - let embedding = embeddings[i]; - if (!issuesByRepo[issue.repoRef.toString()]) { - issuesByRepo[issue.repoRef.toString()] = []; - } - issuesByRepo[issue.repoRef.toString()].push({ - id: issue._id.toString(), - values: embedding, - }); - } - - // Upsert embeddings into Pinecone - for (const [repoRef, issues] of Object.entries(issuesByRepo)) { - await this.index.namespace(repoRef).upsert(issues); - } - - return true; + async addEmbedding(inputIssue) { + + // Get embeddings from Azure OpenAI Embeddings model + const description = [GetDescription(inputIssue)]; + + let embeddingObject = null ; + + try { + await this.azureSemaphore.runExclusive(async () => { + embeddingObject = await this.azureClient.getEmbeddings("issue-body-embeddings-model", description); + }); + } catch (error) { + console.log(error); } - else { - return true; + + let embedding = embeddingObject.data[0].embedding; + + let payload = { + id: inputIssue._id.toString(), + values: embedding, } + console.log("Upserting embeddings for issue number: " + inputIssue.number); + return await this.pineconeSemaphore.runExclusive(async () => { + console.log("Semaphore acquired for issue number: " + inputIssue.number); + await this.index.namespace(inputIssue.repoRef.toString()).upsert([payload]); + }); } + async removeEmbedding(inputIssue) { await this.index.namespace(inputIssue.repoRef.toString()).deleteOne(inputIssue._id.toString()); @@ -65,8 +64,12 @@ class embeddingsHandler { return true; } - async getSimilarIssueIDs(repo, issueTitle, issue) { - const inputVector = await this.pythonWorker.getEmbedding(issueTitle); + async getSimilarIssueIDs(repo, issueDescription, issue) { + // Create title + body description + const description = [issueDescription]; + + // Query azure for embeddings + const inputVector = await this.azureClient.getEmbeddings("issue-body-embeddings-model", description); let searchFilter = `repo_id eq '${repo._id.toString()}'`; @@ -78,7 +81,7 @@ class embeddingsHandler { let searchResults = await this.index.namespace(repo._id.toString()).query({ topK: numberOfReturnedIssues + 1, - vector: inputVector, + vector: inputVector.data[0].embedding, includeValues: false }); diff --git a/backendsrc/helpers.js b/backendsrc/helpers.js index de26131..6494035 100755 --- a/backendsrc/helpers.js +++ b/backendsrc/helpers.js @@ -1,3 +1,5 @@ +const { getEncoding } = require("js-tiktoken") + module.exports = { PromiseTimeout(delayms) { return new Promise(function (resolve, reject) { @@ -20,6 +22,21 @@ module.exports = { toObject[key] = fromObject[key]; }); }, + GetDescription(issueTitle, issueBody) { + // Generate description and check if token count is too high + const enc = getEncoding("cl100k_base"); + + let description = '# ' + issueTitle + '\n\n' + issueBody; + + const encoding = enc.encode(description); + + if (encoding.length > 8192) { + // Cut description to under 8192 tokens if too long + description = enc.decode(encoding.slice(0, 8100)); + } + + return description; + }, async UpdateIssueRead(inIssueReadDetails, inIssue, inUser, inputDate) { let returnIssueReadList = await inIssueReadDetails.find({ issueRef: inIssue._id, userRef: inUser._id }); diff --git a/backendsrc/oneOffScriptHelpers.js b/backendsrc/oneOffScriptHelpers.js index 50c0840..f54ec68 100755 --- a/backendsrc/oneOffScriptHelpers.js +++ b/backendsrc/oneOffScriptHelpers.js @@ -2,22 +2,22 @@ module.exports = { async AddEmbeddingsToIssuesInRepo(inIssueDetails, inEmbeddingsHandler, inRepo) { - if (inRepo.shortURL == "microsoft/terminal") { + if (inRepo.shortURL == "microsoft/winget-pkgs") { try { let startPeriod = new Date((new Date().getTime() - (20 * 12 * 4 * 7 * 24 * 60 * 60 * 1000))); // 20 years ago let totalIssues = await inIssueDetails.countDocuments({ repoRef: inRepo._id, created_at: { $gte: startPeriod } }); - let pageSize = 100; + console.log(`Total issues for ${inRepo.shortURL}: ${totalIssues}`) + let pageSize = 1; let pages = Math.ceil(totalIssues / pageSize); - for (let i = 0; i < pages; i++) { let issueList = await inIssueDetails.find({ repoRef: inRepo._id, created_at: { $gte: startPeriod } - }).sort({ number: 1 }).skip(i * pageSize).limit(pageSize); - await inEmbeddingsHandler.addMultipleEmbeddings(issueList); + }).sort({ number: 1 }).skip((i * pageSize)).limit(pageSize); + await inEmbeddingsHandler.addEmbedding(issueList[0]); let percentComplete = ((i + 1) / pages) * 100; let beginningNumber = i * pageSize + 1; let endNumber = Math.min((i + 1) * pageSize, totalIssues); diff --git a/backendsrc/refreshRepoHandler.js b/backendsrc/refreshRepoHandler.js index e38a3cb..ae252eb 100755 --- a/backendsrc/refreshRepoHandler.js +++ b/backendsrc/refreshRepoHandler.js @@ -224,7 +224,6 @@ class RefreshRepoTask { async storeInDatabase(data) { var response = 'success'; - var insertedIssueArray = []; await Promise.all(data.map(async (responseItem) => { // for (let i = 0; i < data.length; i++) { @@ -293,7 +292,8 @@ class RefreshRepoTask { // Check if an issue was inserted if (!updateResultRaw.lastErrorObject.updatedExisting) { // Add inserted issue to list - insertedIssueArray.push(updateResult); + let embeddingsPromise = this.embeddingsHandler.addEmbedding(updateResult); + finalAwaitPromiseArray.push(embeddingsPromise); } if (updateResult.closed_by) { @@ -324,7 +324,7 @@ class RefreshRepoTask { // For each name in the mention array, attempt to create a mention if (!updateResultRaw.lastErrorObject.updatedExisting) { finalAwaitPromiseArray.push(helperFunctions.CreateMentionsFromIssueList(mentionsArray, this.IssueCommentMentionDetails, this.UserDetails, this.IssueReadDetails, updateResult)); - } + } await Promise.all(finalAwaitPromiseArray); @@ -337,12 +337,6 @@ class RefreshRepoTask { } })); - // Log to console how many issues we're adding embeddings for and the repo name - console.log("Adding embeddings for " + insertedIssueArray.length + " issues in " + this.shortRepoUrl); - - // Add embeddings for all inserted issues - await this.embeddingsHandler.addMultipleEmbeddings(insertedIssueArray); - return response; } diff --git a/backendsrc/webDataHandler.js b/backendsrc/webDataHandler.js index 124904e..d35f151 100755 --- a/backendsrc/webDataHandler.js +++ b/backendsrc/webDataHandler.js @@ -5,6 +5,7 @@ const axios = require('axios'); const mongoose = require('mongoose'); const ObjectId = mongoose.Types.ObjectId; const oneOffScriptHelpers = require('./oneOffScriptHelpers'); +const { GetDescription } = require('./helpers'); class WebDataHandler { constructor(inRepoDetails, inIssueDetails, inUserDetails, inSiteIssueLabelDetails, inIssueCommentDetails, inIssueCommentMentionDetails, @@ -2140,7 +2141,9 @@ class WebDataHandler { } async getSimilarIssues(queryData) { - const { organizationName, repoName, issueTitle } = queryData; + const { organizationName, repoName, issueTitle, issueBody } = queryData; + + let issueDescription = GetDescription(issueTitle, issueBody) // to do rewrite to take in issue title and body let dbRepoName = (organizationName + "/" + repoName).toLowerCase(); @@ -2152,7 +2155,7 @@ class WebDataHandler { let issue = await this.IssueDetails.findOne({ title: issueTitle, repoRef: repo._id }); - let similarIssueIDArray = await this.embeddingsHandler.getSimilarIssueIDs(repo, issueTitle, issue); + let similarIssueIDArray = await this.embeddingsHandler.getSimilarIssueIDs(repo, issueDescription, issue); // Make a new array that finds each issue with the id specified in the array above let similarIssuesArray = await Promise.all(similarIssueIDArray.map(similarIssueIDObject => this.IssueDetails.findOne({ _id: similarIssueIDObject.id }))); diff --git a/defaultconfig.js b/defaultconfig.js index bebd0ce..7bf62b7 100755 --- a/defaultconfig.js +++ b/defaultconfig.js @@ -6,6 +6,6 @@ module.exports = { 'sessionSecret': 'somesessionsecret', // https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token 'ghToken': null, - 'azureSearchAPIKey': "key", - 'azureSearchURL' : "url", + 'azureOpenAIAPIKey': "key", + 'azureEndpointURL' : "url", }; diff --git a/package-lock.json b/package-lock.json index 39594f1..7770d2f 100644 --- a/package-lock.json +++ b/package-lock.json @@ -5,14 +5,16 @@ "packages": { "": { "dependencies": { + "@azure/openai": "^1.0.0-beta.11", "@pinecone-database/pinecone": "^1.1.2", - "async-mutex": "^0.4.0", + "async-mutex": "^0.4.1", "axios": "^1.6.4", "connect-ensure-login": "^0.1.1", "connect-mongo": "^4.6.0", "es6-promise": "^4.2.8", "express": "^4.17.1", "express-session": "^1.17.2", + "js-tiktoken": "^1.0.10", "jsonwebtoken": "^9.0.2", "mongoose": "^6.0.7", "nodemon": "^3.0.1", @@ -708,6 +710,126 @@ "tslib": "^2.3.1" } }, + "node_modules/@azure-rest/core-client": { + "version": "1.2.0", + "resolved": "https://registry.npmjs.org/@azure-rest/core-client/-/core-client-1.2.0.tgz", + "integrity": "sha512-+3zapvYc+25FpTzCkcYqsh/v8BiSJhMGseWhMgec5RqsH1VovIUHNyNS7hQDECv3QslL6M3TxZnDNmCqoz5IZA==", + "dependencies": { + "@azure/abort-controller": "^2.0.0", + "@azure/core-auth": "^1.3.0", + "@azure/core-rest-pipeline": "^1.5.0", + "@azure/core-tracing": "^1.0.1", + "@azure/core-util": "^1.0.0", + "tslib": "^2.2.0" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/@azure/abort-controller": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/@azure/abort-controller/-/abort-controller-2.0.0.tgz", + "integrity": "sha512-RP/mR/WJchR+g+nQFJGOec+nzeN/VvjlwbinccoqfhTsTHbb8X5+mLDp48kHT0ueyum0BNSwGm0kX0UZuIqTGg==", + "dependencies": { + "tslib": "^2.2.0" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/@azure/core-auth": { + "version": "1.6.0", + "resolved": "https://registry.npmjs.org/@azure/core-auth/-/core-auth-1.6.0.tgz", + "integrity": "sha512-3X9wzaaGgRaBCwhLQZDtFp5uLIXCPrGbwJNWPPugvL4xbIGgScv77YzzxToKGLAKvG9amDoofMoP+9hsH1vs1w==", + "dependencies": { + "@azure/abort-controller": "^2.0.0", + "@azure/core-util": "^1.1.0", + "tslib": "^2.2.0" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/@azure/core-rest-pipeline": { + "version": "1.14.0", + "resolved": "https://registry.npmjs.org/@azure/core-rest-pipeline/-/core-rest-pipeline-1.14.0.tgz", + "integrity": "sha512-Tp4M6NsjCmn9L5p7HsW98eSOS7A0ibl3e5ntZglozT0XuD/0y6i36iW829ZbBq0qihlGgfaeFpkLjZ418KDm1Q==", + "dependencies": { + "@azure/abort-controller": "^2.0.0", + "@azure/core-auth": "^1.4.0", + "@azure/core-tracing": "^1.0.1", + "@azure/core-util": "^1.3.0", + "@azure/logger": "^1.0.0", + "http-proxy-agent": "^5.0.0", + "https-proxy-agent": "^5.0.0", + "tslib": "^2.2.0" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/@azure/core-sse": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/@azure/core-sse/-/core-sse-2.0.0.tgz", + "integrity": "sha512-PFmmaUwDmcmtt+q9NLzfhwC5qA2ACDn/5fuy8GVxI+YRv2qRvy1C0rZrwZLvOHe//G4cSRMz1X+CekY/Nlem2w==", + "dependencies": { + "tslib": "^2.4.0" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/@azure/core-tracing": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/@azure/core-tracing/-/core-tracing-1.0.1.tgz", + "integrity": "sha512-I5CGMoLtX+pI17ZdiFJZgxMJApsK6jjfm85hpgp3oazCdq5Wxgh4wMr7ge/TTWW1B5WBuvIOI1fMU/FrOAMKrw==", + "dependencies": { + "tslib": "^2.2.0" + }, + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/@azure/core-util": { + "version": "1.7.0", + "resolved": "https://registry.npmjs.org/@azure/core-util/-/core-util-1.7.0.tgz", + "integrity": "sha512-Zq2i3QO6k9DA8vnm29mYM4G8IE9u1mhF1GUabVEqPNX8Lj833gdxQ2NAFxt2BZsfAL+e9cT8SyVN7dFVJ/Hf0g==", + "dependencies": { + "@azure/abort-controller": "^2.0.0", + "tslib": "^2.2.0" + }, + "engines": { + "node": ">=18.0.0" + } + }, + "node_modules/@azure/logger": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/@azure/logger/-/logger-1.0.4.tgz", + "integrity": "sha512-ustrPY8MryhloQj7OWGe+HrYx+aoiOxzbXTtgblbV3xwCqpzUK36phH3XNHQKj3EPonyFUuDTfR3qFhTEAuZEg==", + "dependencies": { + "tslib": "^2.2.0" + }, + "engines": { + "node": ">=14.0.0" + } + }, + "node_modules/@azure/openai": { + "version": "1.0.0-beta.11", + "resolved": "https://registry.npmjs.org/@azure/openai/-/openai-1.0.0-beta.11.tgz", + "integrity": "sha512-OXS27xkG1abiGf5VZUKnkJKr1VCo8+6EUrTGW5aSVjc5COqX8jAUqVAOZsQVCHBdtWYSBULlZkc0ncKMTRQAiQ==", + "dependencies": { + "@azure-rest/core-client": "^1.1.7", + "@azure/core-auth": "^1.4.0", + "@azure/core-rest-pipeline": "^1.13.0", + "@azure/core-sse": "^2.0.0", + "@azure/core-util": "^1.4.0", + "@azure/logger": "^1.0.3", + "tslib": "^2.4.0" + }, + "engines": { + "node": ">=18.0.0" + } + }, "node_modules/@babel/runtime": { "version": "7.23.2", "resolved": "https://registry.npmjs.org/@babel/runtime/-/runtime-7.23.2.tgz", @@ -1318,6 +1440,14 @@ "node": ">=14.0.0" } }, + "node_modules/@tootallnate/once": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/@tootallnate/once/-/once-2.0.0.tgz", + "integrity": "sha512-XCuKFP5PS55gnMVu3dty8KPatLqUoy/ZYzDzAGCQ8JNFCkLXzmI7vNHCR+XpbZaMWQK/vQubr7PkYq8g470J/A==", + "engines": { + "node": ">= 10" + } + }, "node_modules/@types/node": { "version": "20.9.0", "resolved": "https://registry.npmjs.org/@types/node/-/node-20.9.0.tgz", @@ -1357,6 +1487,17 @@ "node": ">= 0.6" } }, + "node_modules/agent-base": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-6.0.2.tgz", + "integrity": "sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ==", + "dependencies": { + "debug": "4" + }, + "engines": { + "node": ">= 6.0.0" + } + }, "node_modules/ajv": { "version": "8.12.0", "resolved": "https://registry.npmjs.org/ajv/-/ajv-8.12.0.tgz", @@ -1425,9 +1566,9 @@ } }, "node_modules/async-mutex": { - "version": "0.4.0", - "resolved": "https://registry.npmjs.org/async-mutex/-/async-mutex-0.4.0.tgz", - "integrity": "sha512-eJFZ1YhRR8UN8eBLoNzcDPcy/jqjsg6I1AP+KvWQX80BqOSW1oJPJXDylPUEeMr2ZQvHgnQ//Lp6f3RQ1zI7HA==", + "version": "0.4.1", + "resolved": "https://registry.npmjs.org/async-mutex/-/async-mutex-0.4.1.tgz", + "integrity": "sha512-WfoBo4E/TbCX1G95XTjbWTE3X2XLG0m1Xbv2cwOtuPdyH9CZvnaA5nCt1ucjaKEgW2A5IF71hxrRhr83Je5xjA==", "dependencies": { "tslib": "^2.4.0" } @@ -2343,6 +2484,31 @@ "node": ">= 0.8" } }, + "node_modules/http-proxy-agent": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-5.0.0.tgz", + "integrity": "sha512-n2hY8YdoRE1i7r6M0w9DIw5GgZN0G25P8zLCRQ8rjXtTU3vsNFBI/vWK/UIeE6g5MUUz6avwAPXmL6Fy9D/90w==", + "dependencies": { + "@tootallnate/once": "2", + "agent-base": "6", + "debug": "4" + }, + "engines": { + "node": ">= 6" + } + }, + "node_modules/https-proxy-agent": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-5.0.1.tgz", + "integrity": "sha512-dFcAjpTQFgoLMzC2VwU+C/CbS7uRL0lWmxDITmqm7C+7F0Odmj6s9l6alZc6AELXhrnggM2CeWSXHGOdX2YtwA==", + "dependencies": { + "agent-base": "6", + "debug": "4" + }, + "engines": { + "node": ">= 6" + } + }, "node_modules/iconv-lite": { "version": "0.4.24", "resolved": "https://registry.npmjs.org/iconv-lite/-/iconv-lite-0.4.24.tgz", @@ -2476,6 +2642,14 @@ "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", "integrity": "sha512-RHxMLp9lnKHGHRng9QFhRCMbYAcVpn69smSGcq3f36xjgVVWThj4qqLbTLlq7Ssj8B+fIQ1EuCEGI2lKsyQeIw==" }, + "node_modules/js-tiktoken": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/js-tiktoken/-/js-tiktoken-1.0.10.tgz", + "integrity": "sha512-ZoSxbGjvGyMT13x6ACo9ebhDha/0FHdKA+OsQcMOWcm1Zs7r90Rhk5lhERLzji+3rA7EKpXCgwXcM5fF3DMpdA==", + "dependencies": { + "base64-js": "^1.5.1" + } + }, "node_modules/json-schema-traverse": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/json-schema-traverse/-/json-schema-traverse-1.0.0.tgz", diff --git a/package.json b/package.json index 34899c5..6bdaa95 100755 --- a/package.json +++ b/package.json @@ -1,13 +1,15 @@ { "dependencies": { + "@azure/openai": "^1.0.0-beta.11", "@pinecone-database/pinecone": "^1.1.2", - "async-mutex": "^0.4.0", + "async-mutex": "^0.4.1", "axios": "^1.6.4", "connect-ensure-login": "^0.1.1", "connect-mongo": "^4.6.0", "es6-promise": "^4.2.8", "express": "^4.17.1", "express-session": "^1.17.2", + "js-tiktoken": "^1.0.10", "jsonwebtoken": "^9.0.2", "mongoose": "^6.0.7", "nodemon": "^3.0.1",