Skip to content

Commit

Permalink
Merge pull request #27 from craigloewen-msft/add_body_indexing
Browse files Browse the repository at this point in the history
Add body indexing
  • Loading branch information
craigloewen-msft authored Mar 20, 2024
2 parents 4a9be53 + 2e61723 commit a3f22c6
Show file tree
Hide file tree
Showing 9 changed files with 256 additions and 76 deletions.
17 changes: 2 additions & 15 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ if (process.env.NODE_ENV == 'production') {
config.sessionSecret = process.env.sessionSecret;
config.ghToken = process.env.ghToken;
config.pineconeAPIKey = process.env.pineconeAPIKey;
config.azureOpenAIAPIKey = process.env.azureOpenAIAPIKey;
config.azureEndpointURL = process.env.azureEndpointURL;
hostPort = process.env.PORT ? process.env.PORT : 8080;
} else {
mongooseConnectionString = config.devMongoDBConnectionString;
Expand Down Expand Up @@ -1035,7 +1037,6 @@ app.post('/api/getrepoissuegraph', authenticateToken, async function (req, res)

app.post('/api/getsimilarissues', async function (req, res) {
try {
req.body.username = req.user.id;
var returnData = await dataHandler.getSimilarIssues(req.body);

return res.json({ success: true, similarIssues: returnData });
Expand All @@ -1044,20 +1045,6 @@ app.post('/api/getsimilarissues', async function (req, res) {
}
});

// app.get with /api//getsimilarissues/:repoName/:issueNumber
app.get('/api/getsimilarissues/:organizationName/:repoName/*', async function (req, res) {
try {
let issueTitle = decodeURIComponent(req.params[0]);
req.params.issueTitle = issueTitle;

var returnData = await dataHandler.getSimilarIssues(req.params);

return res.json({ success: true, similarIssues: returnData });
} catch (error) {
return res.json(returnFailure(error));
}
});

// Interval jobs

// Refresh all user data every hour
Expand Down
79 changes: 41 additions & 38 deletions backendsrc/embeddingsHandler.js
Original file line number Diff line number Diff line change
@@ -1,57 +1,56 @@
const pythonWorkerHandler = require('./pythonWorkerHandler');
const zmq = require('zeromq');
const { Pinecone } = require("@pinecone-database/pinecone");
const { OpenAIClient, AzureKeyCredential } = require("@azure/openai");
const { Semaphore } = require("async-mutex");
const { GetDescription } = require('./helpers');

class embeddingsHandler {

static embeddingDimensions = 384;
static embeddingDimensions = 3072;

static indexName = "gitgudissues";

constructor(inConfigObject) {
// Set up Python Worker
this.sock = new zmq.Request;
this.pythonWorker = new pythonWorkerHandler(this.sock);
// Set up azureClient and Pinecone
this.azureClient = new OpenAIClient(inConfigObject.azureEndpointURL, new AzureKeyCredential(inConfigObject.azureOpenAIAPIKey), {apiVersion: "2023-05-15"});
this.pinecone = new Pinecone({
environment: "gcp-starter",
apiKey: inConfigObject.pineconeAPIKey,
});
this.index = this.pinecone.Index(embeddingsHandler.indexName);
this.maxConcurrentRequests = 1;
this.pineconeSemaphore = new Semaphore(this.maxConcurrentRequests);
this.azureSemaphore = new Semaphore(this.maxConcurrentRequests);
}

async addMultipleEmbeddings(inputIssues) {
// Get embeddings from Python Worker

if (inputIssues.length != 0) {
const titles = inputIssues.map(issue => issue.title);
const embeddings = await this.pythonWorker.getMultipleEmbeddings(titles);

// Get list of issues grouped by repoRef with embeddings added
let issuesByRepo = {};
for (let i = 0; i < inputIssues.length; i++) {
let issue = inputIssues[i];
let embedding = embeddings[i];
if (!issuesByRepo[issue.repoRef.toString()]) {
issuesByRepo[issue.repoRef.toString()] = [];
}
issuesByRepo[issue.repoRef.toString()].push({
id: issue._id.toString(),
values: embedding,
});
}

// Upsert embeddings into Pinecone
for (const [repoRef, issues] of Object.entries(issuesByRepo)) {
await this.index.namespace(repoRef).upsert(issues);
}

return true;
async addEmbedding(inputIssue) {

// Get embeddings from Azure OpenAI Embeddings model
const description = [GetDescription(inputIssue)];

let embeddingObject = null ;

try {
await this.azureSemaphore.runExclusive(async () => {
embeddingObject = await this.azureClient.getEmbeddings("issue-body-embeddings-model", description);
});
} catch (error) {
console.log(error);
}
else {
return true;

let embedding = embeddingObject.data[0].embedding;

let payload = {
id: inputIssue._id.toString(),
values: embedding,
}

console.log("Upserting embeddings for issue number: " + inputIssue.number);
return await this.pineconeSemaphore.runExclusive(async () => {
console.log("Semaphore acquired for issue number: " + inputIssue.number);
await this.index.namespace(inputIssue.repoRef.toString()).upsert([payload]);
});
}


async removeEmbedding(inputIssue) {
await this.index.namespace(inputIssue.repoRef.toString()).deleteOne(inputIssue._id.toString());
Expand All @@ -65,8 +64,12 @@ class embeddingsHandler {
return true;
}

async getSimilarIssueIDs(repo, issueTitle, issue) {
const inputVector = await this.pythonWorker.getEmbedding(issueTitle);
async getSimilarIssueIDs(repo, issueDescription, issue) {
// Create title + body description
const description = [issueDescription];

// Query azure for embeddings
const inputVector = await this.azureClient.getEmbeddings("issue-body-embeddings-model", description);

let searchFilter = `repo_id eq '${repo._id.toString()}'`;

Expand All @@ -78,7 +81,7 @@ class embeddingsHandler {

let searchResults = await this.index.namespace(repo._id.toString()).query({
topK: numberOfReturnedIssues + 1,
vector: inputVector,
vector: inputVector.data[0].embedding,
includeValues: false
});

Expand Down
17 changes: 17 additions & 0 deletions backendsrc/helpers.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
const { getEncoding } = require("js-tiktoken")

module.exports = {
PromiseTimeout(delayms) {
return new Promise(function (resolve, reject) {
Expand All @@ -20,6 +22,21 @@ module.exports = {
toObject[key] = fromObject[key];
});
},
GetDescription(issueTitle, issueBody) {
// Generate description and check if token count is too high
const enc = getEncoding("cl100k_base");

let description = '# ' + issueTitle + '\n\n' + issueBody;

const encoding = enc.encode(description);

if (encoding.length > 8192) {
// Cut description to under 8192 tokens if too long
description = enc.decode(encoding.slice(0, 8100));
}

return description;
},
async UpdateIssueRead(inIssueReadDetails, inIssue, inUser, inputDate) {
let returnIssueReadList = await inIssueReadDetails.find({ issueRef: inIssue._id, userRef: inUser._id });

Expand Down
10 changes: 5 additions & 5 deletions backendsrc/oneOffScriptHelpers.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,22 @@
module.exports = {
async AddEmbeddingsToIssuesInRepo(inIssueDetails, inEmbeddingsHandler, inRepo) {

if (inRepo.shortURL == "microsoft/terminal") {
if (inRepo.shortURL == "microsoft/winget-pkgs") {
try {
let startPeriod = new Date((new Date().getTime() - (20 * 12 * 4 * 7 * 24 * 60 * 60 * 1000))); // 20 years ago
let totalIssues = await inIssueDetails.countDocuments({
repoRef: inRepo._id,
created_at: { $gte: startPeriod }
});
let pageSize = 100;
console.log(`Total issues for ${inRepo.shortURL}: ${totalIssues}`)
let pageSize = 1;
let pages = Math.ceil(totalIssues / pageSize);

for (let i = 0; i < pages; i++) {
let issueList = await inIssueDetails.find({
repoRef: inRepo._id,
created_at: { $gte: startPeriod }
}).sort({ number: 1 }).skip(i * pageSize).limit(pageSize);
await inEmbeddingsHandler.addMultipleEmbeddings(issueList);
}).sort({ number: 1 }).skip((i * pageSize)).limit(pageSize);
await inEmbeddingsHandler.addEmbedding(issueList[0]);
let percentComplete = ((i + 1) / pages) * 100;
let beginningNumber = i * pageSize + 1;
let endNumber = Math.min((i + 1) * pageSize, totalIssues);
Expand Down
12 changes: 3 additions & 9 deletions backendsrc/refreshRepoHandler.js
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,6 @@ class RefreshRepoTask {

async storeInDatabase(data) {
var response = 'success';
var insertedIssueArray = [];

await Promise.all(data.map(async (responseItem) => {
// for (let i = 0; i < data.length; i++) {
Expand Down Expand Up @@ -293,7 +292,8 @@ class RefreshRepoTask {
// Check if an issue was inserted
if (!updateResultRaw.lastErrorObject.updatedExisting) {
// Add inserted issue to list
insertedIssueArray.push(updateResult);
let embeddingsPromise = this.embeddingsHandler.addEmbedding(updateResult);
finalAwaitPromiseArray.push(embeddingsPromise);
}

if (updateResult.closed_by) {
Expand Down Expand Up @@ -324,7 +324,7 @@ class RefreshRepoTask {
// For each name in the mention array, attempt to create a mention
if (!updateResultRaw.lastErrorObject.updatedExisting) {
finalAwaitPromiseArray.push(helperFunctions.CreateMentionsFromIssueList(mentionsArray, this.IssueCommentMentionDetails, this.UserDetails, this.IssueReadDetails, updateResult));
}
}

await Promise.all(finalAwaitPromiseArray);

Expand All @@ -337,12 +337,6 @@ class RefreshRepoTask {
}
}));

// Log to console how many issues we're adding embeddings for and the repo name
console.log("Adding embeddings for " + insertedIssueArray.length + " issues in " + this.shortRepoUrl);

// Add embeddings for all inserted issues
await this.embeddingsHandler.addMultipleEmbeddings(insertedIssueArray);

return response;
}

Expand Down
7 changes: 5 additions & 2 deletions backendsrc/webDataHandler.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ const axios = require('axios');
const mongoose = require('mongoose');
const ObjectId = mongoose.Types.ObjectId;
const oneOffScriptHelpers = require('./oneOffScriptHelpers');
const { GetDescription } = require('./helpers');

class WebDataHandler {
constructor(inRepoDetails, inIssueDetails, inUserDetails, inSiteIssueLabelDetails, inIssueCommentDetails, inIssueCommentMentionDetails,
Expand Down Expand Up @@ -2140,7 +2141,9 @@ class WebDataHandler {
}

async getSimilarIssues(queryData) {
const { organizationName, repoName, issueTitle } = queryData;
const { organizationName, repoName, issueTitle, issueBody } = queryData;

let issueDescription = GetDescription(issueTitle, issueBody) // to do rewrite to take in issue title and body

let dbRepoName = (organizationName + "/" + repoName).toLowerCase();

Expand All @@ -2152,7 +2155,7 @@ class WebDataHandler {

let issue = await this.IssueDetails.findOne({ title: issueTitle, repoRef: repo._id });

let similarIssueIDArray = await this.embeddingsHandler.getSimilarIssueIDs(repo, issueTitle, issue);
let similarIssueIDArray = await this.embeddingsHandler.getSimilarIssueIDs(repo, issueDescription, issue);

// Make a new array that finds each issue with the id specified in the array above
let similarIssuesArray = await Promise.all(similarIssueIDArray.map(similarIssueIDObject => this.IssueDetails.findOne({ _id: similarIssueIDObject.id })));
Expand Down
4 changes: 2 additions & 2 deletions defaultconfig.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ module.exports = {
'sessionSecret': 'somesessionsecret',
// https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token
'ghToken': null,
'azureSearchAPIKey': "key",
'azureSearchURL' : "url",
'azureOpenAIAPIKey': "key",
'azureEndpointURL' : "url",
};
Loading

0 comments on commit a3f22c6

Please sign in to comment.