Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add body indexing #27

Merged
merged 15 commits into from
Mar 20, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 2 additions & 15 deletions app.js
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ if (process.env.NODE_ENV == 'production') {
config.sessionSecret = process.env.sessionSecret;
config.ghToken = process.env.ghToken;
config.pineconeAPIKey = process.env.pineconeAPIKey;
config.azureOpenAIAPIKey = process.env.azureOpenAIAPIKey;
config.azureEndpointURL = process.env.azureEndpointURL;
hostPort = process.env.PORT ? process.env.PORT : 8080;
} else {
mongooseConnectionString = config.devMongoDBConnectionString;
Expand Down Expand Up @@ -1035,7 +1037,6 @@ app.post('/api/getrepoissuegraph', authenticateToken, async function (req, res)

app.post('/api/getsimilarissues', async function (req, res) {
try {
req.body.username = req.user.id;
var returnData = await dataHandler.getSimilarIssues(req.body);

return res.json({ success: true, similarIssues: returnData });
Expand All @@ -1044,20 +1045,6 @@ app.post('/api/getsimilarissues', async function (req, res) {
}
});

// app.get with /api//getsimilarissues/:repoName/:issueNumber
app.get('/api/getsimilarissues/:organizationName/:repoName/*', async function (req, res) {
try {
let issueTitle = decodeURIComponent(req.params[0]);
req.params.issueTitle = issueTitle;

var returnData = await dataHandler.getSimilarIssues(req.params);

return res.json({ success: true, similarIssues: returnData });
} catch (error) {
return res.json(returnFailure(error));
}
});

// Interval jobs

// Refresh all user data every hour
Expand Down
79 changes: 41 additions & 38 deletions backendsrc/embeddingsHandler.js
Original file line number Diff line number Diff line change
@@ -1,57 +1,56 @@
const pythonWorkerHandler = require('./pythonWorkerHandler');
const zmq = require('zeromq');
const { Pinecone } = require("@pinecone-database/pinecone");
const { OpenAIClient, AzureKeyCredential } = require("@azure/openai");
const { Semaphore } = require("async-mutex");
const { GetDescription } = require('./helpers');

class embeddingsHandler {

static embeddingDimensions = 384;
static embeddingDimensions = 3072;

static indexName = "gitgudissues";

constructor(inConfigObject) {
// Set up Python Worker
this.sock = new zmq.Request;
this.pythonWorker = new pythonWorkerHandler(this.sock);
// Set up azureClient and Pinecone
this.azureClient = new OpenAIClient(inConfigObject.azureEndpointURL, new AzureKeyCredential(inConfigObject.azureOpenAIAPIKey), {apiVersion: "2023-05-15"});
this.pinecone = new Pinecone({
environment: "gcp-starter",
apiKey: inConfigObject.pineconeAPIKey,
});
this.index = this.pinecone.Index(embeddingsHandler.indexName);
this.maxConcurrentRequests = 1;
this.pineconeSemaphore = new Semaphore(this.maxConcurrentRequests);
this.azureSemaphore = new Semaphore(this.maxConcurrentRequests);
}

async addMultipleEmbeddings(inputIssues) {
// Get embeddings from Python Worker

if (inputIssues.length != 0) {
const titles = inputIssues.map(issue => issue.title);
const embeddings = await this.pythonWorker.getMultipleEmbeddings(titles);

// Get list of issues grouped by repoRef with embeddings added
let issuesByRepo = {};
for (let i = 0; i < inputIssues.length; i++) {
let issue = inputIssues[i];
let embedding = embeddings[i];
if (!issuesByRepo[issue.repoRef.toString()]) {
issuesByRepo[issue.repoRef.toString()] = [];
}
issuesByRepo[issue.repoRef.toString()].push({
id: issue._id.toString(),
values: embedding,
});
}

// Upsert embeddings into Pinecone
for (const [repoRef, issues] of Object.entries(issuesByRepo)) {
await this.index.namespace(repoRef).upsert(issues);
}

return true;
async addEmbedding(inputIssue) {

// Get embeddings from Azure OpenAI Embeddings model
const description = [GetDescription(inputIssue)];

let embeddingObject = null ;

try {
await this.azureSemaphore.runExclusive(async () => {
embeddingObject = await this.azureClient.getEmbeddings("issue-body-embeddings-model", description);
});
} catch (error) {
console.log(error);
}
else {
return true;

let embedding = embeddingObject.data[0].embedding;

let payload = {
id: inputIssue._id.toString(),
values: embedding,
}

console.log("Upserting embeddings for issue number: " + inputIssue.number);
return await this.pineconeSemaphore.runExclusive(async () => {
console.log("Semaphore acquired for issue number: " + inputIssue.number);
await this.index.namespace(inputIssue.repoRef.toString()).upsert([payload]);
});
}


async removeEmbedding(inputIssue) {
await this.index.namespace(inputIssue.repoRef.toString()).deleteOne(inputIssue._id.toString());
Expand All @@ -65,8 +64,12 @@ class embeddingsHandler {
return true;
}

async getSimilarIssueIDs(repo, issueTitle, issue) {
const inputVector = await this.pythonWorker.getEmbedding(issueTitle);
async getSimilarIssueIDs(repo, issueDescription, issue) {
// Create title + body description
const description = [issueDescription];

// Query azure for embeddings
const inputVector = await this.azureClient.getEmbeddings("issue-body-embeddings-model", description);

let searchFilter = `repo_id eq '${repo._id.toString()}'`;

Expand All @@ -78,7 +81,7 @@ class embeddingsHandler {

let searchResults = await this.index.namespace(repo._id.toString()).query({
topK: numberOfReturnedIssues + 1,
vector: inputVector,
vector: inputVector.data[0].embedding,
includeValues: false
});

Expand Down
17 changes: 17 additions & 0 deletions backendsrc/helpers.js
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
const { getEncoding } = require("js-tiktoken")

module.exports = {
PromiseTimeout(delayms) {
return new Promise(function (resolve, reject) {
Expand All @@ -20,6 +22,21 @@ module.exports = {
toObject[key] = fromObject[key];
});
},
GetDescription(issueTitle, issueBody) {
// Generate description and check if token count is too high
const enc = getEncoding("cl100k_base");

let description = '# ' + issueTitle + '\n\n' + issueBody;

const encoding = enc.encode(description);

if (encoding.length > 8192) {
// Cut description to under 8192 tokens if too long
description = enc.decode(encoding.slice(0, 8100));
}

return description;
},
async UpdateIssueRead(inIssueReadDetails, inIssue, inUser, inputDate) {
let returnIssueReadList = await inIssueReadDetails.find({ issueRef: inIssue._id, userRef: inUser._id });

Expand Down
10 changes: 5 additions & 5 deletions backendsrc/oneOffScriptHelpers.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,22 +2,22 @@
module.exports = {
async AddEmbeddingsToIssuesInRepo(inIssueDetails, inEmbeddingsHandler, inRepo) {

if (inRepo.shortURL == "microsoft/terminal") {
if (inRepo.shortURL == "microsoft/winget-pkgs") {
try {
let startPeriod = new Date((new Date().getTime() - (20 * 12 * 4 * 7 * 24 * 60 * 60 * 1000))); // 20 years ago
let totalIssues = await inIssueDetails.countDocuments({
repoRef: inRepo._id,
created_at: { $gte: startPeriod }
});
let pageSize = 100;
console.log(`Total issues for ${inRepo.shortURL}: ${totalIssues}`)
let pageSize = 1;
let pages = Math.ceil(totalIssues / pageSize);

for (let i = 0; i < pages; i++) {
let issueList = await inIssueDetails.find({
repoRef: inRepo._id,
created_at: { $gte: startPeriod }
}).sort({ number: 1 }).skip(i * pageSize).limit(pageSize);
await inEmbeddingsHandler.addMultipleEmbeddings(issueList);
}).sort({ number: 1 }).skip((i * pageSize)).limit(pageSize);
await inEmbeddingsHandler.addEmbedding(issueList[0]);
let percentComplete = ((i + 1) / pages) * 100;
let beginningNumber = i * pageSize + 1;
let endNumber = Math.min((i + 1) * pageSize, totalIssues);
Expand Down
12 changes: 3 additions & 9 deletions backendsrc/refreshRepoHandler.js
Original file line number Diff line number Diff line change
Expand Up @@ -224,7 +224,6 @@ class RefreshRepoTask {

async storeInDatabase(data) {
var response = 'success';
var insertedIssueArray = [];

await Promise.all(data.map(async (responseItem) => {
// for (let i = 0; i < data.length; i++) {
Expand Down Expand Up @@ -293,7 +292,8 @@ class RefreshRepoTask {
// Check if an issue was inserted
if (!updateResultRaw.lastErrorObject.updatedExisting) {
// Add inserted issue to list
insertedIssueArray.push(updateResult);
let embeddingsPromise = this.embeddingsHandler.addEmbedding(updateResult);
finalAwaitPromiseArray.push(embeddingsPromise);
}

if (updateResult.closed_by) {
Expand Down Expand Up @@ -324,7 +324,7 @@ class RefreshRepoTask {
// For each name in the mention array, attempt to create a mention
if (!updateResultRaw.lastErrorObject.updatedExisting) {
finalAwaitPromiseArray.push(helperFunctions.CreateMentionsFromIssueList(mentionsArray, this.IssueCommentMentionDetails, this.UserDetails, this.IssueReadDetails, updateResult));
}
}

await Promise.all(finalAwaitPromiseArray);

Expand All @@ -337,12 +337,6 @@ class RefreshRepoTask {
}
}));

// Log to console how many issues we're adding embeddings for and the repo name
console.log("Adding embeddings for " + insertedIssueArray.length + " issues in " + this.shortRepoUrl);

// Add embeddings for all inserted issues
await this.embeddingsHandler.addMultipleEmbeddings(insertedIssueArray);

return response;
}

Expand Down
7 changes: 5 additions & 2 deletions backendsrc/webDataHandler.js
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ const axios = require('axios');
const mongoose = require('mongoose');
const ObjectId = mongoose.Types.ObjectId;
const oneOffScriptHelpers = require('./oneOffScriptHelpers');
const { GetDescription } = require('./helpers');

class WebDataHandler {
constructor(inRepoDetails, inIssueDetails, inUserDetails, inSiteIssueLabelDetails, inIssueCommentDetails, inIssueCommentMentionDetails,
Expand Down Expand Up @@ -2140,7 +2141,9 @@ class WebDataHandler {
}

async getSimilarIssues(queryData) {
const { organizationName, repoName, issueTitle } = queryData;
const { organizationName, repoName, issueTitle, issueBody } = queryData;

let issueDescription = GetDesription(issueTitle, issueBody) // to do rewrite to take in issue title and body

let dbRepoName = (organizationName + "/" + repoName).toLowerCase();

Expand All @@ -2152,7 +2155,7 @@ class WebDataHandler {

let issue = await this.IssueDetails.findOne({ title: issueTitle, repoRef: repo._id });

let similarIssueIDArray = await this.embeddingsHandler.getSimilarIssueIDs(repo, issueTitle, issue);
let similarIssueIDArray = await this.embeddingsHandler.getSimilarIssueIDs(repo, issueDescription, issue);

// Make a new array that finds each issue with the id specified in the array above
let similarIssuesArray = await Promise.all(similarIssueIDArray.map(similarIssueIDObject => this.IssueDetails.findOne({ _id: similarIssueIDObject.id })));
Expand Down
4 changes: 2 additions & 2 deletions defaultconfig.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ module.exports = {
'sessionSecret': 'somesessionsecret',
// https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token
'ghToken': null,
'azureSearchAPIKey': "key",
'azureSearchURL' : "url",
'azureOpenAIAPIKey': "key",
'azureEndpointURL' : "url",
};
Loading