Skip to content

Commit

Permalink
feat(scripts): use pg instead of supabase (#10337)
Browse files Browse the repository at this point in the history
* feat(scripts): use pg instead of supabase

* fix error destructing

* use pg uri to connect

* log formatted updates
  • Loading branch information
fiji-flo authored Jan 29, 2024
1 parent c9f1522 commit 5ce3a65
Show file tree
Hide file tree
Showing 7 changed files with 218 additions and 79 deletions.
3 changes: 1 addition & 2 deletions .github/workflows/prod-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -384,8 +384,7 @@ jobs:
run: yarn ai-help-macros update-index
env:
OPENAI_KEY: ${{ secrets.OPENAI_KEY }}
SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
PG_URI: ${{ secrets.PG_URI }}

- name: Slack Notification
if: failure()
Expand Down
3 changes: 1 addition & 2 deletions .github/workflows/stage-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -377,8 +377,7 @@ jobs:
run: yarn ai-help-macros update-index
env:
OPENAI_KEY: ${{ secrets.OPENAI_KEY }}
SUPABASE_SERVICE_ROLE_KEY: ${{ secrets.SUPABASE_SERVICE_ROLE_KEY }}
SUPABASE_URL: ${{ secrets.SUPABASE_URL }}
PG_URI: ${{ secrets.PG_URI }}

- name: Slack Notification
if: failure()
Expand Down
1 change: 1 addition & 0 deletions libs/env/index.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ export const OFFLINE_CONTENT: boolean;
export const FAKE_V1_API: boolean;
export const SENTRY_DSN_BUILD: string;
export const OPENAI_KEY: string;
export const PG_URI: string;
export const SUPABASE_URL: string;
export const SUPABASE_SERVICE_ROLE_KEY: string;
export const SAMPLE_SIGN_KEY: Buffer;
1 change: 1 addition & 0 deletions libs/env/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,7 @@ export const FAKE_V1_API = JSON.parse(process.env.SERVER_FAKE_V1_API || false);
// ----

export const OPENAI_KEY = process.env.OPENAI_KEY || "";
export const PG_URI = process.env.PG_URI || "";
export const SUPABASE_URL = process.env.SUPABASE_URL || "";
export const SUPABASE_SERVICE_ROLE_KEY =
process.env.SUPABASE_SERVICE_ROLE_KEY || "";
Expand Down
2 changes: 2 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,8 @@
"open": "^10.0.3",
"open-editor": "^4.1.1",
"openai": "^4.26.0",
"pg": "^8.11.3",
"pgvector": "^0.1.7",
"prism-svelte": "^0.5.0",
"prismjs": "^1.29.0",
"react-markdown": "^9.0.1",
Expand Down
187 changes: 112 additions & 75 deletions scripts/ai-help-macros.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,14 @@ import { createHash } from "node:crypto";
import { readFile } from "node:fs/promises";

import caporal from "@caporal/core";
import { SupabaseClient, createClient } from "@supabase/supabase-js";
import pg from "pg";
import pgvector from "pgvector/pg";
import { fdir } from "fdir";
import OpenAI from "openai";
import { load as cheerio } from "cheerio";

import { DocMetadata } from "../libs/types/document.js";
import {
BUILD_OUT_ROOT,
OPENAI_KEY,
SUPABASE_SERVICE_ROLE_KEY,
SUPABASE_URL,
} from "../libs/env/index.js";
import { BUILD_OUT_ROOT, OPENAI_KEY, PG_URI } from "../libs/env/index.js";
import {
getBCDDataForPath,
SimpleSupportStatementExtended,
Expand Down Expand Up @@ -49,14 +45,18 @@ export async function updateEmbeddings(
directory: string,
updateFormatting: boolean
) {
if (!OPENAI_KEY || !SUPABASE_URL || !SUPABASE_SERVICE_ROLE_KEY) {
throw Error(
"Please set these environment variables: OPENAI_KEY, SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY"
);
if (!OPENAI_KEY || !PG_URI) {
throw Error("Please set these environment variables: OPENAI_KEY, PG_URI");
}

// Supabase.
const supabaseClient = createClient(SUPABASE_URL, SUPABASE_SERVICE_ROLE_KEY);
// Postgres.
const pgClient = new pg.Client({
connectionString: PG_URI,
});

await pgClient.connect();
await pgClient.query("CREATE EXTENSION IF NOT EXISTS vector");
await pgvector.registerType(pgClient);

// Open AI.
const openai = new OpenAI({
Expand All @@ -70,16 +70,9 @@ export async function updateEmbeddings(
model: "text-embedding-ada-002",
input,
});
} catch (e: any) {
const {
data: {
error: { message, type },
},
status,
statusText,
} = e.response;
} catch ({ error: { message, type }, status }: any) {
console.error(
`[!] Failed to create embedding (${status} ${statusText}): ${type} - ${message}`
`[!] Failed to create embedding (${status}): ${type} - ${message}`
);
// Try again with trimmed content.
embeddingResponse = await openai.embeddings.create({
Expand All @@ -100,7 +93,7 @@ export async function updateEmbeddings(
};

console.log(`Retrieving all indexed documents...`);
const existingDocs = await fetchAllExistingDocs(supabaseClient);
const existingDocs = await fetchAllExistingDocs(pgClient);
console.log(`-> Done.`);

const existingDocByUrl = new Map<string, IndexedDoc>(
Expand Down Expand Up @@ -143,7 +136,7 @@ export async function updateEmbeddings(
}

console.log(
`-> ${updates.length} of ${seenUrls.size} documents were changed (or added).`
`-> ${updates.length} (${formattingUpdates.length}) of ${seenUrls.size} documents were changed or added (or formatted).`
);
const deletions: IndexedDoc[] = [...existingDocByUrl.entries()]
.filter(([key]) => !seenUrls.has(key))
Expand All @@ -162,23 +155,41 @@ export async function updateEmbeddings(
const { total_tokens, embedding } = await createEmbedding(text);

// Create/update document record.
await supabaseClient
.from("mdn_doc_macro")
.upsert(
{
mdn_url,
title,
hash,
html,
token_count: total_tokens,
embedding,
text_hash,
},
{ onConflict: "mdn_url" }
)
.select()
.single()
.throwOnError();
const query = {
name: "upsert-embedding-doc",
text: `
INSERT INTO mdn_doc_macro(
mdn_url,
title,
hash,
html,
token_count,
embedding,
text_hash
)
VALUES($1, $2, $3, $4, $5, $6, $7) ON CONFLICT (mdn_url) DO
UPDATE
SET mdn_url = $1,
title = $2,
hash = $3,
html = $4,
token_count = $5,
embedding = $6,
text_hash = $7
`,
values: [
mdn_url,
title,
hash,
html,
total_tokens,
pgvector.toSql(embedding),
text_hash,
],
rowMode: "array",
};

await pgClient.query(query);
} catch (err: any) {
console.error(`!> [${mdn_url}] Failed to update document.`);
const context = err?.response?.data ?? err?.response ?? err;
Expand All @@ -192,20 +203,22 @@ export async function updateEmbeddings(
);

// Create/update document record.
await supabaseClient
.from("mdn_doc_macro")
.upsert(
{
mdn_url,
title,
hash,
html,
},
{ onConflict: "mdn_url" }
)
.select()
.single()
.throwOnError();
const query = {
name: "upsert-doc",
text: `
INSERT INTO mdn_doc_macro(mdn_url, title, hash, html)
VALUES($1, $2, $3, $4) ON CONFLICT (mdn_url) DO
UPDATE
SET mdn_url = $1,
title = $2,
hash = $3,
html = $4
`,
values: [mdn_url, title, hash, html],
rowMode: "array",
};

await pgClient.query(query);
} catch (err: any) {
console.error(`!> [${mdn_url}] Failed to update document.`);
const context = err?.response?.data ?? err?.response ?? err;
Expand All @@ -219,14 +232,18 @@ export async function updateEmbeddings(
console.log(`Applying deletions...`);
for (const { id, mdn_url } of deletions) {
console.log(`-> [${mdn_url}] Deleting indexed document...`);
await supabaseClient
.from("mdn_doc_macro")
.delete()
.eq("id", id)
.throwOnError();
const query = {
name: "delete-doc",
text: `DELETE from mdn_doc_macro WHERE id = $1`,
values: [id],
rowMode: "array",
};

await pgClient.query(query);
}
console.log(`-> Done.`);
}
pgClient.end();
}

async function formatDocs(directory: string) {
Expand Down Expand Up @@ -449,24 +466,44 @@ export function isNotSupportedAtAll(support: SimpleSupportStatement) {
return !support.version_added && !hasLimitation(support);
}

async function fetchAllExistingDocs(supabase: SupabaseClient) {
async function fetchAllExistingDocs(pgClient) {
const PAGE_SIZE = 1000;
const selectDocs = () =>
supabase
.from("mdn_doc_macro")
.select("id, mdn_url, title, hash, token_count, text_hash")
.order("id")
.limit(PAGE_SIZE);

let { data } = await selectDocs().throwOnError();
let allData = data;
while (data.length === PAGE_SIZE) {
const lastItem = data[data.length - 1];
({ data } = await selectDocs().gt("id", lastItem.id).throwOnError());
allData = [...allData, ...data];
const selectDocs = async (lastId) => {
const query = {
name: "fetch-all-doc",
text: `
SELECT id,
mdn_url,
title,
hash,
token_count,
text_hash
from mdn_doc_macro
WHERE id > $1
ORDER BY id ASC
LIMIT $2
`,
values: [lastId, PAGE_SIZE],
rowMode: "array",
};
const result = await pgClient.query(query);
return result.rows.map(
([id, mdn_url, title, hash, token_count, text_hash]) => {
return { id, mdn_url, title, hash, token_count, text_hash };
}
);
};

const allDocs = [];
let docs = await selectDocs(0);
allDocs.push(...docs);
while (docs.length === PAGE_SIZE) {
const lastItem = docs[docs.length - 1];
docs = await selectDocs(lastItem.id);
allDocs.push(...docs);
}

return allData;
return allDocs;
}

// CLI.
Expand Down
Loading

0 comments on commit 5ce3a65

Please sign in to comment.