Skip to content

Commit

Permalink
feat(ai-help): index content as markdown (#10330)
Browse files Browse the repository at this point in the history
Convert the document for context to markdown.
This reduces the token count by ~18% on average.

Co-authored-by: Claas Augner <[email protected]>
  • Loading branch information
fiji-flo and caugner authored Jan 30, 2024
1 parent ea3a849 commit 337d0b1
Show file tree
Hide file tree
Showing 6 changed files with 126 additions and 13 deletions.
15 changes: 15 additions & 0 deletions markdown/h2m/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import { unified } from "unified";
import rehypeParse from "rehype-parse";
import rehypeRemark from "rehype-remark";
import remarkStringify from "remark-stringify";
import remarkGfm from "remark-gfm";

export function h2mSync(html: string) {
const file = unified()
.use(rehypeParse)
.use(rehypeRemark)
.use(remarkGfm)
.use(remarkStringify)
.processSync(html);
return String(file);
}
1 change: 1 addition & 0 deletions markdown/index.ts
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
export * from "./utils/index.js";
export * from "./m2h/index.js";
export * from "./h2m/index.js";
3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -126,12 +126,15 @@
"react-modal": "^3.16.1",
"read-chunk": "^4.0.3",
"rehype-format": "^5.0.0",
"rehype-parse": "^9.0.0",
"rehype-raw": "^7.0.0",
"rehype-remark": "^10.0.0",
"rehype-sanitize": "^6.0.0",
"rehype-stringify": "^10.0.0",
"remark-gfm": "^4.0.0",
"remark-parse": "^11.0.0",
"remark-rehype": "^11.1.0",
"remark-stringify": "^11.0.0",
"sanitize-filename": "^1.6.3",
"send": "^0.18.0",
"source-map-support": "^0.5.21",
Expand Down
44 changes: 31 additions & 13 deletions scripts/ai-help-macros.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import {
SimpleSupportStatement,
VersionValue,
} from "@mdn/browser-compat-data/types";
import { h2mSync } from "../markdown/index.js";

const { program } = caporal;

Expand All @@ -37,6 +38,7 @@ interface Doc {
title: string;
hash: string;
html: string;
markdown: string;
text?: string;
text_hash?: string;
}
Expand Down Expand Up @@ -106,7 +108,7 @@ export async function updateEmbeddings(
const updates: Doc[] = [];
const formattingUpdates: Doc[] = [];

for await (const { mdn_url, title, hash, html, text } of builtDocs(
for await (const { mdn_url, title, hash, html, markdown, text } of builtDocs(
directory
)) {
seenUrls.add(mdn_url);
Expand All @@ -122,6 +124,7 @@ export async function updateEmbeddings(
title,
hash,
html,
markdown,
text,
text_hash,
});
Expand All @@ -131,6 +134,7 @@ export async function updateEmbeddings(
title,
hash,
html,
markdown,
});
}
}
Expand All @@ -147,7 +151,15 @@ export async function updateEmbeddings(

if (updates.length > 0 || formattingUpdates.length > 0) {
console.log(`Applying updates...`);
for (const { mdn_url, title, hash, html, text, text_hash } of updates) {
for (const {
mdn_url,
title,
hash,
html,
markdown,
text,
text_hash,
} of updates) {
try {
console.log(`-> [${mdn_url}] Updating document...`);

Expand All @@ -163,25 +175,28 @@ export async function updateEmbeddings(
title,
hash,
html,
markdown,
token_count,
embedding,
text_hash
)
VALUES($1, $2, $3, $4, $5, $6, $7) ON CONFLICT (mdn_url) DO
VALUES($1, $2, $3, $4, $5, $6, $7, $8) ON CONFLICT (mdn_url) DO
UPDATE
SET mdn_url = $1,
title = $2,
hash = $3,
html = $4,
token_count = $5,
embedding = $6,
text_hash = $7
markdown = $5,
token_count = $6,
embedding = $7,
text_hash = $8
`,
values: [
mdn_url,
title,
hash,
html,
markdown,
total_tokens,
pgvector.toSql(embedding),
text_hash,
Expand All @@ -196,7 +211,7 @@ export async function updateEmbeddings(
console.error(context);
}
}
for (const { mdn_url, title, hash, html } of formattingUpdates) {
for (const { mdn_url, title, hash, html, markdown } of formattingUpdates) {
try {
console.log(
`-> [${mdn_url}] Updating document without generating new embedding...`
Expand All @@ -206,15 +221,16 @@ export async function updateEmbeddings(
const query = {
name: "upsert-doc",
text: `
INSERT INTO mdn_doc_macro(mdn_url, title, hash, html)
VALUES($1, $2, $3, $4) ON CONFLICT (mdn_url) DO
INSERT INTO mdn_doc_macro(mdn_url, title, hash, html, markdown)
VALUES($1, $2, $3, $4, $5) ON CONFLICT (mdn_url) DO
UPDATE
SET mdn_url = $1,
title = $2,
hash = $3,
html = $4
html = $4,
markdown = $5
`,
values: [mdn_url, title, hash, html],
values: [mdn_url, title, hash, html, markdown],
rowMode: "array",
};

Expand Down Expand Up @@ -247,8 +263,8 @@ export async function updateEmbeddings(
}

async function formatDocs(directory: string) {
for await (const { html, text } of builtDocs(directory)) {
console.log(html, text);
for await (const { html, markdown, text } of builtDocs(directory)) {
console.log(html, markdown, text);
}
}

Expand Down Expand Up @@ -288,6 +304,7 @@ async function* builtDocs(directory: string) {
$(el).replaceWith(buildBCDTable($(el).data("query") as string));
});
const html = $.html();
const markdown = h2mSync(html);

// reformat text version, used for embedding
$("title").remove();
Expand All @@ -299,6 +316,7 @@ async function* builtDocs(directory: string) {
title,
hash,
html,
markdown,
text,
};
} catch (e) {
Expand Down
1 change: 1 addition & 0 deletions scripts/ai-help.sql
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ create table
title text not null,
mdn_url text not null,
html text null,
markdown text null,
token_count integer null,
embedding extensions.vector null,
text_hash text null,
Expand Down
75 changes: 75 additions & 0 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -7754,6 +7754,18 @@ hast-util-embedded@^3.0.0:
"@types/hast" "^3.0.0"
hast-util-is-element "^3.0.0"

hast-util-from-html@^2.0.0:
version "2.0.1"
resolved "https://registry.yarnpkg.com/hast-util-from-html/-/hast-util-from-html-2.0.1.tgz#9cd38ee81bf40b2607368b92a04b0905fa987488"
integrity sha512-RXQBLMl9kjKVNkJTIO6bZyb2n+cUH8LFaSSzo82jiLT6Tfc+Pt7VQCS+/h3YwG4jaNE2TA2sdJisGWR+aJrp0g==
dependencies:
"@types/hast" "^3.0.0"
devlop "^1.1.0"
hast-util-from-parse5 "^8.0.0"
parse5 "^7.0.0"
vfile "^6.0.0"
vfile-message "^4.0.0"

hast-util-from-parse5@^8.0.0:
version "8.0.1"
resolved "https://registry.yarnpkg.com/hast-util-from-parse5/-/hast-util-from-parse5-8.0.1.tgz#654a5676a41211e14ee80d1b1758c399a0327651"
Expand Down Expand Up @@ -7868,6 +7880,26 @@ hast-util-to-jsx-runtime@^2.0.0:
unist-util-position "^5.0.0"
vfile-message "^4.0.0"

hast-util-to-mdast@^10.0.0:
version "10.1.0"
resolved "https://registry.yarnpkg.com/hast-util-to-mdast/-/hast-util-to-mdast-10.1.0.tgz#906c80fc263a9f09a33462317ffc6ad94f4ee3db"
integrity sha512-DsL/SvCK9V7+vfc6SLQ+vKIyBDXTk2KLSbfBYkH4zeF/uR1yBajHRhkzuaUSGOB1WJSTieJBdHwxlC+HLKvZZw==
dependencies:
"@types/hast" "^3.0.0"
"@types/mdast" "^4.0.0"
"@ungap/structured-clone" "^1.0.0"
hast-util-phrasing "^3.0.0"
hast-util-to-html "^9.0.0"
hast-util-to-text "^4.0.0"
hast-util-whitespace "^3.0.0"
mdast-util-phrasing "^4.0.0"
mdast-util-to-hast "^13.0.0"
mdast-util-to-string "^4.0.0"
rehype-minify-whitespace "^6.0.0"
trim-trailing-lines "^2.0.0"
unist-util-position "^5.0.0"
unist-util-visit "^5.0.0"

hast-util-to-parse5@^8.0.0:
version "8.0.0"
resolved "https://registry.yarnpkg.com/hast-util-to-parse5/-/hast-util-to-parse5-8.0.0.tgz#477cd42d278d4f036bc2ea58586130f6f39ee6ed"
Expand All @@ -7881,6 +7913,16 @@ hast-util-to-parse5@^8.0.0:
web-namespaces "^2.0.0"
zwitch "^2.0.0"

hast-util-to-text@^4.0.0:
version "4.0.0"
resolved "https://registry.yarnpkg.com/hast-util-to-text/-/hast-util-to-text-4.0.0.tgz#7f33a45d0bf7981ead44e82d9d8d75f511b3642f"
integrity sha512-EWiE1FSArNBPUo1cKWtzqgnuRQwEeQbQtnFJRYV1hb1BWDgrAlBU0ExptvZMM/KSA82cDpm2sFGf3Dmc5Mza3w==
dependencies:
"@types/hast" "^3.0.0"
"@types/unist" "^3.0.0"
hast-util-is-element "^3.0.0"
unist-util-find-after "^5.0.0"

hast-util-whitespace@^3.0.0:
version "3.0.0"
resolved "https://registry.yarnpkg.com/hast-util-whitespace/-/hast-util-whitespace-3.0.0.tgz#7778ed9d3c92dd9e8c5c8f648a49c21fc51cb621"
Expand Down Expand Up @@ -12870,6 +12912,15 @@ rehype-minify-whitespace@^6.0.0:
hast-util-whitespace "^3.0.0"
unist-util-is "^6.0.0"

rehype-parse@^9.0.0:
version "9.0.0"
resolved "https://registry.yarnpkg.com/rehype-parse/-/rehype-parse-9.0.0.tgz#3949faeec6f466ec57774215661e0d75469195d9"
integrity sha512-WG7nfvmWWkCR++KEkZevZb/uw41E8TsH4DsY9UxsTbIXCVGbAs4S+r8FrQ+OtH5EEQAs+5UxKC42VinkmpA1Yw==
dependencies:
"@types/hast" "^3.0.0"
hast-util-from-html "^2.0.0"
unified "^11.0.0"

rehype-raw@^7.0.0:
version "7.0.0"
resolved "https://registry.yarnpkg.com/rehype-raw/-/rehype-raw-7.0.0.tgz#59d7348fd5dbef3807bbaa1d443efd2dd85ecee4"
Expand All @@ -12879,6 +12930,17 @@ rehype-raw@^7.0.0:
hast-util-raw "^9.0.0"
vfile "^6.0.0"

rehype-remark@^10.0.0:
version "10.0.0"
resolved "https://registry.yarnpkg.com/rehype-remark/-/rehype-remark-10.0.0.tgz#de15bf1f920ce519291848cd0d99aabaad44cf71"
integrity sha512-+aDXY/icqMFOafJQomVjxe3BAP7aR3lIsQ3GV6VIwpbCD2nvNFOXjGvotMe5p0Ny+Gt6L13DhEf/FjOOpTuUbQ==
dependencies:
"@types/hast" "^3.0.0"
"@types/mdast" "^4.0.0"
hast-util-to-mdast "^10.0.0"
unified "^11.0.0"
vfile "^6.0.0"

rehype-sanitize@^6.0.0:
version "6.0.0"
resolved "https://registry.yarnpkg.com/rehype-sanitize/-/rehype-sanitize-6.0.0.tgz#16e95f4a67a69cbf0f79e113c8e0df48203db73c"
Expand Down Expand Up @@ -14686,6 +14748,11 @@ trim-repeated@^1.0.0:
dependencies:
escape-string-regexp "^1.0.2"

trim-trailing-lines@^2.0.0:
version "2.1.0"
resolved "https://registry.yarnpkg.com/trim-trailing-lines/-/trim-trailing-lines-2.1.0.tgz#9aac7e89b09cb35badf663de7133c6de164f86df"
integrity sha512-5UR5Biq4VlVOtzqkm2AZlgvSlDJtME46uV0br0gENbwN4l5+mMKT4b9gJKqWtuL2zAIqajGJGuvbCbcAJUZqBg==

triple-beam@^1.3.0:
version "1.3.0"
resolved "https://registry.yarnpkg.com/triple-beam/-/triple-beam-1.3.0.tgz#a595214c7298db8339eeeee083e4d10bd8cb8dd9"
Expand Down Expand Up @@ -15002,6 +15069,14 @@ unist-builder@^4.0.0:
dependencies:
"@types/unist" "^3.0.0"

unist-util-find-after@^5.0.0:
version "5.0.0"
resolved "https://registry.yarnpkg.com/unist-util-find-after/-/unist-util-find-after-5.0.0.tgz#3fccc1b086b56f34c8b798e1ff90b5c54468e896"
integrity sha512-amQa0Ep2m6hE2g72AugUItjbuM8X8cGQnFoHk0pGfrFeT9GZhzN5SW8nRsiGKK7Aif4CrACPENkA6P/Lw6fHGQ==
dependencies:
"@types/unist" "^3.0.0"
unist-util-is "^6.0.0"

unist-util-is@^5.0.0:
version "5.1.1"
resolved "https://registry.yarnpkg.com/unist-util-is/-/unist-util-is-5.1.1.tgz#e8aece0b102fa9bc097b0fef8f870c496d4a6236"
Expand Down

0 comments on commit 337d0b1

Please sign in to comment.