From 32840b341fee56ca9cdf08cb5b134318ab9acd3b Mon Sep 17 00:00:00 2001 From: Eze Date: Sat, 2 Dec 2023 01:00:15 -0300 Subject: [PATCH 1/3] Add ObsidianLoader integration --- docs/api_refs/typedoc.json | 1 + .../test-exports-bun/src/entrypoints.js | 1 + .../test-exports-cf/src/entrypoints.js | 1 + .../test-exports-cjs/src/entrypoints.js | 1 + .../test-exports-esbuild/src/entrypoints.js | 1 + .../test-exports-esm/src/entrypoints.js | 1 + .../test-exports-vercel/src/entrypoints.js | 1 + .../test-exports-vite/src/entrypoints.js | 1 + .../example_data/obsidian/bad_frontmatter.md | 9 + .../example_data/obsidian/frontmatter.md | 5 + .../example_data/obsidian/no_frontmatter.md | 13 + .../example_data/obsidian/no_metadata.md | 1 + .../obsidian/tags_and_frontmatter.md | 35 +++ examples/src/document_loaders/obsidian.ts | 9 + langchain/.gitignore | 3 + langchain/package.json | 8 + langchain/scripts/create-entrypoints.js | 1 + langchain/src/document_loaders/fs/obsidian.ts | 262 ++++++++++++++++++ .../example_data/obsidian/bad_frontmatter.md | 9 + .../example_data/obsidian/frontmatter.md | 5 + .../example_data/obsidian/no_frontmatter.md | 13 + .../example_data/obsidian/no_metadata.md | 1 + .../obsidian/tags_and_frontmatter.md | 35 +++ .../document_loaders/tests/obsidian.test.ts | 185 +++++++++++++ langchain/src/load/import_map.ts | 1 + 25 files changed, 603 insertions(+) create mode 100644 examples/src/document_loaders/example_data/obsidian/bad_frontmatter.md create mode 100644 examples/src/document_loaders/example_data/obsidian/frontmatter.md create mode 100644 examples/src/document_loaders/example_data/obsidian/no_frontmatter.md create mode 100644 examples/src/document_loaders/example_data/obsidian/no_metadata.md create mode 100644 examples/src/document_loaders/example_data/obsidian/tags_and_frontmatter.md create mode 100644 examples/src/document_loaders/obsidian.ts create mode 100644 langchain/src/document_loaders/fs/obsidian.ts create mode 100644 langchain/src/document_loaders/tests/example_data/obsidian/bad_frontmatter.md create mode 100644 langchain/src/document_loaders/tests/example_data/obsidian/frontmatter.md create mode 100644 langchain/src/document_loaders/tests/example_data/obsidian/no_frontmatter.md create mode 100644 langchain/src/document_loaders/tests/example_data/obsidian/no_metadata.md create mode 100644 langchain/src/document_loaders/tests/example_data/obsidian/tags_and_frontmatter.md create mode 100644 langchain/src/document_loaders/tests/obsidian.test.ts diff --git a/docs/api_refs/typedoc.json b/docs/api_refs/typedoc.json index 7ff44c31fe65..7a129d80af97 100644 --- a/docs/api_refs/typedoc.json +++ b/docs/api_refs/typedoc.json @@ -174,6 +174,7 @@ "./langchain/src/document_loaders/fs/epub.ts", "./langchain/src/document_loaders/fs/csv.ts", "./langchain/src/document_loaders/fs/notion.ts", + "./langchain/src/document_loaders/fs/obsidian.ts", "./langchain/src/document_loaders/fs/unstructured.ts", "./langchain/src/document_loaders/fs/openai_whisper_audio.ts", "./langchain/src/document_loaders/fs/pptx.ts", diff --git a/environment_tests/test-exports-bun/src/entrypoints.js b/environment_tests/test-exports-bun/src/entrypoints.js index 61b2f0553453..f4dd67cb26b8 100644 --- a/environment_tests/test-exports-bun/src/entrypoints.js +++ b/environment_tests/test-exports-bun/src/entrypoints.js @@ -46,6 +46,7 @@ export * from "langchain/document_loaders/base"; export * from "langchain/document_loaders/web/searchapi"; export * from "langchain/document_loaders/web/serpapi"; export * from "langchain/document_loaders/web/sort_xyz_blockchain"; +export * from "langchain/document_loaders/fs/obsidian"; export * from "langchain/document_transformers/openai_functions"; export * from "langchain/chat_models/base"; export * from "langchain/chat_models/openai"; diff --git a/environment_tests/test-exports-cf/src/entrypoints.js b/environment_tests/test-exports-cf/src/entrypoints.js index 61b2f0553453..f4dd67cb26b8 100644 --- a/environment_tests/test-exports-cf/src/entrypoints.js +++ b/environment_tests/test-exports-cf/src/entrypoints.js @@ -46,6 +46,7 @@ export * from "langchain/document_loaders/base"; export * from "langchain/document_loaders/web/searchapi"; export * from "langchain/document_loaders/web/serpapi"; export * from "langchain/document_loaders/web/sort_xyz_blockchain"; +export * from "langchain/document_loaders/fs/obsidian"; export * from "langchain/document_transformers/openai_functions"; export * from "langchain/chat_models/base"; export * from "langchain/chat_models/openai"; diff --git a/environment_tests/test-exports-cjs/src/entrypoints.js b/environment_tests/test-exports-cjs/src/entrypoints.js index 725b9cd0477c..8526ba2a7eba 100644 --- a/environment_tests/test-exports-cjs/src/entrypoints.js +++ b/environment_tests/test-exports-cjs/src/entrypoints.js @@ -46,6 +46,7 @@ const document_loaders_base = require("langchain/document_loaders/base"); const document_loaders_web_searchapi = require("langchain/document_loaders/web/searchapi"); const document_loaders_web_serpapi = require("langchain/document_loaders/web/serpapi"); const document_loaders_web_sort_xyz_blockchain = require("langchain/document_loaders/web/sort_xyz_blockchain"); +const document_loaders_fs_obsidian = require("langchain/document_loaders/fs/obsidian"); const document_transformers_openai_functions = require("langchain/document_transformers/openai_functions"); const chat_models_base = require("langchain/chat_models/base"); const chat_models_openai = require("langchain/chat_models/openai"); diff --git a/environment_tests/test-exports-esbuild/src/entrypoints.js b/environment_tests/test-exports-esbuild/src/entrypoints.js index 9c4a916789c5..e54679ba65a5 100644 --- a/environment_tests/test-exports-esbuild/src/entrypoints.js +++ b/environment_tests/test-exports-esbuild/src/entrypoints.js @@ -46,6 +46,7 @@ import * as document_loaders_base from "langchain/document_loaders/base"; import * as document_loaders_web_searchapi from "langchain/document_loaders/web/searchapi"; import * as document_loaders_web_serpapi from "langchain/document_loaders/web/serpapi"; import * as document_loaders_web_sort_xyz_blockchain from "langchain/document_loaders/web/sort_xyz_blockchain"; +import * as document_loaders_fs_obsidian from "langchain/document_loaders/fs/obsidian"; import * as document_transformers_openai_functions from "langchain/document_transformers/openai_functions"; import * as chat_models_base from "langchain/chat_models/base"; import * as chat_models_openai from "langchain/chat_models/openai"; diff --git a/environment_tests/test-exports-esm/src/entrypoints.js b/environment_tests/test-exports-esm/src/entrypoints.js index 9c4a916789c5..e54679ba65a5 100644 --- a/environment_tests/test-exports-esm/src/entrypoints.js +++ b/environment_tests/test-exports-esm/src/entrypoints.js @@ -46,6 +46,7 @@ import * as document_loaders_base from "langchain/document_loaders/base"; import * as document_loaders_web_searchapi from "langchain/document_loaders/web/searchapi"; import * as document_loaders_web_serpapi from "langchain/document_loaders/web/serpapi"; import * as document_loaders_web_sort_xyz_blockchain from "langchain/document_loaders/web/sort_xyz_blockchain"; +import * as document_loaders_fs_obsidian from "langchain/document_loaders/fs/obsidian"; import * as document_transformers_openai_functions from "langchain/document_transformers/openai_functions"; import * as chat_models_base from "langchain/chat_models/base"; import * as chat_models_openai from "langchain/chat_models/openai"; diff --git a/environment_tests/test-exports-vercel/src/entrypoints.js b/environment_tests/test-exports-vercel/src/entrypoints.js index 61b2f0553453..f4dd67cb26b8 100644 --- a/environment_tests/test-exports-vercel/src/entrypoints.js +++ b/environment_tests/test-exports-vercel/src/entrypoints.js @@ -46,6 +46,7 @@ export * from "langchain/document_loaders/base"; export * from "langchain/document_loaders/web/searchapi"; export * from "langchain/document_loaders/web/serpapi"; export * from "langchain/document_loaders/web/sort_xyz_blockchain"; +export * from "langchain/document_loaders/fs/obsidian"; export * from "langchain/document_transformers/openai_functions"; export * from "langchain/chat_models/base"; export * from "langchain/chat_models/openai"; diff --git a/environment_tests/test-exports-vite/src/entrypoints.js b/environment_tests/test-exports-vite/src/entrypoints.js index 61b2f0553453..f4dd67cb26b8 100644 --- a/environment_tests/test-exports-vite/src/entrypoints.js +++ b/environment_tests/test-exports-vite/src/entrypoints.js @@ -46,6 +46,7 @@ export * from "langchain/document_loaders/base"; export * from "langchain/document_loaders/web/searchapi"; export * from "langchain/document_loaders/web/serpapi"; export * from "langchain/document_loaders/web/sort_xyz_blockchain"; +export * from "langchain/document_loaders/fs/obsidian"; export * from "langchain/document_transformers/openai_functions"; export * from "langchain/chat_models/base"; export * from "langchain/chat_models/openai"; diff --git a/examples/src/document_loaders/example_data/obsidian/bad_frontmatter.md b/examples/src/document_loaders/example_data/obsidian/bad_frontmatter.md new file mode 100644 index 000000000000..57698653173d --- /dev/null +++ b/examples/src/document_loaders/example_data/obsidian/bad_frontmatter.md @@ -0,0 +1,9 @@ +--- +anArray: + one +- two +- three +tags: 'onetag', 'twotag' ] +--- + +A document with frontmatter that isn't valid. \ No newline at end of file diff --git a/examples/src/document_loaders/example_data/obsidian/frontmatter.md b/examples/src/document_loaders/example_data/obsidian/frontmatter.md new file mode 100644 index 000000000000..80396d268f94 --- /dev/null +++ b/examples/src/document_loaders/example_data/obsidian/frontmatter.md @@ -0,0 +1,5 @@ +--- +tags: journal/entry, obsidian +--- + +No other content than the frontmatter. \ No newline at end of file diff --git a/examples/src/document_loaders/example_data/obsidian/no_frontmatter.md b/examples/src/document_loaders/example_data/obsidian/no_frontmatter.md new file mode 100644 index 000000000000..74c2405506e2 --- /dev/null +++ b/examples/src/document_loaders/example_data/obsidian/no_frontmatter.md @@ -0,0 +1,13 @@ +### Description +#recipes #dessert #cookies + +A document with HR elements that might trip up a front matter parser: + +--- + +### Ingredients + +- 3/4 cup (170g) **unsalted butter**, slightly softened to room temperature. +- 1 and 1/2 cups (180g) **confectioners’ sugar** + +--- \ No newline at end of file diff --git a/examples/src/document_loaders/example_data/obsidian/no_metadata.md b/examples/src/document_loaders/example_data/obsidian/no_metadata.md new file mode 100644 index 000000000000..991d076e28da --- /dev/null +++ b/examples/src/document_loaders/example_data/obsidian/no_metadata.md @@ -0,0 +1 @@ +A markdown document with no additional metadata. \ No newline at end of file diff --git a/examples/src/document_loaders/example_data/obsidian/tags_and_frontmatter.md b/examples/src/document_loaders/example_data/obsidian/tags_and_frontmatter.md new file mode 100644 index 000000000000..cb373d396806 --- /dev/null +++ b/examples/src/document_loaders/example_data/obsidian/tags_and_frontmatter.md @@ -0,0 +1,35 @@ +--- +aFloat: 13.12345 +anInt: 15 +aBool: true +aString: string value +anArray: +- one +- two +- three +aDict: + dictId1: '58417' + dictId2: 1500 +tags: [ 'onetag', 'twotag' ] +--- + +# Tags + + ()#notatag +#12345 + #read +something #tagWithCases +- #tag-with-dash +#tag_with_underscore #tag/with/nesting + +# Dataview + +Here is some data in a [dataview1:: a value] line. +Here is even more data in a (dataview2:: another value) line. +dataview3:: more data +notdataview4: this is not a field +notdataview5: this is not a field + +# Text content + +https://example.com/blog/#not-a-tag \ No newline at end of file diff --git a/examples/src/document_loaders/obsidian.ts b/examples/src/document_loaders/obsidian.ts new file mode 100644 index 000000000000..093c345d2966 --- /dev/null +++ b/examples/src/document_loaders/obsidian.ts @@ -0,0 +1,9 @@ +import { ObsidianLoader } from "langchain/document_loaders/fs/obsidian"; + +export const run = async () => { + const loader = new ObsidianLoader("src/document_loaders/example_data/obsidian"); + + const docs = await loader.load(); + + console.log({ docs }); +}; diff --git a/langchain/.gitignore b/langchain/.gitignore index 886cf43390bd..5e8b191edfc2 100644 --- a/langchain/.gitignore +++ b/langchain/.gitignore @@ -466,6 +466,9 @@ document_loaders/fs/csv.d.ts document_loaders/fs/notion.cjs document_loaders/fs/notion.js document_loaders/fs/notion.d.ts +document_loaders/fs/obsidian.cjs +document_loaders/fs/obsidian.js +document_loaders/fs/obsidian.d.ts document_loaders/fs/unstructured.cjs document_loaders/fs/unstructured.js document_loaders/fs/unstructured.d.ts diff --git a/langchain/package.json b/langchain/package.json index c855ae367c8f..fd4164fa0860 100644 --- a/langchain/package.json +++ b/langchain/package.json @@ -478,6 +478,9 @@ "document_loaders/fs/notion.cjs", "document_loaders/fs/notion.js", "document_loaders/fs/notion.d.ts", + "document_loaders/fs/obsidian.cjs", + "document_loaders/fs/obsidian.js", + "document_loaders/fs/obsidian.d.ts", "document_loaders/fs/unstructured.cjs", "document_loaders/fs/unstructured.js", "document_loaders/fs/unstructured.d.ts", @@ -2233,6 +2236,11 @@ "import": "./document_loaders/fs/notion.js", "require": "./document_loaders/fs/notion.cjs" }, + "./document_loaders/fs/obsidian": { + "types": "./document_loaders/fs/obsidian.d.ts", + "import": "./document_loaders/fs/obsidian.js", + "require": "./document_loaders/fs/obsidian.cjs" + }, "./document_loaders/fs/unstructured": { "types": "./document_loaders/fs/unstructured.d.ts", "import": "./document_loaders/fs/unstructured.js", diff --git a/langchain/scripts/create-entrypoints.js b/langchain/scripts/create-entrypoints.js index 446eaac7856b..0ceeeaa6e33e 100644 --- a/langchain/scripts/create-entrypoints.js +++ b/langchain/scripts/create-entrypoints.js @@ -181,6 +181,7 @@ const entrypoints = { "document_loaders/fs/epub": "document_loaders/fs/epub", "document_loaders/fs/csv": "document_loaders/fs/csv", "document_loaders/fs/notion": "document_loaders/fs/notion", + "document_loaders/fs/obsidian": "document_loaders/fs/obsidian", "document_loaders/fs/unstructured": "document_loaders/fs/unstructured", "document_loaders/fs/openai_whisper_audio": "document_loaders/fs/openai_whisper_audio", diff --git a/langchain/src/document_loaders/fs/obsidian.ts b/langchain/src/document_loaders/fs/obsidian.ts new file mode 100644 index 000000000000..783ddae081bb --- /dev/null +++ b/langchain/src/document_loaders/fs/obsidian.ts @@ -0,0 +1,262 @@ +import type { basename as BasenameT } from "node:path"; +import type { readFile as ReadFileT, stat as StatT } from "node:fs/promises"; +import yaml from "js-yaml"; +import { getEnv } from "../../util/env.js"; +import { DirectoryLoader, UnknownHandling } from "./directory.js"; +import { BaseDocumentLoader } from "../base.js"; +import { Document } from "../../document.js"; + +export type FrontMatter = { + title?: string; + description?: string; + tags?: string[] | string; + [key: string]: unknown; +}; + +export interface ObsidianFileLoaderOptions { + encoding?: BufferEncoding; + collectMetadata?: boolean; +} + +/** + * Represents a loader for Obsidian markdown files. This loader extends the BaseDocumentLoader + * and provides functionality to parse and extract metadata, tags, and dataview fields from + * Obsidian markdown files. + */ +class ObsidianFileLoader extends BaseDocumentLoader { + private filePath: string; + + private encoding: BufferEncoding; + + private collectMetadata: boolean; + + /** + * Initializes a new instance of the ObsidianFileLoader class. + * @param filePath The path to the Obsidian markdown file. + * @param encoding The character encoding to use when reading the file. Defaults to 'utf-8'. + * @param collectMetadata Determines whether metadata should be collected from the file. Defaults to true. + */ + constructor(filePath: string, { encoding = "utf-8", collectMetadata = true }: ObsidianFileLoaderOptions = {}) { + super() + this.filePath = filePath; + this.encoding = encoding; + this.collectMetadata = collectMetadata; + } + + private static FRONT_MATTER_REGEX = /^---\n(.*?)\n---\n/s; + + /** + * Parses the YAML front matter from the given content string. + * @param content The string content of the markdown file. + * @returns An object representing the parsed front matter. + */ + private parseFrontMatter(content: string): FrontMatter { + if (!this.collectMetadata) { + return {}; + } + + const match = content.match(ObsidianFileLoader.FRONT_MATTER_REGEX); + if (!match) { + return {}; + } + + try { + const frontMatter = yaml.load(match[1]) as FrontMatter; + if (frontMatter.tags && typeof frontMatter.tags === "string") { + frontMatter.tags = frontMatter.tags.split(", "); + } + + return frontMatter; + } catch (e) { + console.warn("Encountered non-yaml frontmatter"); + return {} + } + } + + /** + * Removes YAML front matter from the given content string. + * @param content The string content of the markdown file. + * @returns The content string with the front matter removed. + */ + private removeFrontMatter(content: string): string { + if (!this.collectMetadata) { + return content; + } + + return content.replace(ObsidianFileLoader.FRONT_MATTER_REGEX, ""); + } + + private static TAG_REGEX = /(?:\s|^)#([a-zA-Z_][\w/-]*)/g; + + /** + * Parses Obsidian-style tags from the given content string. + * @param content The string content of the markdown file. + * @returns A set of parsed tags. + */ + private parseObsidianTags(content: string): Set { + if (!this.collectMetadata) { + return new Set(); + } + + const matches = content.matchAll(ObsidianFileLoader.TAG_REGEX); + const tags = new Set(); + for (const match of matches) { + tags.add(match[1]); + } + + return tags; + } + + private static DATAVIEW_LINE_REGEX = /^\s*(\w+)::\s*(.*)$/gm; + + private static DATAVIEW_INLINE_BRACKET_REGEX = /\[(\w+)::\s*(.*)\]/gm; + + private static DATAVIEW_INLINE_PAREN_REGEX = /\((\w+)::\s*(.*)\)/gm; + + /** + * Parses dataview fields from the given content string. + * @param content The string content of the markdown file. + * @returns A record object containing key-value pairs of dataview fields. + */ + private parseObsidianDataviewFields(content: string): Record { + if (!this.collectMetadata) { + return {}; + } + + const fields: Record = {}; + const lineMatches = content.matchAll( + ObsidianFileLoader.DATAVIEW_LINE_REGEX + ); + for (const [, key, value] of lineMatches) { + fields[key] = value; + } + + const bracketMatches = content.matchAll( + ObsidianFileLoader.DATAVIEW_INLINE_BRACKET_REGEX + ); + for (const [, key, value] of bracketMatches) { + fields[key] = value; + } + + const parenMatches = content.matchAll( + ObsidianFileLoader.DATAVIEW_INLINE_PAREN_REGEX + ); + for (const [, key, value] of parenMatches) { + fields[key] = value; + } + + return fields; + } + + /** + * Converts metadata to a format compatible with Langchain. + * @param metadata The metadata object to convert. + * @returns A record object containing key-value pairs of Langchain-compatible metadata. + */ + private toLangchainCompatibleMetadata( + metadata: Record + ) { + const result: Record = {}; + for (const [key, value] of Object.entries(metadata)) { + if (typeof value === "string" || typeof value === "number") { + result[key] = value; + } else { + result[key] = JSON.stringify(value); + } + } + return result; + } + + + /** + * It loads the Obsidian file, parses it, and returns a `Document` instance. + * @returns An array of `Document` instances to comply with the BaseDocumentLoader interface. + */ + public async load(): Promise { + const documents: Document[] = []; + + const { basename, readFile, stat } = await ObsidianFileLoader.imports(); + const fileName = basename(this.filePath) + const stats = await stat(this.filePath); + let content = await readFile(this.filePath, this.encoding); + + const frontMatter = this.parseFrontMatter(content); + const tags = this.parseObsidianTags(content); + const dataviewFields = this.parseObsidianDataviewFields(content); + content = this.removeFrontMatter(content); + + const metadata: Document["metadata"] = { + source: fileName, + path: this.filePath, + created: stats.birthtimeMs, + lastModified: stats.mtimeMs, + lastAccessed: stats.atimeMs, + ...this.toLangchainCompatibleMetadata(frontMatter), + ...dataviewFields, + }; + + if (tags.size || frontMatter.tags) { + metadata.tags = Array.from( + new Set([...tags, ...(frontMatter.tags ?? [])]) + ).join(","); + } + + documents.push( + new Document({ + pageContent: content, + metadata, + }) + ); + + return documents; + } + + /** + * Imports the necessary functions from the `node:path` and + * `node:fs/promises` modules. It is used to dynamically import the + * functions when needed. If the import fails, it throws an error + * indicating that the modules failed to load. + * @returns A promise that resolves to an object containing the imported functions. + */ + static async imports(): Promise<{ + basename: typeof BasenameT; + readFile: typeof ReadFileT; + stat: typeof StatT; + }> { + try { + const { basename } = await import("node:path"); + const { readFile, stat } = await import("node:fs/promises"); + return { basename, readFile, stat }; + } catch (e) { + console.error(e); + throw new Error( + `Failed to load fs/promises. ObsidianFileLoader available only on environment 'node'. It appears you are running environment '${getEnv()}'. See https:// for alternatives.` + ); + } + } +} + +/** + * Represents a loader for directories containing Obsidian markdown files. This loader extends + * the DirectoryLoader and provides functionality to load and parse '.md' files with YAML frontmatter, + * Obsidian tags, and Dataview fields. + */ +export class ObsidianLoader extends DirectoryLoader { + /** + * Initializes a new instance of the ObsidianLoader class. + * @param directoryPath The path to the directory containing Obsidian markdown files. + * @param encoding The character encoding to use when reading files. Defaults to 'utf-8'. + * @param collectMetadata Determines whether metadata should be collected from the files. Defaults to true. + */ + constructor(directoryPath: string, options?: ObsidianFileLoaderOptions) { + super( + directoryPath, + { + ".md": (filePath) => new ObsidianFileLoader(filePath, options), + }, + true, + UnknownHandling.Ignore + ); + } +} + diff --git a/langchain/src/document_loaders/tests/example_data/obsidian/bad_frontmatter.md b/langchain/src/document_loaders/tests/example_data/obsidian/bad_frontmatter.md new file mode 100644 index 000000000000..57698653173d --- /dev/null +++ b/langchain/src/document_loaders/tests/example_data/obsidian/bad_frontmatter.md @@ -0,0 +1,9 @@ +--- +anArray: + one +- two +- three +tags: 'onetag', 'twotag' ] +--- + +A document with frontmatter that isn't valid. \ No newline at end of file diff --git a/langchain/src/document_loaders/tests/example_data/obsidian/frontmatter.md b/langchain/src/document_loaders/tests/example_data/obsidian/frontmatter.md new file mode 100644 index 000000000000..80396d268f94 --- /dev/null +++ b/langchain/src/document_loaders/tests/example_data/obsidian/frontmatter.md @@ -0,0 +1,5 @@ +--- +tags: journal/entry, obsidian +--- + +No other content than the frontmatter. \ No newline at end of file diff --git a/langchain/src/document_loaders/tests/example_data/obsidian/no_frontmatter.md b/langchain/src/document_loaders/tests/example_data/obsidian/no_frontmatter.md new file mode 100644 index 000000000000..74c2405506e2 --- /dev/null +++ b/langchain/src/document_loaders/tests/example_data/obsidian/no_frontmatter.md @@ -0,0 +1,13 @@ +### Description +#recipes #dessert #cookies + +A document with HR elements that might trip up a front matter parser: + +--- + +### Ingredients + +- 3/4 cup (170g) **unsalted butter**, slightly softened to room temperature. +- 1 and 1/2 cups (180g) **confectioners’ sugar** + +--- \ No newline at end of file diff --git a/langchain/src/document_loaders/tests/example_data/obsidian/no_metadata.md b/langchain/src/document_loaders/tests/example_data/obsidian/no_metadata.md new file mode 100644 index 000000000000..991d076e28da --- /dev/null +++ b/langchain/src/document_loaders/tests/example_data/obsidian/no_metadata.md @@ -0,0 +1 @@ +A markdown document with no additional metadata. \ No newline at end of file diff --git a/langchain/src/document_loaders/tests/example_data/obsidian/tags_and_frontmatter.md b/langchain/src/document_loaders/tests/example_data/obsidian/tags_and_frontmatter.md new file mode 100644 index 000000000000..cb373d396806 --- /dev/null +++ b/langchain/src/document_loaders/tests/example_data/obsidian/tags_and_frontmatter.md @@ -0,0 +1,35 @@ +--- +aFloat: 13.12345 +anInt: 15 +aBool: true +aString: string value +anArray: +- one +- two +- three +aDict: + dictId1: '58417' + dictId2: 1500 +tags: [ 'onetag', 'twotag' ] +--- + +# Tags + + ()#notatag +#12345 + #read +something #tagWithCases +- #tag-with-dash +#tag_with_underscore #tag/with/nesting + +# Dataview + +Here is some data in a [dataview1:: a value] line. +Here is even more data in a (dataview2:: another value) line. +dataview3:: more data +notdataview4: this is not a field +notdataview5: this is not a field + +# Text content + +https://example.com/blog/#not-a-tag \ No newline at end of file diff --git a/langchain/src/document_loaders/tests/obsidian.test.ts b/langchain/src/document_loaders/tests/obsidian.test.ts new file mode 100644 index 000000000000..cafdab84b5c7 --- /dev/null +++ b/langchain/src/document_loaders/tests/obsidian.test.ts @@ -0,0 +1,185 @@ +import { test, expect } from "@jest/globals"; +import * as url from "node:url"; +import * as path from "node:path"; +import { ObsidianLoader } from "../fs/obsidian.js"; +import { Document } from "../../document.js"; + + +const STANDARD_METADATA_FIELDS = [ + "created", + "path", + "source", + "lastAccessed", + "lastModified", +]; + +const FRONTMATTER_FIELDS = [ + "aBool", + "aFloat", + "anInt", + "anArray", + "aString", + "aDict", + "tags" +] + +const DATAVIEW_FIELDS = [ + "dataview1", + "dataview2", + "dataview3" +] + + +const directoryPath = path.resolve( + path.dirname(url.fileURLToPath(import.meta.url)), + "./example_data/obsidian" +); + +let docs: Document[]; + +beforeAll(async () => { + const loader = new ObsidianLoader(directoryPath); + docs = await loader.load(); +}); + +test("Test page content is loaded", async () => { + expect(docs.length).toBe(5); + docs.forEach(doc => { + expect(doc.pageContent).toBeTruthy(); + }); +}); + +test("Test no additional metadata is collected if collectMetadata is false", async () => { + const noMetadataLoader = new ObsidianLoader(directoryPath, { collectMetadata: false }); + const noMetadataDocs = await noMetadataLoader.load(); + + expect(noMetadataDocs.length).toBe(5); + expect( + noMetadataDocs.every( + (doc) => + Object.keys(doc.metadata).length === + Object.keys(STANDARD_METADATA_FIELDS).length && + Object.keys(doc.metadata).every((key) => + STANDARD_METADATA_FIELDS.includes(key) + ) + ) + ).toBe(true); +}); + +test("Test docs without frontmatter still have basic metadata", async () => { + const doc = docs.find( + (doc) => doc.metadata.source === "no_metadata.md" + ); + + if (!doc) { + fail("'no_metadata.md' not found."); + } + + expect( + Object.keys(doc.metadata).every((key) => + STANDARD_METADATA_FIELDS.includes(key) + ) + ).toBe(true); +}); + + test("Test standard frontmatter fields are loaded", async() => { + const doc = docs.find( + (doc) => doc.metadata.source === "frontmatter.md" + ); + + if (!doc) { + fail("'frontmatter.md' not found."); + } + + expect(Object.keys(doc.metadata)).toEqual(expect.arrayContaining(STANDARD_METADATA_FIELDS.concat("tags"))); + + const tagsSet = new Set(doc.metadata.tags?.split(",")); + expect(tagsSet.has("journal/entry")).toBe(true); + expect(tagsSet.has("obsidian")).toBe(true); + }); + +test("Test a doc with non-yaml frontmatter still have basic metadata", async() => { + const doc = docs.find( + (doc) => doc.metadata.source === "bad_frontmatter.md" + ); + + if (!doc) { + fail("'bad_frontmatter.md' not found."); + } + + expect( + Object.keys(doc.metadata).every((key) => + STANDARD_METADATA_FIELDS.includes(key) + ) + ).toBe(true); +}); + + +test("Test a doc with frontmatter and tags/dataview tags are all added to metadata", () => { + const doc = docs.find( + (doc) => doc.metadata.source === "tags_and_frontmatter.md" + ); + + if (!doc) { + fail("'tags_and_frontmatter.md' not found."); + } + + const expectedFields = [...STANDARD_METADATA_FIELDS, ...FRONTMATTER_FIELDS, ...DATAVIEW_FIELDS]; + expect(Object.keys(doc.metadata)).toEqual(expect.arrayContaining(expectedFields)); +}); + +test("Test float metadata is loaded correctly", () => { + const doc = docs.find(doc => doc.metadata.source === "tags_and_frontmatter.md"); + + if (!doc) { + fail("Document 'tags_and_frontmatter.md' not found."); + return; + } + + expect(doc.metadata["aFloat"]).toBe(13.12345); +}); + +test("Test int metadata is loaded correctly", () => { + const doc = docs.find(doc => doc.metadata.source === "tags_and_frontmatter.md"); + + if (!doc) { + fail("Document 'tags_and_frontmatter.md' not found."); + return; + } + + expect(doc.metadata["anInt"]).toBe(15); +}); + +test("Test string metadata is loaded correctly", () => { + const doc = docs.find(doc => doc.metadata.source === "tags_and_frontmatter.md"); + + if (!doc) { + fail("Document 'tags_and_frontmatter.md' not found."); + return; + } + + expect(doc.metadata["aString"]).toBe("string value"); +}); + +test("Test array metadata is loaded as a string", () => { + const doc = docs.find(doc => doc.metadata.source === "tags_and_frontmatter.md"); + + if (!doc) { + fail("Document 'tags_and_frontmatter.md' not found."); + return; + } + + expect(doc.metadata["anArray"]).toBe("[\"one\",\"two\",\"three\"]"); +}); + +test("Test dict metadata is stored as a string", () => { + const doc = docs.find(doc => doc.metadata.source === "tags_and_frontmatter.md"); + + if (!doc) { + fail("Document 'tags_and_frontmatter.md' not found."); + return; + } + + expect(doc.metadata["aDict"]).toBe("{\"dictId1\":\"58417\",\"dictId2\":1500}"); +}); + diff --git a/langchain/src/load/import_map.ts b/langchain/src/load/import_map.ts index a1d511062784..bd09713ae643 100644 --- a/langchain/src/load/import_map.ts +++ b/langchain/src/load/import_map.ts @@ -47,6 +47,7 @@ export * as document_loaders__base from "../document_loaders/base.js"; export * as document_loaders__web__searchapi from "../document_loaders/web/searchapi.js"; export * as document_loaders__web__serpapi from "../document_loaders/web/serpapi.js"; export * as document_loaders__web__sort_xyz_blockchain from "../document_loaders/web/sort_xyz_blockchain.js"; +export * as document_loaders__fs__obsidian from "../document_loaders/fs/obsidian.js"; export * as document_transformers__openai_functions from "../document_transformers/openai_functions.js"; export * as chat_models__base from "../chat_models/base.js"; export * as chat_models__openai from "../chat_models/openai.js"; From ca6dfdbe8675ab5507fe2af81338bb35b63d0a16 Mon Sep 17 00:00:00 2001 From: Eze Date: Sun, 3 Dec 2023 12:26:29 -0300 Subject: [PATCH 2/3] Fix Notion test not to consider Obsidian '.md' files --- .../test-exports-bun/src/entrypoints.js | 1 - .../test-exports-cf/src/entrypoints.js | 1 - .../test-exports-cjs/src/entrypoints.js | 1 - .../test-exports-esbuild/src/entrypoints.js | 1 - .../test-exports-esm/src/entrypoints.js | 1 - .../test-exports-vercel/src/entrypoints.js | 1 - .../test-exports-vite/src/entrypoints.js | 1 - examples/src/document_loaders/obsidian.ts | 4 +- langchain/scripts/create-entrypoints.js | 1 + langchain/src/document_loaders/fs/obsidian.ts | 72 +++++++-------- .../tests/example_data/{ => notion}/notion.md | 0 .../example_data/obsidian/bad_frontmatter.md | 2 +- .../example_data/obsidian/frontmatter.md | 2 +- .../example_data/obsidian/no_frontmatter.md | 9 +- .../example_data/obsidian/no_metadata.md | 2 +- .../obsidian/tags_and_frontmatter.md | 19 ++-- .../src/document_loaders/tests/notion.test.ts | 2 +- .../document_loaders/tests/obsidian.test.ts | 90 ++++++++++--------- langchain/src/load/import_constants.ts | 1 + langchain/src/load/import_map.ts | 1 - langchain/src/load/import_type.d.ts | 3 + 21 files changed, 112 insertions(+), 103 deletions(-) rename langchain/src/document_loaders/tests/example_data/{ => notion}/notion.md (100%) diff --git a/environment_tests/test-exports-bun/src/entrypoints.js b/environment_tests/test-exports-bun/src/entrypoints.js index f4dd67cb26b8..61b2f0553453 100644 --- a/environment_tests/test-exports-bun/src/entrypoints.js +++ b/environment_tests/test-exports-bun/src/entrypoints.js @@ -46,7 +46,6 @@ export * from "langchain/document_loaders/base"; export * from "langchain/document_loaders/web/searchapi"; export * from "langchain/document_loaders/web/serpapi"; export * from "langchain/document_loaders/web/sort_xyz_blockchain"; -export * from "langchain/document_loaders/fs/obsidian"; export * from "langchain/document_transformers/openai_functions"; export * from "langchain/chat_models/base"; export * from "langchain/chat_models/openai"; diff --git a/environment_tests/test-exports-cf/src/entrypoints.js b/environment_tests/test-exports-cf/src/entrypoints.js index f4dd67cb26b8..61b2f0553453 100644 --- a/environment_tests/test-exports-cf/src/entrypoints.js +++ b/environment_tests/test-exports-cf/src/entrypoints.js @@ -46,7 +46,6 @@ export * from "langchain/document_loaders/base"; export * from "langchain/document_loaders/web/searchapi"; export * from "langchain/document_loaders/web/serpapi"; export * from "langchain/document_loaders/web/sort_xyz_blockchain"; -export * from "langchain/document_loaders/fs/obsidian"; export * from "langchain/document_transformers/openai_functions"; export * from "langchain/chat_models/base"; export * from "langchain/chat_models/openai"; diff --git a/environment_tests/test-exports-cjs/src/entrypoints.js b/environment_tests/test-exports-cjs/src/entrypoints.js index 8526ba2a7eba..725b9cd0477c 100644 --- a/environment_tests/test-exports-cjs/src/entrypoints.js +++ b/environment_tests/test-exports-cjs/src/entrypoints.js @@ -46,7 +46,6 @@ const document_loaders_base = require("langchain/document_loaders/base"); const document_loaders_web_searchapi = require("langchain/document_loaders/web/searchapi"); const document_loaders_web_serpapi = require("langchain/document_loaders/web/serpapi"); const document_loaders_web_sort_xyz_blockchain = require("langchain/document_loaders/web/sort_xyz_blockchain"); -const document_loaders_fs_obsidian = require("langchain/document_loaders/fs/obsidian"); const document_transformers_openai_functions = require("langchain/document_transformers/openai_functions"); const chat_models_base = require("langchain/chat_models/base"); const chat_models_openai = require("langchain/chat_models/openai"); diff --git a/environment_tests/test-exports-esbuild/src/entrypoints.js b/environment_tests/test-exports-esbuild/src/entrypoints.js index e54679ba65a5..9c4a916789c5 100644 --- a/environment_tests/test-exports-esbuild/src/entrypoints.js +++ b/environment_tests/test-exports-esbuild/src/entrypoints.js @@ -46,7 +46,6 @@ import * as document_loaders_base from "langchain/document_loaders/base"; import * as document_loaders_web_searchapi from "langchain/document_loaders/web/searchapi"; import * as document_loaders_web_serpapi from "langchain/document_loaders/web/serpapi"; import * as document_loaders_web_sort_xyz_blockchain from "langchain/document_loaders/web/sort_xyz_blockchain"; -import * as document_loaders_fs_obsidian from "langchain/document_loaders/fs/obsidian"; import * as document_transformers_openai_functions from "langchain/document_transformers/openai_functions"; import * as chat_models_base from "langchain/chat_models/base"; import * as chat_models_openai from "langchain/chat_models/openai"; diff --git a/environment_tests/test-exports-esm/src/entrypoints.js b/environment_tests/test-exports-esm/src/entrypoints.js index e54679ba65a5..9c4a916789c5 100644 --- a/environment_tests/test-exports-esm/src/entrypoints.js +++ b/environment_tests/test-exports-esm/src/entrypoints.js @@ -46,7 +46,6 @@ import * as document_loaders_base from "langchain/document_loaders/base"; import * as document_loaders_web_searchapi from "langchain/document_loaders/web/searchapi"; import * as document_loaders_web_serpapi from "langchain/document_loaders/web/serpapi"; import * as document_loaders_web_sort_xyz_blockchain from "langchain/document_loaders/web/sort_xyz_blockchain"; -import * as document_loaders_fs_obsidian from "langchain/document_loaders/fs/obsidian"; import * as document_transformers_openai_functions from "langchain/document_transformers/openai_functions"; import * as chat_models_base from "langchain/chat_models/base"; import * as chat_models_openai from "langchain/chat_models/openai"; diff --git a/environment_tests/test-exports-vercel/src/entrypoints.js b/environment_tests/test-exports-vercel/src/entrypoints.js index f4dd67cb26b8..61b2f0553453 100644 --- a/environment_tests/test-exports-vercel/src/entrypoints.js +++ b/environment_tests/test-exports-vercel/src/entrypoints.js @@ -46,7 +46,6 @@ export * from "langchain/document_loaders/base"; export * from "langchain/document_loaders/web/searchapi"; export * from "langchain/document_loaders/web/serpapi"; export * from "langchain/document_loaders/web/sort_xyz_blockchain"; -export * from "langchain/document_loaders/fs/obsidian"; export * from "langchain/document_transformers/openai_functions"; export * from "langchain/chat_models/base"; export * from "langchain/chat_models/openai"; diff --git a/environment_tests/test-exports-vite/src/entrypoints.js b/environment_tests/test-exports-vite/src/entrypoints.js index f4dd67cb26b8..61b2f0553453 100644 --- a/environment_tests/test-exports-vite/src/entrypoints.js +++ b/environment_tests/test-exports-vite/src/entrypoints.js @@ -46,7 +46,6 @@ export * from "langchain/document_loaders/base"; export * from "langchain/document_loaders/web/searchapi"; export * from "langchain/document_loaders/web/serpapi"; export * from "langchain/document_loaders/web/sort_xyz_blockchain"; -export * from "langchain/document_loaders/fs/obsidian"; export * from "langchain/document_transformers/openai_functions"; export * from "langchain/chat_models/base"; export * from "langchain/chat_models/openai"; diff --git a/examples/src/document_loaders/obsidian.ts b/examples/src/document_loaders/obsidian.ts index 093c345d2966..279618b0d75d 100644 --- a/examples/src/document_loaders/obsidian.ts +++ b/examples/src/document_loaders/obsidian.ts @@ -1,7 +1,9 @@ import { ObsidianLoader } from "langchain/document_loaders/fs/obsidian"; export const run = async () => { - const loader = new ObsidianLoader("src/document_loaders/example_data/obsidian"); + const loader = new ObsidianLoader( + "src/document_loaders/example_data/obsidian" + ); const docs = await loader.load(); diff --git a/langchain/scripts/create-entrypoints.js b/langchain/scripts/create-entrypoints.js index 15fcc14cf6db..143a563d8f97 100644 --- a/langchain/scripts/create-entrypoints.js +++ b/langchain/scripts/create-entrypoints.js @@ -452,6 +452,7 @@ const requiresOptionalDependency = [ "document_loaders/fs/epub", "document_loaders/fs/csv", "document_loaders/fs/notion", + "document_loaders/fs/obsidian", "document_loaders/fs/unstructured", "document_loaders/fs/openai_whisper_audio", "document_loaders/fs/pptx", diff --git a/langchain/src/document_loaders/fs/obsidian.ts b/langchain/src/document_loaders/fs/obsidian.ts index 783ddae081bb..a2770cc7f877 100644 --- a/langchain/src/document_loaders/fs/obsidian.ts +++ b/langchain/src/document_loaders/fs/obsidian.ts @@ -25,7 +25,7 @@ export interface ObsidianFileLoaderOptions { */ class ObsidianFileLoader extends BaseDocumentLoader { private filePath: string; - + private encoding: BufferEncoding; private collectMetadata: boolean; @@ -36,11 +36,17 @@ class ObsidianFileLoader extends BaseDocumentLoader { * @param encoding The character encoding to use when reading the file. Defaults to 'utf-8'. * @param collectMetadata Determines whether metadata should be collected from the file. Defaults to true. */ - constructor(filePath: string, { encoding = "utf-8", collectMetadata = true }: ObsidianFileLoaderOptions = {}) { - super() + constructor( + filePath: string, + { + encoding = "utf-8", + collectMetadata = true, + }: ObsidianFileLoaderOptions = {} + ) { + super(); this.filePath = filePath; this.encoding = encoding; - this.collectMetadata = collectMetadata; + this.collectMetadata = collectMetadata; } private static FRONT_MATTER_REGEX = /^---\n(.*?)\n---\n/s; @@ -69,10 +75,10 @@ class ObsidianFileLoader extends BaseDocumentLoader { return frontMatter; } catch (e) { console.warn("Encountered non-yaml frontmatter"); - return {} + return {}; } } - + /** * Removes YAML front matter from the given content string. * @param content The string content of the markdown file. @@ -87,7 +93,7 @@ class ObsidianFileLoader extends BaseDocumentLoader { } private static TAG_REGEX = /(?:\s|^)#([a-zA-Z_][\w/-]*)/g; - + /** * Parses Obsidian-style tags from the given content string. * @param content The string content of the markdown file. @@ -110,7 +116,7 @@ class ObsidianFileLoader extends BaseDocumentLoader { private static DATAVIEW_LINE_REGEX = /^\s*(\w+)::\s*(.*)$/gm; private static DATAVIEW_INLINE_BRACKET_REGEX = /\[(\w+)::\s*(.*)\]/gm; - + private static DATAVIEW_INLINE_PAREN_REGEX = /\((\w+)::\s*(.*)\)/gm; /** @@ -118,7 +124,7 @@ class ObsidianFileLoader extends BaseDocumentLoader { * @param content The string content of the markdown file. * @returns A record object containing key-value pairs of dataview fields. */ - private parseObsidianDataviewFields(content: string): Record { + private parseObsidianDataviewFields(content: string): Record { if (!this.collectMetadata) { return {}; } @@ -153,9 +159,7 @@ class ObsidianFileLoader extends BaseDocumentLoader { * @param metadata The metadata object to convert. * @returns A record object containing key-value pairs of Langchain-compatible metadata. */ - private toLangchainCompatibleMetadata( - metadata: Record - ) { + private toLangchainCompatibleMetadata(metadata: Record) { const result: Record = {}; for (const [key, value] of Object.entries(metadata)) { if (typeof value === "string" || typeof value === "number") { @@ -167,16 +171,15 @@ class ObsidianFileLoader extends BaseDocumentLoader { return result; } - /** * It loads the Obsidian file, parses it, and returns a `Document` instance. * @returns An array of `Document` instances to comply with the BaseDocumentLoader interface. */ public async load(): Promise { const documents: Document[] = []; - + const { basename, readFile, stat } = await ObsidianFileLoader.imports(); - const fileName = basename(this.filePath) + const fileName = basename(this.filePath); const stats = await stat(this.filePath); let content = await readFile(this.filePath, this.encoding); @@ -198,9 +201,9 @@ class ObsidianFileLoader extends BaseDocumentLoader { if (tags.size || frontMatter.tags) { metadata.tags = Array.from( new Set([...tags, ...(frontMatter.tags ?? [])]) - ).join(","); + ).join(","); } - + documents.push( new Document({ pageContent: content, @@ -218,22 +221,22 @@ class ObsidianFileLoader extends BaseDocumentLoader { * indicating that the modules failed to load. * @returns A promise that resolves to an object containing the imported functions. */ - static async imports(): Promise<{ - basename: typeof BasenameT; - readFile: typeof ReadFileT; - stat: typeof StatT; - }> { - try { - const { basename } = await import("node:path"); - const { readFile, stat } = await import("node:fs/promises"); - return { basename, readFile, stat }; - } catch (e) { - console.error(e); - throw new Error( - `Failed to load fs/promises. ObsidianFileLoader available only on environment 'node'. It appears you are running environment '${getEnv()}'. See https:// for alternatives.` - ); - } + static async imports(): Promise<{ + basename: typeof BasenameT; + readFile: typeof ReadFileT; + stat: typeof StatT; + }> { + try { + const { basename } = await import("node:path"); + const { readFile, stat } = await import("node:fs/promises"); + return { basename, readFile, stat }; + } catch (e) { + console.error(e); + throw new Error( + `Failed to load fs/promises. ObsidianFileLoader available only on environment 'node'. It appears you are running environment '${getEnv()}'. See https:// for alternatives.` + ); } + } } /** @@ -242,7 +245,7 @@ class ObsidianFileLoader extends BaseDocumentLoader { * Obsidian tags, and Dataview fields. */ export class ObsidianLoader extends DirectoryLoader { - /** + /** * Initializes a new instance of the ObsidianLoader class. * @param directoryPath The path to the directory containing Obsidian markdown files. * @param encoding The character encoding to use when reading files. Defaults to 'utf-8'. @@ -256,7 +259,6 @@ export class ObsidianLoader extends DirectoryLoader { }, true, UnknownHandling.Ignore - ); + ); } } - diff --git a/langchain/src/document_loaders/tests/example_data/notion.md b/langchain/src/document_loaders/tests/example_data/notion/notion.md similarity index 100% rename from langchain/src/document_loaders/tests/example_data/notion.md rename to langchain/src/document_loaders/tests/example_data/notion/notion.md diff --git a/langchain/src/document_loaders/tests/example_data/obsidian/bad_frontmatter.md b/langchain/src/document_loaders/tests/example_data/obsidian/bad_frontmatter.md index 57698653173d..edc335e195ff 100644 --- a/langchain/src/document_loaders/tests/example_data/obsidian/bad_frontmatter.md +++ b/langchain/src/document_loaders/tests/example_data/obsidian/bad_frontmatter.md @@ -6,4 +6,4 @@ anArray: tags: 'onetag', 'twotag' ] --- -A document with frontmatter that isn't valid. \ No newline at end of file +A document with frontmatter that isn't valid. diff --git a/langchain/src/document_loaders/tests/example_data/obsidian/frontmatter.md b/langchain/src/document_loaders/tests/example_data/obsidian/frontmatter.md index 80396d268f94..bb1f5b9f0fc5 100644 --- a/langchain/src/document_loaders/tests/example_data/obsidian/frontmatter.md +++ b/langchain/src/document_loaders/tests/example_data/obsidian/frontmatter.md @@ -2,4 +2,4 @@ tags: journal/entry, obsidian --- -No other content than the frontmatter. \ No newline at end of file +No other content than the frontmatter. diff --git a/langchain/src/document_loaders/tests/example_data/obsidian/no_frontmatter.md b/langchain/src/document_loaders/tests/example_data/obsidian/no_frontmatter.md index 74c2405506e2..3943ec888c7e 100644 --- a/langchain/src/document_loaders/tests/example_data/obsidian/no_frontmatter.md +++ b/langchain/src/document_loaders/tests/example_data/obsidian/no_frontmatter.md @@ -1,5 +1,6 @@ ### Description -#recipes #dessert #cookies + +#recipes #dessert #cookies A document with HR elements that might trip up a front matter parser: @@ -7,7 +8,7 @@ A document with HR elements that might trip up a front matter parser: ### Ingredients -- 3/4 cup (170g) **unsalted butter**, slightly softened to room temperature. -- 1 and 1/2 cups (180g) **confectioners’ sugar** +- 3/4 cup (170g) **unsalted butter**, slightly softened to room temperature. +- 1 and 1/2 cups (180g) **confectioners’ sugar** ---- \ No newline at end of file +--- diff --git a/langchain/src/document_loaders/tests/example_data/obsidian/no_metadata.md b/langchain/src/document_loaders/tests/example_data/obsidian/no_metadata.md index 991d076e28da..70258e5aea71 100644 --- a/langchain/src/document_loaders/tests/example_data/obsidian/no_metadata.md +++ b/langchain/src/document_loaders/tests/example_data/obsidian/no_metadata.md @@ -1 +1 @@ -A markdown document with no additional metadata. \ No newline at end of file +A markdown document with no additional metadata. diff --git a/langchain/src/document_loaders/tests/example_data/obsidian/tags_and_frontmatter.md b/langchain/src/document_loaders/tests/example_data/obsidian/tags_and_frontmatter.md index cb373d396806..1d108a11ac87 100644 --- a/langchain/src/document_loaders/tests/example_data/obsidian/tags_and_frontmatter.md +++ b/langchain/src/document_loaders/tests/example_data/obsidian/tags_and_frontmatter.md @@ -4,23 +4,24 @@ anInt: 15 aBool: true aString: string value anArray: -- one -- two -- three + - one + - two + - three aDict: - dictId1: '58417' + dictId1: "58417" dictId2: 1500 -tags: [ 'onetag', 'twotag' ] +tags: ["onetag", "twotag"] --- # Tags - ()#notatag +()#notatag #12345 - #read +#read something #tagWithCases + - #tag-with-dash -#tag_with_underscore #tag/with/nesting + #tag_with_underscore #tag/with/nesting # Dataview @@ -32,4 +33,4 @@ notdataview5: this is not a field # Text content -https://example.com/blog/#not-a-tag \ No newline at end of file +https://example.com/blog/#not-a-tag diff --git a/langchain/src/document_loaders/tests/notion.test.ts b/langchain/src/document_loaders/tests/notion.test.ts index 025da7591f59..ca0680685023 100644 --- a/langchain/src/document_loaders/tests/notion.test.ts +++ b/langchain/src/document_loaders/tests/notion.test.ts @@ -6,7 +6,7 @@ import { NotionLoader } from "../fs/notion.js"; test("Test Notion Loader", async () => { const directoryPath = path.resolve( path.dirname(url.fileURLToPath(import.meta.url)), - "./example_data" + "./example_data/notion" ); const loader = new NotionLoader(directoryPath); const docs = await loader.load(); diff --git a/langchain/src/document_loaders/tests/obsidian.test.ts b/langchain/src/document_loaders/tests/obsidian.test.ts index cafdab84b5c7..fad894ad4fad 100644 --- a/langchain/src/document_loaders/tests/obsidian.test.ts +++ b/langchain/src/document_loaders/tests/obsidian.test.ts @@ -4,7 +4,6 @@ import * as path from "node:path"; import { ObsidianLoader } from "../fs/obsidian.js"; import { Document } from "../../document.js"; - const STANDARD_METADATA_FIELDS = [ "created", "path", @@ -13,22 +12,17 @@ const STANDARD_METADATA_FIELDS = [ "lastModified", ]; -const FRONTMATTER_FIELDS = [ +const FRONTMATTER_FIELDS = [ "aBool", "aFloat", "anInt", "anArray", "aString", "aDict", - "tags" -] - -const DATAVIEW_FIELDS = [ - "dataview1", - "dataview2", - "dataview3" -] + "tags", +]; +const DATAVIEW_FIELDS = ["dataview1", "dataview2", "dataview3"]; const directoryPath = path.resolve( path.dirname(url.fileURLToPath(import.meta.url)), @@ -44,13 +38,15 @@ beforeAll(async () => { test("Test page content is loaded", async () => { expect(docs.length).toBe(5); - docs.forEach(doc => { + docs.forEach((doc) => { expect(doc.pageContent).toBeTruthy(); }); }); test("Test no additional metadata is collected if collectMetadata is false", async () => { - const noMetadataLoader = new ObsidianLoader(directoryPath, { collectMetadata: false }); + const noMetadataLoader = new ObsidianLoader(directoryPath, { + collectMetadata: false, + }); const noMetadataDocs = await noMetadataLoader.load(); expect(noMetadataDocs.length).toBe(5); @@ -67,9 +63,7 @@ test("Test no additional metadata is collected if collectMetadata is false", asy }); test("Test docs without frontmatter still have basic metadata", async () => { - const doc = docs.find( - (doc) => doc.metadata.source === "no_metadata.md" - ); + const doc = docs.find((doc) => doc.metadata.source === "no_metadata.md"); if (!doc) { fail("'no_metadata.md' not found."); @@ -82,26 +76,24 @@ test("Test docs without frontmatter still have basic metadata", async () => { ).toBe(true); }); - test("Test standard frontmatter fields are loaded", async() => { - const doc = docs.find( - (doc) => doc.metadata.source === "frontmatter.md" - ); +test("Test standard frontmatter fields are loaded", async () => { + const doc = docs.find((doc) => doc.metadata.source === "frontmatter.md"); - if (!doc) { - fail("'frontmatter.md' not found."); - } + if (!doc) { + fail("'frontmatter.md' not found."); + } - expect(Object.keys(doc.metadata)).toEqual(expect.arrayContaining(STANDARD_METADATA_FIELDS.concat("tags"))); + expect(Object.keys(doc.metadata)).toEqual( + expect.arrayContaining(STANDARD_METADATA_FIELDS.concat("tags")) + ); - const tagsSet = new Set(doc.metadata.tags?.split(",")); - expect(tagsSet.has("journal/entry")).toBe(true); - expect(tagsSet.has("obsidian")).toBe(true); - }); + const tagsSet = new Set(doc.metadata.tags?.split(",")); + expect(tagsSet.has("journal/entry")).toBe(true); + expect(tagsSet.has("obsidian")).toBe(true); +}); -test("Test a doc with non-yaml frontmatter still have basic metadata", async() => { - const doc = docs.find( - (doc) => doc.metadata.source === "bad_frontmatter.md" - ); +test("Test a doc with non-yaml frontmatter still have basic metadata", async () => { + const doc = docs.find((doc) => doc.metadata.source === "bad_frontmatter.md"); if (!doc) { fail("'bad_frontmatter.md' not found."); @@ -114,7 +106,6 @@ test("Test a doc with non-yaml frontmatter still have basic metadata", async() = ).toBe(true); }); - test("Test a doc with frontmatter and tags/dataview tags are all added to metadata", () => { const doc = docs.find( (doc) => doc.metadata.source === "tags_and_frontmatter.md" @@ -124,12 +115,20 @@ test("Test a doc with frontmatter and tags/dataview tags are all added to metada fail("'tags_and_frontmatter.md' not found."); } - const expectedFields = [...STANDARD_METADATA_FIELDS, ...FRONTMATTER_FIELDS, ...DATAVIEW_FIELDS]; - expect(Object.keys(doc.metadata)).toEqual(expect.arrayContaining(expectedFields)); + const expectedFields = [ + ...STANDARD_METADATA_FIELDS, + ...FRONTMATTER_FIELDS, + ...DATAVIEW_FIELDS, + ]; + expect(Object.keys(doc.metadata)).toEqual( + expect.arrayContaining(expectedFields) + ); }); test("Test float metadata is loaded correctly", () => { - const doc = docs.find(doc => doc.metadata.source === "tags_and_frontmatter.md"); + const doc = docs.find( + (doc) => doc.metadata.source === "tags_and_frontmatter.md" + ); if (!doc) { fail("Document 'tags_and_frontmatter.md' not found."); @@ -140,7 +139,9 @@ test("Test float metadata is loaded correctly", () => { }); test("Test int metadata is loaded correctly", () => { - const doc = docs.find(doc => doc.metadata.source === "tags_and_frontmatter.md"); + const doc = docs.find( + (doc) => doc.metadata.source === "tags_and_frontmatter.md" + ); if (!doc) { fail("Document 'tags_and_frontmatter.md' not found."); @@ -151,7 +152,9 @@ test("Test int metadata is loaded correctly", () => { }); test("Test string metadata is loaded correctly", () => { - const doc = docs.find(doc => doc.metadata.source === "tags_and_frontmatter.md"); + const doc = docs.find( + (doc) => doc.metadata.source === "tags_and_frontmatter.md" + ); if (!doc) { fail("Document 'tags_and_frontmatter.md' not found."); @@ -162,24 +165,27 @@ test("Test string metadata is loaded correctly", () => { }); test("Test array metadata is loaded as a string", () => { - const doc = docs.find(doc => doc.metadata.source === "tags_and_frontmatter.md"); + const doc = docs.find( + (doc) => doc.metadata.source === "tags_and_frontmatter.md" + ); if (!doc) { fail("Document 'tags_and_frontmatter.md' not found."); return; } - expect(doc.metadata["anArray"]).toBe("[\"one\",\"two\",\"three\"]"); + expect(doc.metadata["anArray"]).toBe('["one","two","three"]'); }); test("Test dict metadata is stored as a string", () => { - const doc = docs.find(doc => doc.metadata.source === "tags_and_frontmatter.md"); + const doc = docs.find( + (doc) => doc.metadata.source === "tags_and_frontmatter.md" + ); if (!doc) { fail("Document 'tags_and_frontmatter.md' not found."); return; } - expect(doc.metadata["aDict"]).toBe("{\"dictId1\":\"58417\",\"dictId2\":1500}"); + expect(doc.metadata["aDict"]).toBe('{"dictId1":"58417","dictId2":1500}'); }); - diff --git a/langchain/src/load/import_constants.ts b/langchain/src/load/import_constants.ts index 9bf8f2b4cb46..cb3f4719cc88 100644 --- a/langchain/src/load/import_constants.ts +++ b/langchain/src/load/import_constants.ts @@ -110,6 +110,7 @@ export const optionalImportEntrypoints = [ "langchain/document_loaders/fs/epub", "langchain/document_loaders/fs/csv", "langchain/document_loaders/fs/notion", + "langchain/document_loaders/fs/obsidian", "langchain/document_loaders/fs/unstructured", "langchain/document_loaders/fs/openai_whisper_audio", "langchain/document_loaders/fs/pptx", diff --git a/langchain/src/load/import_map.ts b/langchain/src/load/import_map.ts index bd09713ae643..a1d511062784 100644 --- a/langchain/src/load/import_map.ts +++ b/langchain/src/load/import_map.ts @@ -47,7 +47,6 @@ export * as document_loaders__base from "../document_loaders/base.js"; export * as document_loaders__web__searchapi from "../document_loaders/web/searchapi.js"; export * as document_loaders__web__serpapi from "../document_loaders/web/serpapi.js"; export * as document_loaders__web__sort_xyz_blockchain from "../document_loaders/web/sort_xyz_blockchain.js"; -export * as document_loaders__fs__obsidian from "../document_loaders/fs/obsidian.js"; export * as document_transformers__openai_functions from "../document_transformers/openai_functions.js"; export * as chat_models__base from "../chat_models/base.js"; export * as chat_models__openai from "../chat_models/openai.js"; diff --git a/langchain/src/load/import_type.d.ts b/langchain/src/load/import_type.d.ts index 052c975320f8..94f1753cbb70 100644 --- a/langchain/src/load/import_type.d.ts +++ b/langchain/src/load/import_type.d.ts @@ -328,6 +328,9 @@ export interface OptionalImportMap { "langchain/document_loaders/fs/notion"?: | typeof import("../document_loaders/fs/notion.js") | Promise; + "langchain/document_loaders/fs/obsidian"?: + | typeof import("../document_loaders/fs/obsidian.js") + | Promise; "langchain/document_loaders/fs/unstructured"?: | typeof import("../document_loaders/fs/unstructured.js") | Promise; From 912027e822803583dc8219e90229319c0802f830 Mon Sep 17 00:00:00 2001 From: jacoblee93 Date: Mon, 4 Dec 2023 12:49:24 -0800 Subject: [PATCH 3/3] Fix lint --- langchain/src/document_loaders/tests/obsidian.test.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/langchain/src/document_loaders/tests/obsidian.test.ts b/langchain/src/document_loaders/tests/obsidian.test.ts index fad894ad4fad..0b0fcd8f906f 100644 --- a/langchain/src/document_loaders/tests/obsidian.test.ts +++ b/langchain/src/document_loaders/tests/obsidian.test.ts @@ -135,7 +135,7 @@ test("Test float metadata is loaded correctly", () => { return; } - expect(doc.metadata["aFloat"]).toBe(13.12345); + expect(doc.metadata.aFloat).toBe(13.12345); }); test("Test int metadata is loaded correctly", () => { @@ -148,7 +148,7 @@ test("Test int metadata is loaded correctly", () => { return; } - expect(doc.metadata["anInt"]).toBe(15); + expect(doc.metadata.anInt).toBe(15); }); test("Test string metadata is loaded correctly", () => { @@ -161,7 +161,7 @@ test("Test string metadata is loaded correctly", () => { return; } - expect(doc.metadata["aString"]).toBe("string value"); + expect(doc.metadata.aString).toBe("string value"); }); test("Test array metadata is loaded as a string", () => { @@ -174,7 +174,7 @@ test("Test array metadata is loaded as a string", () => { return; } - expect(doc.metadata["anArray"]).toBe('["one","two","three"]'); + expect(doc.metadata.anArray).toBe('["one","two","three"]'); }); test("Test dict metadata is stored as a string", () => { @@ -187,5 +187,5 @@ test("Test dict metadata is stored as a string", () => { return; } - expect(doc.metadata["aDict"]).toBe('{"dictId1":"58417","dictId2":1500}'); + expect(doc.metadata.aDict).toBe('{"dictId1":"58417","dictId2":1500}'); });