From 4c38c1be0b4d0b907d54e65941f6c4d3e8cf55c7 Mon Sep 17 00:00:00 2001 From: Alex Yang Date: Fri, 18 Oct 2024 09:36:01 -0700 Subject: [PATCH] fix: do not detect file type in sdk (#1340) --- .changeset/funny-dancers-listen.md | 6 + packages/cloud/package.json | 3 - packages/cloud/src/reader.ts | 121 +----------------- .../tests/readers/llama-parser-reader.test.ts | 15 --- pnpm-lock.yaml | 39 +----- 5 files changed, 9 insertions(+), 175 deletions(-) create mode 100644 .changeset/funny-dancers-listen.md delete mode 100644 packages/llamaindex/tests/readers/llama-parser-reader.test.ts diff --git a/.changeset/funny-dancers-listen.md b/.changeset/funny-dancers-listen.md new file mode 100644 index 0000000000..b1b6d5530d --- /dev/null +++ b/.changeset/funny-dancers-listen.md @@ -0,0 +1,6 @@ +--- +"@llamaindex/cloud": patch +"llamaindex": patch +--- + +fix(cloud): do not detect file type in llama parse diff --git a/packages/cloud/package.json b/packages/cloud/package.json index e30419c719..8e2377417e 100644 --- a/packages/cloud/package.json +++ b/packages/cloud/package.json @@ -58,8 +58,5 @@ "peerDependencies": { "@llamaindex/core": "workspace:*", "@llamaindex/env": "workspace:*" - }, - "dependencies": { - "magic-bytes.js": "^1.10.0" } } diff --git a/packages/cloud/src/reader.ts b/packages/cloud/src/reader.ts index 88c0fac38a..3a2c9a9460 100644 --- a/packages/cloud/src/reader.ts +++ b/packages/cloud/src/reader.ts @@ -1,7 +1,6 @@ import { type Client, createClient, createConfig } from "@hey-api/client-fetch"; import { Document, FileReader } from "@llamaindex/core/schema"; import { fs, getEnv, path } from "@llamaindex/env"; -import { filetypeinfo } from "magic-bytes.js"; import { type Body_upload_file_api_v1_parsing_upload_post, type ParserLanguages, @@ -13,99 +12,6 @@ export type Language = ParserLanguages; export type ResultType = "text" | "markdown" | "json"; -const SUPPORT_FILE_EXT: string[] = [ - ".pdf", - // document and presentations - ".602", - ".abw", - ".cgm", - ".cwk", - ".doc", - ".docx", - ".docm", - ".dot", - ".dotm", - ".hwp", - ".key", - ".lwp", - ".mw", - ".mcw", - ".pages", - ".pbd", - ".ppt", - ".pptm", - ".pptx", - ".pot", - ".potm", - ".potx", - ".rtf", - ".sda", - ".sdd", - ".sdp", - ".sdw", - ".sgl", - ".sti", - ".sxi", - ".sxw", - ".stw", - ".sxg", - ".txt", - ".uof", - ".uop", - ".uot", - ".vor", - ".wpd", - ".wps", - ".xml", - ".zabw", - ".epub", - // images - ".jpg", - ".jpeg", - ".png", - ".gif", - ".bmp", - ".svg", - ".tiff", - ".webp", - // web - ".htm", - ".html", - // spreadsheets - ".xlsx", - ".xls", - ".xlsm", - ".xlsb", - ".xlw", - ".csv", - ".dif", - ".sylk", - ".slk", - ".prn", - ".numbers", - ".et", - ".ods", - ".fods", - ".uos1", - ".uos2", - ".dbf", - ".wk1", - ".wk2", - ".wk3", - ".wk4", - ".wks", - ".123", - ".wq1", - ".wq2", - ".wb1", - ".wb2", - ".wb3", - ".qpw", - ".xlr", - ".eth", - ".tsv", -]; - //todo: should move into @llamaindex/env type WriteStream = { write: (text: string) => void; @@ -239,17 +145,12 @@ export class LlamaParseReader extends FileReader { // Create a job for the LlamaParse API private async createJob(data: Uint8Array): Promise { - // Load data, set the mime type - const { mime } = await LlamaParseReader.getMimeType(data); - if (this.verbose) { console.log("Started uploading the file"); } const body = { - file: new Blob([data], { - type: mime, - }), + file: new Blob([data]), language: this.language, parsing_instruction: this.parsingInstruction, skip_diagonal_text: this.skipDiagonalText, @@ -564,24 +465,4 @@ export class LlamaParseReader extends FileReader { }), ); } - - static async getMimeType( - data: Uint8Array, - ): Promise<{ mime: string; extension: string }> { - const typeinfos = filetypeinfo(data); - // find the first type info that matches the supported MIME types - // It could be happened that docx file is recognized as zip file, so we need to check the mime type - const info = typeinfos.find((info) => { - if (info.extension && SUPPORT_FILE_EXT.includes(`.${info.extension}`)) { - return info; - } - }); - if (!info || !info.mime || !info.extension) { - const ext = SUPPORT_FILE_EXT.join(", "); - throw new Error( - `File has type which does not match supported MIME Types. Supported formats include: ${ext}`, - ); - } - return { mime: info.mime, extension: info.extension }; - } } diff --git a/packages/llamaindex/tests/readers/llama-parser-reader.test.ts b/packages/llamaindex/tests/readers/llama-parser-reader.test.ts deleted file mode 100644 index b43cf99158..0000000000 --- a/packages/llamaindex/tests/readers/llama-parser-reader.test.ts +++ /dev/null @@ -1,15 +0,0 @@ -import { LlamaParseReader } from "llamaindex"; -import { readFile } from "node:fs/promises"; -import { join } from "node:path"; -import { fileURLToPath } from "node:url"; -import { expect, test } from "vitest"; - -const fixturesDir = fileURLToPath(new URL("./fixtures", import.meta.url)); - -test("file type should be detected correctly", async () => { - const xlsx = join(fixturesDir, "test.xlsx"); - const buffer = await readFile(xlsx); - const { mime, extension } = await LlamaParseReader.getMimeType(buffer); - expect(mime).toBe("application/vnd.oasis.opendocument.spreadsheet"); - expect(extension).toBe("ods"); -}); diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 0067bb768a..f3f66ce67c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -348,10 +348,6 @@ importers: version: 5.6.2 packages/cloud: - dependencies: - magic-bytes.js: - specifier: ^1.10.0 - version: 1.10.0 devDependencies: '@hey-api/client-fetch': specifier: ^0.2.4 @@ -6568,7 +6564,6 @@ packages: eslint@8.57.0: resolution: {integrity: sha512-dZ6+mexnaTIbSBZWgou51U6OmzIhYM2VcNdtiTtI7qPNZm35Akpr0f6vtw3w1Kmn5PYo+tZVfh13WrhpS6oLqQ==} engines: {node: ^12.22.0 || ^14.17.0 || >=16.0.0} - deprecated: This version is no longer supported. Please see https://eslint.org/version-support for other options. hasBin: true eslint@9.10.0: @@ -19534,7 +19529,7 @@ snapshots: '@typescript-eslint/parser': 7.2.0(eslint@8.57.0)(typescript@5.6.2) eslint: 8.57.0 eslint-import-resolver-node: 0.3.9 - eslint-import-resolver-typescript: 3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint@8.57.0))(eslint@8.57.0) + eslint-import-resolver-typescript: 3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1)(eslint@8.57.0) eslint-plugin-import: 2.29.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-typescript@3.6.3)(eslint@8.57.0) eslint-plugin-jsx-a11y: 6.9.0(eslint@8.57.0) eslint-plugin-react: 7.35.0(eslint@8.57.0) @@ -19582,25 +19577,6 @@ snapshots: transitivePeerDependencies: - supports-color - eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint@8.57.0))(eslint@8.57.0): - dependencies: - '@nolyfill/is-core-module': 1.0.39 - debug: 4.3.7 - enhanced-resolve: 5.17.1 - eslint: 8.57.0 - eslint-module-utils: 2.8.2(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint@8.57.0))(eslint@8.57.0))(eslint@8.57.0) - fast-glob: 3.3.2 - get-tsconfig: 4.8.0 - is-bun-module: 1.1.0 - is-glob: 4.0.3 - optionalDependencies: - eslint-plugin-import: 2.29.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-typescript@3.6.3)(eslint@8.57.0) - transitivePeerDependencies: - - '@typescript-eslint/parser' - - eslint-import-resolver-node - - eslint-import-resolver-webpack - - supports-color - eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1)(eslint@8.57.0): dependencies: '@nolyfill/is-core-module': 1.0.39 @@ -19620,17 +19596,6 @@ snapshots: - eslint-import-resolver-webpack - supports-color - eslint-module-utils@2.8.2(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint@8.57.0))(eslint@8.57.0))(eslint@8.57.0): - dependencies: - debug: 3.2.7 - optionalDependencies: - '@typescript-eslint/parser': 7.2.0(eslint@8.57.0)(typescript@5.6.2) - eslint: 8.57.0 - eslint-import-resolver-node: 0.3.9 - eslint-import-resolver-typescript: 3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint@8.57.0))(eslint@8.57.0) - transitivePeerDependencies: - - supports-color - eslint-module-utils@2.8.2(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1)(eslint@8.57.0))(eslint@8.57.0): dependencies: debug: 3.2.7 @@ -19652,7 +19617,7 @@ snapshots: doctrine: 2.1.0 eslint: 8.57.0 eslint-import-resolver-node: 0.3.9 - eslint-module-utils: 2.8.2(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint@8.57.0))(eslint@8.57.0))(eslint@8.57.0) + eslint-module-utils: 2.8.2(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-import-resolver-typescript@3.6.3(@typescript-eslint/parser@7.2.0(eslint@8.57.0)(typescript@5.6.2))(eslint-import-resolver-node@0.3.9)(eslint-plugin-import@2.29.1)(eslint@8.57.0))(eslint@8.57.0) hasown: 2.0.2 is-core-module: 2.15.1 is-glob: 4.0.3