Skip to content

Commit

Permalink
fix: do not detect file type in sdk (#1340)
Browse files Browse the repository at this point in the history
  • Loading branch information
himself65 authored Oct 18, 2024
1 parent 0dde0ca commit 4c38c1b
Show file tree
Hide file tree
Showing 5 changed files with 9 additions and 175 deletions.
6 changes: 6 additions & 0 deletions .changeset/funny-dancers-listen.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
"@llamaindex/cloud": patch
"llamaindex": patch
---

fix(cloud): do not detect file type in llama parse
3 changes: 0 additions & 3 deletions packages/cloud/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -58,8 +58,5 @@
"peerDependencies": {
"@llamaindex/core": "workspace:*",
"@llamaindex/env": "workspace:*"
},
"dependencies": {
"magic-bytes.js": "^1.10.0"
}
}
121 changes: 1 addition & 120 deletions packages/cloud/src/reader.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import { type Client, createClient, createConfig } from "@hey-api/client-fetch";
import { Document, FileReader } from "@llamaindex/core/schema";
import { fs, getEnv, path } from "@llamaindex/env";
import { filetypeinfo } from "magic-bytes.js";
import {
type Body_upload_file_api_v1_parsing_upload_post,
type ParserLanguages,
Expand All @@ -13,99 +12,6 @@ export type Language = ParserLanguages;

export type ResultType = "text" | "markdown" | "json";

const SUPPORT_FILE_EXT: string[] = [
".pdf",
// document and presentations
".602",
".abw",
".cgm",
".cwk",
".doc",
".docx",
".docm",
".dot",
".dotm",
".hwp",
".key",
".lwp",
".mw",
".mcw",
".pages",
".pbd",
".ppt",
".pptm",
".pptx",
".pot",
".potm",
".potx",
".rtf",
".sda",
".sdd",
".sdp",
".sdw",
".sgl",
".sti",
".sxi",
".sxw",
".stw",
".sxg",
".txt",
".uof",
".uop",
".uot",
".vor",
".wpd",
".wps",
".xml",
".zabw",
".epub",
// images
".jpg",
".jpeg",
".png",
".gif",
".bmp",
".svg",
".tiff",
".webp",
// web
".htm",
".html",
// spreadsheets
".xlsx",
".xls",
".xlsm",
".xlsb",
".xlw",
".csv",
".dif",
".sylk",
".slk",
".prn",
".numbers",
".et",
".ods",
".fods",
".uos1",
".uos2",
".dbf",
".wk1",
".wk2",
".wk3",
".wk4",
".wks",
".123",
".wq1",
".wq2",
".wb1",
".wb2",
".wb3",
".qpw",
".xlr",
".eth",
".tsv",
];

//todo: should move into @llamaindex/env
type WriteStream = {
write: (text: string) => void;
Expand Down Expand Up @@ -239,17 +145,12 @@ export class LlamaParseReader extends FileReader {

// Create a job for the LlamaParse API
private async createJob(data: Uint8Array): Promise<string> {
// Load data, set the mime type
const { mime } = await LlamaParseReader.getMimeType(data);

if (this.verbose) {
console.log("Started uploading the file");
}

const body = {
file: new Blob([data], {
type: mime,
}),
file: new Blob([data]),
language: this.language,
parsing_instruction: this.parsingInstruction,
skip_diagonal_text: this.skipDiagonalText,
Expand Down Expand Up @@ -564,24 +465,4 @@ export class LlamaParseReader extends FileReader {
}),
);
}

static async getMimeType(
data: Uint8Array,
): Promise<{ mime: string; extension: string }> {
const typeinfos = filetypeinfo(data);
// find the first type info that matches the supported MIME types
// It could be happened that docx file is recognized as zip file, so we need to check the mime type
const info = typeinfos.find((info) => {
if (info.extension && SUPPORT_FILE_EXT.includes(`.${info.extension}`)) {
return info;
}
});
if (!info || !info.mime || !info.extension) {
const ext = SUPPORT_FILE_EXT.join(", ");
throw new Error(
`File has type which does not match supported MIME Types. Supported formats include: ${ext}`,
);
}
return { mime: info.mime, extension: info.extension };
}
}
15 changes: 0 additions & 15 deletions packages/llamaindex/tests/readers/llama-parser-reader.test.ts

This file was deleted.

39 changes: 2 additions & 37 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 comment on commit 4c38c1b

@jsmusgrave
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps this will resolve this discussion?

Please sign in to comment.