From 9c15a436fc685c81530892d1c002144f226b7455 Mon Sep 17 00:00:00 2001 From: wyang14 Date: Sat, 13 Apr 2024 01:37:13 +0100 Subject: [PATCH] bugfix: replaced puppeteer with cheerio for simpler deployment --- package.json | 2 +- pnpm-lock.yaml | 344 +++++++------------------------------------- server/utils/rag.ts | 17 +-- 3 files changed, 55 insertions(+), 308 deletions(-) diff --git a/package.json b/package.json index e19b193..e32489f 100644 --- a/package.json +++ b/package.json @@ -43,6 +43,7 @@ "@vueuse/nuxt": "^10.9.0", "@zilliz/milvus2-sdk-node": "^2.3.5", "ai": "^2.2.37", + "cheerio": "1.0.0-rc.12", "chromadb": "^1.8.1", "dexie": "^4.0.1", "highlight.js": "^11.9.0", @@ -63,7 +64,6 @@ "openai": "^4.33.0", "pdf-parse": "^1.1.1", "prisma": "^5.12.1", - "puppeteer": "^19.7.2", "ws": "^8.16.0" } } \ No newline at end of file diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 5e47dac..11e3e6b 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -56,6 +56,9 @@ dependencies: ai: specifier: ^2.2.37 version: 2.2.37(react@18.2.0)(solid-js@1.8.16)(svelte@4.2.12)(vue@3.4.21) + cheerio: + specifier: 1.0.0-rc.12 + version: 1.0.0-rc.12 chromadb: specifier: ^1.8.1 version: 1.8.1(openai@4.33.0) @@ -70,7 +73,7 @@ dependencies: version: 5.3.2 langchain: specifier: ^0.1.31 - version: 0.1.31(@zilliz/milvus2-sdk-node@2.3.5)(chromadb@1.8.1)(ioredis@5.3.2)(mammoth@1.7.1)(pdf-parse@1.1.1)(puppeteer@19.7.2)(ws@8.16.0) + version: 0.1.31(@zilliz/milvus2-sdk-node@2.3.5)(cheerio@1.0.0-rc.12)(chromadb@1.8.1)(ioredis@5.3.2)(mammoth@1.7.1)(pdf-parse@1.1.1)(ws@8.16.0) langsmith: specifier: ^0.1.14 version: 0.1.14 @@ -116,9 +119,6 @@ dependencies: prisma: specifier: ^5.12.1 version: 5.12.1 - puppeteer: - specifier: ^19.7.2 - version: 19.7.2(typescript@5.4.4) ws: specifier: ^8.16.0 version: 8.16.0 @@ -2973,14 +2973,6 @@ packages: '@types/node': 20.12.5 dev: true - /@types/yauzl@2.10.3: - resolution: {integrity: sha512-oJoftv0LSuaDZE3Le4DbKX+KS9G36NzOeSap90UIK0yMA/NhKJhqlSGtNDORNRaIbQfzjXDrQa0ytJ6mNRGz/Q==} - requiresBuild: true - dependencies: - '@types/node': 20.12.5 - dev: false - optional: true - /@unhead/dom@1.9.4: resolution: {integrity: sha512-nEaHOcCL0u56g4XOV5XGwRMFZ05eEINfp8nxVrPiIGLrS9BoFrZS7/6IYSkalkNRTmw8M5xqxt6BalBr594SaA==} dependencies: @@ -3957,14 +3949,6 @@ packages: /birpc@0.2.17: resolution: {integrity: sha512-+hkTxhot+dWsLpp3gia5AkVHIsKlZybNT5gIYiDlNzJrmYPcTM9k5/w2uaj3IPpd7LlEYpmCj4Jj1nC41VhDFg==} - /bl@4.1.0: - resolution: {integrity: sha512-1W07cM9gS6DcLperZfFSj+bWLtaPGSOHWhPiGzXmvVJbRLdG82sH/Kn8EtW1VqWVA54AKf2h5k5BbnIbwF3h6w==} - dependencies: - buffer: 5.7.1 - inherits: 2.0.4 - readable-stream: 3.6.2 - dev: false - /bluebird@3.4.7: resolution: {integrity: sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==} dev: false @@ -3999,10 +3983,6 @@ packages: node-releases: 2.0.14 update-browserslist-db: 1.0.13(browserslist@4.23.0) - /buffer-crc32@0.2.13: - resolution: {integrity: sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==} - dev: false - /buffer-crc32@1.0.0: resolution: {integrity: sha512-Db1SbgBS/fg/392AblrMJk97KggmvYhr4pB5ZIMTWtaivCPMWLkmb7m21cJvpvgK+J3nsU2CmmixNBZx4vFj/w==} engines: {node: '>=8.0.0'} @@ -4010,13 +3990,6 @@ packages: /buffer-from@1.1.2: resolution: {integrity: sha512-E+XQCRwSbaaiChtv6k6Dwgc+bx+Bs6vuKJHHl5kox/BaKbhiXzqQOwK4cO22yElGp2OCmjwVhT3HmxgyPGnJfQ==} - /buffer@5.7.1: - resolution: {integrity: sha512-EHcyIPBQ4BSGlvjB16k5KgAJ27CIsHY/2JBmCRReo48y9rQ3MaUzWX3KVlBa4U7MyX02HdVj0K7C3WaB3ju7FQ==} - dependencies: - base64-js: 1.5.1 - ieee754: 1.2.1 - dev: false - /buffer@6.0.3: resolution: {integrity: sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==} dependencies: @@ -4133,6 +4106,30 @@ packages: resolution: {integrity: sha512-yrLQ/yVUFXkzg7EDQsPieE/53+0RlaWTs+wBrvW36cyilJ2SaDWfl4Yj7MtLTXleV9uEKefbAGUPv2/iWSooRA==} dev: false + /cheerio-select@2.1.0: + resolution: {integrity: sha512-9v9kG0LvzrlcungtnJtpGNxY+fzECQKhK4EGJX2vByejiMX84MFNQw4UxPJl3bFbTMw+Dfs37XaIkCwTZfLh4g==} + dependencies: + boolbase: 1.0.0 + css-select: 5.1.0 + css-what: 6.1.0 + domelementtype: 2.3.0 + domhandler: 5.0.3 + domutils: 3.1.0 + dev: false + + /cheerio@1.0.0-rc.12: + resolution: {integrity: sha512-VqR8m68vM46BNnuZ5NtnGBKIE/DfN0cRIzg9n40EIq9NOv90ayxLBXA8fXC5gquFRGJSTRqBq25Jt2ECLR431Q==} + engines: {node: '>= 6'} + dependencies: + cheerio-select: 2.1.0 + dom-serializer: 2.0.0 + domhandler: 5.0.3 + domutils: 3.1.0 + htmlparser2: 8.0.2 + parse5: 7.1.2 + parse5-htmlparser2-tree-adapter: 7.0.0 + dev: false + /chokidar@3.6.0: resolution: {integrity: sha512-7VT13fmjotKpGipCW9JEQAusEPE+Ei8nl6/g4FBAmIm0GOOLMua9NDDo/DWp0ZAxCr3cPq5ZpBqmPAQgDda2Pw==} engines: {node: '>= 8.10.0'} @@ -4147,10 +4144,6 @@ packages: optionalDependencies: fsevents: 2.3.3 - /chownr@1.1.4: - resolution: {integrity: sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg==} - dev: false - /chownr@2.0.0: resolution: {integrity: sha512-bIomtDF5KGpdogkLd9VspvFzk9KfpyyGlS8YFVZl7TGPBHL5snIOnxeshwVgPteQ9b4Eydl+pVbIyE1DcvCWgQ==} engines: {node: '>=10'} @@ -4177,15 +4170,6 @@ packages: - encoding dev: false - /chromium-bidi@0.4.4(devtools-protocol@0.0.1094867): - resolution: {integrity: sha512-4BX5cSaponuvVT1+SbLYTOAgDoVtX/Khoc9UsbFJ/AsPVUeFAM3RiIDFI6XFhLYMi9WmVJqh1ZH+dRpNKkKwiQ==} - peerDependencies: - devtools-protocol: '*' - dependencies: - devtools-protocol: 0.0.1094867 - mitt: 3.0.0 - dev: false - /ci-info@4.0.0: resolution: {integrity: sha512-TdHqgGf9odd8SXNuxtUBVx8Nv+qZOejE6qyqiy5NtbYYQOeFa6zmHkxlPzmaLxWWHsU6nJmB7AETdVPi+2NBUg==} engines: {node: '>=8'} @@ -4383,16 +4367,6 @@ packages: /core-util-is@1.0.3: resolution: {integrity: sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==} - /cosmiconfig@8.0.0: - resolution: {integrity: sha512-da1EafcpH6b/TD8vDRaWV7xFINlHlF6zKsGwS1TsuVJTZRkquaS5HTMq7uq6h31619QjbsYl21gVDOm32KM1vQ==} - engines: {node: '>=14'} - dependencies: - import-fresh: 3.3.0 - js-yaml: 4.1.0 - parse-json: 5.2.0 - path-type: 4.0.0 - dev: false - /crc-32@1.2.2: resolution: {integrity: sha512-ROmzCKrTnOwybPcJApAA6WBWij23HVfGVNKqqrZpuyZOHqK2CwHSvpGuyt/UNNvaIjEd8X5IFGp4Mh+Ie1IHJQ==} engines: {node: '>=0.8'} @@ -4416,14 +4390,6 @@ packages: resolution: {integrity: sha512-FWZBqdStQaPR8ZTBQGALh1EK9Hl1HcG70dyGvD1rKLPafFO3H73o38dz/e8YkIlbLn3JxmBI/f6Doe3Nh+DcEQ==} hasBin: true - /cross-fetch@3.1.5: - resolution: {integrity: sha512-lvb1SBsI0Z7GDwmuid+mU3kWVBwTVUbe7S0H52yaaAdQOXq2YktTCZdlAcNKFzE6QtRz0snpw9bNiPeOIkkQvw==} - dependencies: - node-fetch: 2.6.7 - transitivePeerDependencies: - - encoding - dev: false - /cross-spawn@7.0.3: resolution: {integrity: sha512-iRDPJKUPVEND7dHPO8rkbOnPpyDygcDFtWjpeWNCgy8WP2rXcxXL8TskReQl6OrB2G7+UJrags1q15Fudc7G6w==} engines: {node: '>= 8'} @@ -4679,10 +4645,6 @@ packages: /devalue@4.3.2: resolution: {integrity: sha512-KqFl6pOgOW+Y6wJgu80rHpo2/3H07vr8ntR9rkkFIRETewbf5GaYYcakYfiKz89K+sLsuPkQIZaXDMjUObZwWg==} - /devtools-protocol@0.0.1094867: - resolution: {integrity: sha512-pmMDBKiRVjh0uKK6CT1WqZmM3hBVSgD+N2MrgyV1uNizAZMw4tx6i/RTc+/uCsKSCmg0xXx7arCP/OFcIwTsiQ==} - dev: false - /dexie@4.0.1: resolution: {integrity: sha512-wSNn+TcCh+DuE2pdg058K3MhxA4g+IiZlW7yGz4cMd/t3z2rJXZcV3HDxZljbrICU2Iq0qY4UHnbolTMK/+bcA==} dev: false @@ -4780,12 +4742,6 @@ packages: iconv-lite: 0.6.3 optional: true - /end-of-stream@1.4.4: - resolution: {integrity: sha512-+uw1inIHVPQoaVuHzRyXd21icM+cnt4CzD5rW+NC1wjOUSTOs+Te7FOv7AhN7vS9x/oIyhLP5PR1H+phQAHu5Q==} - dependencies: - once: 1.4.0 - dev: false - /enhanced-resolve@5.16.0: resolution: {integrity: sha512-O+QWCviPNSSLAD9Ucn8Awv+poAkqn3T1XY5/N7kR7rQO9yfSGWkYZDwpJ+iKF7B8rxaQKWngSqACpgzeapSyoA==} engines: {node: '>=10.13.0'} @@ -4804,12 +4760,6 @@ packages: /err-code@2.0.3: resolution: {integrity: sha512-2bmlRpNKBxT/CRmPOlyISQpNj+qSeYvcym/uT0Jx2bMOlKLtSy1ZmLuVxSEKKyor/N5yhvp/ZiG1oE3DEYMSFA==} - /error-ex@1.3.2: - resolution: {integrity: sha512-7dFHNmqeFSEt2ZBsCriorKnn3Z2pj+fd9kmI6QoWw4//DL+icEBfc0U7qJCisqrTsKTjw4fNFy2pW9OqStD84g==} - dependencies: - is-arrayish: 0.2.1 - dev: false - /error-stack-parser-es@0.1.1: resolution: {integrity: sha512-g/9rfnvnagiNf+DRMHEVGuGuIBlCIMDFoTA616HaP2l9PlCjGjVhD98PNbVSJvmK4TttqT5mV5tInMhoFgi+aA==} @@ -4944,20 +4894,6 @@ packages: pathe: 1.1.2 ufo: 1.5.3 - /extract-zip@2.0.1: - resolution: {integrity: sha512-GDhU9ntwuKyGXdZBUgTIe+vXnWj0fppUEtMDL0+idd5Sta8TGpHssn/eusA9mrPr9qNDym6SxAYZjNvCn/9RBg==} - engines: {node: '>= 10.17.0'} - hasBin: true - dependencies: - debug: 4.3.4 - get-stream: 5.2.0 - yauzl: 2.10.0 - optionalDependencies: - '@types/yauzl': 2.10.3 - transitivePeerDependencies: - - supports-color - dev: false - /fast-fifo@1.3.2: resolution: {integrity: sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==} @@ -4983,12 +4919,6 @@ packages: dependencies: reusify: 1.0.4 - /fd-slicer@1.1.0: - resolution: {integrity: sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g==} - dependencies: - pend: 1.2.0 - dev: false - /fecha@4.2.3: resolution: {integrity: sha512-OP2IUU6HeYKJi3i0z4A19kHMQoLVs4Hc+DPqqxI2h/DPZHTm/vjsfC6P0b4jCMy14XizLBqvndQ+UilD7707Jw==} dev: false @@ -5073,10 +5003,6 @@ packages: resolution: {integrity: sha512-zJ2mQYM18rEFOudeV4GShTGIQ7RbzA7ozbU9I/XBpm7kqgMywgmylMwXHxZJmkVoYkna9d2pVXVXPdYTP9ej8Q==} engines: {node: '>= 0.6'} - /fs-constants@1.0.0: - resolution: {integrity: sha512-y6OAwoSIf7FyjMIv94u+b5rdheZEjzR63GTyZJm5qh4Bi+2YgwLCcI/fPFZkL5PSixOt6ZNKm+w+Hfp/Bciwow==} - dev: false - /fs-extra@11.2.0: resolution: {integrity: sha512-PmDi3uwK5nFuXh7XDTlVnS17xJS7vW36is2+w3xcv8SVxiB4NyATf4ctkVY5bkSjX0Y4nbvZCq1/EjtEyr9ktw==} engines: {node: '>=14.14'} @@ -5150,13 +5076,6 @@ packages: /get-port-please@3.1.2: resolution: {integrity: sha512-Gxc29eLs1fbn6LQ4jSU4vXjlwyZhF5HsGuMAa7gqBP4Rw4yxxltyDUuF5MBclFzDTXO+ACchGQoeela4DSfzdQ==} - /get-stream@5.2.0: - resolution: {integrity: sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==} - engines: {node: '>=8'} - dependencies: - pump: 3.0.0 - dev: false - /get-stream@6.0.1: resolution: {integrity: sha512-ts6Wi+2j3jQjqi70w5AlN8DFnkSwC+MqmxEzdEALB2qXZYV3X/b1CTfgPLGJNMeAWxdPfU8FO1ms3NUfaHCPYg==} engines: {node: '>=10'} @@ -5374,6 +5293,15 @@ packages: resolution: {integrity: sha512-ztqyC3kLto0e9WbNp0aeP+M3kTt+nbaIveGmUxAtZa+8iFgKLUOD4YKM5j+f3QD89bra7UeumolZHKuOXnTmeQ==} engines: {node: '>=8'} + /htmlparser2@8.0.2: + resolution: {integrity: sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==} + dependencies: + domelementtype: 2.3.0 + domhandler: 5.0.3 + domutils: 3.1.0 + entities: 4.5.0 + dev: false + /http-assert@1.5.0: resolution: {integrity: sha512-uPpH7OKX4H25hBmU6G1jWNaqJGpTXxey+YOUizJUAgu0AjLUeC8D73hTrhvDS5D+GJN1DN1+hhc/eF/wpxtp0w==} engines: {node: '>= 0.8'} @@ -5499,14 +5427,6 @@ packages: /immutable@4.3.5: resolution: {integrity: sha512-8eabxkth9gZatlwl5TBuJnCsoTADlL6ftEr7A4qgdaTsPyreilDSnUk57SO+jfKcNtxPa22U5KK6DSeAYhpBJw==} - /import-fresh@3.3.0: - resolution: {integrity: sha512-veYYhQa+D1QBKznvhUHxb8faxlrwUnxseDAbAp457E0wLNio2bOSKnjYDhMj+YiAq61xrMGhQk9iXVk5FzgQMw==} - engines: {node: '>=6'} - dependencies: - parent-module: 1.0.1 - resolve-from: 4.0.0 - dev: false - /imurmurhash@0.1.4: resolution: {integrity: sha512-JmXMZ6wuvDmLiHEml9ykzqO6lwFbof0GG4IkcGaENdCRDDmMVnny7s5HsIgHCbaq0w2MyPhDqkhTUgS2LU2PHA==} engines: {node: '>=0.8.19'} @@ -5565,10 +5485,6 @@ packages: resolution: {integrity: sha512-UtilS7hLRu++wb/WBAw9bNuP1Eg04Ivn1vERJck8zJthEvXCBEBpGR/33u/xLKWEQf95803oalHrVDptcAvFdQ==} dev: false - /is-arrayish@0.2.1: - resolution: {integrity: sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==} - dev: false - /is-arrayish@0.3.2: resolution: {integrity: sha512-eVRqCvVlZbuw3GrM63ovNSNAeA1K16kaR/LRY/92w0zxQ5/1YzwblUX652i4Xs9RwAGjW9d9y6X88t8OaAJfWQ==} dev: false @@ -5756,10 +5672,6 @@ packages: engines: {node: '>=4'} hasBin: true - /json-parse-even-better-errors@2.3.1: - resolution: {integrity: sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==} - dev: false - /json-parse-even-better-errors@3.0.1: resolution: {integrity: sha512-aatBvbL26wVUCLmbWdCpeu9iF5wOyWpagiKkInA+kfws3sWdBrTnsvN2CKcyCYyUrc7rebNBlK6+kteg7ksecg==} engines: {node: ^14.17.0 || ^16.13.0 || >=18.0.0} @@ -5889,7 +5801,7 @@ packages: resolution: {integrity: sha512-Xq9nH7KlWZmXAtodXDDRE7vs6DU1gTU8zYDHDiWLSip45Egwq3plLHzPn27NgvzL2r1LMPC1vdqh98sQxtqj4A==} dev: false - /langchain@0.1.31(@zilliz/milvus2-sdk-node@2.3.5)(chromadb@1.8.1)(ioredis@5.3.2)(mammoth@1.7.1)(pdf-parse@1.1.1)(puppeteer@19.7.2)(ws@8.16.0): + /langchain@0.1.31(@zilliz/milvus2-sdk-node@2.3.5)(cheerio@1.0.0-rc.12)(chromadb@1.8.1)(ioredis@5.3.2)(mammoth@1.7.1)(pdf-parse@1.1.1)(ws@8.16.0): resolution: {integrity: sha512-J2iZZL48Nwdo1XDU5cS5YYm91b2L6sL3a8hH1gBljvDzCY7jbtUOTuIoRVx5iHN3DXuuLnszS4lhqg0u9hQxBQ==} engines: {node: '>=18'} peerDependencies: @@ -6053,6 +5965,7 @@ packages: '@langchain/core': 0.1.54 '@langchain/openai': 0.0.26 binary-extensions: 2.3.0 + cheerio: 1.0.0-rc.12 chromadb: 1.8.1(openai@4.33.0) ioredis: 5.3.2 js-tiktoken: 1.0.10 @@ -6065,7 +5978,6 @@ packages: openapi-types: 12.1.3 p-retry: 4.6.2 pdf-parse: 1.1.1 - puppeteer: 19.7.2(typescript@5.4.4) uuid: 9.0.1 ws: 8.16.0 yaml: 2.4.1 @@ -6612,17 +6524,9 @@ packages: /mitt@2.1.0: resolution: {integrity: sha512-ILj2TpLiysu2wkBbWjAmww7TkZb65aiQO+DkVdUTBpBXq+MHYiETENkKFMtsJZX1Lf4pe4QOrTSjIfUwN5lRdg==} - /mitt@3.0.0: - resolution: {integrity: sha512-7dX2/10ITVyqh4aOSVI9gdape+t9l2/8QxHrFmUXu4EEUpdlxl6RudZUPZoc+zuY2hk1j7XxVroIVIan/pD/SQ==} - dev: false - /mitt@3.0.1: resolution: {integrity: sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==} - /mkdirp-classic@0.5.3: - resolution: {integrity: sha512-gKLcREMhtuZRwRAfqP3RFW+TK4JqApVBtOIftVgjuABpAtpxhPGaDcfvbhNvD0B8iD1oUr/txX35NjcaY6Ns/A==} - dev: false - /mkdirp@0.5.6: resolution: {integrity: sha512-FP+p8RB8OWpF3YZBCrP5gtADmtXApB5AMLn+vdyA+PyxCjrCs00mjyUozssO33cwDeT3wNGdLxJ5M//YqtHAJw==} hasBin: true @@ -6834,18 +6738,6 @@ packages: /node-fetch-native@1.6.4: resolution: {integrity: sha512-IhOigYzAKHd244OC0JIMIUrjzctirCmPkaIfhDeGcEETWof5zKYUW7e7MYvChGWh/4CJeXEgsRyGzuF334rOOQ==} - /node-fetch@2.6.7: - resolution: {integrity: sha512-ZjMPFEfVx5j+y2yF35Kzx5sF7kDzxuDj6ziH4FFbOp87zKDZNx8yExJIb05OGF4Nlt9IHFIMBkRl41VdvcNdbQ==} - engines: {node: 4.x || >=6.0.0} - peerDependencies: - encoding: ^0.1.0 - peerDependenciesMeta: - encoding: - optional: true - dependencies: - whatwg-url: 5.0.0 - dev: false - /node-fetch@2.7.0: resolution: {integrity: sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==} engines: {node: 4.x || >=6.0.0} @@ -7354,13 +7246,6 @@ packages: resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==} dev: false - /parent-module@1.0.1: - resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==} - engines: {node: '>=6'} - dependencies: - callsites: 3.1.0 - dev: false - /parent-module@2.0.0: resolution: {integrity: sha512-uo0Z9JJeWzv8BG+tRcapBKNJ0dro9cLyczGzulS6EfeyAdeC9sbojtW6XwvYxJkEne9En+J2XEl4zyglVeIwFg==} engines: {node: '>=8'} @@ -7375,16 +7260,6 @@ packages: git-config-path: 2.0.0 ini: 1.3.8 - /parse-json@5.2.0: - resolution: {integrity: sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==} - engines: {node: '>=8'} - dependencies: - '@babel/code-frame': 7.24.2 - error-ex: 1.3.2 - json-parse-even-better-errors: 2.3.1 - lines-and-columns: 1.2.4 - dev: false - /parse-path@7.0.0: resolution: {integrity: sha512-Euf9GG8WT9CdqwuWJGdf3RkUcTBArppHABkO7Lm8IzRQp0e2r/kkFnmhu4TSK30Wcu5rVAZLmfPKSBBi9tWFog==} dependencies: @@ -7395,6 +7270,19 @@ packages: dependencies: parse-path: 7.0.0 + /parse5-htmlparser2-tree-adapter@7.0.0: + resolution: {integrity: sha512-B77tOZrqqfUfnVcOrUvfdLbz4pu4RopLD/4vmu3HUPswwTA8OH0EMW9BlWR2B0RCoiZRAHEUu7IxeP1Pd1UU+g==} + dependencies: + domhandler: 5.0.3 + parse5: 7.1.2 + dev: false + + /parse5@7.1.2: + resolution: {integrity: sha512-Czj1WaSVpaoj0wbhMzLmWD69anp2WH7FXMB9n1Sy8/ZFF9jolSQVMu1Ij5WIyGmcBmhk7EOndpO4mIpihVqAXw==} + dependencies: + entities: 4.5.0 + dev: false + /parseurl@1.3.3: resolution: {integrity: sha512-CiyeOxFT/JZyN5m0z9PfXw4SCBJ6Sygz1Dpl0wqjlhDEGGBP1GnsUVEL0p63hoG1fcj3fHynXi9NYO4nWOL+qQ==} engines: {node: '>= 0.8'} @@ -7436,11 +7324,6 @@ packages: resolution: {integrity: sha512-JLyh7xT1kizaEvcaXOQwOc2/Yhw6KZOvPf1S8401UyLk86CU79LN3vl7ztXGm/pZ+YjoyAJ4rxmHwbkBXJX+yw==} dev: false - /path-type@4.0.0: - resolution: {integrity: sha512-gDKb8aZMDeD/tZWs9P6+q0J9Mwkdl6xMV8TjnGP3qJVJ06bdMgkbBlLU8IdfOsIsFz2BW1rNVT3XuNEl8zPAvw==} - engines: {node: '>=8'} - dev: false - /path-type@5.0.0: resolution: {integrity: sha512-5HviZNaZcfqP95rwpv+1HDgUamezbqdSYTyzjTvwtJSnIH+3vnbmWsItli8OFEndS984VT55M3jduxZbX351gg==} engines: {node: '>=12'} @@ -7458,10 +7341,6 @@ packages: - supports-color dev: false - /pend@1.2.0: - resolution: {integrity: sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==} - dev: false - /perfect-debounce@1.0.0: resolution: {integrity: sha512-xCy9V055GLEqoFaHoC1SoLIaLmWctgCUaBaWxDZ7/Zx4CTyX7cJQLJOok/orfjZAh9kEYpjJa4d0KcJmCbctZA==} @@ -7882,11 +7761,6 @@ packages: resolution: {integrity: sha512-cdGef/drWFoydD1JsMzuFf8100nZl+GT+yacc2bEced5f9Rjk4z+WtFUTBu9PhOi9j/jfmBPu0mMEY4wIdAF8A==} engines: {node: '>= 0.6.0'} - /progress@2.0.3: - resolution: {integrity: sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==} - engines: {node: '>=0.4.0'} - dev: false - /promise-inflight@1.0.1: resolution: {integrity: sha512-6zWPyEOFaQBJYcGMHBKTKJ3u6TBsnMFOIZSa6ce1e/ZrrsOlnHRHbabMjLiBYKp+n44X9eUI6VUPaukCXHuG4g==} peerDependencies: @@ -7931,69 +7805,11 @@ packages: /protocols@2.0.1: resolution: {integrity: sha512-/XJ368cyBJ7fzLMwLKv1e4vLxOju2MNAIokcr7meSaNcVbWz/CPcW22cP04mwxOErdA5mwjA8Q6w/cdAQxVn7Q==} - /proxy-from-env@1.1.0: - resolution: {integrity: sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==} - dev: false - - /pump@3.0.0: - resolution: {integrity: sha512-LwZy+p3SFs1Pytd/jYct4wpv49HiYCqd9Rlc5ZVdk0V+8Yzv6jR5Blk3TRmPL1ft69TxP0IMZGJ+WPFU2BFhww==} - dependencies: - end-of-stream: 1.4.4 - once: 1.4.0 - dev: false - /punycode.js@2.3.1: resolution: {integrity: sha512-uxFIHU0YlHYhDQtV4R9J6a52SLx28BCjT+4ieh7IGbgwVJWO+km431c4yRlREUAsAmt/uMjQUyQHNEPf0M39CA==} engines: {node: '>=6'} dev: false - /puppeteer-core@19.7.2(typescript@5.4.4): - resolution: {integrity: sha512-PvI+fXqgP0uGJxkyZcX51bnzjFA73MODZOAv0fSD35yR7tvbqwtMV3/Y+hxQ0AMMwzxkEebP6c7po/muqxJvmQ==} - engines: {node: '>=14.1.0'} - peerDependencies: - typescript: '>= 4.7.4' - peerDependenciesMeta: - typescript: - optional: true - dependencies: - chromium-bidi: 0.4.4(devtools-protocol@0.0.1094867) - cross-fetch: 3.1.5 - debug: 4.3.4 - devtools-protocol: 0.0.1094867 - extract-zip: 2.0.1 - https-proxy-agent: 5.0.1 - proxy-from-env: 1.1.0 - rimraf: 3.0.2 - tar-fs: 2.1.1 - typescript: 5.4.4 - unbzip2-stream: 1.4.3 - ws: 8.11.0 - transitivePeerDependencies: - - bufferutil - - encoding - - supports-color - - utf-8-validate - dev: false - - /puppeteer@19.7.2(typescript@5.4.4): - resolution: {integrity: sha512-4Lm7Qpe/LU95Svirei/jDLDvR5oMrl9BPGd7HMY5+Q28n+BhvKuW97gKkR+1LlI86bO8J3g8rG/Ll5kv9J1nlQ==} - engines: {node: '>=14.1.0'} - deprecated: < 21.8.0 is no longer supported - requiresBuild: true - dependencies: - cosmiconfig: 8.0.0 - https-proxy-agent: 5.0.1 - progress: 2.0.3 - proxy-from-env: 1.1.0 - puppeteer-core: 19.7.2(typescript@5.4.4) - transitivePeerDependencies: - - bufferutil - - encoding - - supports-color - - typescript - - utf-8-validate - dev: false - /queue-microtask@1.2.3: resolution: {integrity: sha512-NuaNSa6flKT5JaSYQzJok04JzTL1CA6aGhv5rfLW3PgqA+M2ChpZQnAC8h8i4ZFkBS8X5RqkDBHA7r4hej3K9A==} @@ -8119,11 +7935,6 @@ packages: resolution: {integrity: sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==} engines: {node: '>=0.10.0'} - /resolve-from@4.0.0: - resolution: {integrity: sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==} - engines: {node: '>=4'} - dev: false - /resolve-from@5.0.0: resolution: {integrity: sha512-qYg9KP24dD5qka9J47d0aVky0N+b4fTU89LN9iDnjB5waksiC49rvMB0PrUJQGoTmH50XPiqOvAjDfaijGxYZw==} engines: {node: '>=8'} @@ -8763,26 +8574,6 @@ packages: resolution: {integrity: sha512-GNzQvQTOIP6RyTfE2Qxb8ZVlNmw0n88vp1szwWRimP02mnTsx3Wtn5qRdqY9w2XduFNUgvOwhNnQsjwCp+kqaQ==} engines: {node: '>=6'} - /tar-fs@2.1.1: - resolution: {integrity: sha512-V0r2Y9scmbDRLCNex/+hYzvp/zyYjvFbHPNgVTKfQvVrb6guiE/fxP+XblDNR011utopbkex2nM4dHNV6GDsng==} - dependencies: - chownr: 1.1.4 - mkdirp-classic: 0.5.3 - pump: 3.0.0 - tar-stream: 2.2.0 - dev: false - - /tar-stream@2.2.0: - resolution: {integrity: sha512-ujeqbceABgwMZxEJnk2HDY2DlnUZ+9oEcb1KzTVfYHio0UE6dG71n60d8D2I4qNvleWrrXpmjpt7vZeF1LnMZQ==} - engines: {node: '>=6'} - dependencies: - bl: 4.1.0 - end-of-stream: 1.4.4 - fs-constants: 1.0.0 - inherits: 2.0.4 - readable-stream: 3.6.2 - dev: false - /tar-stream@3.1.7: resolution: {integrity: sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==} dependencies: @@ -8826,10 +8617,6 @@ packages: dependencies: any-promise: 1.3.0 - /through@2.3.8: - resolution: {integrity: sha512-w89qg7PI8wAdvX60bMDP+bFoD5Dvhm9oLheFp5O4a2QF0cSBGsBX4qZmadPMvVqlLJBBci+WqGGOAPvcDeNSVg==} - dev: false - /tiny-invariant@1.3.3: resolution: {integrity: sha512-+FbBPE1o9QAYvviau/qC5SE3caw21q3xkvWKBtja5vgqOWIHHJ3ioaq1VPfn/Szqctz2bU/oYeKd9/z5BL+PVg==} @@ -8912,13 +8699,6 @@ packages: /ultrahtml@1.5.3: resolution: {integrity: sha512-GykOvZwgDWZlTQMtp5jrD4BVL+gNn2NVlVafjcFUJ7taY20tqYdwdoWBFy6GBJsNTZe1GkGPkSl5knQAjtgceg==} - /unbzip2-stream@1.4.3: - resolution: {integrity: sha512-mlExGW4w71ebDJviH16lQLtZS32VKqsSfk80GCfUlwT/4/hNRFsoscrF/c++9xinkMzECL1uL9DDwXqFWkruPg==} - dependencies: - buffer: 5.7.1 - through: 2.3.8 - dev: false - /unconfig@0.3.12: resolution: {integrity: sha512-oDtfWDC0TMYFuwdt7E7CaqYZGqq1wAiC12PRTFe/93IkgNi+wVlF/LCjcD/bgNkGoopb0RsU363Ge3YXy7NGSw==} dependencies: @@ -9577,19 +9357,6 @@ packages: /wrappy@1.0.2: resolution: {integrity: sha512-l4Sp/DRseor9wL6EvV2+TuQn63dMkPjZ/sp9XkghTEbV9KlPS1xUsZ3u7/IQO4wxtcFB4bgpQPRcR3QCvezPcQ==} - /ws@8.11.0: - resolution: {integrity: sha512-HPG3wQd9sNQoT9xHyNCXoDUa+Xw/VevmY9FoHyQ+g+rrMn4j6FB4np7Z0OhdTgjx6MgQLK7jwSy1YecU1+4Asg==} - engines: {node: '>=10.0.0'} - peerDependencies: - bufferutil: ^4.0.1 - utf-8-validate: ^5.0.2 - peerDependenciesMeta: - bufferutil: - optional: true - utf-8-validate: - optional: true - dev: false - /ws@8.16.0: resolution: {integrity: sha512-HS0c//TP7Ina87TfiPUz1rQzMhHrl/SG2guqRcTOIUYD2q8uhUdNHZYJUaQ8aTGPzCh+c6oawMKW35nFl1dxyQ==} engines: {node: '>=10.0.0'} @@ -9638,13 +9405,6 @@ packages: y18n: 5.0.8 yargs-parser: 21.1.1 - /yauzl@2.10.0: - resolution: {integrity: sha512-p4a9I6X6nu6IhoGmBqAcbJy1mlC4j27vEPZX9F4L4/vZT3Lyq1VkFHw/V/PUcB9Buo+DG3iHkT0x3Qya58zc3g==} - dependencies: - buffer-crc32: 0.2.13 - fd-slicer: 1.1.0 - dev: false - /ylru@1.3.2: resolution: {integrity: sha512-RXRJzMiK6U2ye0BlGGZnmpwJDPgakn6aNQ0A7gHRbD4I0uvK4TW6UqkK1V0pp9jskjJBAXd3dRrbzWkqJ+6cxA==} engines: {node: '>= 4.0.0'} diff --git a/server/utils/rag.ts b/server/utils/rag.ts index b4b0e31..2904c5e 100644 --- a/server/utils/rag.ts +++ b/server/utils/rag.ts @@ -2,7 +2,7 @@ import { PDFLoader } from "langchain/document_loaders/fs/pdf" import { TextLoader } from "langchain/document_loaders/fs/text" import { JSONLoader } from "langchain/document_loaders/fs/json" import { DocxLoader } from "langchain/document_loaders/fs/docx" -import { PuppeteerWebBaseLoader } from "langchain/document_loaders/web/puppeteer" +import { CheerioWebBaseLoader } from "langchain/document_loaders/web/cheerio" import { MultiPartData, H3Event } from 'h3' import { createRetriever } from '@/server/retriever' @@ -26,20 +26,7 @@ export const loadDocuments = async (file: MultiPartData) => { export const loadURL = async (url: string) => { console.log("URL: ", url) - const loader = new PuppeteerWebBaseLoader(url, { - launchOptions: { - headless: true, - }, - gotoOptions: { - waitUntil: "domcontentloaded", - }, - async evaluate(page, browser) { - const result = await page.evaluate(() => document.body.innerText) - console.log("HTML result: ", result) - await browser.close() - return result - }, - }) + const loader = new CheerioWebBaseLoader(url) const docs = await loader.load() console.log(`Documents loaded and parsed from ${url}:`, docs) return docs