From d053ff2ae5d2c3d70dfb4475c317039a390adeb7 Mon Sep 17 00:00:00 2001 From: Andreas Motl Date: Thu, 31 Oct 2024 21:18:05 +0100 Subject: [PATCH] nlp: Update packages langchain, langchain-text-splitters, unstructured --- CHANGES.md | 1 + pueblo/nlp/resource.py | 13 ++++++++++--- pyproject.toml | 5 +++-- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index a2b0a0e..f7a8c70 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,6 +1,7 @@ # Changes for pueblo ## Unreleased +- nlp: Updated dependencies langchain, langchain-text-splitters, unstructured ## 2024-03-07 v0.0.9 - Testing: Add `pueblo.testing.notebook.{list_path,generate_tests}` diff --git a/pueblo/nlp/resource.py b/pueblo/nlp/resource.py index 9aec214..3e9ba51 100644 --- a/pueblo/nlp/resource.py +++ b/pueblo/nlp/resource.py @@ -39,10 +39,17 @@ def document_from_url(self) -> "Document": """ logger.info(f"Acquiring web resource: {self.url}") from langchain.schema import Document - from unstructured.partition.html import partition_html response = http.get(self.url) - elements = partition_html(text=response.text) + metadata = {"source": self.url} + return Document(page_content=response.text, metadata=metadata) + + def decode_html(self): + from langchain.schema import Document + from unstructured.partition.html import partition_html + + doc = self.document_from_url() + elements = partition_html(text=doc.page_content) text = "\n\n".join([str(el) for el in elements]) metadata = {"source": self.url} return Document(page_content=text, metadata=metadata) @@ -51,7 +58,7 @@ def langchain_documents(self, **kwargs) -> t.List["Document"]: """ Load URL resource, and split paragraphs in response into individual documents. """ - from langchain.text_splitter import CharacterTextSplitter + from langchain_text_splitters import CharacterTextSplitter documents = self.fetch_single() text_splitter = CharacterTextSplitter(**kwargs) diff --git a/pyproject.toml b/pyproject.toml index 3de46e3..255f025 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -95,8 +95,9 @@ optional-dependencies.fileio = [ ] optional-dependencies.nlp = [ "aiohttp<3.11", - "langchain", - "unstructured<0.15", + "langchain<0.4", + "langchain-text-splitters<0.4", + "unstructured<0.17", ] optional-dependencies.notebook = [ "nbclient<0.11",