Skip to content

Commit

Permalink
nlp: Update packages langchain, langchain-text-splitters, unstructured
Browse files Browse the repository at this point in the history
  • Loading branch information
amotl committed Oct 31, 2024
1 parent 776c4bb commit d053ff2
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 5 deletions.
1 change: 1 addition & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Changes for pueblo

## Unreleased
- nlp: Updated dependencies langchain, langchain-text-splitters, unstructured

## 2024-03-07 v0.0.9
- Testing: Add `pueblo.testing.notebook.{list_path,generate_tests}`
Expand Down
13 changes: 10 additions & 3 deletions pueblo/nlp/resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,17 @@ def document_from_url(self) -> "Document":
"""
logger.info(f"Acquiring web resource: {self.url}")
from langchain.schema import Document
from unstructured.partition.html import partition_html

response = http.get(self.url)
elements = partition_html(text=response.text)
metadata = {"source": self.url}
return Document(page_content=response.text, metadata=metadata)

def decode_html(self):
from langchain.schema import Document
from unstructured.partition.html import partition_html

doc = self.document_from_url()
elements = partition_html(text=doc.page_content)
text = "\n\n".join([str(el) for el in elements])
metadata = {"source": self.url}
return Document(page_content=text, metadata=metadata)
Expand All @@ -51,7 +58,7 @@ def langchain_documents(self, **kwargs) -> t.List["Document"]:
"""
Load URL resource, and split paragraphs in response into individual documents.
"""
from langchain.text_splitter import CharacterTextSplitter
from langchain_text_splitters import CharacterTextSplitter

documents = self.fetch_single()
text_splitter = CharacterTextSplitter(**kwargs)
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -95,8 +95,9 @@ optional-dependencies.fileio = [
]
optional-dependencies.nlp = [
"aiohttp<3.11",
"langchain",
"unstructured<0.15",
"langchain<0.4",
"langchain-text-splitters<0.4",
"unstructured<0.17",
]
optional-dependencies.notebook = [
"nbclient<0.11",
Expand Down

0 comments on commit d053ff2

Please sign in to comment.