From 12fff6ff5c7c4767888b813af511312453c6afd8 Mon Sep 17 00:00:00 2001 From: "dristy.cd" Date: Wed, 3 Jul 2024 12:56:45 +0530 Subject: [PATCH 01/12] Replacing doc from payload with checksum while sending data to pebblo_cloud --- .../chains/pebblo_retrieval/base.py | 30 ++++++++----------- .../document_loaders/pebblo.py | 15 +++++++++- 2 files changed, 26 insertions(+), 19 deletions(-) diff --git a/libs/community/langchain_community/chains/pebblo_retrieval/base.py b/libs/community/langchain_community/chains/pebblo_retrieval/base.py index 97c939b4fce38..f5849f8153057 100644 --- a/libs/community/langchain_community/chains/pebblo_retrieval/base.py +++ b/libs/community/langchain_community/chains/pebblo_retrieval/base.py @@ -457,25 +457,19 @@ def _send_prompt(self, qa_payload: Qa) -> None: if self.api_key: if self.classifier_location == "local": if pebblo_resp: - payload["response"] = ( - json.loads(pebblo_resp.text) - .get("retrieval_data", {}) - .get("response", {}) - ) - payload["context"] = ( - json.loads(pebblo_resp.text) - .get("retrieval_data", {}) - .get("context", []) - ) - payload["prompt"] = ( - json.loads(pebblo_resp.text) - .get("retrieval_data", {}) - .get("prompt", {}) - ) + resp = json.loads(pebblo_resp.text) + if resp: + payload["response"].update( + resp.get("retrieval_data", {}) + .get("response", {}) + ) + payload["prompt"].update( + resp.get("retrieval_data", {}) + .get("prompt", {}) + ) else: - payload["response"] = None - payload["context"] = None - payload["prompt"] = None + payload["response"] = {} + payload["prompt"] = {} headers.update({"x-api-key": self.api_key}) pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}{PROMPT_URL}" try: diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py index a695582e2fa24..f7103ba24f7c3 100644 --- a/libs/community/langchain_community/document_loaders/pebblo.py +++ b/libs/community/langchain_community/document_loaders/pebblo.py @@ -257,7 +257,20 @@ def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list: if self.api_key: if self.classifier_location == "local": - payload["docs"] = classified_docs + payload_docs = [] + docs = payload["docs"] + for doc_data in docs: + for doc in classified_docs: + if doc_data["source_path"] == doc["source_path"]: + doc_data.update({ + "content_checksum": doc["content_checksum"], + "loader_source_path": doc["loader_source_path"] + }) + break + doc_data.pop("doc") + payload_docs.append(doc_data) + payload["docs"] = payload_docs + headers.update({"x-api-key": self.api_key}) pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}{LOADER_DOC_URL}" try: From 1dd8f2d7bbdd824d85c4f61a4c05291323caabd3 Mon Sep 17 00:00:00 2001 From: "dristy.cd" Date: Wed, 3 Jul 2024 14:55:33 +0530 Subject: [PATCH 02/12] Adding id in doc --- .../document_loaders/pebblo.py | 59 ++++++++----------- 1 file changed, 26 insertions(+), 33 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py index f7103ba24f7c3..86e8d88ceaa4a 100644 --- a/libs/community/langchain_community/document_loaders/pebblo.py +++ b/libs/community/langchain_community/document_loaders/pebblo.py @@ -89,17 +89,12 @@ def load(self) -> List[Document]: list: Documents fetched from load method of the wrapped `loader`. """ self.docs = self.loader.load() - # Add pebblo-specific metadata to docs - self._add_pebblo_specific_metadata() - if not self.load_semantic: - self._classify_doc(self.docs, loading_end=True) - return self.docs - self.docs_with_id = self._index_docs() - classified_docs = self._classify_doc(self.docs_with_id, loading_end=True) - self.docs_with_id = self._add_semantic_to_docs( - self.docs_with_id, classified_docs - ) - self.docs = self._unindex_docs(self.docs_with_id) # type: ignore + classified_docs = self._classify_doc(loading_end=True) + self._add_pebblo_specific_metadata(classified_docs) + if self.load_semantic: + self.docs = self._add_semantic_to_docs(classified_docs) + else: + self.docs = self._unindex_docs() # type: ignore return self.docs def lazy_load(self) -> Iterator[Document]: @@ -125,19 +120,14 @@ def lazy_load(self) -> Iterator[Document]: self.docs = [] break self.docs = list((doc,)) - # Add pebblo-specific metadata to docs - self._add_pebblo_specific_metadata() - if not self.load_semantic: - self._classify_doc(self.docs, loading_end=True) - yield self.docs[0] + self.docs_with_id = self._index_docs() + classified_doc = self._classify_doc() + self._add_pebblo_specific_metadata(classified_doc) + if self.load_semantic: + self.docs = self._add_semantic_to_docs(classified_doc) else: - self.docs_with_id = self._index_docs() - classified_doc = self._classify_doc(self.docs) - self.docs_with_id = self._add_semantic_to_docs( - self.docs_with_id, classified_doc - ) - self.docs = self._unindex_docs(self.docs_with_id) # type: ignore - yield self.docs[0] + self.docs = self._unindex_docs() + yield self.docs[0] @classmethod def set_discover_sent(cls) -> None: @@ -257,19 +247,22 @@ def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list: if self.api_key: if self.classifier_location == "local": - payload_docs = [] docs = payload["docs"] for doc_data in docs: - for doc in classified_docs: - if doc_data["source_path"] == doc["source_path"]: - doc_data.update({ - "content_checksum": doc["content_checksum"], - "loader_source_path": doc["loader_source_path"] - }) - break + classified_data = classified_docs.get(doc_data["pb_id"], {}) + doc_data.update( + { + "content_checksum": classified_data.get( + "content_checksum", None + ), + "loader_source_path": classified_data.get( + "loader_source_path", None + ), + "entities": classified_data.get("entities", {}), + "topics": classified_data.get("topics", {}), + } + ) doc_data.pop("doc") - payload_docs.append(doc_data) - payload["docs"] = payload_docs headers.update({"x-api-key": self.api_key}) pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}{LOADER_DOC_URL}" From 0b9199e9f1789d2ef96fff20602d971d90d74be8 Mon Sep 17 00:00:00 2001 From: "dristy.cd" Date: Thu, 4 Jul 2024 18:22:22 +0530 Subject: [PATCH 03/12] Updating IndexedDocument class --- .../document_loaders/pebblo.py | 76 +++++++++---------- .../langchain_community/utilities/pebblo.py | 2 +- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py index 86e8d88ceaa4a..a766856975fa8 100644 --- a/libs/community/langchain_community/document_loaders/pebblo.py +++ b/libs/community/langchain_community/document_loaders/pebblo.py @@ -5,7 +5,7 @@ import os import uuid from http import HTTPStatus -from typing import Any, Dict, Iterator, List, Optional, Union +from typing import Any, Dict, Iterator, List, Optional import requests # type: ignore from langchain_core.documents import Document @@ -38,16 +38,16 @@ class PebbloSafeLoader(BaseLoader): _loader_sent: bool = False def __init__( - self, - langchain_loader: BaseLoader, - name: str, - owner: str = "", - description: str = "", - api_key: Optional[str] = None, - load_semantic: bool = False, - classifier_url: Optional[str] = None, - *, - classifier_location: str = "local", + self, + langchain_loader: BaseLoader, + name: str, + owner: str = "", + description: str = "", + api_key: Optional[str] = None, + load_semantic: bool = False, + classifier_url: Optional[str] = None, + *, + classifier_location: str = "local", ): if not name or not isinstance(name, str): raise NameError("Must specify a valid name.") @@ -61,7 +61,7 @@ def __init__( self.source_path = get_loader_full_path(self.loader) self.source_owner = PebbloSafeLoader.get_file_owner_from_path(self.source_path) self.docs: List[Document] = [] - self.docs_with_id: Union[List[IndexedDocument], List[Document], List] = [] + self.docs_with_id: List[IndexedDocument] = [] loader_name = str(type(self.loader)).split(".")[-1].split("'")[0] self.source_type = get_loader_type(loader_name) self.source_path_size = self.get_source_size(self.source_path) @@ -89,6 +89,7 @@ def load(self) -> List[Document]: list: Documents fetched from load method of the wrapped `loader`. """ self.docs = self.loader.load() + self.docs_with_id = self._index_docs() classified_docs = self._classify_doc(loading_end=True) self._add_pebblo_specific_metadata(classified_docs) if self.load_semantic: @@ -137,13 +138,12 @@ def set_discover_sent(cls) -> None: def set_loader_sent(cls) -> None: cls._loader_sent = True - def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list: + def _classify_doc(self, loading_end: bool = False) -> dict: """Send documents fetched from loader to pebblo-server. Then send classified documents to Daxa cloud(If api_key is present). Internal method. Args: - loaded_docs (list): List of documents fetched from loader's load operation. loading_end (bool, optional): Flag indicating the halt of data loading by loader. Defaults to False. """ @@ -153,9 +153,8 @@ def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list: } if loading_end is True: PebbloSafeLoader.set_loader_sent() - doc_content = [doc.dict() for doc in loaded_docs] + doc_content = [doc.dict() for doc in self.docs_with_id] docs = [] - classified_docs = [] for doc in doc_content: doc_metadata = doc.get("metadata", {}) doc_authorized_identities = doc_metadata.get("authorized_identities", []) @@ -173,12 +172,12 @@ def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list: page_content = str(doc.get("page_content")) page_content_size = self.calculate_content_size(page_content) self.source_aggregate_size += page_content_size - doc_id = doc.get("id", None) or 0 + doc_id = doc.get("pb_id", None) or 0 docs.append( { "doc": page_content, "source_path": doc_source_path, - "id": doc_id, + "pb_id": doc_id, "last_modified": doc.get("metadata", {}).get("last_modified"), "file_owner": doc_source_owner, **( @@ -211,6 +210,7 @@ def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list: self.source_aggregate_size ) payload = Doc(**payload).dict(exclude_unset=True) + classified_docs = {} # Raw payload to be sent to classifier if self.classifier_location == "local": load_doc_url = f"{self.classifier_url}{LOADER_DOC_URL}" @@ -218,7 +218,10 @@ def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list: pebblo_resp = requests.post( load_doc_url, headers=headers, json=payload, timeout=300 ) - classified_docs = json.loads(pebblo_resp.text).get("docs", None) + + # Updating the structure of pebblo response docs for efficient searching + for classified_doc in json.loads(pebblo_resp.text).get("docs", []): + classified_docs.update({classified_doc["pb_id"]: classified_doc}) if pebblo_resp.status_code not in [ HTTPStatus.OK, HTTPStatus.BAD_GATEWAY, @@ -459,33 +462,29 @@ def _index_docs(self) -> List[IndexedDocument]: List[IndexedDocument]: A list of IndexedDocument objects with unique IDs. """ docs_with_id = [ - IndexedDocument(id=hex(i)[2:], **doc.dict()) + IndexedDocument(pb_id=str(i), **doc.dict()) for i, doc in enumerate(self.docs) ] return docs_with_id - def _add_semantic_to_docs( - self, docs_with_id: List[IndexedDocument], classified_docs: List[dict] - ) -> List[Document]: + def _add_semantic_to_docs(self, classified_docs: Dict) -> List[Document]: """ Adds semantic metadata to the given list of documents. Args: - docs_with_id (List[IndexedDocument]): A list of IndexedDocument objects - containing the documents with their IDs. - classified_docs (List[dict]): A list of dictionaries containing the - classified documents. + classified_docs (Dict): A dictionary of dictionaries containing the + classified documents with pb_id as key. Returns: List[Document]: A list of Document objects with added semantic metadata. """ indexed_docs = { - doc.id: Document(page_content=doc.page_content, metadata=doc.metadata) - for doc in docs_with_id + doc.pb_id: Document(page_content=doc.page_content, metadata=doc.metadata) + for doc in self.docs_with_id } - for classified_doc in classified_docs: - doc_id = classified_doc.get("id") + for classified_doc in classified_docs.values(): + doc_id = classified_doc.get("pb_id") if doc_id in indexed_docs: self._add_semantic_to_doc(indexed_docs[doc_id], classified_doc) @@ -493,19 +492,16 @@ def _add_semantic_to_docs( return semantic_metadata_docs - def _unindex_docs(self, docs_with_id: List[IndexedDocument]) -> List[Document]: + def _unindex_docs(self) -> List[Document]: """ Converts a list of IndexedDocument objects to a list of Document objects. - Args: - docs_with_id (List[IndexedDocument]): A list of IndexedDocument objects. - Returns: List[Document]: A list of Document objects. """ docs = [ Document(page_content=doc.page_content, metadata=doc.metadata) - for i, doc in enumerate(docs_with_id) + for i, doc in enumerate(self.docs_with_id) ] return docs @@ -528,12 +524,16 @@ def _add_semantic_to_doc(self, doc: Document, classified_doc: dict) -> Document: ) return doc - def _add_pebblo_specific_metadata(self) -> None: + def _add_pebblo_specific_metadata(self, classified_docs: dict) -> None: """Add Pebblo specific metadata to documents.""" - for doc in self.docs: + for doc in self.docs_with_id: doc_metadata = doc.metadata doc_metadata["full_path"] = get_full_path( doc_metadata.get( "full_path", doc_metadata.get("source", self.source_path) ) ) + doc_metadata["pb_id"] = doc.pb_id + doc_metadata["content_checksum"] = classified_docs.get(doc.pb_id, {}).get( + "content_checksum", None + ) diff --git a/libs/community/langchain_community/utilities/pebblo.py b/libs/community/langchain_community/utilities/pebblo.py index 377155c71f4a3..9d8ffb13e4723 100644 --- a/libs/community/langchain_community/utilities/pebblo.py +++ b/libs/community/langchain_community/utilities/pebblo.py @@ -65,7 +65,7 @@ class IndexedDocument(Document): """Pebblo Indexed Document.""" - id: str + pb_id: str """Unique ID of the document.""" From 35d725a579dcda50056b8c393607379be89f4f4e Mon Sep 17 00:00:00 2001 From: "dristy.cd" Date: Fri, 5 Jul 2024 12:32:58 +0530 Subject: [PATCH 04/12] Fixing lint and Uts --- .../unit_tests/document_loaders/test_pebblo.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py index 1cee8a849d1c6..922acbc8818e6 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py +++ b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py @@ -65,12 +65,24 @@ def test_csv_loader_load_valid_data(mocker: MockerFixture) -> None: full_file_path = os.path.abspath(file_path) expected_docs = [ Document( + metadata={ + "source": full_file_path, + "row": 0, + "full_path": full_file_path, + "pb_id": "0", + "content_checksum": None, # For UT as here we are not calculating checksum + }, page_content="column1: value1\ncolumn2: value2\ncolumn3: value3", - metadata={"source": file_path, "row": 0, "full_path": full_file_path}, ), Document( + metadata={ + "source": full_file_path, + "row": 1, + "full_path": full_file_path, + "pb_id": "1", + "content_checksum": None, # For UT as here we are not calculating checksum + }, page_content="column1: value4\ncolumn2: value5\ncolumn3: value6", - metadata={"source": file_path, "row": 1, "full_path": full_file_path}, ), ] From 9ba1c1111dcd43b22d5b24bdc14596d24f27d064 Mon Sep 17 00:00:00 2001 From: "dristy.cd" Date: Tue, 9 Jul 2024 09:56:51 +0530 Subject: [PATCH 05/12] Fixing lints --- .../tests/unit_tests/document_loaders/test_pebblo.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py index 922acbc8818e6..89f2e8e8a905c 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py +++ b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py @@ -70,7 +70,8 @@ def test_csv_loader_load_valid_data(mocker: MockerFixture) -> None: "row": 0, "full_path": full_file_path, "pb_id": "0", - "content_checksum": None, # For UT as here we are not calculating checksum + # For UT as here we are not calculating checksum + "content_checksum": None, }, page_content="column1: value1\ncolumn2: value2\ncolumn3: value3", ), @@ -80,7 +81,8 @@ def test_csv_loader_load_valid_data(mocker: MockerFixture) -> None: "row": 1, "full_path": full_file_path, "pb_id": "1", - "content_checksum": None, # For UT as here we are not calculating checksum + # For UT as here we are not calculating checksum + "content_checksum": None, }, page_content="column1: value4\ncolumn2: value5\ncolumn3: value6", ), From 26ded3639e56f53486df26d5ceede9c8c801986c Mon Sep 17 00:00:00 2001 From: "dristy.cd" Date: Tue, 9 Jul 2024 10:02:02 +0530 Subject: [PATCH 06/12] Fixing lints --- .../chains/pebblo_retrieval/base.py | 6 ++---- .../document_loaders/pebblo.py | 20 +++++++++---------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/libs/community/langchain_community/chains/pebblo_retrieval/base.py b/libs/community/langchain_community/chains/pebblo_retrieval/base.py index f5849f8153057..8b10e82538bcd 100644 --- a/libs/community/langchain_community/chains/pebblo_retrieval/base.py +++ b/libs/community/langchain_community/chains/pebblo_retrieval/base.py @@ -460,12 +460,10 @@ def _send_prompt(self, qa_payload: Qa) -> None: resp = json.loads(pebblo_resp.text) if resp: payload["response"].update( - resp.get("retrieval_data", {}) - .get("response", {}) + resp.get("retrieval_data", {}).get("response", {}) ) payload["prompt"].update( - resp.get("retrieval_data", {}) - .get("prompt", {}) + resp.get("retrieval_data", {}).get("prompt", {}) ) else: payload["response"] = {} diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py index a766856975fa8..41ff9b3aceff4 100644 --- a/libs/community/langchain_community/document_loaders/pebblo.py +++ b/libs/community/langchain_community/document_loaders/pebblo.py @@ -38,16 +38,16 @@ class PebbloSafeLoader(BaseLoader): _loader_sent: bool = False def __init__( - self, - langchain_loader: BaseLoader, - name: str, - owner: str = "", - description: str = "", - api_key: Optional[str] = None, - load_semantic: bool = False, - classifier_url: Optional[str] = None, - *, - classifier_location: str = "local", + self, + langchain_loader: BaseLoader, + name: str, + owner: str = "", + description: str = "", + api_key: Optional[str] = None, + load_semantic: bool = False, + classifier_url: Optional[str] = None, + *, + classifier_location: str = "local", ): if not name or not isinstance(name, str): raise NameError("Must specify a valid name.") From 8533c536363087110ccf901071726e9f9b05af73 Mon Sep 17 00:00:00 2001 From: "dristy.cd" Date: Tue, 9 Jul 2024 11:27:37 +0530 Subject: [PATCH 07/12] Updating field name --- .../langchain_community/document_loaders/pebblo.py | 8 ++++---- .../tests/unit_tests/document_loaders/test_pebblo.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py index 41ff9b3aceff4..175a904807313 100644 --- a/libs/community/langchain_community/document_loaders/pebblo.py +++ b/libs/community/langchain_community/document_loaders/pebblo.py @@ -255,8 +255,8 @@ def _classify_doc(self, loading_end: bool = False) -> dict: classified_data = classified_docs.get(doc_data["pb_id"], {}) doc_data.update( { - "content_checksum": classified_data.get( - "content_checksum", None + "pb_checksum": classified_data.get( + "pb_checksum", None ), "loader_source_path": classified_data.get( "loader_source_path", None @@ -534,6 +534,6 @@ def _add_pebblo_specific_metadata(self, classified_docs: dict) -> None: ) ) doc_metadata["pb_id"] = doc.pb_id - doc_metadata["content_checksum"] = classified_docs.get(doc.pb_id, {}).get( - "content_checksum", None + doc_metadata["pb_checksum"] = classified_docs.get(doc.pb_id, {}).get( + "pb_checksum", None ) diff --git a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py index 89f2e8e8a905c..d0a71faae7a8e 100644 --- a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py +++ b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py @@ -71,7 +71,7 @@ def test_csv_loader_load_valid_data(mocker: MockerFixture) -> None: "full_path": full_file_path, "pb_id": "0", # For UT as here we are not calculating checksum - "content_checksum": None, + "pb_checksum": None, }, page_content="column1: value1\ncolumn2: value2\ncolumn3: value3", ), @@ -82,7 +82,7 @@ def test_csv_loader_load_valid_data(mocker: MockerFixture) -> None: "full_path": full_file_path, "pb_id": "1", # For UT as here we are not calculating checksum - "content_checksum": None, + "pb_checksum": None, }, page_content="column1: value4\ncolumn2: value5\ncolumn3: value6", ), From 5bec2304db496953c9f59a2cc46ce50724c153e0 Mon Sep 17 00:00:00 2001 From: "dristy.cd" Date: Tue, 9 Jul 2024 11:34:31 +0530 Subject: [PATCH 08/12] Fixing lints --- libs/community/langchain_community/document_loaders/pebblo.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py index 175a904807313..2e31b370cbd23 100644 --- a/libs/community/langchain_community/document_loaders/pebblo.py +++ b/libs/community/langchain_community/document_loaders/pebblo.py @@ -255,9 +255,7 @@ def _classify_doc(self, loading_end: bool = False) -> dict: classified_data = classified_docs.get(doc_data["pb_id"], {}) doc_data.update( { - "pb_checksum": classified_data.get( - "pb_checksum", None - ), + "pb_checksum": classified_data.get("pb_checksum", None), "loader_source_path": classified_data.get( "loader_source_path", None ), From d42724b4e7882c7dcc673df94454b7260e7a6f30 Mon Sep 17 00:00:00 2001 From: "dristy.cd" Date: Wed, 10 Jul 2024 09:57:50 +0530 Subject: [PATCH 09/12] Updating retrieval model --- .../langchain_community/chains/pebblo_retrieval/models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/libs/community/langchain_community/chains/pebblo_retrieval/models.py b/libs/community/langchain_community/chains/pebblo_retrieval/models.py index 3b7f94d44c8a4..13a54537b9169 100644 --- a/libs/community/langchain_community/chains/pebblo_retrieval/models.py +++ b/libs/community/langchain_community/chains/pebblo_retrieval/models.py @@ -129,6 +129,7 @@ class Context(BaseModel): retrieved_from: Optional[str] doc: Optional[str] vector_db: str + pb_checksum: Optional[str] class Prompt(BaseModel): From 5acce1b7fa36a383632a0f04fe0e0639a4e654a8 Mon Sep 17 00:00:00 2001 From: "dristy.cd" Date: Wed, 10 Jul 2024 11:04:56 +0530 Subject: [PATCH 10/12] Updating pebblo_retrieval_qa --- .../chains/pebblo_retrieval/base.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/libs/community/langchain_community/chains/pebblo_retrieval/base.py b/libs/community/langchain_community/chains/pebblo_retrieval/base.py index 8b10e82538bcd..b13523d16e7e7 100644 --- a/libs/community/langchain_community/chains/pebblo_retrieval/base.py +++ b/libs/community/langchain_community/chains/pebblo_retrieval/base.py @@ -124,6 +124,11 @@ def _call( ), "doc": doc.page_content, "vector_db": self.retriever.vectorstore.__class__.__name__, + **( + {"content_checksum": doc.metadata.get("content_checksum")} + if doc.metadata.get("content_checksum") + else {} + ), } for doc in docs if isinstance(doc, Document) @@ -465,9 +470,14 @@ def _send_prompt(self, qa_payload: Qa) -> None: payload["prompt"].update( resp.get("retrieval_data", {}).get("prompt", {}) ) + context = payload["context"] + for context_data in context: + context_data.pop("doc") + payload["context"] = context else: payload["response"] = {} payload["prompt"] = {} + payload["context"] = [] headers.update({"x-api-key": self.api_key}) pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}{PROMPT_URL}" try: From d4f1a107c3a8bec6c07087f04f77bf8ae6e1de13 Mon Sep 17 00:00:00 2001 From: "dristy.cd" Date: Thu, 11 Jul 2024 13:17:57 +0530 Subject: [PATCH 11/12] Updating variable name --- .../langchain_community/chains/pebblo_retrieval/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/community/langchain_community/chains/pebblo_retrieval/base.py b/libs/community/langchain_community/chains/pebblo_retrieval/base.py index b13523d16e7e7..629aca740d51d 100644 --- a/libs/community/langchain_community/chains/pebblo_retrieval/base.py +++ b/libs/community/langchain_community/chains/pebblo_retrieval/base.py @@ -125,8 +125,8 @@ def _call( "doc": doc.page_content, "vector_db": self.retriever.vectorstore.__class__.__name__, **( - {"content_checksum": doc.metadata.get("content_checksum")} - if doc.metadata.get("content_checksum") + {"pb_checksum": doc.metadata.get("pb_checksum")} + if doc.metadata.get("pb_checksum") else {} ), } From b0a444526e15acdea48c75af56cd04880d723a50 Mon Sep 17 00:00:00 2001 From: "dristy.cd" Date: Thu, 11 Jul 2024 14:52:18 +0530 Subject: [PATCH 12/12] Removing actual content from prompt API --- .../langchain_community/chains/pebblo_retrieval/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libs/community/langchain_community/chains/pebblo_retrieval/base.py b/libs/community/langchain_community/chains/pebblo_retrieval/base.py index 629aca740d51d..02d3553c4b464 100644 --- a/libs/community/langchain_community/chains/pebblo_retrieval/base.py +++ b/libs/community/langchain_community/chains/pebblo_retrieval/base.py @@ -467,9 +467,11 @@ def _send_prompt(self, qa_payload: Qa) -> None: payload["response"].update( resp.get("retrieval_data", {}).get("response", {}) ) + payload["response"].pop("data") payload["prompt"].update( resp.get("retrieval_data", {}).get("prompt", {}) ) + payload["prompt"].pop("data") context = payload["context"] for context_data in context: context_data.pop("doc")