From 12fff6ff5c7c4767888b813af511312453c6afd8 Mon Sep 17 00:00:00 2001
From: "dristy.cd" <dristy@clouddefense.io>
Date: Wed, 3 Jul 2024 12:56:45 +0530
Subject: [PATCH 01/12] Replacing doc from payload with checksum while sending
 data to pebblo_cloud

---
 .../chains/pebblo_retrieval/base.py           | 30 ++++++++-----------
 .../document_loaders/pebblo.py                | 15 +++++++++-
 2 files changed, 26 insertions(+), 19 deletions(-)

diff --git a/libs/community/langchain_community/chains/pebblo_retrieval/base.py b/libs/community/langchain_community/chains/pebblo_retrieval/base.py
index 97c939b4fce38..f5849f8153057 100644
--- a/libs/community/langchain_community/chains/pebblo_retrieval/base.py
+++ b/libs/community/langchain_community/chains/pebblo_retrieval/base.py
@@ -457,25 +457,19 @@ def _send_prompt(self, qa_payload: Qa) -> None:
         if self.api_key:
             if self.classifier_location == "local":
                 if pebblo_resp:
-                    payload["response"] = (
-                        json.loads(pebblo_resp.text)
-                        .get("retrieval_data", {})
-                        .get("response", {})
-                    )
-                    payload["context"] = (
-                        json.loads(pebblo_resp.text)
-                        .get("retrieval_data", {})
-                        .get("context", [])
-                    )
-                    payload["prompt"] = (
-                        json.loads(pebblo_resp.text)
-                        .get("retrieval_data", {})
-                        .get("prompt", {})
-                    )
+                    resp = json.loads(pebblo_resp.text)
+                    if resp:
+                        payload["response"].update(
+                            resp.get("retrieval_data", {})
+                            .get("response", {})
+                        )
+                        payload["prompt"].update(
+                            resp.get("retrieval_data", {})
+                            .get("prompt", {})
+                        )
                 else:
-                    payload["response"] = None
-                    payload["context"] = None
-                    payload["prompt"] = None
+                    payload["response"] = {}
+                    payload["prompt"] = {}
             headers.update({"x-api-key": self.api_key})
             pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}{PROMPT_URL}"
             try:
diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py
index a695582e2fa24..f7103ba24f7c3 100644
--- a/libs/community/langchain_community/document_loaders/pebblo.py
+++ b/libs/community/langchain_community/document_loaders/pebblo.py
@@ -257,7 +257,20 @@ def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list:
 
         if self.api_key:
             if self.classifier_location == "local":
-                payload["docs"] = classified_docs
+                payload_docs = []
+                docs = payload["docs"]
+                for doc_data in docs:
+                    for doc in classified_docs:
+                        if doc_data["source_path"] == doc["source_path"]:
+                            doc_data.update({
+                                "content_checksum": doc["content_checksum"],
+                                "loader_source_path": doc["loader_source_path"]
+                            })
+                        break
+                    doc_data.pop("doc")
+                    payload_docs.append(doc_data)
+                payload["docs"] = payload_docs
+
             headers.update({"x-api-key": self.api_key})
             pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}{LOADER_DOC_URL}"
             try:

From 1dd8f2d7bbdd824d85c4f61a4c05291323caabd3 Mon Sep 17 00:00:00 2001
From: "dristy.cd" <dristy@clouddefense.io>
Date: Wed, 3 Jul 2024 14:55:33 +0530
Subject: [PATCH 02/12] Adding id in doc

---
 .../document_loaders/pebblo.py                | 59 ++++++++-----------
 1 file changed, 26 insertions(+), 33 deletions(-)

diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py
index f7103ba24f7c3..86e8d88ceaa4a 100644
--- a/libs/community/langchain_community/document_loaders/pebblo.py
+++ b/libs/community/langchain_community/document_loaders/pebblo.py
@@ -89,17 +89,12 @@ def load(self) -> List[Document]:
             list: Documents fetched from load method of the wrapped `loader`.
         """
         self.docs = self.loader.load()
-        # Add pebblo-specific metadata to docs
-        self._add_pebblo_specific_metadata()
-        if not self.load_semantic:
-            self._classify_doc(self.docs, loading_end=True)
-            return self.docs
-        self.docs_with_id = self._index_docs()
-        classified_docs = self._classify_doc(self.docs_with_id, loading_end=True)
-        self.docs_with_id = self._add_semantic_to_docs(
-            self.docs_with_id, classified_docs
-        )
-        self.docs = self._unindex_docs(self.docs_with_id)  # type: ignore
+        classified_docs = self._classify_doc(loading_end=True)
+        self._add_pebblo_specific_metadata(classified_docs)
+        if self.load_semantic:
+            self.docs = self._add_semantic_to_docs(classified_docs)
+        else:
+            self.docs = self._unindex_docs()  # type: ignore
         return self.docs
 
     def lazy_load(self) -> Iterator[Document]:
@@ -125,19 +120,14 @@ def lazy_load(self) -> Iterator[Document]:
                 self.docs = []
                 break
             self.docs = list((doc,))
-            # Add pebblo-specific metadata to docs
-            self._add_pebblo_specific_metadata()
-            if not self.load_semantic:
-                self._classify_doc(self.docs, loading_end=True)
-                yield self.docs[0]
+            self.docs_with_id = self._index_docs()
+            classified_doc = self._classify_doc()
+            self._add_pebblo_specific_metadata(classified_doc)
+            if self.load_semantic:
+                self.docs = self._add_semantic_to_docs(classified_doc)
             else:
-                self.docs_with_id = self._index_docs()
-                classified_doc = self._classify_doc(self.docs)
-                self.docs_with_id = self._add_semantic_to_docs(
-                    self.docs_with_id, classified_doc
-                )
-                self.docs = self._unindex_docs(self.docs_with_id)  # type: ignore
-                yield self.docs[0]
+                self.docs = self._unindex_docs()
+            yield self.docs[0]
 
     @classmethod
     def set_discover_sent(cls) -> None:
@@ -257,19 +247,22 @@ def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list:
 
         if self.api_key:
             if self.classifier_location == "local":
-                payload_docs = []
                 docs = payload["docs"]
                 for doc_data in docs:
-                    for doc in classified_docs:
-                        if doc_data["source_path"] == doc["source_path"]:
-                            doc_data.update({
-                                "content_checksum": doc["content_checksum"],
-                                "loader_source_path": doc["loader_source_path"]
-                            })
-                        break
+                    classified_data = classified_docs.get(doc_data["pb_id"], {})
+                    doc_data.update(
+                        {
+                            "content_checksum": classified_data.get(
+                                "content_checksum", None
+                            ),
+                            "loader_source_path": classified_data.get(
+                                "loader_source_path", None
+                            ),
+                            "entities": classified_data.get("entities", {}),
+                            "topics": classified_data.get("topics", {}),
+                        }
+                    )
                     doc_data.pop("doc")
-                    payload_docs.append(doc_data)
-                payload["docs"] = payload_docs
 
             headers.update({"x-api-key": self.api_key})
             pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}{LOADER_DOC_URL}"

From 0b9199e9f1789d2ef96fff20602d971d90d74be8 Mon Sep 17 00:00:00 2001
From: "dristy.cd" <dristy@clouddefense.io>
Date: Thu, 4 Jul 2024 18:22:22 +0530
Subject: [PATCH 03/12] Updating IndexedDocument class

---
 .../document_loaders/pebblo.py                | 76 +++++++++----------
 .../langchain_community/utilities/pebblo.py   |  2 +-
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py
index 86e8d88ceaa4a..a766856975fa8 100644
--- a/libs/community/langchain_community/document_loaders/pebblo.py
+++ b/libs/community/langchain_community/document_loaders/pebblo.py
@@ -5,7 +5,7 @@
 import os
 import uuid
 from http import HTTPStatus
-from typing import Any, Dict, Iterator, List, Optional, Union
+from typing import Any, Dict, Iterator, List, Optional
 
 import requests  # type: ignore
 from langchain_core.documents import Document
@@ -38,16 +38,16 @@ class PebbloSafeLoader(BaseLoader):
     _loader_sent: bool = False
 
     def __init__(
-        self,
-        langchain_loader: BaseLoader,
-        name: str,
-        owner: str = "",
-        description: str = "",
-        api_key: Optional[str] = None,
-        load_semantic: bool = False,
-        classifier_url: Optional[str] = None,
-        *,
-        classifier_location: str = "local",
+            self,
+            langchain_loader: BaseLoader,
+            name: str,
+            owner: str = "",
+            description: str = "",
+            api_key: Optional[str] = None,
+            load_semantic: bool = False,
+            classifier_url: Optional[str] = None,
+            *,
+            classifier_location: str = "local",
     ):
         if not name or not isinstance(name, str):
             raise NameError("Must specify a valid name.")
@@ -61,7 +61,7 @@ def __init__(
         self.source_path = get_loader_full_path(self.loader)
         self.source_owner = PebbloSafeLoader.get_file_owner_from_path(self.source_path)
         self.docs: List[Document] = []
-        self.docs_with_id: Union[List[IndexedDocument], List[Document], List] = []
+        self.docs_with_id: List[IndexedDocument] = []
         loader_name = str(type(self.loader)).split(".")[-1].split("'")[0]
         self.source_type = get_loader_type(loader_name)
         self.source_path_size = self.get_source_size(self.source_path)
@@ -89,6 +89,7 @@ def load(self) -> List[Document]:
             list: Documents fetched from load method of the wrapped `loader`.
         """
         self.docs = self.loader.load()
+        self.docs_with_id = self._index_docs()
         classified_docs = self._classify_doc(loading_end=True)
         self._add_pebblo_specific_metadata(classified_docs)
         if self.load_semantic:
@@ -137,13 +138,12 @@ def set_discover_sent(cls) -> None:
     def set_loader_sent(cls) -> None:
         cls._loader_sent = True
 
-    def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list:
+    def _classify_doc(self, loading_end: bool = False) -> dict:
         """Send documents fetched from loader to pebblo-server. Then send
         classified documents to Daxa cloud(If api_key is present). Internal method.
 
         Args:
 
-            loaded_docs (list): List of documents fetched from loader's load operation.
             loading_end (bool, optional): Flag indicating the halt of data
                                           loading by loader. Defaults to False.
         """
@@ -153,9 +153,8 @@ def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list:
         }
         if loading_end is True:
             PebbloSafeLoader.set_loader_sent()
-        doc_content = [doc.dict() for doc in loaded_docs]
+        doc_content = [doc.dict() for doc in self.docs_with_id]
         docs = []
-        classified_docs = []
         for doc in doc_content:
             doc_metadata = doc.get("metadata", {})
             doc_authorized_identities = doc_metadata.get("authorized_identities", [])
@@ -173,12 +172,12 @@ def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list:
             page_content = str(doc.get("page_content"))
             page_content_size = self.calculate_content_size(page_content)
             self.source_aggregate_size += page_content_size
-            doc_id = doc.get("id", None) or 0
+            doc_id = doc.get("pb_id", None) or 0
             docs.append(
                 {
                     "doc": page_content,
                     "source_path": doc_source_path,
-                    "id": doc_id,
+                    "pb_id": doc_id,
                     "last_modified": doc.get("metadata", {}).get("last_modified"),
                     "file_owner": doc_source_owner,
                     **(
@@ -211,6 +210,7 @@ def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list:
                     self.source_aggregate_size
                 )
         payload = Doc(**payload).dict(exclude_unset=True)
+        classified_docs = {}
         # Raw payload to be sent to classifier
         if self.classifier_location == "local":
             load_doc_url = f"{self.classifier_url}{LOADER_DOC_URL}"
@@ -218,7 +218,10 @@ def _classify_doc(self, loaded_docs: list, loading_end: bool = False) -> list:
                 pebblo_resp = requests.post(
                     load_doc_url, headers=headers, json=payload, timeout=300
                 )
-                classified_docs = json.loads(pebblo_resp.text).get("docs", None)
+
+                # Updating the structure of pebblo response docs for efficient searching
+                for classified_doc in json.loads(pebblo_resp.text).get("docs", []):
+                    classified_docs.update({classified_doc["pb_id"]: classified_doc})
                 if pebblo_resp.status_code not in [
                     HTTPStatus.OK,
                     HTTPStatus.BAD_GATEWAY,
@@ -459,33 +462,29 @@ def _index_docs(self) -> List[IndexedDocument]:
             List[IndexedDocument]: A list of IndexedDocument objects with unique IDs.
         """
         docs_with_id = [
-            IndexedDocument(id=hex(i)[2:], **doc.dict())
+            IndexedDocument(pb_id=str(i), **doc.dict())
             for i, doc in enumerate(self.docs)
         ]
         return docs_with_id
 
-    def _add_semantic_to_docs(
-        self, docs_with_id: List[IndexedDocument], classified_docs: List[dict]
-    ) -> List[Document]:
+    def _add_semantic_to_docs(self, classified_docs: Dict) -> List[Document]:
         """
         Adds semantic metadata to the given list of documents.
 
         Args:
-            docs_with_id (List[IndexedDocument]): A list of IndexedDocument objects
-                containing the documents with their IDs.
-            classified_docs (List[dict]): A list of dictionaries containing the
-                classified documents.
+            classified_docs (Dict): A dictionary of dictionaries containing the
+                classified documents with pb_id as key.
 
         Returns:
             List[Document]: A list of Document objects with added semantic metadata.
         """
         indexed_docs = {
-            doc.id: Document(page_content=doc.page_content, metadata=doc.metadata)
-            for doc in docs_with_id
+            doc.pb_id: Document(page_content=doc.page_content, metadata=doc.metadata)
+            for doc in self.docs_with_id
         }
 
-        for classified_doc in classified_docs:
-            doc_id = classified_doc.get("id")
+        for classified_doc in classified_docs.values():
+            doc_id = classified_doc.get("pb_id")
             if doc_id in indexed_docs:
                 self._add_semantic_to_doc(indexed_docs[doc_id], classified_doc)
 
@@ -493,19 +492,16 @@ def _add_semantic_to_docs(
 
         return semantic_metadata_docs
 
-    def _unindex_docs(self, docs_with_id: List[IndexedDocument]) -> List[Document]:
+    def _unindex_docs(self) -> List[Document]:
         """
         Converts a list of IndexedDocument objects to a list of Document objects.
 
-        Args:
-            docs_with_id (List[IndexedDocument]): A list of IndexedDocument objects.
-
         Returns:
             List[Document]: A list of Document objects.
         """
         docs = [
             Document(page_content=doc.page_content, metadata=doc.metadata)
-            for i, doc in enumerate(docs_with_id)
+            for i, doc in enumerate(self.docs_with_id)
         ]
         return docs
 
@@ -528,12 +524,16 @@ def _add_semantic_to_doc(self, doc: Document, classified_doc: dict) -> Document:
         )
         return doc
 
-    def _add_pebblo_specific_metadata(self) -> None:
+    def _add_pebblo_specific_metadata(self, classified_docs: dict) -> None:
         """Add Pebblo specific metadata to documents."""
-        for doc in self.docs:
+        for doc in self.docs_with_id:
             doc_metadata = doc.metadata
             doc_metadata["full_path"] = get_full_path(
                 doc_metadata.get(
                     "full_path", doc_metadata.get("source", self.source_path)
                 )
             )
+            doc_metadata["pb_id"] = doc.pb_id
+            doc_metadata["content_checksum"] = classified_docs.get(doc.pb_id, {}).get(
+                "content_checksum", None
+            )
diff --git a/libs/community/langchain_community/utilities/pebblo.py b/libs/community/langchain_community/utilities/pebblo.py
index 377155c71f4a3..9d8ffb13e4723 100644
--- a/libs/community/langchain_community/utilities/pebblo.py
+++ b/libs/community/langchain_community/utilities/pebblo.py
@@ -65,7 +65,7 @@
 class IndexedDocument(Document):
     """Pebblo Indexed Document."""
 
-    id: str
+    pb_id: str
     """Unique ID of the document."""
 
 

From 35d725a579dcda50056b8c393607379be89f4f4e Mon Sep 17 00:00:00 2001
From: "dristy.cd" <dristy@clouddefense.io>
Date: Fri, 5 Jul 2024 12:32:58 +0530
Subject: [PATCH 04/12] Fixing lint and Uts

---
 .../unit_tests/document_loaders/test_pebblo.py   | 16 ++++++++++++++--
 1 file changed, 14 insertions(+), 2 deletions(-)

diff --git a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py
index 1cee8a849d1c6..922acbc8818e6 100644
--- a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py
@@ -65,12 +65,24 @@ def test_csv_loader_load_valid_data(mocker: MockerFixture) -> None:
     full_file_path = os.path.abspath(file_path)
     expected_docs = [
         Document(
+            metadata={
+                "source": full_file_path,
+                "row": 0,
+                "full_path": full_file_path,
+                "pb_id": "0",
+                "content_checksum": None,  # For UT as here we are not calculating checksum
+            },
             page_content="column1: value1\ncolumn2: value2\ncolumn3: value3",
-            metadata={"source": file_path, "row": 0, "full_path": full_file_path},
         ),
         Document(
+            metadata={
+                "source": full_file_path,
+                "row": 1,
+                "full_path": full_file_path,
+                "pb_id": "1",
+                "content_checksum": None,  # For UT as here we are not calculating checksum
+            },
             page_content="column1: value4\ncolumn2: value5\ncolumn3: value6",
-            metadata={"source": file_path, "row": 1, "full_path": full_file_path},
         ),
     ]
 

From 9ba1c1111dcd43b22d5b24bdc14596d24f27d064 Mon Sep 17 00:00:00 2001
From: "dristy.cd" <dristy@clouddefense.io>
Date: Tue, 9 Jul 2024 09:56:51 +0530
Subject: [PATCH 05/12] Fixing lints

---
 .../tests/unit_tests/document_loaders/test_pebblo.py        | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py
index 922acbc8818e6..89f2e8e8a905c 100644
--- a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py
@@ -70,7 +70,8 @@ def test_csv_loader_load_valid_data(mocker: MockerFixture) -> None:
                 "row": 0,
                 "full_path": full_file_path,
                 "pb_id": "0",
-                "content_checksum": None,  # For UT as here we are not calculating checksum
+                # For UT as here we are not calculating checksum
+                "content_checksum": None,
             },
             page_content="column1: value1\ncolumn2: value2\ncolumn3: value3",
         ),
@@ -80,7 +81,8 @@ def test_csv_loader_load_valid_data(mocker: MockerFixture) -> None:
                 "row": 1,
                 "full_path": full_file_path,
                 "pb_id": "1",
-                "content_checksum": None,  # For UT as here we are not calculating checksum
+                # For UT as here we are not calculating checksum
+                "content_checksum": None,
             },
             page_content="column1: value4\ncolumn2: value5\ncolumn3: value6",
         ),

From 26ded3639e56f53486df26d5ceede9c8c801986c Mon Sep 17 00:00:00 2001
From: "dristy.cd" <dristy@clouddefense.io>
Date: Tue, 9 Jul 2024 10:02:02 +0530
Subject: [PATCH 06/12] Fixing lints

---
 .../chains/pebblo_retrieval/base.py           |  6 ++----
 .../document_loaders/pebblo.py                | 20 +++++++++----------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/libs/community/langchain_community/chains/pebblo_retrieval/base.py b/libs/community/langchain_community/chains/pebblo_retrieval/base.py
index f5849f8153057..8b10e82538bcd 100644
--- a/libs/community/langchain_community/chains/pebblo_retrieval/base.py
+++ b/libs/community/langchain_community/chains/pebblo_retrieval/base.py
@@ -460,12 +460,10 @@ def _send_prompt(self, qa_payload: Qa) -> None:
                     resp = json.loads(pebblo_resp.text)
                     if resp:
                         payload["response"].update(
-                            resp.get("retrieval_data", {})
-                            .get("response", {})
+                            resp.get("retrieval_data", {}).get("response", {})
                         )
                         payload["prompt"].update(
-                            resp.get("retrieval_data", {})
-                            .get("prompt", {})
+                            resp.get("retrieval_data", {}).get("prompt", {})
                         )
                 else:
                     payload["response"] = {}
diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py
index a766856975fa8..41ff9b3aceff4 100644
--- a/libs/community/langchain_community/document_loaders/pebblo.py
+++ b/libs/community/langchain_community/document_loaders/pebblo.py
@@ -38,16 +38,16 @@ class PebbloSafeLoader(BaseLoader):
     _loader_sent: bool = False
 
     def __init__(
-            self,
-            langchain_loader: BaseLoader,
-            name: str,
-            owner: str = "",
-            description: str = "",
-            api_key: Optional[str] = None,
-            load_semantic: bool = False,
-            classifier_url: Optional[str] = None,
-            *,
-            classifier_location: str = "local",
+        self,
+        langchain_loader: BaseLoader,
+        name: str,
+        owner: str = "",
+        description: str = "",
+        api_key: Optional[str] = None,
+        load_semantic: bool = False,
+        classifier_url: Optional[str] = None,
+        *,
+        classifier_location: str = "local",
     ):
         if not name or not isinstance(name, str):
             raise NameError("Must specify a valid name.")

From 8533c536363087110ccf901071726e9f9b05af73 Mon Sep 17 00:00:00 2001
From: "dristy.cd" <dristy@clouddefense.io>
Date: Tue, 9 Jul 2024 11:27:37 +0530
Subject: [PATCH 07/12] Updating field name

---
 .../langchain_community/document_loaders/pebblo.py        | 8 ++++----
 .../tests/unit_tests/document_loaders/test_pebblo.py      | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py
index 41ff9b3aceff4..175a904807313 100644
--- a/libs/community/langchain_community/document_loaders/pebblo.py
+++ b/libs/community/langchain_community/document_loaders/pebblo.py
@@ -255,8 +255,8 @@ def _classify_doc(self, loading_end: bool = False) -> dict:
                     classified_data = classified_docs.get(doc_data["pb_id"], {})
                     doc_data.update(
                         {
-                            "content_checksum": classified_data.get(
-                                "content_checksum", None
+                            "pb_checksum": classified_data.get(
+                                "pb_checksum", None
                             ),
                             "loader_source_path": classified_data.get(
                                 "loader_source_path", None
@@ -534,6 +534,6 @@ def _add_pebblo_specific_metadata(self, classified_docs: dict) -> None:
                 )
             )
             doc_metadata["pb_id"] = doc.pb_id
-            doc_metadata["content_checksum"] = classified_docs.get(doc.pb_id, {}).get(
-                "content_checksum", None
+            doc_metadata["pb_checksum"] = classified_docs.get(doc.pb_id, {}).get(
+                "pb_checksum", None
             )
diff --git a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py
index 89f2e8e8a905c..d0a71faae7a8e 100644
--- a/libs/community/tests/unit_tests/document_loaders/test_pebblo.py
+++ b/libs/community/tests/unit_tests/document_loaders/test_pebblo.py
@@ -71,7 +71,7 @@ def test_csv_loader_load_valid_data(mocker: MockerFixture) -> None:
                 "full_path": full_file_path,
                 "pb_id": "0",
                 # For UT as here we are not calculating checksum
-                "content_checksum": None,
+                "pb_checksum": None,
             },
             page_content="column1: value1\ncolumn2: value2\ncolumn3: value3",
         ),
@@ -82,7 +82,7 @@ def test_csv_loader_load_valid_data(mocker: MockerFixture) -> None:
                 "full_path": full_file_path,
                 "pb_id": "1",
                 # For UT as here we are not calculating checksum
-                "content_checksum": None,
+                "pb_checksum": None,
             },
             page_content="column1: value4\ncolumn2: value5\ncolumn3: value6",
         ),

From 5bec2304db496953c9f59a2cc46ce50724c153e0 Mon Sep 17 00:00:00 2001
From: "dristy.cd" <dristy@clouddefense.io>
Date: Tue, 9 Jul 2024 11:34:31 +0530
Subject: [PATCH 08/12] Fixing lints

---
 libs/community/langchain_community/document_loaders/pebblo.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/libs/community/langchain_community/document_loaders/pebblo.py b/libs/community/langchain_community/document_loaders/pebblo.py
index 175a904807313..2e31b370cbd23 100644
--- a/libs/community/langchain_community/document_loaders/pebblo.py
+++ b/libs/community/langchain_community/document_loaders/pebblo.py
@@ -255,9 +255,7 @@ def _classify_doc(self, loading_end: bool = False) -> dict:
                     classified_data = classified_docs.get(doc_data["pb_id"], {})
                     doc_data.update(
                         {
-                            "pb_checksum": classified_data.get(
-                                "pb_checksum", None
-                            ),
+                            "pb_checksum": classified_data.get("pb_checksum", None),
                             "loader_source_path": classified_data.get(
                                 "loader_source_path", None
                             ),

From d42724b4e7882c7dcc673df94454b7260e7a6f30 Mon Sep 17 00:00:00 2001
From: "dristy.cd" <dristy@clouddefense.io>
Date: Wed, 10 Jul 2024 09:57:50 +0530
Subject: [PATCH 09/12] Updating retrieval model

---
 .../langchain_community/chains/pebblo_retrieval/models.py        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libs/community/langchain_community/chains/pebblo_retrieval/models.py b/libs/community/langchain_community/chains/pebblo_retrieval/models.py
index 3b7f94d44c8a4..13a54537b9169 100644
--- a/libs/community/langchain_community/chains/pebblo_retrieval/models.py
+++ b/libs/community/langchain_community/chains/pebblo_retrieval/models.py
@@ -129,6 +129,7 @@ class Context(BaseModel):
     retrieved_from: Optional[str]
     doc: Optional[str]
     vector_db: str
+    pb_checksum: Optional[str]
 
 
 class Prompt(BaseModel):

From 5acce1b7fa36a383632a0f04fe0e0639a4e654a8 Mon Sep 17 00:00:00 2001
From: "dristy.cd" <dristy@clouddefense.io>
Date: Wed, 10 Jul 2024 11:04:56 +0530
Subject: [PATCH 10/12] Updating pebblo_retrieval_qa

---
 .../chains/pebblo_retrieval/base.py                    | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/libs/community/langchain_community/chains/pebblo_retrieval/base.py b/libs/community/langchain_community/chains/pebblo_retrieval/base.py
index 8b10e82538bcd..b13523d16e7e7 100644
--- a/libs/community/langchain_community/chains/pebblo_retrieval/base.py
+++ b/libs/community/langchain_community/chains/pebblo_retrieval/base.py
@@ -124,6 +124,11 @@ def _call(
                     ),
                     "doc": doc.page_content,
                     "vector_db": self.retriever.vectorstore.__class__.__name__,
+                    **(
+                        {"content_checksum": doc.metadata.get("content_checksum")}
+                        if doc.metadata.get("content_checksum")
+                        else {}
+                    ),
                 }
                 for doc in docs
                 if isinstance(doc, Document)
@@ -465,9 +470,14 @@ def _send_prompt(self, qa_payload: Qa) -> None:
                         payload["prompt"].update(
                             resp.get("retrieval_data", {}).get("prompt", {})
                         )
+                        context = payload["context"]
+                        for context_data in context:
+                            context_data.pop("doc")
+                        payload["context"] = context
                 else:
                     payload["response"] = {}
                     payload["prompt"] = {}
+                    payload["context"] = []
             headers.update({"x-api-key": self.api_key})
             pebblo_cloud_url = f"{PEBBLO_CLOUD_URL}{PROMPT_URL}"
             try:

From d4f1a107c3a8bec6c07087f04f77bf8ae6e1de13 Mon Sep 17 00:00:00 2001
From: "dristy.cd" <dristy@clouddefense.io>
Date: Thu, 11 Jul 2024 13:17:57 +0530
Subject: [PATCH 11/12] Updating variable name

---
 .../langchain_community/chains/pebblo_retrieval/base.py       | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libs/community/langchain_community/chains/pebblo_retrieval/base.py b/libs/community/langchain_community/chains/pebblo_retrieval/base.py
index b13523d16e7e7..629aca740d51d 100644
--- a/libs/community/langchain_community/chains/pebblo_retrieval/base.py
+++ b/libs/community/langchain_community/chains/pebblo_retrieval/base.py
@@ -125,8 +125,8 @@ def _call(
                     "doc": doc.page_content,
                     "vector_db": self.retriever.vectorstore.__class__.__name__,
                     **(
-                        {"content_checksum": doc.metadata.get("content_checksum")}
-                        if doc.metadata.get("content_checksum")
+                        {"pb_checksum": doc.metadata.get("pb_checksum")}
+                        if doc.metadata.get("pb_checksum")
                         else {}
                     ),
                 }

From b0a444526e15acdea48c75af56cd04880d723a50 Mon Sep 17 00:00:00 2001
From: "dristy.cd" <dristy@clouddefense.io>
Date: Thu, 11 Jul 2024 14:52:18 +0530
Subject: [PATCH 12/12] Removing actual content from prompt API

---
 .../langchain_community/chains/pebblo_retrieval/base.py         | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libs/community/langchain_community/chains/pebblo_retrieval/base.py b/libs/community/langchain_community/chains/pebblo_retrieval/base.py
index 629aca740d51d..02d3553c4b464 100644
--- a/libs/community/langchain_community/chains/pebblo_retrieval/base.py
+++ b/libs/community/langchain_community/chains/pebblo_retrieval/base.py
@@ -467,9 +467,11 @@ def _send_prompt(self, qa_payload: Qa) -> None:
                         payload["response"].update(
                             resp.get("retrieval_data", {}).get("response", {})
                         )
+                        payload["response"].pop("data")
                         payload["prompt"].update(
                             resp.get("retrieval_data", {}).get("prompt", {})
                         )
+                        payload["prompt"].pop("data")
                         context = payload["context"]
                         for context_data in context:
                             context_data.pop("doc")