diff --git a/pebblo_safeloader/langchain/textloader_postgress/.env.sample b/pebblo_safeloader/langchain/textloader_postgress/.env.sample index 9cbefec8..f3241005 100644 --- a/pebblo_safeloader/langchain/textloader_postgress/.env.sample +++ b/pebblo_safeloader/langchain/textloader_postgress/.env.sample @@ -1,11 +1,11 @@ # OpenAI credentials OPENAI_API_KEY= -# Pebblo configuration -PEBBLO_CLOUD_URL= -PEBBLO_API_KEY= -PEBBLO_CLASSIFIER_URL="http://localhost:8000/" - # Postgres configuration PG_CONNECTION_STRING = "postgresql://:@:/" +# Pebblo configuration +PEBBLO_CLASSIFIER_URL="http://localhost:8000/" +# Optional (only if you are using Pebblo Cloud) +PEBBLO_CLOUD_URL= +PEBBLO_API_KEY= diff --git a/pebblo_safeloader/langchain/textloader_postgress/README.md b/pebblo_safeloader/langchain/textloader_postgress/README.md new file mode 100644 index 00000000..f6bfa349 --- /dev/null +++ b/pebblo_safeloader/langchain/textloader_postgress/README.md @@ -0,0 +1,62 @@ +# Pebblo Text Loader + +This is a sample application that demonstrates how to use the `Pebblo Text Loader` to load the text data +with the `Pebblo Safe Loader` into `Postgres` Vector Database. + +\* This solution uses predefined text data and metadata from the utility functions to demonstrate the loading of +in-memory text data using Pebblo Safe Loader. Real-world applications can use this solution to load text data from +various sources. + +**PebbloTextLoader**: PebbloTextLoader is a loader for text data. Since PebbloSafeLoader is a wrapper around document +loaders, this loader is used to load text data directly into Documents. + +**This solution uses:** + +- PostgreSQL 15.7 +- langchain-community from daxa-ai/langchain branch(pebblo-0.1.19) + +### Instructions + +1. Create Python virtual-env + +```console +$ python3 -m venv .venv +$ source .venv/bin/activate +``` + +2. Install dependencies + +```console +$ pip3 install -r requirements.txt +``` + +3. Install langchain-community from the branch `pebblo-0.1.19` + +```console +$ git clone https://github.com/daxa-ai/langchain.git +$ cd langchain +$ git fetch && git checkout pebblo-0.1.19 +$ cd libs/community +$ pip3 install langchain-community . +``` + +4. Copy the `.env.sample` file to `.env` and populate the necessary environment variable. The `.env` file should look + like this: + +```console +$ cat .env +# OpenAI credentials +OPENAI_API_KEY= + +# Postgres configuration +PG_CONNECTION_STRING = "postgresql://:@:/" +``` + +5. Run Pebblo Safe Loader sample app + +```console +$ python3 pebblo_safeload.py +``` + +6. Retrieve the Pebblo PDF report in `$HOME/.pebblo/pebblo-safe-loader-text-loader/pebblo_report.pdf` file path on the + system where `Pebblo Server` is running. diff --git a/pebblo_safeloader/langchain/textloader_postgress/pebblo_safeload.py b/pebblo_safeloader/langchain/textloader_postgress/pebblo_safeload.py index b2eea758..56410c71 100644 --- a/pebblo_safeloader/langchain/textloader_postgress/pebblo_safeload.py +++ b/pebblo_safeloader/langchain/textloader_postgress/pebblo_safeload.py @@ -48,6 +48,7 @@ def __init__(self, collection_name: str): description="Identity & Semantic enabled SafeLoader app using Pebblo", # Description (Optional) load_semantic=True, api_key=PEBBLO_API_KEY, + anonymize_snippets=True, ) self.documents = self.loader.load() unique_identities = set() diff --git a/pebblo_safeloader/langchain/textloader_postgress/requirements.txt b/pebblo_safeloader/langchain/textloader_postgress/requirements.txt index c263668b..33b1638a 100644 --- a/pebblo_safeloader/langchain/textloader_postgress/requirements.txt +++ b/pebblo_safeloader/langchain/textloader_postgress/requirements.txt @@ -2,7 +2,7 @@ python-dotenv==1.0.0 tiktoken # OpenAI tokenizer langchain-openai>=0.1.7 # For OpenAI LLM and OpenAIEmbeddings -langchain-community>=0.2.16,<0.3 # for PebbloSafeLoader, PebbloRetrievalQA +#langchain-community>=0.2.16,<0.3 # for PebbloSafeLoader, PebbloRetrievalQA psycopg2-binary # For Postgres VectorStore langchain-postgres # For Postgres VectorStore \ No newline at end of file diff --git a/pebblo_safeloader/langchain/textloader_postgress/util.py b/pebblo_safeloader/langchain/textloader_postgress/util.py index c082cd04..cacd1255 100644 --- a/pebblo_safeloader/langchain/textloader_postgress/util.py +++ b/pebblo_safeloader/langchain/textloader_postgress/util.py @@ -40,8 +40,12 @@ def get_data( if metadatas: # Metadata(source: fake news web url) for each text _metadata_list = [ - {"source": f"https://www.acme.org/news/{i}"} - for i in range(1, len(texts) + 1) + { + "source": f"https://www.acme.org/news/{i + 1}", + "owner": "Joe Smith", + "size": f"{len(texts[i])}", + } + for i in range(len(texts)) ] else: _metadata_list = None