Skip to content

Commit

Permalink
Added langchain document chunking notebook to nbtest (#151)
Browse files Browse the repository at this point in the history
  • Loading branch information
miguelgrinberg authored Jan 22, 2024
1 parent 611bf5f commit ebd2e96
Show file tree
Hide file tree
Showing 9 changed files with 173 additions and 120 deletions.
5 changes: 4 additions & 1 deletion notebooks/document-chunking/.nbtest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@ masks:
- "'cluster_name': '[^']+'"
- "'cluster_uuid': '[^']+'"
- "'build_flavor': '[^']+'"
- '[0-9]+\.[0-9]+\.[0-9]+'
- '[0-9]+\.[0-9]+\.[0-9]+(-SNAPSHOT)?'
- "'build_snapshot': [^,]+"
- "'build_hash': '[^']+'"
- "'build_date': '[^']+'"
- '^ID: .*$'
- '^Score: [0-9]+\.[0-9][0-9]*$'
5 changes: 3 additions & 2 deletions notebooks/document-chunking/Makefile
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
NBTEST = ../../bin/nbtest
NOTEBOOKS = \
with-index-pipelines.ipynb
with-index-pipelines.ipynb \
with-langchain-splitters.ipynb

.PHONY: all $(NOTEBOOKS)

all: $(NOTEBOOKS)

$(NOTEBOOKS):
$(NBTEST) $@
-$(NBTEST) $@
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,11 @@
"metadata": {},
"outputs": [],
"source": [
"client.indices.delete(index=\"chunk_passages_example\")\n",
"client.ingest.delete_pipeline(id=\"chunk_text_to_passages\")"
"client.indices.delete(index=\"chunk_passages_example\", ignore_unavailable=True)\n",
"try:\n",
" client.ingest.delete_pipeline(id=\"chunk_text_to_passages\")\n",
"except:\n",
" pass"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "1422b7bb-bc8c-42bb-b070-53fce3cf6144",
"metadata": {},
"outputs": [],
"source": [
"from elasticsearch import Elasticsearch\n",
"from getpass import getpass\n",
"\n",
"# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#finding-your-cloud-id\n",
"ELASTIC_CLOUD_ID = getpass(\"Elastic Cloud ID: \")\n",
"\n",
"# https://www.elastic.co/search-labs/tutorials/install-elasticsearch/elastic-cloud#creating-an-api-key\n",
"ELASTIC_API_KEY = getpass(\"Elastic Api Key: \")\n",
"\n",
"# Create the client instance\n",
"client = Elasticsearch(\n",
" # For local development\n",
" # hosts=[\"http://localhost:9200\"] \n",
" cloud_id=ELASTIC_CLOUD_ID,\n",
" api_key=ELASTIC_API_KEY,\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e4a89367-d23a-4340-bc92-2dcabd18adcd",
"metadata": {},
"outputs": [],
"source": [
"client.indices.delete(index=\"nb_parent_retriever_index\", ignore_unavailable=True)\n",
"try:\n",
" client.ingest.delete_pipeline(id=\"chunk_text_to_passages\")\n",
"except:\n",
" pass"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4ac37f1b-6122-49fe-a3b8-e8f2025a0961",
"metadata": {},
"outputs": [],
"source": [
"try:\n",
" client.ml.delete_trained_model(model_id=\"sentence-transformers__all-minilm-l6-v2\", force=True)\n",
"except:\n",
" pass"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
133 changes: 72 additions & 61 deletions notebooks/document-chunking/with-langchain-splitters.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -27,23 +27,11 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
"langserve 0.0.21 requires pydantic<2,>=1, but you have pydantic 2.3.0 which is incompatible.\n",
"poetry 1.6.1 requires build<0.11.0,>=0.10.0, but you have build 1.0.3 which is incompatible.\n",
"poetry 1.6.1 requires jsonschema<4.18.0,>=4.10.0, but you have jsonschema 4.19.1 which is incompatible.\u001b[0m\u001b[31m\n",
"\u001b[0m"
]
}
],
"outputs": [],
"source": [
"!python3 -m pip install -qU langchain elasticsearch "
"!python3 -m pip install -qU langchain elasticsearch eland jq"
]
},
{
Expand All @@ -59,7 +47,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -177,7 +165,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 11,
"metadata": {},
"outputs": [
{
Expand All @@ -186,7 +174,7 @@
"ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'nb_parent_retriever_index'})"
]
},
"execution_count": 9,
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -274,7 +262,7 @@
},
{
"cell_type": "code",
"execution_count": 10,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -313,7 +301,7 @@
},
{
"cell_type": "code",
"execution_count": 11,
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -356,7 +344,7 @@
},
{
"cell_type": "code",
"execution_count": 12,
"execution_count": 14,
"metadata": {},
"outputs": [
{
Expand All @@ -378,7 +366,10 @@
" index=INDEX_NAME\n",
")\n",
"\n",
"print(f\"Indexed {count} documents with {errors} errors\")"
"print(f\"Indexed {count} documents with {errors} errors\")\n",
"\n",
"import time\n",
"time.sleep(5)"
]
},
{
Expand All @@ -391,15 +382,15 @@
},
{
"cell_type": "code",
"execution_count": 13,
"execution_count": 15,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"ID: HP6WfIwBeCQuLJUs19ov\n",
"ID: 1XvjyowBidHK_OJxJozM\n",
"Doc Title: Work From Home Policy\n",
"parent text:\n",
"\n",
Expand All @@ -414,11 +405,11 @@
"Eligibility\n",
"\n",
"\n",
"Score: 0.8483097\n",
"Score: 0.84830964\n",
"\n",
"---\n",
"\n",
"ID: I_6WfIwBeCQuLJUs19ov\n",
"ID: 3HvjyowBidHK_OJxJozM\n",
"Doc Title: Intellectual Property Policy\n",
"parent text:\n",
"\n",
Expand All @@ -434,7 +425,7 @@
"\n",
"---\n",
"\n",
"ID: IP6WfIwBeCQuLJUs19ov\n",
"ID: 2XvjyowBidHK_OJxJozM\n",
"Doc Title: Company Vacation Policy\n",
"parent text:\n",
"\n",
Expand All @@ -452,7 +443,7 @@
"\n",
"---\n",
"\n",
"ID: Hv6WfIwBeCQuLJUs19ov\n",
"ID: 13vjyowBidHK_OJxJozM\n",
"Doc Title: Wfh Policy Update May 2023\n",
"parent text:\n",
"\n",
Expand All @@ -464,7 +455,7 @@
"\n",
"---\n",
"\n",
"ID: Kv6WfIwBeCQuLJUs19ov\n",
"ID: 43vjyowBidHK_OJxJozM\n",
"Doc Title: New Employee Onboarding Guide\n",
"parent text:\n",
"\n",
Expand Down Expand Up @@ -519,21 +510,51 @@
},
{
"cell_type": "code",
"execution_count": 14,
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[Document(page_content='Effective: March 2020\\nPurpose\\n\\nThe purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\\nScope\\n\\nThis policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\\nEligibility\\n\\n', metadata={'summary': 'This policy outlines the guidelines for full-time remote work, including eligibility, equipment and resources, workspace requirements, communication expectations, performance expectations, time tracking and overtime, confidentiality and data security, health and well-being, and policy reviews and updates. Employees are encouraged to direct any questions or concerns', 'updated_at': '2020-03-01', 'name': 'Work From Home Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/document-chunking/temp.json', 'category': 'teams', 'seq_num': 1, 'url': './sharepoint/Work from home policy.txt'}),\n",
" Document(page_content='Purpose\\nThe purpose of this Intellectual Property Policy is to establish guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\\n\\nScope\\nThis policy applies to all employees, including full-time, part-time, temporary, and contract employees.\\n\\n', metadata={'summary': \"This Intellectual Property Policy outlines guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. It establishes the company's ownership of work generated on company time, while recognizing employee ownership of work generated outside of company time without the use of company resources. The policy\", 'updated_at': None, 'name': 'Intellectual Property Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/document-chunking/temp.json', 'category': 'sharepoint', 'seq_num': 8, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/EWz3cYEVdzBNsiHsYbKhms4BVYGhravyrUw3T3lzxL4pTg?e=mPIgbO'}),\n",
" Document(page_content='Purpose\\n\\nThe purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life balance and encourage employees to take time to rest and recharge.\\nScope\\n\\nThis policy applies to all full-time and part-time employees who have completed their probationary period.\\nVacation Accrual\\n\\n', metadata={'summary': ': This policy outlines the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. Full-time employees accrue vacation time at a rate of [X hours] per month, equivalent to [Y days] per year. Vacation requests must be submitted to supervisors at least', 'updated_at': '2018-04-16', 'name': 'Company Vacation Policy', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/document-chunking/temp.json', 'category': 'sharepoint', 'seq_num': 5, 'url': 'https://enterprisesearch.sharepoint.com/:t:/s/MSBuilddemo/ES6rw9bKZxVBobG1WUoJpikBF9Bhx1pw_GvJWbsg-Z_HNA?e=faSHVt'}),\n",
" Document(page_content='As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.\\n\\n', metadata={'summary': 'Starting May 1, 2023, our hybrid work policy will require employees to work from the office three days a week and two days remotely.', 'updated_at': '2023-05-01', 'name': 'Wfh Policy Update May 2023', 'source': '/Users/joe/projects/elastic/elasticsearch-labs/notebooks/document-chunking/temp.json', 'category': 'teams', 'seq_num': 3, 'url': './sharepoint/WFH policy update May 2023.txt'})]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
"name": "stdout",
"output_type": "stream",
"text": [
"Doc title: Work From Home Policy\n",
"Text:\n",
"Effective: March 2020\n",
"Purpose\n",
"\n",
"The purpose of this full-time work-from-home policy is to provide guidelines and support for employees to conduct their work remotely, ensuring the continuity and productivity of business operations during the COVID-19 pandemic and beyond.\n",
"Scope\n",
"\n",
"This policy applies to all employees who are eligible for remote work as determined by their role and responsibilities. It is designed to allow employees to work from home full time while maintaining the same level of performance and collaboration as they would in the office.\n",
"Eligibility\n",
"\n",
"\n",
"Doc title: Intellectual Property Policy\n",
"Text:\n",
"Purpose\n",
"The purpose of this Intellectual Property Policy is to establish guidelines and procedures for the ownership, protection, and utilization of intellectual property generated by employees during their employment. This policy aims to encourage creativity and innovation while ensuring that the interests of both the company and its employees are protected.\n",
"\n",
"Scope\n",
"This policy applies to all employees, including full-time, part-time, temporary, and contract employees.\n",
"\n",
"\n",
"Doc title: Company Vacation Policy\n",
"Text:\n",
"Purpose\n",
"\n",
"The purpose of this vacation policy is to outline the guidelines and procedures for requesting and taking time off from work for personal and leisure purposes. This policy aims to promote a healthy work-life balance and encourage employees to take time to rest and recharge.\n",
"Scope\n",
"\n",
"This policy applies to all full-time and part-time employees who have completed their probationary period.\n",
"Vacation Accrual\n",
"\n",
"\n",
"Doc title: Wfh Policy Update May 2023\n",
"Text:\n",
"As we continue to prioritize the well-being of our employees, we are making a slight adjustment to our hybrid work policy. Starting May 1, 2023, employees will be required to work from the office three days a week, with two days designated for remote work. Please communicate with your supervisor and HR department to establish your updated in-office workdays.\n",
"\n",
"\n"
]
}
],
"source": [
Expand Down Expand Up @@ -593,33 +614,23 @@
" metadata=hit[\"_source\"][\"metadata\"],\n",
" )\n",
"\n",
"vector_store.similarity_search(query=\"Whats the work from home policy?\", doc_builder=doc_builder)"
"results = vector_store.similarity_search(query=\"Whats the work from home policy?\", doc_builder=doc_builder)\n",
"for result in results:\n",
" print(f'Doc title: {result.metadata[\"name\"]}')\n",
" print(f'Text:\\n{result.page_content}')"
]
},
{
"cell_type": "code",
"execution_count": 8,
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ObjectApiResponse({'acknowledged': True})"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"client.indices.delete(index=INDEX_NAME)"
]
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
Expand All @@ -633,9 +644,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.3"
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}
3 changes: 2 additions & 1 deletion notebooks/search/.nbtest.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
masks:
- "'name': '[^']+'"
- "'build_flavor': '[^']+'"
- '[0-9]+\.[0-9]+\.[0-9]+'
- '[0-9]+\.[0-9]+\.[0-9]+(-SNAPSHOT)?'
- "'build_snapshot': [^,]+"
- "'cluster_name': '[^']+'"
- "'cluster_uuid': '[^']+'"
- "'build_hash': '[^']+'"
Expand Down
Loading

0 comments on commit ebd2e96

Please sign in to comment.