diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt index ca77427c3e6..37ed7e538d7 100644 --- a/.github/actions/spelling/allow.txt +++ b/.github/actions/spelling/allow.txt @@ -41,6 +41,7 @@ CHECKOV COCOAPODS COINIT CONOUT +COUNTIF CUCUONAR CWLEY CZE @@ -77,7 +78,6 @@ ESG Eliud Embs Envane -envsubst Esin Eventarc FAISS @@ -114,17 +114,16 @@ Glickman Gmb GmbH Googlers -Hadoop HBox HDFC HDFS -hdfs HIDPI HIGHCPU HMO HREDRAW HSA HVDC +Hadoop Hamamoto Hamblin Hamers @@ -155,6 +154,7 @@ Junzhe KFBI KNN KPIs +KSA Kaelen Kaggle Kamradt @@ -170,7 +170,6 @@ Kipchoge Knopf Kohli Kraizt -KSA Kubeflow Kudrow LCEL @@ -189,7 +188,6 @@ Lego Leung Llion Loghub -loghub Logrus Lottry MLB @@ -229,7 +227,6 @@ Niitsuma Nintendo Nominatim Noogler -novnc ODb OOTB Oberst @@ -261,8 +258,8 @@ Qwiklabs RAGAS RLHF RMSE -RNNs RNN +RNNs ROOTSPAN RRF RTN @@ -282,8 +279,8 @@ SEK SEO SIMONE SKUs -SNE SNB +SNE SPII SPLADE SSRF @@ -315,6 +312,7 @@ TGI TOKENLIST TPU TPUs +TSNE Tadao Tafel Tbk @@ -330,8 +328,6 @@ Traceloop Trapp Tribbiani Tricyle -TSNE -tsne UDFs USERDATA Unimicron @@ -358,12 +354,14 @@ WAI WDIR WFH WNDCLASS +WXGA Wakatipu Weaviate Wehn Welwyn Wnd Womens +XGA XSum XXE Xiang @@ -372,6 +370,7 @@ Yuxuan Yuzuru Zhao Zhaohua +Zhu Zijin Zom Zscaler @@ -457,7 +456,6 @@ colwidth constexpr corpuses countplot -COUNTIF csa cse ctd @@ -498,6 +496,7 @@ embvs emojis ename engi +envsubst epath epoc erty @@ -569,6 +568,7 @@ gunicorn hadolint hashtag hashtags +hdfs hdlr heatmap heatmapgl @@ -628,6 +628,7 @@ linted linting llm llms +loghub logparser logprobs lparam @@ -643,8 +644,8 @@ meme memes metadatas mgrs -miranda millis +miranda morty moviepy mpld @@ -669,6 +670,7 @@ noabe nobserved norigin notetaker +novnc nrows ntheory nunique @@ -812,6 +814,7 @@ traceloop treeah tritan tseslint +tsne tsv tures ubuntu @@ -834,8 +837,8 @@ vtotal waterjet wcontext wcslen -websites webpages +websites welcom wiffle windspeed @@ -846,7 +849,6 @@ wparam wscore wscores wstring -WXGA xaxes xaxis xcassets @@ -854,7 +856,6 @@ xcconfig xcodeproj xcscheme xctest -XGA xlabel xmltodict xsi @@ -867,4 +868,3 @@ youtube ytd yticks zaxis -Zhu diff --git a/search/create_datastore_and_search.ipynb b/search/create_datastore_and_search.ipynb index 6085d99592d..5ca793daa37 100644 --- a/search/create_datastore_and_search.ipynb +++ b/search/create_datastore_and_search.ipynb @@ -1,533 +1,511 @@ { - "cells": [ - { - "cell_type": "markdown", - "id": "829cc6c6-f8c3-4fc6-942b-f698be5fa1a2", - "metadata": { - "id": "ff04e7883b50" - }, - "source": [ - "# Create a Vertex AI Datastore and Search Engine\n", - "\n", - "\n", - "\n", - " \n", - " \n", - " \n", - "
\n", - " \n", - " \"Colab Run in Colab\n", - " \n", - " \n", - " \n", - " \"GitHub\n", - " View on GitHub\n", - " \n", - " \n", - " \n", - " \"Vertex\n", - " Open in Vertex AI Workbench\n", - " \n", - "
" - ] - }, - { - "cell_type": "markdown", - "id": "0460bdbe-05db-42f8-822c-82195de8329a", - "metadata": { - "id": "adc3b22c5002" - }, - "source": [ - "---\n", - "\n", - "\n", - "* Author(s): [Kara Greenfield](https://github.com/kgreenfield2)\n", - "* Created: 22 Nov 2023\n", - "\n", - "---\n", - "\n", - "## Objective\n", - "\n", - "This notebook shows how to create and populate a Vertex AI Search Datastore, how to create a search app connected to that datastore, and how to submit queries through the search engine.\n" - ] - }, - { - "cell_type": "markdown", - "id": "0b4b1050-7113-487e-aeaf-55690c831a1d", - "metadata": { - "id": "6fed77d4ef65" - }, - "source": [ - "Services used in the notebook:\n", - "\n", - "- ✅ Vertex AI Search for document search and retrieval" - ] - }, - { - "cell_type": "markdown", - "id": "43828625-a130-449c-ba5f-6a948220f559", - "metadata": { - "id": "21f197020ea9" - }, - "source": [ - "## Install pre-requisites\n", - "\n", - "If running in Colab install the pre-requisites into the runtime. Otherwise it is assumed that the notebook is running in Vertex AI Workbench. " - ] - }, - { - "cell_type": "code", - "execution_count": 62, - "id": "8ea5db8a-dccc-4442-b5d7-7088d5ffb5ac", - "metadata": { - "id": "b54b892d8af9" - }, - "outputs": [], - "source": [ - "%pip install --upgrade google-cloud-discoveryengine -q --user" - ] - }, - { - "cell_type": "markdown", - "id": "10f9e321", - "metadata": { - "id": "R5Xep4W9lq-Z" - }, - "source": [ - "### Restart current runtime\n", - "\n", - "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "da755736", - "metadata": { - "id": "XRvKdaPDTznN" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'status': 'ok', 'restart': True}" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Restart kernel after installs so that your environment can access the new packages\n", - "\n", - "import IPython\n", - "\n", - "app = IPython.Application.instance()\n", - "app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "id": "9c31dbe0", - "metadata": { - "id": "SbmM4z7FOBpM" - }, - "source": [ - "
\n", - "⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️\n", - "
\n" - ] - }, - { - "cell_type": "markdown", - "id": "5f6a5de5-156e-4f14-99e6-3ab33e076c81", - "metadata": { - "id": "444de7e71596" - }, - "source": [ - "## Authenticate\n", - "\n", - "If running in Colab authenticate with `google.colab.google.auth` otherwise assume that running on Vertex AI Workbench." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "df3b7309-c51c-44a0-a466-b0b2733e0c28", - "metadata": { - "id": "05ef8b1def58" - }, - "outputs": [], - "source": [ - "import sys\n", - "\n", - "if \"google.colab\" in sys.modules:\n", - " from google.colab import auth as google_auth\n", - "\n", - " google_auth.authenticate_user()\n", - "\n", - "from google.auth import default\n", - "\n", - "creds, _ = default()" - ] - }, - { - "cell_type": "markdown", - "id": "ae7a4925-145a-40f3-9fa1-3b69a42d488d", - "metadata": { - "id": "fb56e24e33f5" - }, - "source": [ - "## Configure notebook environment" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "389b9e51-1ce4-4bdf-80b5-fcdc1882f853", - "metadata": { - "id": "dbf6d0810272" - }, - "outputs": [], - "source": [ - "from google.api_core.client_options import ClientOptions\n", - "from google.cloud import discoveryengine_v1alpha as discoveryengine\n", - "\n", - "PROJECT_ID = \"\" # @param {type:\"string\"}\n", - "LOCATION = \"global\"" - ] - }, - { - "cell_type": "markdown", - "id": "07f1aecd-4633-4451-b5a7-1f26e4cb2631", - "metadata": { - "id": "9294ec3e10c7" - }, - "source": [ - "## Create and Populate a Datastore" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "9a1d0021-e299-49d4-a657-2e101ae49eb6", - "metadata": { - "id": "a94dfef0f3e5" - }, - "outputs": [], - "source": [ - "def create_data_store(\n", - " project_id: str, location: str, data_store_name: str, data_store_id: str\n", - "):\n", - " # Create a client\n", - " client_options = (\n", - " ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n", - " if location != \"global\"\n", - " else None\n", - " )\n", - " client = discoveryengine.DataStoreServiceClient(client_options=client_options)\n", - "\n", - " # Initialize request argument(s)\n", - " data_store = discoveryengine.DataStore(\n", - " display_name=data_store_name,\n", - " industry_vertical=\"GENERIC\",\n", - " content_config=\"CONTENT_REQUIRED\",\n", - " )\n", - "\n", - " request = discoveryengine.CreateDataStoreRequest(\n", - " parent=discoveryengine.DataStoreServiceClient.collection_path(\n", - " project_id, location, \"default_collection\"\n", - " ),\n", - " data_store=data_store,\n", - " data_store_id=data_store_id,\n", - " )\n", - " operation = client.create_data_store(request=request)\n", - "\n", - " # Make the request\n", - " # The try block is necessary to prevent execution from halting due to an error being thrown when the datastore takes a while to instantiate\n", - " try:\n", - " response = operation.result(timeout=90)\n", - " except:\n", - " print(\"long-running operation\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8a4db726-8da1-4c76-8934-944aaf5f9b53", - "metadata": { - "id": "ac2d667b374a" - }, - "outputs": [], - "source": [ - "# The datastore name can only contain lowercase letters, numbers, and hyphens\n", - "DATASTORE_NAME = \"alphabet-contracts\"\n", - "DATASTORE_ID = f\"{DATASTORE_NAME}-id\"\n", - "\n", - "create_data_store(PROJECT_ID, LOCATION, DATASTORE_NAME, DATASTORE_ID)" - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "id": "03121270-5d2f-403b-81ea-c1d241357bd1", - "metadata": { - "id": "1b96e4f793f9" - }, - "outputs": [], - "source": [ - "def import_documents(\n", - " project_id: str,\n", - " location: str,\n", - " data_store_id: str,\n", - " gcs_uri: str,\n", - "):\n", - " # Create a client\n", - " client_options = (\n", - " ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n", - " if location != \"global\"\n", - " else None\n", - " )\n", - " client = discoveryengine.DocumentServiceClient(client_options=client_options)\n", - "\n", - " # The full resource name of the search engine branch.\n", - " # e.g. projects/{project}/locations/{location}/dataStores/{data_store_id}/branches/{branch}\n", - " parent = client.branch_path(\n", - " project=project_id,\n", - " location=location,\n", - " data_store=data_store_id,\n", - " branch=\"default_branch\",\n", - " )\n", - "\n", - " source_documents = [f\"{gcs_uri}/*\"]\n", - "\n", - " request = discoveryengine.ImportDocumentsRequest(\n", - " parent=parent,\n", - " gcs_source=discoveryengine.GcsSource(\n", - " input_uris=source_documents, data_schema=\"content\"\n", - " ),\n", - " # Options: `FULL`, `INCREMENTAL`\n", - " reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,\n", - " )\n", - "\n", - " # Make the request\n", - " operation = client.import_documents(request=request)\n", - "\n", - " response = operation.result()\n", - "\n", - " # Once the operation is complete,\n", - " # get information from operation metadata\n", - " metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)\n", - "\n", - " # Handle the response\n", - " return operation.operation.name" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1ddfe66a-acb4-4fdb-b9ed-76a332bb0f0c", - "metadata": { - "id": "d6b868ba2342" - }, - "outputs": [], - "source": [ - "source_documents_gs_uri = (\n", - " \"gs://cloud-samples-data/gen-app-builder/search/alphabet-investor-pdfs\"\n", - ")\n", - "\n", - "import_documents(PROJECT_ID, LOCATION, DATASTORE_ID, source_documents_gs_uri)" - ] - }, - { - "cell_type": "markdown", - "id": "7a957202-b67e-47ca-84c3-b8a62cfbe405", - "metadata": { - "id": "78abf9383982" - }, - "source": [ - "## Create a Search Engine\n", - "\n", - "This is used to set the `search_tier` to enterprise and to enable advanced LLM features.\n", - "\n", - "Enterprise tier is required to get extractive answers from a search query and advanced LLM features are required to summarize search results." - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "id": "0e39e9bb-f381-44f6-a45e-3322669a171f", - "metadata": { - "id": "d48dce7f750e" - }, - "outputs": [], - "source": [ - "def create_engine(\n", - " project_id: str, location: str, data_store_name: str, data_store_id: str\n", - "):\n", - " # Create a client\n", - " client_options = (\n", - " ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n", - " if location != \"global\"\n", - " else None\n", - " )\n", - " client = discoveryengine.EngineServiceClient(client_options=client_options)\n", - "\n", - " # Initialize request argument(s)\n", - " config = discoveryengine.Engine.SearchEngineConfig(\n", - " search_tier=\"SEARCH_TIER_ENTERPRISE\", search_add_ons=[\"SEARCH_ADD_ON_LLM\"]\n", - " )\n", - "\n", - " engine = discoveryengine.Engine(\n", - " display_name=data_store_name,\n", - " solution_type=\"SOLUTION_TYPE_SEARCH\",\n", - " industry_vertical=\"GENERIC\",\n", - " data_store_ids=[data_store_id],\n", - " search_engine_config=config,\n", - " )\n", - "\n", - " request = discoveryengine.CreateEngineRequest(\n", - " parent=discoveryengine.DataStoreServiceClient.collection_path(\n", - " project_id, location, \"default_collection\"\n", - " ),\n", - " engine=engine,\n", - " engine_id=engine.display_name,\n", - " )\n", - "\n", - " # Make the request\n", - " operation = client.create_engine(request=request)\n", - " response = operation.result(timeout=90)" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "4a853982-8c2e-402a-a808-5364bf932619", - "metadata": { - "id": "2094d759826c" - }, - "outputs": [], - "source": [ - "create_engine(PROJECT_ID, LOCATION, DATASTORE_NAME, DATASTORE_ID)" - ] - }, - { - "cell_type": "markdown", - "id": "e9f4d978-9164-4de3-b01a-179051706313", - "metadata": { - "id": "2fc1b872bb29" - }, - "source": [ - "## Query your Datastore" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "1c4dfb62-7846-43d0-9cba-fd8886ce5546", - "metadata": { - "id": "cd25728cba55" - }, - "outputs": [], - "source": [ - "def search_sample(\n", - " project_id: str,\n", - " location: str,\n", - " data_store_id: str,\n", - " search_query: str,\n", - ") -> list[discoveryengine.SearchResponse]:\n", - " # For more information, refer to:\n", - " # https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store\n", - " client_options = (\n", - " ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n", - " if LOCATION != \"global\"\n", - " else None\n", - " )\n", - "\n", - " # Create a client\n", - " client = discoveryengine.SearchServiceClient(client_options=client_options)\n", - "\n", - " # The full resource name of the search engine serving config\n", - " # e.g. projects/{project_id}/locations/{location}/dataStores/{data_store_id}/servingConfigs/{serving_config_id}\n", - " serving_config = client.serving_config_path(\n", - " project=project_id,\n", - " location=location,\n", - " data_store=data_store_id,\n", - " serving_config=\"default_config\",\n", - " )\n", - "\n", - " # Optional: Configuration options for search\n", - " # Refer to the `ContentSearchSpec` reference for all supported fields:\n", - " # https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest.ContentSearchSpec\n", - " content_search_spec = discoveryengine.SearchRequest.ContentSearchSpec(\n", - " # For information about snippets, refer to:\n", - " # https://cloud.google.com/generative-ai-app-builder/docs/snippets\n", - " snippet_spec=discoveryengine.SearchRequest.ContentSearchSpec.SnippetSpec(\n", - " return_snippet=True\n", - " ),\n", - " # For information about search summaries, refer to:\n", - " # https://cloud.google.com/generative-ai-app-builder/docs/get-search-summaries\n", - " summary_spec=discoveryengine.SearchRequest.ContentSearchSpec.SummarySpec(\n", - " summary_result_count=5,\n", - " include_citations=True,\n", - " ignore_adversarial_query=True,\n", - " ignore_non_summary_seeking_query=True,\n", - " ),\n", - " )\n", - "\n", - " # Refer to the `SearchRequest` reference for all supported fields:\n", - " # https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest\n", - " request = discoveryengine.SearchRequest(\n", - " serving_config=serving_config,\n", - " query=search_query,\n", - " page_size=10,\n", - " content_search_spec=content_search_spec,\n", - " query_expansion_spec=discoveryengine.SearchRequest.QueryExpansionSpec(\n", - " condition=discoveryengine.SearchRequest.QueryExpansionSpec.Condition.AUTO,\n", - " ),\n", - " spell_correction_spec=discoveryengine.SearchRequest.SpellCorrectionSpec(\n", - " mode=discoveryengine.SearchRequest.SpellCorrectionSpec.Mode.AUTO\n", - " ),\n", - " )\n", - "\n", - " response = client.search(request)\n", - " return response" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "id": "ad41e18b-38d2-4f4c-98ae-df14eda900ae", - "metadata": { - "id": "e6429a6f9333" - }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Sundar Pichai is the CEO of Google and Alphabet [1]. He has served as CEO of Google since 2015 and as CEO of Alphabet since 2019 [1].\n" - ] - } - ], - "source": [ - "query = \"Who is the CEO of Google?\"\n", - "\n", - "print(search_sample(PROJECT_ID, LOCATION, DATASTORE_ID, query).summary.summary_text)" - ] - } - ], - "metadata": { - "colab": { - "name": "create_datastore_and_search.ipynb", - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "829cc6c6-f8c3-4fc6-942b-f698be5fa1a2" + }, + "source": [ + "# Create a Vertex AI Datastore and Search Engine\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + "
\n", + " \n", + " \"Colab Run in Colab\n", + " \n", + " \n", + " \n", + " \"GitHub\n", + " View on GitHub\n", + " \n", + " \n", + " \n", + " \"Vertex\n", + " Open in Vertex AI Workbench\n", + " \n", + "
" + ] }, - "nbformat": 4, - "nbformat_minor": 0 + { + "cell_type": "markdown", + "metadata": { + "id": "0460bdbe-05db-42f8-822c-82195de8329a" + }, + "source": [ + "---\n", + "\n", + "* Author(s): [Kara Greenfield](https://github.com/kgreenfield2)\n", + "* Created: 22 Nov 2023\n", + "* Updated: 31 Oct 2024\n", + "\n", + "---\n", + "\n", + "## Objective\n", + "\n", + "This notebook shows how to create and populate a Vertex AI Search Datastore, how to create a search app connected to that datastore, and how to submit queries through the search engine.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0b4b1050-7113-487e-aeaf-55690c831a1d" + }, + "source": [ + "Services used in the notebook:\n", + "\n", + "- ✅ Vertex AI Search for document search and retrieval" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "43828625-a130-449c-ba5f-6a948220f559" + }, + "source": [ + "## Install pre-requisites\n", + "\n", + "If running in Colab install the pre-requisites into the runtime. Otherwise it is assumed that the notebook is running in Vertex AI Workbench." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "id": "8ea5db8a-dccc-4442-b5d7-7088d5ffb5ac" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --user -q google-cloud-discoveryengine" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "10f9e321" + }, + "source": [ + "### Restart current runtime\n", + "\n", + "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "da755736" + }, + "outputs": [], + "source": [ + "# Restart kernel after installs so that your environment can access the new packages\n", + "\n", + "import IPython\n", + "\n", + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9c31dbe0" + }, + "source": [ + "
\n", + "⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "5f6a5de5-156e-4f14-99e6-3ab33e076c81" + }, + "source": [ + "## Authenticate\n", + "\n", + "If running in Colab authenticate with `google.colab.google.auth` otherwise assume that running on Vertex AI Workbench." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "df3b7309-c51c-44a0-a466-b0b2733e0c28" + }, + "outputs": [], + "source": [ + "import sys\n", + "\n", + "if \"google.colab\" in sys.modules:\n", + " from google.colab import auth\n", + "\n", + " auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ae7a4925-145a-40f3-9fa1-3b69a42d488d" + }, + "source": [ + "## Configure notebook environment" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "389b9e51-1ce4-4bdf-80b5-fcdc1882f853" + }, + "outputs": [], + "source": [ + "from google.api_core.client_options import ClientOptions\n", + "from google.cloud import discoveryengine\n", + "\n", + "PROJECT_ID = \"YOUR_PROJECT_ID\" # @param {type:\"string\"}\n", + "LOCATION = \"global\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "HjoerqoksBdx" + }, + "source": [ + "Set [Application Default Credentials](https://cloud.google.com/docs/authentication/application-default-credentials)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "s-jH2yG1rtxn" + }, + "outputs": [], + "source": [ + "!gcloud auth application-default login --project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "07f1aecd-4633-4451-b5a7-1f26e4cb2631" + }, + "source": [ + "## Create and Populate a Datastore" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "9a1d0021-e299-49d4-a657-2e101ae49eb6" + }, + "outputs": [], + "source": [ + "def create_data_store(\n", + " project_id: str, location: str, data_store_name: str, data_store_id: str\n", + "):\n", + " # Create a client\n", + " client_options = (\n", + " ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n", + " if location != \"global\"\n", + " else None\n", + " )\n", + " client = discoveryengine.DataStoreServiceClient(client_options=client_options)\n", + "\n", + " # Initialize request argument(s)\n", + " data_store = discoveryengine.DataStore(\n", + " display_name=data_store_name,\n", + " industry_vertical=discoveryengine.IndustryVertical.GENERIC,\n", + " content_config=discoveryengine.DataStore.ContentConfig.CONTENT_REQUIRED,\n", + " )\n", + "\n", + " operation = client.create_data_store(\n", + " request=discoveryengine.CreateDataStoreRequest(\n", + " parent=client.collection_path(\n", + " project_id, location, \"default_collection\"\n", + " ),\n", + " data_store=data_store,\n", + " data_store_id=data_store_id,\n", + " )\n", + " )\n", + "\n", + " # Make the request\n", + " # The try block is necessary to prevent execution from halting due to an error being thrown when the datastore takes a while to instantiate\n", + " try:\n", + " response = operation.result(timeout=90)\n", + " except:\n", + " print(\"long-running operation error.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "8a4db726-8da1-4c76-8934-944aaf5f9b53" + }, + "outputs": [], + "source": [ + "# The datastore name can only contain lowercase letters, numbers, and hyphens\n", + "DATASTORE_NAME = \"alphabet-contracts\"\n", + "DATASTORE_ID = f\"{DATASTORE_NAME}-id\"\n", + "\n", + "create_data_store(PROJECT_ID, LOCATION, DATASTORE_NAME, DATASTORE_ID)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "03121270-5d2f-403b-81ea-c1d241357bd1" + }, + "outputs": [], + "source": [ + "def import_documents(\n", + " project_id: str,\n", + " location: str,\n", + " data_store_id: str,\n", + " gcs_uri: str,\n", + "):\n", + " # Create a client\n", + " client_options = (\n", + " ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n", + " if location != \"global\"\n", + " else None\n", + " )\n", + " client = discoveryengine.DocumentServiceClient(client_options=client_options)\n", + "\n", + " # The full resource name of the search engine branch.\n", + " # e.g. projects/{project}/locations/{location}/dataStores/{data_store_id}/branches/{branch}\n", + " parent = client.branch_path(\n", + " project=project_id,\n", + " location=location,\n", + " data_store=data_store_id,\n", + " branch=\"default_branch\",\n", + " )\n", + "\n", + " source_documents = [f\"{gcs_uri}/*\"]\n", + "\n", + " request = discoveryengine.ImportDocumentsRequest(\n", + " parent=parent,\n", + " gcs_source=discoveryengine.GcsSource(\n", + " input_uris=source_documents, data_schema=\"content\"\n", + " ),\n", + " # Options: `FULL`, `INCREMENTAL`\n", + " reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,\n", + " )\n", + "\n", + " # Make the request\n", + " operation = client.import_documents(request=request)\n", + "\n", + " response = operation.result()\n", + "\n", + " # Once the operation is complete,\n", + " # get information from operation metadata\n", + " metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)\n", + "\n", + " # Handle the response\n", + " return operation.operation.name" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "1ddfe66a-acb4-4fdb-b9ed-76a332bb0f0c" + }, + "outputs": [], + "source": [ + "source_documents_gs_uri = (\n", + " \"gs://cloud-samples-data/gen-app-builder/search/alphabet-investor-pdfs\"\n", + ")\n", + "\n", + "import_documents(PROJECT_ID, LOCATION, DATASTORE_ID, source_documents_gs_uri)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "7a957202-b67e-47ca-84c3-b8a62cfbe405" + }, + "source": [ + "## Create a Search Engine\n", + "\n", + "This is used to set the `search_tier` to enterprise and to enable advanced LLM features.\n", + "\n", + "Enterprise tier is required to get extractive answers from a search query and advanced LLM features are required to summarize search results." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "id": "0e39e9bb-f381-44f6-a45e-3322669a171f" + }, + "outputs": [], + "source": [ + "def create_engine(\n", + " project_id: str, location: str, engine_name: str, engine_id: str, data_store_id: str\n", + "):\n", + " # Create a client\n", + " client_options = (\n", + " ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n", + " if location != \"global\"\n", + " else None\n", + " )\n", + " client = discoveryengine.EngineServiceClient(client_options=client_options)\n", + "\n", + " # Initialize request argument(s)\n", + " engine = discoveryengine.Engine(\n", + " display_name=engine_name,\n", + " solution_type=discoveryengine.SolutionType.SOLUTION_TYPE_SEARCH,\n", + " industry_vertical=discoveryengine.IndustryVertical.GENERIC,\n", + " data_store_ids=[data_store_id],\n", + " search_engine_config=discoveryengine.Engine.SearchEngineConfig(\n", + " search_tier=discoveryengine.SearchTier.SEARCH_TIER_ENTERPRISE,\n", + " search_add_ons=[discoveryengine.SearchAddOn.SEARCH_ADD_ON_LLM],\n", + " ),\n", + " )\n", + "\n", + " request = discoveryengine.CreateEngineRequest(\n", + " parent=client.collection_path(project_id, location, \"default_collection\"),\n", + " engine=engine,\n", + " engine_id=engine.display_name,\n", + " )\n", + "\n", + " # Make the request\n", + " operation = client.create_engine(request=request)\n", + " response = operation.result(timeout=90)" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "id": "4a853982-8c2e-402a-a808-5364bf932619" + }, + "outputs": [], + "source": [ + "ENGINE_NAME = DATASTORE_NAME\n", + "ENGINE_ID = DATASTORE_ID\n", + "create_engine(PROJECT_ID, LOCATION, ENGINE_NAME, ENGINE_ID, DATASTORE_ID)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e9f4d978-9164-4de3-b01a-179051706313" + }, + "source": [ + "## Query your Search Engine\n", + "\n", + "Note: The Engine will take some time to be ready to query.\n", + "\n", + "If you recently created an engine and you receive an error similar to:\n", + "\n", + "`404 Engine {ENGINE_NAME} is not found`\n", + "\n", + "Then wait a few minutes and try your query again." + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": { + "id": "1c4dfb62-7846-43d0-9cba-fd8886ce5546" + }, + "outputs": [], + "source": [ + "def search_sample(\n", + " project_id: str,\n", + " location: str,\n", + " engine_id: str,\n", + " search_query: str,\n", + ") -> list[discoveryengine.SearchResponse]:\n", + " # For more information, refer to:\n", + " # https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store\n", + " client_options = (\n", + " ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n", + " if LOCATION != \"global\"\n", + " else None\n", + " )\n", + "\n", + " # Create a client\n", + " client = discoveryengine.SearchServiceClient(client_options=client_options)\n", + "\n", + " # The full resource name of the search engine serving config\n", + " # e.g. projects/{project_id}/locations/{location}/dataStores/{data_store_id}/servingConfigs/{serving_config_id}\n", + " serving_config = f\"projects/{project_id}/locations/{location}/collections/default_collection/engines/{engine_id}/servingConfigs/default_search\"\n", + "\n", + " # Optional: Configuration options for search\n", + " # Refer to the `ContentSearchSpec` reference for all supported fields:\n", + " # https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest.ContentSearchSpec\n", + " content_search_spec = discoveryengine.SearchRequest.ContentSearchSpec(\n", + " # For information about snippets, refer to:\n", + " # https://cloud.google.com/generative-ai-app-builder/docs/snippets\n", + " snippet_spec=discoveryengine.SearchRequest.ContentSearchSpec.SnippetSpec(\n", + " return_snippet=True\n", + " ),\n", + " # For information about search summaries, refer to:\n", + " # https://cloud.google.com/generative-ai-app-builder/docs/get-search-summaries\n", + " summary_spec=discoveryengine.SearchRequest.ContentSearchSpec.SummarySpec(\n", + " summary_result_count=5,\n", + " include_citations=True,\n", + " ignore_adversarial_query=True,\n", + " ignore_non_summary_seeking_query=True,\n", + " ),\n", + " )\n", + "\n", + " # Refer to the `SearchRequest` reference for all supported fields:\n", + " # https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest\n", + " request = discoveryengine.SearchRequest(\n", + " serving_config=serving_config,\n", + " query=search_query,\n", + " page_size=10,\n", + " content_search_spec=content_search_spec,\n", + " query_expansion_spec=discoveryengine.SearchRequest.QueryExpansionSpec(\n", + " condition=discoveryengine.SearchRequest.QueryExpansionSpec.Condition.AUTO,\n", + " ),\n", + " spell_correction_spec=discoveryengine.SearchRequest.SpellCorrectionSpec(\n", + " mode=discoveryengine.SearchRequest.SpellCorrectionSpec.Mode.AUTO\n", + " ),\n", + " )\n", + "\n", + " response = client.search(request)\n", + " return response" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "id": "ad41e18b-38d2-4f4c-98ae-df14eda900ae" + }, + "outputs": [], + "source": [ + "query = \"Who is the CEO of Google?\"\n", + "\n", + "response = search_sample(PROJECT_ID, LOCATION, ENGINE_ID, query)\n", + "print(response.summary.summary_text)" + ] + } + ], + "metadata": { + "colab": { + "name": "create_datastore_and_search.ipynb", + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + } + }, + "nbformat": 4, + "nbformat_minor": 0 }