diff --git a/.github/actions/spelling/allow.txt b/.github/actions/spelling/allow.txt
index ca77427c3e6..37ed7e538d7 100644
--- a/.github/actions/spelling/allow.txt
+++ b/.github/actions/spelling/allow.txt
@@ -41,6 +41,7 @@ CHECKOV
COCOAPODS
COINIT
CONOUT
+COUNTIF
CUCUONAR
CWLEY
CZE
@@ -77,7 +78,6 @@ ESG
Eliud
Embs
Envane
-envsubst
Esin
Eventarc
FAISS
@@ -114,17 +114,16 @@ Glickman
Gmb
GmbH
Googlers
-Hadoop
HBox
HDFC
HDFS
-hdfs
HIDPI
HIGHCPU
HMO
HREDRAW
HSA
HVDC
+Hadoop
Hamamoto
Hamblin
Hamers
@@ -155,6 +154,7 @@ Junzhe
KFBI
KNN
KPIs
+KSA
Kaelen
Kaggle
Kamradt
@@ -170,7 +170,6 @@ Kipchoge
Knopf
Kohli
Kraizt
-KSA
Kubeflow
Kudrow
LCEL
@@ -189,7 +188,6 @@ Lego
Leung
Llion
Loghub
-loghub
Logrus
Lottry
MLB
@@ -229,7 +227,6 @@ Niitsuma
Nintendo
Nominatim
Noogler
-novnc
ODb
OOTB
Oberst
@@ -261,8 +258,8 @@ Qwiklabs
RAGAS
RLHF
RMSE
-RNNs
RNN
+RNNs
ROOTSPAN
RRF
RTN
@@ -282,8 +279,8 @@ SEK
SEO
SIMONE
SKUs
-SNE
SNB
+SNE
SPII
SPLADE
SSRF
@@ -315,6 +312,7 @@ TGI
TOKENLIST
TPU
TPUs
+TSNE
Tadao
Tafel
Tbk
@@ -330,8 +328,6 @@ Traceloop
Trapp
Tribbiani
Tricyle
-TSNE
-tsne
UDFs
USERDATA
Unimicron
@@ -358,12 +354,14 @@ WAI
WDIR
WFH
WNDCLASS
+WXGA
Wakatipu
Weaviate
Wehn
Welwyn
Wnd
Womens
+XGA
XSum
XXE
Xiang
@@ -372,6 +370,7 @@ Yuxuan
Yuzuru
Zhao
Zhaohua
+Zhu
Zijin
Zom
Zscaler
@@ -457,7 +456,6 @@ colwidth
constexpr
corpuses
countplot
-COUNTIF
csa
cse
ctd
@@ -498,6 +496,7 @@ embvs
emojis
ename
engi
+envsubst
epath
epoc
erty
@@ -569,6 +568,7 @@ gunicorn
hadolint
hashtag
hashtags
+hdfs
hdlr
heatmap
heatmapgl
@@ -628,6 +628,7 @@ linted
linting
llm
llms
+loghub
logparser
logprobs
lparam
@@ -643,8 +644,8 @@ meme
memes
metadatas
mgrs
-miranda
millis
+miranda
morty
moviepy
mpld
@@ -669,6 +670,7 @@ noabe
nobserved
norigin
notetaker
+novnc
nrows
ntheory
nunique
@@ -812,6 +814,7 @@ traceloop
treeah
tritan
tseslint
+tsne
tsv
tures
ubuntu
@@ -834,8 +837,8 @@ vtotal
waterjet
wcontext
wcslen
-websites
webpages
+websites
welcom
wiffle
windspeed
@@ -846,7 +849,6 @@ wparam
wscore
wscores
wstring
-WXGA
xaxes
xaxis
xcassets
@@ -854,7 +856,6 @@ xcconfig
xcodeproj
xcscheme
xctest
-XGA
xlabel
xmltodict
xsi
@@ -867,4 +868,3 @@ youtube
ytd
yticks
zaxis
-Zhu
diff --git a/search/create_datastore_and_search.ipynb b/search/create_datastore_and_search.ipynb
index 6085d99592d..5ca793daa37 100644
--- a/search/create_datastore_and_search.ipynb
+++ b/search/create_datastore_and_search.ipynb
@@ -1,533 +1,511 @@
{
- "cells": [
- {
- "cell_type": "markdown",
- "id": "829cc6c6-f8c3-4fc6-942b-f698be5fa1a2",
- "metadata": {
- "id": "ff04e7883b50"
- },
- "source": [
- "# Create a Vertex AI Datastore and Search Engine\n",
- "\n",
- "
\n",
- "\n",
- " \n",
- " \n",
- " Run in Colab\n",
- " \n",
- " | \n",
- " \n",
- " \n",
- " \n",
- " View on GitHub\n",
- " \n",
- " | \n",
- " \n",
- " \n",
- " \n",
- " Open in Vertex AI Workbench\n",
- " \n",
- " | \n",
- "
"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0460bdbe-05db-42f8-822c-82195de8329a",
- "metadata": {
- "id": "adc3b22c5002"
- },
- "source": [
- "---\n",
- "\n",
- "\n",
- "* Author(s): [Kara Greenfield](https://github.com/kgreenfield2)\n",
- "* Created: 22 Nov 2023\n",
- "\n",
- "---\n",
- "\n",
- "## Objective\n",
- "\n",
- "This notebook shows how to create and populate a Vertex AI Search Datastore, how to create a search app connected to that datastore, and how to submit queries through the search engine.\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "0b4b1050-7113-487e-aeaf-55690c831a1d",
- "metadata": {
- "id": "6fed77d4ef65"
- },
- "source": [
- "Services used in the notebook:\n",
- "\n",
- "- ✅ Vertex AI Search for document search and retrieval"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "43828625-a130-449c-ba5f-6a948220f559",
- "metadata": {
- "id": "21f197020ea9"
- },
- "source": [
- "## Install pre-requisites\n",
- "\n",
- "If running in Colab install the pre-requisites into the runtime. Otherwise it is assumed that the notebook is running in Vertex AI Workbench. "
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 62,
- "id": "8ea5db8a-dccc-4442-b5d7-7088d5ffb5ac",
- "metadata": {
- "id": "b54b892d8af9"
- },
- "outputs": [],
- "source": [
- "%pip install --upgrade google-cloud-discoveryengine -q --user"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "10f9e321",
- "metadata": {
- "id": "R5Xep4W9lq-Z"
- },
- "source": [
- "### Restart current runtime\n",
- "\n",
- "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "da755736",
- "metadata": {
- "id": "XRvKdaPDTznN"
- },
- "outputs": [
- {
- "data": {
- "text/plain": [
- "{'status': 'ok', 'restart': True}"
- ]
- },
- "execution_count": 2,
- "metadata": {},
- "output_type": "execute_result"
- }
- ],
- "source": [
- "# Restart kernel after installs so that your environment can access the new packages\n",
- "\n",
- "import IPython\n",
- "\n",
- "app = IPython.Application.instance()\n",
- "app.kernel.do_shutdown(True)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "9c31dbe0",
- "metadata": {
- "id": "SbmM4z7FOBpM"
- },
- "source": [
- "\n",
- "⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️\n",
- "
\n"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "5f6a5de5-156e-4f14-99e6-3ab33e076c81",
- "metadata": {
- "id": "444de7e71596"
- },
- "source": [
- "## Authenticate\n",
- "\n",
- "If running in Colab authenticate with `google.colab.google.auth` otherwise assume that running on Vertex AI Workbench."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "id": "df3b7309-c51c-44a0-a466-b0b2733e0c28",
- "metadata": {
- "id": "05ef8b1def58"
- },
- "outputs": [],
- "source": [
- "import sys\n",
- "\n",
- "if \"google.colab\" in sys.modules:\n",
- " from google.colab import auth as google_auth\n",
- "\n",
- " google_auth.authenticate_user()\n",
- "\n",
- "from google.auth import default\n",
- "\n",
- "creds, _ = default()"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "ae7a4925-145a-40f3-9fa1-3b69a42d488d",
- "metadata": {
- "id": "fb56e24e33f5"
- },
- "source": [
- "## Configure notebook environment"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "id": "389b9e51-1ce4-4bdf-80b5-fcdc1882f853",
- "metadata": {
- "id": "dbf6d0810272"
- },
- "outputs": [],
- "source": [
- "from google.api_core.client_options import ClientOptions\n",
- "from google.cloud import discoveryengine_v1alpha as discoveryengine\n",
- "\n",
- "PROJECT_ID = \"\" # @param {type:\"string\"}\n",
- "LOCATION = \"global\""
- ]
- },
- {
- "cell_type": "markdown",
- "id": "07f1aecd-4633-4451-b5a7-1f26e4cb2631",
- "metadata": {
- "id": "9294ec3e10c7"
- },
- "source": [
- "## Create and Populate a Datastore"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 21,
- "id": "9a1d0021-e299-49d4-a657-2e101ae49eb6",
- "metadata": {
- "id": "a94dfef0f3e5"
- },
- "outputs": [],
- "source": [
- "def create_data_store(\n",
- " project_id: str, location: str, data_store_name: str, data_store_id: str\n",
- "):\n",
- " # Create a client\n",
- " client_options = (\n",
- " ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n",
- " if location != \"global\"\n",
- " else None\n",
- " )\n",
- " client = discoveryengine.DataStoreServiceClient(client_options=client_options)\n",
- "\n",
- " # Initialize request argument(s)\n",
- " data_store = discoveryengine.DataStore(\n",
- " display_name=data_store_name,\n",
- " industry_vertical=\"GENERIC\",\n",
- " content_config=\"CONTENT_REQUIRED\",\n",
- " )\n",
- "\n",
- " request = discoveryengine.CreateDataStoreRequest(\n",
- " parent=discoveryengine.DataStoreServiceClient.collection_path(\n",
- " project_id, location, \"default_collection\"\n",
- " ),\n",
- " data_store=data_store,\n",
- " data_store_id=data_store_id,\n",
- " )\n",
- " operation = client.create_data_store(request=request)\n",
- "\n",
- " # Make the request\n",
- " # The try block is necessary to prevent execution from halting due to an error being thrown when the datastore takes a while to instantiate\n",
- " try:\n",
- " response = operation.result(timeout=90)\n",
- " except:\n",
- " print(\"long-running operation\")"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "8a4db726-8da1-4c76-8934-944aaf5f9b53",
- "metadata": {
- "id": "ac2d667b374a"
- },
- "outputs": [],
- "source": [
- "# The datastore name can only contain lowercase letters, numbers, and hyphens\n",
- "DATASTORE_NAME = \"alphabet-contracts\"\n",
- "DATASTORE_ID = f\"{DATASTORE_NAME}-id\"\n",
- "\n",
- "create_data_store(PROJECT_ID, LOCATION, DATASTORE_NAME, DATASTORE_ID)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 23,
- "id": "03121270-5d2f-403b-81ea-c1d241357bd1",
- "metadata": {
- "id": "1b96e4f793f9"
- },
- "outputs": [],
- "source": [
- "def import_documents(\n",
- " project_id: str,\n",
- " location: str,\n",
- " data_store_id: str,\n",
- " gcs_uri: str,\n",
- "):\n",
- " # Create a client\n",
- " client_options = (\n",
- " ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n",
- " if location != \"global\"\n",
- " else None\n",
- " )\n",
- " client = discoveryengine.DocumentServiceClient(client_options=client_options)\n",
- "\n",
- " # The full resource name of the search engine branch.\n",
- " # e.g. projects/{project}/locations/{location}/dataStores/{data_store_id}/branches/{branch}\n",
- " parent = client.branch_path(\n",
- " project=project_id,\n",
- " location=location,\n",
- " data_store=data_store_id,\n",
- " branch=\"default_branch\",\n",
- " )\n",
- "\n",
- " source_documents = [f\"{gcs_uri}/*\"]\n",
- "\n",
- " request = discoveryengine.ImportDocumentsRequest(\n",
- " parent=parent,\n",
- " gcs_source=discoveryengine.GcsSource(\n",
- " input_uris=source_documents, data_schema=\"content\"\n",
- " ),\n",
- " # Options: `FULL`, `INCREMENTAL`\n",
- " reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,\n",
- " )\n",
- "\n",
- " # Make the request\n",
- " operation = client.import_documents(request=request)\n",
- "\n",
- " response = operation.result()\n",
- "\n",
- " # Once the operation is complete,\n",
- " # get information from operation metadata\n",
- " metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)\n",
- "\n",
- " # Handle the response\n",
- " return operation.operation.name"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "id": "1ddfe66a-acb4-4fdb-b9ed-76a332bb0f0c",
- "metadata": {
- "id": "d6b868ba2342"
- },
- "outputs": [],
- "source": [
- "source_documents_gs_uri = (\n",
- " \"gs://cloud-samples-data/gen-app-builder/search/alphabet-investor-pdfs\"\n",
- ")\n",
- "\n",
- "import_documents(PROJECT_ID, LOCATION, DATASTORE_ID, source_documents_gs_uri)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "7a957202-b67e-47ca-84c3-b8a62cfbe405",
- "metadata": {
- "id": "78abf9383982"
- },
- "source": [
- "## Create a Search Engine\n",
- "\n",
- "This is used to set the `search_tier` to enterprise and to enable advanced LLM features.\n",
- "\n",
- "Enterprise tier is required to get extractive answers from a search query and advanced LLM features are required to summarize search results."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 25,
- "id": "0e39e9bb-f381-44f6-a45e-3322669a171f",
- "metadata": {
- "id": "d48dce7f750e"
- },
- "outputs": [],
- "source": [
- "def create_engine(\n",
- " project_id: str, location: str, data_store_name: str, data_store_id: str\n",
- "):\n",
- " # Create a client\n",
- " client_options = (\n",
- " ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n",
- " if location != \"global\"\n",
- " else None\n",
- " )\n",
- " client = discoveryengine.EngineServiceClient(client_options=client_options)\n",
- "\n",
- " # Initialize request argument(s)\n",
- " config = discoveryengine.Engine.SearchEngineConfig(\n",
- " search_tier=\"SEARCH_TIER_ENTERPRISE\", search_add_ons=[\"SEARCH_ADD_ON_LLM\"]\n",
- " )\n",
- "\n",
- " engine = discoveryengine.Engine(\n",
- " display_name=data_store_name,\n",
- " solution_type=\"SOLUTION_TYPE_SEARCH\",\n",
- " industry_vertical=\"GENERIC\",\n",
- " data_store_ids=[data_store_id],\n",
- " search_engine_config=config,\n",
- " )\n",
- "\n",
- " request = discoveryengine.CreateEngineRequest(\n",
- " parent=discoveryengine.DataStoreServiceClient.collection_path(\n",
- " project_id, location, \"default_collection\"\n",
- " ),\n",
- " engine=engine,\n",
- " engine_id=engine.display_name,\n",
- " )\n",
- "\n",
- " # Make the request\n",
- " operation = client.create_engine(request=request)\n",
- " response = operation.result(timeout=90)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 26,
- "id": "4a853982-8c2e-402a-a808-5364bf932619",
- "metadata": {
- "id": "2094d759826c"
- },
- "outputs": [],
- "source": [
- "create_engine(PROJECT_ID, LOCATION, DATASTORE_NAME, DATASTORE_ID)"
- ]
- },
- {
- "cell_type": "markdown",
- "id": "e9f4d978-9164-4de3-b01a-179051706313",
- "metadata": {
- "id": "2fc1b872bb29"
- },
- "source": [
- "## Query your Datastore"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 27,
- "id": "1c4dfb62-7846-43d0-9cba-fd8886ce5546",
- "metadata": {
- "id": "cd25728cba55"
- },
- "outputs": [],
- "source": [
- "def search_sample(\n",
- " project_id: str,\n",
- " location: str,\n",
- " data_store_id: str,\n",
- " search_query: str,\n",
- ") -> list[discoveryengine.SearchResponse]:\n",
- " # For more information, refer to:\n",
- " # https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store\n",
- " client_options = (\n",
- " ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n",
- " if LOCATION != \"global\"\n",
- " else None\n",
- " )\n",
- "\n",
- " # Create a client\n",
- " client = discoveryengine.SearchServiceClient(client_options=client_options)\n",
- "\n",
- " # The full resource name of the search engine serving config\n",
- " # e.g. projects/{project_id}/locations/{location}/dataStores/{data_store_id}/servingConfigs/{serving_config_id}\n",
- " serving_config = client.serving_config_path(\n",
- " project=project_id,\n",
- " location=location,\n",
- " data_store=data_store_id,\n",
- " serving_config=\"default_config\",\n",
- " )\n",
- "\n",
- " # Optional: Configuration options for search\n",
- " # Refer to the `ContentSearchSpec` reference for all supported fields:\n",
- " # https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest.ContentSearchSpec\n",
- " content_search_spec = discoveryengine.SearchRequest.ContentSearchSpec(\n",
- " # For information about snippets, refer to:\n",
- " # https://cloud.google.com/generative-ai-app-builder/docs/snippets\n",
- " snippet_spec=discoveryengine.SearchRequest.ContentSearchSpec.SnippetSpec(\n",
- " return_snippet=True\n",
- " ),\n",
- " # For information about search summaries, refer to:\n",
- " # https://cloud.google.com/generative-ai-app-builder/docs/get-search-summaries\n",
- " summary_spec=discoveryengine.SearchRequest.ContentSearchSpec.SummarySpec(\n",
- " summary_result_count=5,\n",
- " include_citations=True,\n",
- " ignore_adversarial_query=True,\n",
- " ignore_non_summary_seeking_query=True,\n",
- " ),\n",
- " )\n",
- "\n",
- " # Refer to the `SearchRequest` reference for all supported fields:\n",
- " # https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest\n",
- " request = discoveryengine.SearchRequest(\n",
- " serving_config=serving_config,\n",
- " query=search_query,\n",
- " page_size=10,\n",
- " content_search_spec=content_search_spec,\n",
- " query_expansion_spec=discoveryengine.SearchRequest.QueryExpansionSpec(\n",
- " condition=discoveryengine.SearchRequest.QueryExpansionSpec.Condition.AUTO,\n",
- " ),\n",
- " spell_correction_spec=discoveryengine.SearchRequest.SpellCorrectionSpec(\n",
- " mode=discoveryengine.SearchRequest.SpellCorrectionSpec.Mode.AUTO\n",
- " ),\n",
- " )\n",
- "\n",
- " response = client.search(request)\n",
- " return response"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 29,
- "id": "ad41e18b-38d2-4f4c-98ae-df14eda900ae",
- "metadata": {
- "id": "e6429a6f9333"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Sundar Pichai is the CEO of Google and Alphabet [1]. He has served as CEO of Google since 2015 and as CEO of Alphabet since 2019 [1].\n"
- ]
- }
- ],
- "source": [
- "query = \"Who is the CEO of Google?\"\n",
- "\n",
- "print(search_sample(PROJECT_ID, LOCATION, DATASTORE_ID, query).summary.summary_text)"
- ]
- }
- ],
- "metadata": {
- "colab": {
- "name": "create_datastore_and_search.ipynb",
- "toc_visible": true
- },
- "kernelspec": {
- "display_name": "Python 3",
- "name": "python3"
- }
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "829cc6c6-f8c3-4fc6-942b-f698be5fa1a2"
+ },
+ "source": [
+ "# Create a Vertex AI Datastore and Search Engine\n",
+ "\n",
+ "\n",
+ "\n",
+ " \n",
+ " \n",
+ " Run in Colab\n",
+ " \n",
+ " | \n",
+ " \n",
+ " \n",
+ " \n",
+ " View on GitHub\n",
+ " \n",
+ " | \n",
+ " \n",
+ " \n",
+ " \n",
+ " Open in Vertex AI Workbench\n",
+ " \n",
+ " | \n",
+ "
"
+ ]
},
- "nbformat": 4,
- "nbformat_minor": 0
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "0460bdbe-05db-42f8-822c-82195de8329a"
+ },
+ "source": [
+ "---\n",
+ "\n",
+ "* Author(s): [Kara Greenfield](https://github.com/kgreenfield2)\n",
+ "* Created: 22 Nov 2023\n",
+ "* Updated: 31 Oct 2024\n",
+ "\n",
+ "---\n",
+ "\n",
+ "## Objective\n",
+ "\n",
+ "This notebook shows how to create and populate a Vertex AI Search Datastore, how to create a search app connected to that datastore, and how to submit queries through the search engine.\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "0b4b1050-7113-487e-aeaf-55690c831a1d"
+ },
+ "source": [
+ "Services used in the notebook:\n",
+ "\n",
+ "- ✅ Vertex AI Search for document search and retrieval"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "43828625-a130-449c-ba5f-6a948220f559"
+ },
+ "source": [
+ "## Install pre-requisites\n",
+ "\n",
+ "If running in Colab install the pre-requisites into the runtime. Otherwise it is assumed that the notebook is running in Vertex AI Workbench."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "id": "8ea5db8a-dccc-4442-b5d7-7088d5ffb5ac"
+ },
+ "outputs": [],
+ "source": [
+ "%pip install --upgrade --user -q google-cloud-discoveryengine"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "10f9e321"
+ },
+ "source": [
+ "### Restart current runtime\n",
+ "\n",
+ "To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "da755736"
+ },
+ "outputs": [],
+ "source": [
+ "# Restart kernel after installs so that your environment can access the new packages\n",
+ "\n",
+ "import IPython\n",
+ "\n",
+ "app = IPython.Application.instance()\n",
+ "app.kernel.do_shutdown(True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "9c31dbe0"
+ },
+ "source": [
+ "\n",
+ "⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️\n",
+ "
\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "5f6a5de5-156e-4f14-99e6-3ab33e076c81"
+ },
+ "source": [
+ "## Authenticate\n",
+ "\n",
+ "If running in Colab authenticate with `google.colab.google.auth` otherwise assume that running on Vertex AI Workbench."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "df3b7309-c51c-44a0-a466-b0b2733e0c28"
+ },
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "\n",
+ "if \"google.colab\" in sys.modules:\n",
+ " from google.colab import auth\n",
+ "\n",
+ " auth.authenticate_user()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "ae7a4925-145a-40f3-9fa1-3b69a42d488d"
+ },
+ "source": [
+ "## Configure notebook environment"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "id": "389b9e51-1ce4-4bdf-80b5-fcdc1882f853"
+ },
+ "outputs": [],
+ "source": [
+ "from google.api_core.client_options import ClientOptions\n",
+ "from google.cloud import discoveryengine\n",
+ "\n",
+ "PROJECT_ID = \"YOUR_PROJECT_ID\" # @param {type:\"string\"}\n",
+ "LOCATION = \"global\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "HjoerqoksBdx"
+ },
+ "source": [
+ "Set [Application Default Credentials](https://cloud.google.com/docs/authentication/application-default-credentials)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "s-jH2yG1rtxn"
+ },
+ "outputs": [],
+ "source": [
+ "!gcloud auth application-default login --project {PROJECT_ID}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "07f1aecd-4633-4451-b5a7-1f26e4cb2631"
+ },
+ "source": [
+ "## Create and Populate a Datastore"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "id": "9a1d0021-e299-49d4-a657-2e101ae49eb6"
+ },
+ "outputs": [],
+ "source": [
+ "def create_data_store(\n",
+ " project_id: str, location: str, data_store_name: str, data_store_id: str\n",
+ "):\n",
+ " # Create a client\n",
+ " client_options = (\n",
+ " ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n",
+ " if location != \"global\"\n",
+ " else None\n",
+ " )\n",
+ " client = discoveryengine.DataStoreServiceClient(client_options=client_options)\n",
+ "\n",
+ " # Initialize request argument(s)\n",
+ " data_store = discoveryengine.DataStore(\n",
+ " display_name=data_store_name,\n",
+ " industry_vertical=discoveryengine.IndustryVertical.GENERIC,\n",
+ " content_config=discoveryengine.DataStore.ContentConfig.CONTENT_REQUIRED,\n",
+ " )\n",
+ "\n",
+ " operation = client.create_data_store(\n",
+ " request=discoveryengine.CreateDataStoreRequest(\n",
+ " parent=client.collection_path(\n",
+ " project_id, location, \"default_collection\"\n",
+ " ),\n",
+ " data_store=data_store,\n",
+ " data_store_id=data_store_id,\n",
+ " )\n",
+ " )\n",
+ "\n",
+ " # Make the request\n",
+ " # The try block is necessary to prevent execution from halting due to an error being thrown when the datastore takes a while to instantiate\n",
+ " try:\n",
+ " response = operation.result(timeout=90)\n",
+ " except:\n",
+ " print(\"long-running operation error.\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "id": "8a4db726-8da1-4c76-8934-944aaf5f9b53"
+ },
+ "outputs": [],
+ "source": [
+ "# The datastore name can only contain lowercase letters, numbers, and hyphens\n",
+ "DATASTORE_NAME = \"alphabet-contracts\"\n",
+ "DATASTORE_ID = f\"{DATASTORE_NAME}-id\"\n",
+ "\n",
+ "create_data_store(PROJECT_ID, LOCATION, DATASTORE_NAME, DATASTORE_ID)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "id": "03121270-5d2f-403b-81ea-c1d241357bd1"
+ },
+ "outputs": [],
+ "source": [
+ "def import_documents(\n",
+ " project_id: str,\n",
+ " location: str,\n",
+ " data_store_id: str,\n",
+ " gcs_uri: str,\n",
+ "):\n",
+ " # Create a client\n",
+ " client_options = (\n",
+ " ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n",
+ " if location != \"global\"\n",
+ " else None\n",
+ " )\n",
+ " client = discoveryengine.DocumentServiceClient(client_options=client_options)\n",
+ "\n",
+ " # The full resource name of the search engine branch.\n",
+ " # e.g. projects/{project}/locations/{location}/dataStores/{data_store_id}/branches/{branch}\n",
+ " parent = client.branch_path(\n",
+ " project=project_id,\n",
+ " location=location,\n",
+ " data_store=data_store_id,\n",
+ " branch=\"default_branch\",\n",
+ " )\n",
+ "\n",
+ " source_documents = [f\"{gcs_uri}/*\"]\n",
+ "\n",
+ " request = discoveryengine.ImportDocumentsRequest(\n",
+ " parent=parent,\n",
+ " gcs_source=discoveryengine.GcsSource(\n",
+ " input_uris=source_documents, data_schema=\"content\"\n",
+ " ),\n",
+ " # Options: `FULL`, `INCREMENTAL`\n",
+ " reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,\n",
+ " )\n",
+ "\n",
+ " # Make the request\n",
+ " operation = client.import_documents(request=request)\n",
+ "\n",
+ " response = operation.result()\n",
+ "\n",
+ " # Once the operation is complete,\n",
+ " # get information from operation metadata\n",
+ " metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)\n",
+ "\n",
+ " # Handle the response\n",
+ " return operation.operation.name"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "id": "1ddfe66a-acb4-4fdb-b9ed-76a332bb0f0c"
+ },
+ "outputs": [],
+ "source": [
+ "source_documents_gs_uri = (\n",
+ " \"gs://cloud-samples-data/gen-app-builder/search/alphabet-investor-pdfs\"\n",
+ ")\n",
+ "\n",
+ "import_documents(PROJECT_ID, LOCATION, DATASTORE_ID, source_documents_gs_uri)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "7a957202-b67e-47ca-84c3-b8a62cfbe405"
+ },
+ "source": [
+ "## Create a Search Engine\n",
+ "\n",
+ "This is used to set the `search_tier` to enterprise and to enable advanced LLM features.\n",
+ "\n",
+ "Enterprise tier is required to get extractive answers from a search query and advanced LLM features are required to summarize search results."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {
+ "id": "0e39e9bb-f381-44f6-a45e-3322669a171f"
+ },
+ "outputs": [],
+ "source": [
+ "def create_engine(\n",
+ " project_id: str, location: str, engine_name: str, engine_id: str, data_store_id: str\n",
+ "):\n",
+ " # Create a client\n",
+ " client_options = (\n",
+ " ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n",
+ " if location != \"global\"\n",
+ " else None\n",
+ " )\n",
+ " client = discoveryengine.EngineServiceClient(client_options=client_options)\n",
+ "\n",
+ " # Initialize request argument(s)\n",
+ " engine = discoveryengine.Engine(\n",
+ " display_name=engine_name,\n",
+ " solution_type=discoveryengine.SolutionType.SOLUTION_TYPE_SEARCH,\n",
+ " industry_vertical=discoveryengine.IndustryVertical.GENERIC,\n",
+ " data_store_ids=[data_store_id],\n",
+ " search_engine_config=discoveryengine.Engine.SearchEngineConfig(\n",
+ " search_tier=discoveryengine.SearchTier.SEARCH_TIER_ENTERPRISE,\n",
+ " search_add_ons=[discoveryengine.SearchAddOn.SEARCH_ADD_ON_LLM],\n",
+ " ),\n",
+ " )\n",
+ "\n",
+ " request = discoveryengine.CreateEngineRequest(\n",
+ " parent=client.collection_path(project_id, location, \"default_collection\"),\n",
+ " engine=engine,\n",
+ " engine_id=engine.display_name,\n",
+ " )\n",
+ "\n",
+ " # Make the request\n",
+ " operation = client.create_engine(request=request)\n",
+ " response = operation.result(timeout=90)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "id": "4a853982-8c2e-402a-a808-5364bf932619"
+ },
+ "outputs": [],
+ "source": [
+ "ENGINE_NAME = DATASTORE_NAME\n",
+ "ENGINE_ID = DATASTORE_ID\n",
+ "create_engine(PROJECT_ID, LOCATION, ENGINE_NAME, ENGINE_ID, DATASTORE_ID)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "e9f4d978-9164-4de3-b01a-179051706313"
+ },
+ "source": [
+ "## Query your Search Engine\n",
+ "\n",
+ "Note: The Engine will take some time to be ready to query.\n",
+ "\n",
+ "If you recently created an engine and you receive an error similar to:\n",
+ "\n",
+ "`404 Engine {ENGINE_NAME} is not found`\n",
+ "\n",
+ "Then wait a few minutes and try your query again."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {
+ "id": "1c4dfb62-7846-43d0-9cba-fd8886ce5546"
+ },
+ "outputs": [],
+ "source": [
+ "def search_sample(\n",
+ " project_id: str,\n",
+ " location: str,\n",
+ " engine_id: str,\n",
+ " search_query: str,\n",
+ ") -> list[discoveryengine.SearchResponse]:\n",
+ " # For more information, refer to:\n",
+ " # https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store\n",
+ " client_options = (\n",
+ " ClientOptions(api_endpoint=f\"{location}-discoveryengine.googleapis.com\")\n",
+ " if LOCATION != \"global\"\n",
+ " else None\n",
+ " )\n",
+ "\n",
+ " # Create a client\n",
+ " client = discoveryengine.SearchServiceClient(client_options=client_options)\n",
+ "\n",
+ " # The full resource name of the search engine serving config\n",
+ " # e.g. projects/{project_id}/locations/{location}/dataStores/{data_store_id}/servingConfigs/{serving_config_id}\n",
+ " serving_config = f\"projects/{project_id}/locations/{location}/collections/default_collection/engines/{engine_id}/servingConfigs/default_search\"\n",
+ "\n",
+ " # Optional: Configuration options for search\n",
+ " # Refer to the `ContentSearchSpec` reference for all supported fields:\n",
+ " # https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest.ContentSearchSpec\n",
+ " content_search_spec = discoveryengine.SearchRequest.ContentSearchSpec(\n",
+ " # For information about snippets, refer to:\n",
+ " # https://cloud.google.com/generative-ai-app-builder/docs/snippets\n",
+ " snippet_spec=discoveryengine.SearchRequest.ContentSearchSpec.SnippetSpec(\n",
+ " return_snippet=True\n",
+ " ),\n",
+ " # For information about search summaries, refer to:\n",
+ " # https://cloud.google.com/generative-ai-app-builder/docs/get-search-summaries\n",
+ " summary_spec=discoveryengine.SearchRequest.ContentSearchSpec.SummarySpec(\n",
+ " summary_result_count=5,\n",
+ " include_citations=True,\n",
+ " ignore_adversarial_query=True,\n",
+ " ignore_non_summary_seeking_query=True,\n",
+ " ),\n",
+ " )\n",
+ "\n",
+ " # Refer to the `SearchRequest` reference for all supported fields:\n",
+ " # https://cloud.google.com/python/docs/reference/discoveryengine/latest/google.cloud.discoveryengine_v1.types.SearchRequest\n",
+ " request = discoveryengine.SearchRequest(\n",
+ " serving_config=serving_config,\n",
+ " query=search_query,\n",
+ " page_size=10,\n",
+ " content_search_spec=content_search_spec,\n",
+ " query_expansion_spec=discoveryengine.SearchRequest.QueryExpansionSpec(\n",
+ " condition=discoveryengine.SearchRequest.QueryExpansionSpec.Condition.AUTO,\n",
+ " ),\n",
+ " spell_correction_spec=discoveryengine.SearchRequest.SpellCorrectionSpec(\n",
+ " mode=discoveryengine.SearchRequest.SpellCorrectionSpec.Mode.AUTO\n",
+ " ),\n",
+ " )\n",
+ "\n",
+ " response = client.search(request)\n",
+ " return response"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "id": "ad41e18b-38d2-4f4c-98ae-df14eda900ae"
+ },
+ "outputs": [],
+ "source": [
+ "query = \"Who is the CEO of Google?\"\n",
+ "\n",
+ "response = search_sample(PROJECT_ID, LOCATION, ENGINE_ID, query)\n",
+ "print(response.summary.summary_text)"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "name": "create_datastore_and_search.ipynb",
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3",
+ "name": "python3"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
}