From 562be3ff6b6e5c2a71558bb2204d1a8a43f860bd Mon Sep 17 00:00:00 2001 From: Eric Schmidt Date: Sun, 23 Jul 2023 13:26:38 -0700 Subject: [PATCH 1/4] feat: final polish of notebook; clean up --- main.tf | 1 + notebook/gen_ai_jss.ipynb | 501 ++++++++++++++++++++++++++++++++---- webhook/document_extract.py | 5 +- webhook/main.py | 22 -- webhook/vertex_llm.py | 7 +- 5 files changed, 455 insertions(+), 81 deletions(-) diff --git a/main.tf b/main.tf index 70b51d8..1413930 100644 --- a/main.tf +++ b/main.tf @@ -34,6 +34,7 @@ module "project_services" { "cloudbuild.googleapis.com", "run.googleapis.com", "iam.googleapis.com", + "notebooks.googleapis.com", ] } diff --git a/notebook/gen_ai_jss.ipynb b/notebook/gen_ai_jss.ipynb index 20094ff..f86f3e0 100644 --- a/notebook/gen_ai_jss.ipynb +++ b/notebook/gen_ai_jss.ipynb @@ -45,13 +45,7 @@ " \"GitHub\n", " View on GitHub\n", " \n", - " \n", - " \n", - " \n", - " \"Vertex\n", - " Open in Vertex AI Workbench\n", - " \n", - " \n", + " \n", "" ] }, @@ -76,7 +70,7 @@ "source": [ "## Overview\n", "\n", - "This notebook is a companion to the [Generative AI Document Summarization Jump Start Solution](https://cloud.google.com/blog/products/application-modernization/introducing-google-cloud-jump-start-solutions) **TODO: better link target**. With this notebook, you can use the summarization solution to create summaries of academic PDF files. In the notebook, you will programmatically upload a PDF file to a Cloud Storage bucket and then view the summary of that PDF in a BigQuery table. \n", + "This notebook is a companion to the [Generative AI Document Summarization Jump Start Solution](https://cloud.google.com/architecture/ai-ml/generative-ai-document-summarization). With this notebook, you can use the summarization solution to create summaries of academic PDF files. In the notebook, you will programmatically upload a PDF file to a Cloud Storage bucket and then view the summary of that PDF in a BigQuery table. \n", "\n", "+ Learn more about [using text chat LLM with Vertex AI](https://cloud.google.com/vertex-ai/docs/generative-ai/learn/overview).\n", "+ Learn more about [querying tables in Cloud BigQuery](https://cloud.google.com/bigquery/docs/tables).\n", @@ -94,7 +88,7 @@ "source": [ "### Objective\n", "\n", - "In this tutorial, you learn how to create a Cloud Function process that transcribes characters from a PDF, stores the complete PDF text in a Storage bucket, summarizes the PDF, and then upserts the document data (summary, complete text, URI) into a BigQuery table.\n", + "In this tutorial, you learn how to trigger a Cloud Function process that transcribes characters from a PDF, stores the complete PDF text in a Storage bucket, summarizes the PDF, and then upserts the document data (summary, complete text, URI) into a BigQuery table.\n", "\n", "This tutorial uses the following Google Cloud services and resources:\n", "\n", @@ -177,6 +171,7 @@ "google-cloud-bigquery\n", "google-cloud-logging\n", "google-cloud-storage\n", + "google-cloud-vision\n", "polling2\n", "tqdm" ] @@ -206,7 +201,7 @@ "id": "58707a750154" }, "source": [ - "### Colab only: Uncomment the following cell to restart the kernel." + "### Restart the kernel" ] }, { @@ -217,11 +212,11 @@ }, "outputs": [], "source": [ - "# # Automatically restart kernel after installs so that your environment can access the new packages\n", - "# import IPython\n", + "# Automatically restart kernel after installs so that your environment can access the new packages\n", + "import IPython\n", "\n", - "# app = IPython.Application.instance()\n", - "# app.kernel.do_shutdown(True)" + "app = IPython.Application.instance()\n", + "app.kernel.do_shutdown(True)" ] }, { @@ -248,7 +243,7 @@ " - [Cloud Vision API](https://console.cloud.google.com/flows/enableapi?apiid=vision.googleapis.com)\n", "\n", "\n", - "
Note: It is recommended to run this notebook from Vertex AI Workbench. If you are running this notebook locally instead, you need to install the Cloud SDK.
" + "
Note: It is recommended to run this notebook from Google Colaboratory. If you are running this notebook locally instead, you need to install the Cloud SDK.
" ] }, { @@ -319,54 +314,43 @@ "attachments": {}, "cell_type": "markdown", "metadata": { - "id": "74ccc9e52986" - }, - "source": [ - "**1. Vertex AI Workbench**\n", - "* Do nothing as you are already authenticated." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "de775a3773ba" + "id": "ef21552ccea8" }, "source": [ - "**2. Local JupyterLab instance, uncomment and run:**" + "**1. Colab, uncomment and run**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "id": "254614fa0c46" + "id": "603adbbf0532" }, "outputs": [], "source": [ - "# ! gcloud auth login" + "# from google.colab import auth\n", + "# auth.authenticate_user()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": { - "id": "ef21552ccea8" + "id": "de775a3773ba" }, "source": [ - "**3. Colab, uncomment and run:**" + "**2. Local JupyterLab instance, uncomment and run:**" ] }, { "cell_type": "code", "execution_count": null, "metadata": { - "id": "603adbbf0532" + "id": "254614fa0c46" }, "outputs": [], "source": [ - "# from google.colab import auth\n", - "# auth.authenticate_user()" + "# ! gcloud auth login" ] }, { @@ -376,7 +360,7 @@ "id": "f6b2ccc891ed" }, "source": [ - "**4. Service account or other**\n", + "**3. Service account or other**\n", "* See how to grant Cloud Storage permissions to your service account at https://cloud.google.com/storage/docs/gsutil/commands/iam#ch-examples." ] }, @@ -398,16 +382,21 @@ }, "outputs": [], "source": [ + "from datetime import datetime\n", "import os\n", "import polling2\n", "import re\n", - "import time\n", "\n", + "from typing import Sequence, Mapping\n", "from tqdm.notebook import tqdm\n", "from google.cloud import aiplatform\n", "from google.cloud import bigquery\n", "from google.cloud import logging\n", - "from google.cloud import storage" + "from google.cloud import storage\n", + "from google.cloud import vision\n", + "\n", + "import vertexai\n", + "from vertexai.preview.language_models import TextGenerationModel\n" ] }, { @@ -417,7 +406,7 @@ "source": [ "## Download test data\n", "\n", - "This Jump Start Solution uses data from [arXiv.org](https://arxiv.org/) to demonstrate the summarization capabilities of Vertex AI. arXiv, through [Kaggle.com](https://www.kaggle.com/datasets/Cornell-University/arxiv) has made many scholarly papers available, free of charge, from a Google Cloud Storage bucket." + "This Jump Start Solution uses data from [arXiv.org](https://arxiv.org/) to demonstrate the summarization capabilities of Vertex AI. [Kaggle.com](https://www.kaggle.com/datasets/Cornell-University/arxiv) has made many arXiv.org scholarly papers available, free of charge, from a Google Cloud Storage bucket." ] }, { @@ -430,6 +419,13 @@ "! gsutil ls gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One of the available arXiv.org papers has been selected for you in the following cell. You can swap out the selected paper with another from the same source. Be sure to choose a paper that is single-column format." + ] + }, { "cell_type": "code", "execution_count": null, @@ -453,7 +449,7 @@ "source": [ "## Upload test data to Storage bucket\n", "\n", - "The Terraform scripts for this JSS applies an EventArc trigger to a Cloud Storage bucket. When a PDF is uploaded to the storage bucket, the EventArc trigger fires, starting the summarization process." + "The Terraform scripts for this solution applies an EventArc trigger to a Cloud Storage bucket. When a PDF is uploaded to the storage bucket, the EventArc trigger fires, starting the summarization process." ] }, { @@ -470,7 +466,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Running the next cell uploads a local PDF file (downloaded in the previous section) to the target Cloud Storage bucket. " + "Running the following cell uploads a local PDF file (downloaded in the previous section) to the target Cloud Storage bucket. " ] }, { @@ -481,15 +477,7 @@ "source": [ "file_complete_text = f'{filename}_summary.txt'\n", "pdf = f'pdfs/{filename}.pdf'\n", - "logger_name = 'summarization-by-llm'" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ + "\n", "storage_client = storage.Client()\n", "bucket = storage_client.bucket(INPUT_BUCKET)\n", "blob = bucket.blob(pdf)\n", @@ -501,9 +489,9 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "This upload process kicks off the summarization process. You can view the progress of the summarization process in the [Cloud Console](https://console.cloud.google.com/functions/details/us-central1/jss16-1).\n", + "This upload process kicks off the summarization process. You can view the progress of the summarization process in the [Logs Explorer](https://console.cloud.google.com/logs/query for your project).\n", "\n", - "**TODO: Ensure that Cloud Console links go to correct console locations.**" + "To filter the logs, click the **Log name** drop-down menu and type \"summarization-by-llm\". Select the \"summarization-by-llm\" logger in the menu and then click apply to close the drop-down." ] }, { @@ -522,6 +510,8 @@ "metadata": {}, "outputs": [], "source": [ + "logger_name = 'summarization-by-llm'\n", + "\n", "@polling2.poll_decorator(check_success=lambda x: x != '', step=0.5, timeout=90)\n", "def get_cloud_event_id(pdf_filename, bar):\n", " logging_client = logging.Client(project=PROJECT_ID)\n", @@ -558,7 +548,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Now that we have the `cloud_event_id`, we can filter on just this cloud event and get updates for just this event." + "Now that we have the `cloud_event_id`, we can filter for this cloud event." ] }, { @@ -643,6 +633,411 @@ "\n", "summary['summary']" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Optional: Run pipeline components individually\n", + "\n", + "The summarization pipeline is composed of multiple independent components. There is a component that performs optical character recognition on the PDF, another that stores data in a Storage bucket, another that performs summarization with a LLM, and yet another that stores new rows into the BigQuery table.\n", + "\n", + "In this section, you can run each component individually to understand how they work together." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Perform OCR with Cloud Vision\n", + "\n", + "The first component in the pipeline performs optical character recognition (OCR) using Cloud Vision. Run the following cells to run optical character recognition on the PDF file you downloaded previously." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def document_extract(\n", + " bucket: str,\n", + " name: str,\n", + " output_bucket: str,\n", + " timeout: int = 420,\n", + ") -> str:\n", + " \"\"\"Perform OCR with PDF/TIFF as source files on GCS.\n", + "\n", + " Original sample is here:\n", + " https://github.com/GoogleCloudPlatform/python-docs-samples/blob/main/vision/snippets/detect/detect.py#L806\n", + "\n", + " Note: This function can cause the IOPub data rate to be exceeded on a\n", + " Jupyter server. This rate can be changed by setting the variable\n", + " `--ServerApp.iopub_data_rate_limit\n", + "\n", + " Args:\n", + " bucket (str): GCS URI of the bucket containing the PDF/TIFF files.\n", + " name (str): name of the PDF/TIFF file.\n", + " output_bucket: bucket to store output in\n", + " timeout (int): Timeout in seconds for the request.\n", + "\n", + "\n", + " Returns:\n", + " str: the complete text\n", + " \"\"\"\n", + "\n", + " gcs_source_uri = f\"gs://{bucket}/{name}\"\n", + " prefix = \"ocr\"\n", + " gcs_destination_uri = f\"gs://{output_bucket}/{prefix}/\"\n", + " mime_type = \"application/pdf\"\n", + " batch_size = 2\n", + "\n", + " # Perform Vision OCR\n", + " client = vision.ImageAnnotatorClient()\n", + "\n", + " feature = vision.Feature(type_=vision.Feature.Type.DOCUMENT_TEXT_DETECTION)\n", + "\n", + " gcs_source = vision.GcsSource(uri=gcs_source_uri)\n", + " input_config = vision.InputConfig(gcs_source=gcs_source, mime_type=mime_type)\n", + "\n", + " gcs_destination = vision.GcsDestination(uri=gcs_destination_uri)\n", + " output_config = vision.OutputConfig(\n", + " gcs_destination=gcs_destination, batch_size=batch_size\n", + " )\n", + "\n", + " async_request = vision.AsyncAnnotateFileRequest(\n", + " features=[feature], input_config=input_config, output_config=output_config\n", + " )\n", + "\n", + " operation = client.async_batch_annotate_files(requests=[async_request])\n", + "\n", + " print(\"OCR: waiting for the operation to finish.\")\n", + " operation.result(timeout=timeout)\n", + "\n", + " # Once the request has completed and the output has been\n", + " # written to GCS, we can list all the output files.\n", + " return get_ocr_output_from_bucket(gcs_destination_uri, output_bucket)\n", + "\n", + "\n", + "def get_ocr_output_from_bucket(gcs_destination_uri: str, bucket_name: str) -> str:\n", + " \"\"\"Iterates over blobs in output bucket to get full OCR result.\n", + "\n", + " Arguments:\n", + " gcs_destination_uri: the URI where the OCR output was saved.\n", + " bucket_name: the name of the bucket where the output was saved.\n", + "\n", + " Returns:\n", + " The full text of the document\n", + " \"\"\"\n", + " storage_client = storage.Client()\n", + "\n", + " match = re.match(r\"gs://([^/]+)/(.+)\", gcs_destination_uri)\n", + " prefix = match.group(2)\n", + " bucket = storage_client.get_bucket(bucket_name)\n", + "\n", + " # List objects with the given prefix, filtering out folders.\n", + " blob_list = [\n", + " blob\n", + " for blob in list(bucket.list_blobs(prefix=prefix))\n", + " if not blob.name.endswith(\"/\")\n", + " ]\n", + "\n", + " # Concatenate all text from the blobs\n", + " complete_text = \"\"\n", + " for output in blob_list:\n", + " json_string = output.download_as_bytes().decode(\"utf-8\")\n", + " response = json.loads(json_string)\n", + "\n", + " # The actual response for the first page of the input file.\n", + " page_response = response[\"responses\"][0]\n", + " annotation = page_response[\"fullTextAnnotation\"]\n", + "\n", + " complete_text = complete_text + annotation[\"text\"]\n", + "\n", + " return complete_text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "bucket = \"arxiv-dataset\"\n", + "pdf_name = \"arxiv/cmp-lg/pdf/9404/9404002v1.pdf\"\n", + "output_bucket = f\"{PROJECT_ID}_output\"\n", + "\n", + "complete_text = document_extract(bucket=bucket,\n", + " name=pdf_name,\n", + " output_bucket=output_bucket)\n", + "\n", + "# Entire text is long; print just first 1000 characters\n", + "print(complete_text[:1000])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Summarize with Vertex AI LLM\n", + "\n", + "Next, you can send the complete text of the PDF to be summarized. Vertex AI allows you to use many different types of LLM models. In this case, you use a LLM model designed for text summarization, `text-bison@001`. You send a prediction request to Vertex AI, providing the name of the LLM you want to use. The Vertex AI service then sends the model's response back to you. In the following cells, the Python SDK for Vertex AI provides all of the helper methods and classes you need to perform this process.\n", + "\n", + "Note that Vertex AI predictions have a limit of characters that can be sent in a request payload. For this reason, a heuristic is needed to isolate only certain text blocks that you need--the abstract and the conclusion." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def predict_large_language_model(\n", + " project_id: str,\n", + " model_name: str,\n", + " temperature: float,\n", + " max_decode_steps: int,\n", + " top_p: float,\n", + " top_k: int,\n", + " content: str,\n", + " location: str = \"us-central1\",\n", + " tuned_model_name: str = \"\",\n", + ") -> str:\n", + " \"\"\"Predict using a Large Language Model.\n", + "\n", + " Args:\n", + " project_id (str): the Google Cloud project ID\n", + " model_name (str): the name of the LLM model to use\n", + " temperature (float): controls the randomness of predictions\n", + " max_decode_steps (int): the maximum number of decode steps\n", + " top_p (float): cumulative probability of parameter highest vocabulary tokens\n", + " top_k (int): number of highest propbability vocabulary tokens to keep for top-k-filtering\n", + " content (str): the text to summarize\n", + " location (str): the Google Cloud region to run in\n", + " tuned_model_name (str): a tuned LLM model to use; default is none\n", + "\n", + " Returns:\n", + " The summarization of the content\n", + " \"\"\"\n", + " vertexai.init(\n", + " project=project_id,\n", + " location=location,\n", + " )\n", + "\n", + " model = TextGenerationModel.from_pretrained(model_name)\n", + " if tuned_model_name:\n", + " model = model.get_tuned_model(tuned_model_name)\n", + " response = model.predict(\n", + " content,\n", + " temperature=temperature,\n", + " max_output_tokens=max_decode_steps,\n", + " top_k=top_k,\n", + " top_p=top_p,\n", + " )\n", + " return response.text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def truncate_complete_text(complete_text: str) -> str:\n", + " \"\"\"Extracts the abstract and conclusion from an academic paper.\n", + "\n", + " Uses a heuristics to approximate the extent of the abstract and conclusion.\n", + " For abstract: assumes beginning after the string `abstract` and extends for 6-7 sentences\n", + " For conclusion: assumes beginning after the string `conclusion` and extends for 7-9 sentences\n", + "\n", + " Args:\n", + " complete_text (str): the complete text of the academic paper\n", + "\n", + " Returns\n", + " str: the truncated paper\n", + " \"\"\"\n", + " complete_text = complete_text.lower()\n", + " abstract_start = complete_text.find(ABSTRACT_H1)\n", + " conclusion_start = complete_text.find(CONCLUSION_H1)\n", + "\n", + " abstract = complete_text[abstract_start:ABSTRACT_LENGTH]\n", + " conclusion = complete_text[conclusion_start:]\n", + " if len(conclusion) > CONCLUSION_LENGTH:\n", + " conclusion = conclusion[:CONCLUSION_LENGTH]\n", + "\n", + " return f\"\"\"\n", + " Abstract: {abstract}\n", + "\n", + " Conclusion: {conclusion}\n", + " \"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model_name = \"text-bison@001\"\n", + "temperature = 0.2\n", + "max_decode_steps = 1024\n", + "top_p = 0.8\n", + "top_k = 40\n", + "\n", + "extracted_text_trunc = truncate_complete_text(complete_text=complete_text)\n", + "content = f\"Summarize:\\n{extracted_text_trunc}\",\n", + "\n", + "summary = predict_large_language_model(\n", + " project_id=PROJECT_ID,\n", + " model_name=model_name,\n", + " temperature=temperature,\n", + " top_p=top_p,\n", + " top_k=top_k,\n", + " max_decode_steps=max_decode_steps,\n", + " content=content)\n", + "\n", + "print(summary)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Store summary in Cloud Storage\n", + "\n", + "The output from multiple steps in the summarization process are stored in Cloud Storage. The following cells saves summarization text as a TXT file in a Storage bucket." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def upload_to_gcs(bucket: str, name: str, data: str):\n", + " \"\"\"Upload a string to Google Cloud Storage bucket.\n", + "\n", + " Args:\n", + " bucket (str): the name of the Storage bucket. Do not include \"gs://\"\n", + " name (str): the name of the file to create in the bucket\n", + " data (str): the data to store\n", + "\n", + " \"\"\"\n", + " client = storage.Client()\n", + " bucket = client.get_bucket(bucket)\n", + " blob = bucket.blob(name)\n", + " blob.upload_from_string(data)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "summary_text_filename = \"summaries/manual.txt\"\n", + "\n", + "upload_to_gcs(bucket=output_bucket, name=summary_text_filename, data=summary)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Upsert data into BigQuery\n", + "\n", + "Now that you have the summary for the file, you can update the BigQuery table that contains all of the file summaries.\n", + "\n", + "The following cells updates the BigQuery table in your project named `summary_dataset.summary_table` with the summaries created by Vertex LLM." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def write_summarization_to_table(\n", + " project_id: str,\n", + " dataset_id: str,\n", + " table_id: str,\n", + " bucket: str,\n", + " filename: str,\n", + " complete_text: str,\n", + " complete_text_uri: str,\n", + " summary: str,\n", + " summary_uri: str,\n", + " timestamp: datetime,\n", + ") -> Sequence[Mapping]:\n", + " \"\"\"Updates the BigQuery table with the document summarization\n", + "\n", + " Original sample is here:\n", + " https://cloud.google.com/bigquery/docs/samples/bigquery-table-insert-rows-explicit-none-insert-ids\n", + "\n", + " Args:\n", + " project_id (str): the Google Cloud project ID\n", + " dataset_id (str): the name of the BigQuery dataset\n", + " table_id (str): the name of the BigQuery table\n", + " bucket (str): the name of the bucket with the PDF\n", + " filename (str): path of PDF relative to bucket root\n", + " complete_text (str): the complete text of the PDF\n", + " complete_text_uri (str): the Storage URI of the complete TXT document\n", + " summary (str): the text summary of the document\n", + " summary_uri (str): the Storage URI of the summary TXT document\n", + " timestamp (datetime): when the processing occurred\n", + " \"\"\"\n", + " client = bigquery.Client()\n", + "\n", + " table_name = f\"{project_id}.{dataset_id}.{table_id}\"\n", + "\n", + " rows_to_insert = [\n", + " {\n", + " \"bucket\": bucket,\n", + " \"filename\": filename,\n", + " \"extracted_text\": complete_text,\n", + " \"summary_uri\": summary_uri,\n", + " \"summary\": summary,\n", + " \"complete_text_uri\": complete_text_uri,\n", + " \"timestamp\": timestamp.isoformat(),\n", + " }\n", + " ]\n", + "\n", + " errors = client.insert_rows_json(\n", + " table_name, rows_to_insert, row_ids=bigquery.AutoRowIDs.GENERATE_UUID\n", + " )\n", + " if errors != []:\n", + " logging_client = logging.Client()\n", + " logger = logging_client.logger(logger_name)\n", + " logger.log(\n", + " f\"Encountered errors while inserting rows: {errors}\", severity=\"ERROR\"\n", + " )\n", + " return errors\n", + "\n", + " return []" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dataset_id = \"summary_dataset\"\n", + "table_id = \"summary_table\"\n", + "bucket = \"gs://arxiv-dataset\"\n", + "\n", + "errors = write_summarization_to_table(\n", + " dataset_id=dataset_id,\n", + " table_id=table_id,\n", + " bucket=bucket,\n", + " filename=pdf_name,\n", + " complete_text=complete_text,\n", + " complete_text_uri=\"\",\n", + " summary=summary,\n", + " summary_uri=f\"gs://{output_bucket}/{summary_text_filename}\",\n", + " timestamp=datetime().isoformat(),\n", + ")" + ] } ], "metadata": { diff --git a/webhook/document_extract.py b/webhook/document_extract.py index 363e1c0..12c7a88 100644 --- a/webhook/document_extract.py +++ b/webhook/document_extract.py @@ -69,7 +69,7 @@ def async_document_extract( operation = client.async_batch_annotate_files(requests=[async_request]) - print("Waiting for the operation to finish.") + print("OCR: waiting for the operation to finish.") operation.result(timeout=timeout) # Once the request has completed and the output has been @@ -84,7 +84,8 @@ def get_ocr_output_from_bucket(gcs_destination_uri: str, bucket_name: str) -> st gcs_destination_uri: the URI where the OCR output was saved. bucket_name: the name of the bucket where the output was saved. - Returns the full text of the document. + Returns: + The full text of the document """ storage_client = storage.Client() diff --git a/webhook/main.py b/webhook/main.py index 90eb940..ce4d9e1 100644 --- a/webhook/main.py +++ b/webhook/main.py @@ -50,28 +50,6 @@ def default_marshaller(o: object) -> str: return str(o) -def summarize_text(text: str, parameters: None | dict[str, int | float] = None) -> str: - """Summarization Example with a Large Language Model""" - vertexai.init( - project=_PROJECT_ID, - location=_LOCATION, - ) - - final_parameters = _DEFAULT_PARAMETERS.copy() - if parameters: - final_parameters.update(parameters) - - model = TextGenerationModel.from_pretrained("text-bison@001") - response = model.predict( - f"Provide a summary with about two sentences for the following article: {text}\n" - "Summary:", - **final_parameters, - ) - print(f"Response from Model: {response.text}") - - return response.text - - def redirect_and_reply(previous_data): endpoint = f'https://{_LOCATION}-{_PROJECT_ID}.cloudfunctions.net/{os.environ["K_SERVICE"]}' logging_client = logging.Client() diff --git a/webhook/vertex_llm.py b/webhook/vertex_llm.py index 2d4ec88..549a875 100644 --- a/webhook/vertex_llm.py +++ b/webhook/vertex_llm.py @@ -33,13 +33,12 @@ def predict_large_language_model( project_id (str): the Google Cloud project ID model_name (str): the name of the LLM model to use temperature (float): controls the randomness of predictions - max_decode_steps (int): TODO(nicain) + max_decode_steps (int): the maximum number of decode steps top_p (float): cumulative probability of parameter highest vocabulary tokens top_k (int): number of highest propbability vocabulary tokens to keep for top-k-filtering content (str): the text to summarize location (str): the Google Cloud region to run in - tuned_model_name (str): TODO(nicain) - credentials: TODO(nicain) + tuned_model_name (str): the LLM model to use Returns: The summarization of the content @@ -48,7 +47,7 @@ def predict_large_language_model( project=project_id, location=location, ) - print("FOO", vertexai.init) + model = TextGenerationModel.from_pretrained(model_name) if tuned_model_name: model = model.get_tuned_model(tuned_model_name) From 66447c58693b7e4c0214951f18cebd1e98225cff Mon Sep 17 00:00:00 2001 From: Eric Schmidt Date: Sun, 23 Jul 2023 14:49:29 -0700 Subject: [PATCH 2/4] notebook tested locally --- main.tf | 1 + notebook/gen_ai_jss.ipynb | 429 ++++++++++++++++++++++++++++++++------ 2 files changed, 371 insertions(+), 59 deletions(-) diff --git a/main.tf b/main.tf index 1413930..1924420 100644 --- a/main.tf +++ b/main.tf @@ -35,6 +35,7 @@ module "project_services" { "run.googleapis.com", "iam.googleapis.com", "notebooks.googleapis.com", + "dataform.googleapis.com", ] } diff --git a/notebook/gen_ai_jss.ipynb b/notebook/gen_ai_jss.ipynb index f86f3e0..530413b 100644 --- a/notebook/gen_ai_jss.ipynb +++ b/notebook/gen_ai_jss.ipynb @@ -36,12 +36,12 @@ "\n", "\n", " \n", "
\n", - " \n", + " \n", " \"Colab Run in Colab\n", " \n", " \n", - " \n", + " \n", " \"GitHub\n", " View on GitHub\n", " \n", @@ -58,7 +58,7 @@ "source": [ "**_NOTE_**: This notebook has been tested in the following environment:\n", "\n", - "* Python version = 3.9" + "* Python version = 3.8.16 (local)" ] }, { @@ -141,7 +141,7 @@ "and [Cloud Functions pricing](https://cloud.google.com/functions/pricing),\n", "and [Cloud EventArc pricing](https://cloud.google.com/eventarc/pricing),\n", "and [Cloud Storage pricing](https://cloud.google.com/storage/pricing), \n", - "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)\n", + "and use the [Pricing Calculator](https://cloud.google.com/products/calculator/?hl=en_US&_ga=2.16285759.-826855678.1689377111#id=78888c9b-02ac-4130-9327-fecd7f4cfb11)\n", "to generate a cost estimate based on your projected usage." ] }, @@ -159,11 +159,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting requirements.txt\n" + ] + } + ], "source": [ "%%writefile requirements.txt\n", "\n", @@ -178,11 +186,77 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": { "id": "2b4ef9b72d43" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Collecting google-cloud-aiplatform\n", + " Using cached google_cloud_aiplatform-1.28.1-py2.py3-none-any.whl (2.7 MB)\n", + "Collecting google-cloud-bigquery\n", + " Using cached google_cloud_bigquery-3.11.4-py2.py3-none-any.whl (219 kB)\n", + "Collecting google-cloud-logging\n", + " Using cached google_cloud_logging-3.6.0-py2.py3-none-any.whl (195 kB)\n", + "Collecting google-cloud-storage\n", + " Using cached google_cloud_storage-2.10.0-py2.py3-none-any.whl (114 kB)\n", + "Collecting google-cloud-vision\n", + " Using cached google_cloud_vision-3.4.4-py2.py3-none-any.whl (444 kB)\n", + "Collecting polling2\n", + " Using cached polling2-0.5.0-py2.py3-none-any.whl (6.4 kB)\n", + "Collecting tqdm\n", + " Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)\n", + "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-aiplatform->-r requirements.txt (line 2)) (1.22.2)\n", + "Collecting shapely<2.0.0\n", + " Using cached Shapely-1.8.5.post1-cp38-cp38-macosx_10_9_x86_64.whl (1.2 MB)\n", + "Requirement already satisfied: google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-aiplatform->-r requirements.txt (line 2)) (2.11.0)\n", + "Requirement already satisfied: packaging>=14.3 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-aiplatform->-r requirements.txt (line 2)) (23.0)\n", + "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-aiplatform->-r requirements.txt (line 2)) (3.19.6)\n", + "Collecting google-cloud-resource-manager<3.0.0dev,>=1.3.3\n", + " Using cached google_cloud_resource_manager-1.10.2-py2.py3-none-any.whl (321 kB)\n", + "Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.6.0 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-bigquery->-r requirements.txt (line 3)) (2.3.2)\n", + "Requirement already satisfied: requests<3.0.0dev,>=2.21.0 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-bigquery->-r requirements.txt (line 3)) (2.28.2)\n", + "Collecting google-resumable-media<3.0dev,>=0.6.0\n", + " Using cached google_resumable_media-2.5.0-py2.py3-none-any.whl (77 kB)\n", + "Requirement already satisfied: python-dateutil<3.0dev,>=2.7.2 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-bigquery->-r requirements.txt (line 3)) (2.8.2)\n", + "Requirement already satisfied: grpcio<2.0dev,>=1.47.0 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-bigquery->-r requirements.txt (line 3)) (1.51.3)\n", + "Collecting grpc-google-iam-v1<1.0.0dev,>=0.12.4\n", + " Using cached grpc_google_iam_v1-0.12.6-py2.py3-none-any.whl (26 kB)\n", + "Collecting google-cloud-appengine-logging<2.0.0dev,>=0.1.0\n", + " Using cached google_cloud_appengine_logging-1.3.1-py2.py3-none-any.whl (16 kB)\n", + "Collecting google-cloud-audit-log<1.0.0dev,>=0.1.0\n", + " Using cached google_cloud_audit_log-0.2.5-py2.py3-none-any.whl (12 kB)\n", + "Requirement already satisfied: google-auth<3.0dev,>=1.25.0 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-storage->-r requirements.txt (line 5)) (2.16.2)\n", + "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.56.2 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform->-r requirements.txt (line 2)) (1.59.0)\n", + "Requirement already satisfied: grpcio-status<2.0dev,>=1.33.2 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform->-r requirements.txt (line 2)) (1.48.2)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-auth<3.0dev,>=1.25.0->google-cloud-storage->-r requirements.txt (line 5)) (5.3.0)\n", + "Requirement already satisfied: six>=1.9.0 in /Users/erschmid/.local/lib/python3.8/site-packages (from google-auth<3.0dev,>=1.25.0->google-cloud-storage->-r requirements.txt (line 5)) (1.15.0)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-auth<3.0dev,>=1.25.0->google-cloud-storage->-r requirements.txt (line 5)) (0.2.8)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-auth<3.0dev,>=1.25.0->google-cloud-storage->-r requirements.txt (line 5)) (4.9)\n", + "Collecting google-crc32c<2.0dev,>=1.0\n", + " Using cached google_crc32c-1.5.0-cp38-cp38-macosx_10_9_x86_64.whl (30 kB)\n", + "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from requests<3.0.0dev,>=2.21.0->google-cloud-bigquery->-r requirements.txt (line 3)) (1.26.15)\n", + "Requirement already satisfied: idna<4,>=2.5 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from requests<3.0.0dev,>=2.21.0->google-cloud-bigquery->-r requirements.txt (line 3)) (3.4)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from requests<3.0.0dev,>=2.21.0->google-cloud-bigquery->-r requirements.txt (line 3)) (3.1.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from requests<3.0.0dev,>=2.21.0->google-cloud-bigquery->-r requirements.txt (line 3)) (2022.12.7)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=1.25.0->google-cloud-storage->-r requirements.txt (line 5)) (0.4.8)\n", + "Installing collected packages: polling2, tqdm, shapely, google-crc32c, google-resumable-media, google-cloud-audit-log, grpc-google-iam-v1, google-cloud-vision, google-cloud-storage, google-cloud-resource-manager, google-cloud-bigquery, google-cloud-appengine-logging, google-cloud-logging, google-cloud-aiplatform\n", + "\u001b[33m WARNING: The script tqdm is installed in '/Users/erschmid/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", + " NOTE: The current PATH contains path(s) starting with `~`, which may not be expanded by all applications.\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33m WARNING: The script tb-gcp-uploader is installed in '/Users/erschmid/.local/bin' which is not on PATH.\n", + " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", + " NOTE: The current PATH contains path(s) starting with `~`, which may not be expanded by all applications.\u001b[0m\u001b[33m\n", + "\u001b[0mSuccessfully installed google-cloud-aiplatform-1.28.1 google-cloud-appengine-logging-1.3.1 google-cloud-audit-log-0.2.5 google-cloud-bigquery-3.11.4 google-cloud-logging-3.6.0 google-cloud-resource-manager-1.10.2 google-cloud-storage-2.10.0 google-cloud-vision-3.4.4 google-crc32c-1.5.0 google-resumable-media-2.5.0 grpc-google-iam-v1-0.12.6 polling2-0.5.0 shapely-1.8.5.post1 tqdm-4.65.0\n", + "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.2.1 is available.\n", + "You should consider upgrading via the '/Users/erschmid/.pyenv/versions/3.8.16/bin/python3.8 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", + "\u001b[0m" + ] + } + ], "source": [ "# Install the packages\n", "import os\n", @@ -201,16 +275,35 @@ "id": "58707a750154" }, "source": [ - "### Restart the kernel" + "### Restart the kernel (Colab only)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "id": "f200f10a1da3" }, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "{'status': 'ok', 'restart': True}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + }, + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details." + ] + } + ], "source": [ "# Automatically restart kernel after installs so that your environment can access the new packages\n", "import IPython\n", @@ -263,13 +356,21 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": { "id": "oM1iC_MfAts1" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Updated property [core/project].\n" + ] + } + ], "source": [ - "PROJECT_ID = \"[your-project-id]\" # @param {type:\"string\"}\n", + "PROJECT_ID = \"[your-project-name]\" # @param {type:\"string\"}\n", "\n", "# Set the project id\n", "! gcloud config set project {PROJECT_ID}" @@ -289,7 +390,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "metadata": { "id": "region" }, @@ -344,11 +445,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "metadata": { "id": "254614fa0c46" }, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Your browser has been opened to visit:\n", + "\n", + " https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&code_challenge=9QrnfgtBYc8TN96oasecu3uRYnTwHn7vkkNYSHI8w2M&code_challenge_method=S256&access_type=offline&response_type=code&prompt=select_account\n", + "\n", + "\n", + "\n", + "You are now logged in as [erschmid@google.com].\n", + "Your current project is [jss-16p1-test]. You can change this setting by running:\n", + " $ gcloud config set project PROJECT_ID\n" + ] + } + ], "source": [ "# ! gcloud auth login" ] @@ -376,17 +493,18 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 25, "metadata": { "id": "PyQmSRbKA8r-" }, "outputs": [], "source": [ - "from datetime import datetime\n", + "import json\n", "import os\n", "import polling2\n", "import re\n", "\n", + "from datetime import datetime\n", "from typing import Sequence, Mapping\n", "from tqdm.notebook import tqdm\n", "from google.cloud import aiplatform\n", @@ -411,9 +529,26 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404001v1.pdf\n", + "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404002v1.pdf\n", + "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404003v2.pdf\n", + "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404004v1.pdf\n", + "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404005v1.pdf\n", + "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404007v1.pdf\n", + "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404008v1.pdf\n", + "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404009v3.pdf\n", + "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404010v2.pdf\n", + "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404011v2.pdf\n" + ] + } + ], "source": [ "# List all the comparative linguistics papers from Cloud Storage\n", "! gsutil ls gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404" @@ -428,9 +563,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Copying gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404002v1.pdf...\n", + "/ [1 files][196.1 KiB/196.1 KiB] \n", + "Operation completed over 1 objects/196.1 KiB. \n" + ] + } + ], "source": [ "filename = '9404002v1'\n", "file_uri = f'gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/{filename}.pdf'\n", @@ -454,7 +599,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -471,14 +616,14 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ "file_complete_text = f'{filename}_summary.txt'\n", "pdf = f'pdfs/{filename}.pdf'\n", "\n", - "storage_client = storage.Client()\n", + "storage_client = storage.Client(project=PROJECT_ID)\n", "bucket = storage_client.bucket(INPUT_BUCKET)\n", "blob = bucket.blob(pdf)\n", "blob.upload_from_filename(pdf)" @@ -506,7 +651,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -534,9 +679,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "156d17bd47064565a855dc630c186afe", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + " 0%| | 0/100 [00:00\n", + "cloud_event_id(jss-16p1-test_uploads/pdfs/9404002v1.pdf/1690146389113776): DB_WRITE\n", + "polling\n", + "polling\n", + "polling\n", + "polling\n", + "polling\n" + ] + } + ], "source": [ "entries = []\n", "bar = tqdm(total=6)\n", @@ -613,9 +821,20 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 18, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "'The Loebner prize competition is not a good way to measure the progress of AI. The competition is not well-designed, and it is not clear what purpose it serves. The Loebner prize is also inappropriate given the current level of technology.'" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ "bigquery_client = bigquery.Client(project=PROJECT_ID)\n", "\n", @@ -631,7 +850,7 @@ "if len(row_list) != 0:\n", " summary = row_list[0]\n", "\n", - "summary['summary']" + "print(summary['summary'])" ] }, { @@ -651,12 +870,14 @@ "source": [ "### Perform OCR with Cloud Vision\n", "\n", - "The first component in the pipeline performs optical character recognition (OCR) using Cloud Vision. Run the following cells to run optical character recognition on the PDF file you downloaded previously." + "The first component in the pipeline performs optical character recognition (OCR) using Cloud Vision. Run the following cells to run optical character recognition on the PDF file you downloaded previously.\n", + "\n", + "Note that OCR can take a while to complete. You might need to wait for a result." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -664,6 +885,7 @@ " bucket: str,\n", " name: str,\n", " output_bucket: str,\n", + " project_id: str,\n", " timeout: int = 420,\n", ") -> str:\n", " \"\"\"Perform OCR with PDF/TIFF as source files on GCS.\n", @@ -716,10 +938,12 @@ "\n", " # Once the request has completed and the output has been\n", " # written to GCS, we can list all the output files.\n", - " return get_ocr_output_from_bucket(gcs_destination_uri, output_bucket)\n", + " return get_ocr_output_from_bucket(gcs_destination_uri, output_bucket, project_id)\n", "\n", "\n", - "def get_ocr_output_from_bucket(gcs_destination_uri: str, bucket_name: str) -> str:\n", + "def get_ocr_output_from_bucket(gcs_destination_uri: str,\n", + " bucket_name: str,\n", + " project_id: str) -> str:\n", " \"\"\"Iterates over blobs in output bucket to get full OCR result.\n", "\n", " Arguments:\n", @@ -729,7 +953,7 @@ " Returns:\n", " The full text of the document\n", " \"\"\"\n", - " storage_client = storage.Client()\n", + " storage_client = storage.Client(project=project_id)\n", "\n", " match = re.match(r\"gs://([^/]+)/(.+)\", gcs_destination_uri)\n", " prefix = match.group(2)\n", @@ -759,9 +983,39 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 27, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "OCR: waiting for the operation to finish.\n", + "arXiv:cmp-lg/9404002v1 4 Apr 1994\n", + "Lessons from a\n", + "Restricted Turing Test\n", + "Stuart M. Shieber\n", + "Aiken Computation Laboratory\n", + "Division of Applied Sciences\n", + "Harvard University\n", + "March 28, 1994\n", + "(Revision 6)\n", + "Abstract\n", + "We report on the recent Loebner prize competition inspired by Turing's\n", + "test of intelligent behavior. The presentation covers the structure of the\n", + "competition and the outcome of its first instantiation in an actual event,\n", + "and an analysis of the purpose, design, and appropriateness of such a\n", + "competition. We argue that the competition has no clear purpose, that\n", + "its design prevents any useful outcome, and that such a competition is\n", + "inappropriate given the current level of technology. We then speculate as\n", + "to suitable alternatives to the Loebner prize.\n", + "This paper is to appear in Communications of the Association for Comput-\n", + "ing Machinery, and is available from the Center for Research in Computing\n", + "Technology, Harvard University, as Technical Report TR-19-92 and from the\n", + "Computation and Langua\n" + ] + } + ], "source": [ "bucket = \"arxiv-dataset\"\n", "pdf_name = \"arxiv/cmp-lg/pdf/9404/9404002v1.pdf\"\n", @@ -769,7 +1023,8 @@ "\n", "complete_text = document_extract(bucket=bucket,\n", " name=pdf_name,\n", - " output_bucket=output_bucket)\n", + " output_bucket=output_bucket,\n", + " project_id=PROJECT_ID)\n", "\n", "# Entire text is long; print just first 1000 characters\n", "print(complete_text[:1000])" @@ -788,7 +1043,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -839,10 +1094,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 31, "metadata": {}, "outputs": [], "source": [ + "ABSTRACT_H1 = \"abstract\"\n", + "CONCLUSION_H1 = \"conclusion\"\n", + "ABSTRACT_LENGTH = 150 * 10 # Abstract recommended max word length * avg 10 letters long\n", + "CONCLUSION_LENGTH = 200 * 10 # Conclusion max word legnth * avg 10 letters long\n", + "\n", "def truncate_complete_text(complete_text: str) -> str:\n", " \"\"\"Extracts the abstract and conclusion from an academic paper.\n", "\n", @@ -874,9 +1134,19 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 32, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The Loebner Prize competition is inspired by Turing's test of intelligent behavior.\n", + "The authors argue that the competition has no clear purpose, that its design prevents any useful outcome, and that such a competition is inappropriate given the current level of technology.\n", + "They then speculate as to suitable alternatives to the Loebner prize.\n" + ] + } + ], "source": [ "model_name = \"text-bison@001\"\n", "temperature = 0.2\n", @@ -910,11 +1180,11 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 35, "metadata": {}, "outputs": [], "source": [ - "def upload_to_gcs(bucket: str, name: str, data: str):\n", + "def upload_to_gcs(project: str, bucket: str, name: str, data: str):\n", " \"\"\"Upload a string to Google Cloud Storage bucket.\n", "\n", " Args:\n", @@ -923,7 +1193,7 @@ " data (str): the data to store\n", "\n", " \"\"\"\n", - " client = storage.Client()\n", + " client = storage.Client(project=project)\n", " bucket = client.get_bucket(bucket)\n", " blob = bucket.blob(name)\n", " blob.upload_from_string(data)\n" @@ -931,13 +1201,13 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "summary_text_filename = \"summaries/manual.txt\"\n", "\n", - "upload_to_gcs(bucket=output_bucket, name=summary_text_filename, data=summary)" + "upload_to_gcs(project=PROJECT_ID, bucket=output_bucket, name=summary_text_filename, data=summary)" ] }, { @@ -953,7 +1223,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "metadata": {}, "outputs": [], "source": [ @@ -986,7 +1256,7 @@ " summary_uri (str): the Storage URI of the summary TXT document\n", " timestamp (datetime): when the processing occurred\n", " \"\"\"\n", - " client = bigquery.Client()\n", + " client = bigquery.Client(project=project_id)\n", "\n", " table_name = f\"{project_id}.{dataset_id}.{table_id}\"\n", "\n", @@ -1018,7 +1288,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 44, "metadata": {}, "outputs": [], "source": [ @@ -1027,6 +1297,7 @@ "bucket = \"gs://arxiv-dataset\"\n", "\n", "errors = write_summarization_to_table(\n", + " project_id=PROJECT_ID,\n", " dataset_id=dataset_id,\n", " table_id=table_id,\n", " bucket=bucket,\n", @@ -1035,9 +1306,49 @@ " complete_text_uri=\"\",\n", " summary=summary,\n", " summary_uri=f\"gs://{output_bucket}/{summary_text_filename}\",\n", - " timestamp=datetime().isoformat(),\n", + " timestamp=datetime.now(),\n", ")" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Finally, you can query the BigQuery table to ensure that the PDF summary has been inserted into the table." + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The Loebner Prize competition is inspired by Turing's test of intelligent behavior.\n", + "The authors argue that the competition has no clear purpose, that its design prevents any useful outcome, and that such a competition is inappropriate given the current level of technology.\n", + "They then speculate as to suitable alternatives to the Loebner prize.\n" + ] + } + ], + "source": [ + "bigquery_client = bigquery.Client(project=PROJECT_ID)\n", + "\n", + "table_name = f\"{PROJECT_ID}.summary_dataset.summary_table\"\n", + "\n", + "# Compose the SQL query to select the summary for the PDF document\n", + "sql_query = f\"SELECT summary FROM `{table_name}` WHERE filename LIKE '%{pdf_name}%'\"\n", + "\n", + "job = bigquery_client.query(sql_query)\n", + "rows = job.result()\n", + "row_list = list(rows)\n", + "\n", + "if len(row_list) != 0:\n", + " summary = row_list[0]\n", + "\n", + "print(summary['summary'])" + ] } ], "metadata": { @@ -1067,7 +1378,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.7.12" + "version": "3.8.16" } }, "nbformat": 4, From f8c14e4b1f5c57cd4318c275639a1c9c7ae89985 Mon Sep 17 00:00:00 2001 From: Eric Schmidt Date: Mon, 24 Jul 2023 10:49:05 -0700 Subject: [PATCH 3/4] linter --- webhook/main.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/webhook/main.py b/webhook/main.py index ce4d9e1..8742632 100644 --- a/webhook/main.py +++ b/webhook/main.py @@ -15,8 +15,6 @@ import datetime import os from google.cloud import logging -import vertexai -from vertexai.preview.language_models import TextGenerationModel import google.auth.transport.requests import google.oauth2.id_token import requests From 63a3fbed2094df8d7dac6d8bb7b0e1c789643b56 Mon Sep 17 00:00:00 2001 From: Eric Schmidt Date: Mon, 24 Jul 2023 12:36:40 -0700 Subject: [PATCH 4/4] per reviewer --- notebook/gen_ai_jss.ipynb | 352 +++++--------------------------------- 1 file changed, 42 insertions(+), 310 deletions(-) diff --git a/notebook/gen_ai_jss.ipynb b/notebook/gen_ai_jss.ipynb index 530413b..1fbdd44 100644 --- a/notebook/gen_ai_jss.ipynb +++ b/notebook/gen_ai_jss.ipynb @@ -159,19 +159,11 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Overwriting requirements.txt\n" - ] - } - ], + "outputs": [], "source": [ "%%writefile requirements.txt\n", "\n", @@ -186,77 +178,11 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "id": "2b4ef9b72d43" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Collecting google-cloud-aiplatform\n", - " Using cached google_cloud_aiplatform-1.28.1-py2.py3-none-any.whl (2.7 MB)\n", - "Collecting google-cloud-bigquery\n", - " Using cached google_cloud_bigquery-3.11.4-py2.py3-none-any.whl (219 kB)\n", - "Collecting google-cloud-logging\n", - " Using cached google_cloud_logging-3.6.0-py2.py3-none-any.whl (195 kB)\n", - "Collecting google-cloud-storage\n", - " Using cached google_cloud_storage-2.10.0-py2.py3-none-any.whl (114 kB)\n", - "Collecting google-cloud-vision\n", - " Using cached google_cloud_vision-3.4.4-py2.py3-none-any.whl (444 kB)\n", - "Collecting polling2\n", - " Using cached polling2-0.5.0-py2.py3-none-any.whl (6.4 kB)\n", - "Collecting tqdm\n", - " Using cached tqdm-4.65.0-py3-none-any.whl (77 kB)\n", - "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-aiplatform->-r requirements.txt (line 2)) (1.22.2)\n", - "Collecting shapely<2.0.0\n", - " Using cached Shapely-1.8.5.post1-cp38-cp38-macosx_10_9_x86_64.whl (1.2 MB)\n", - "Requirement already satisfied: google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-aiplatform->-r requirements.txt (line 2)) (2.11.0)\n", - "Requirement already satisfied: packaging>=14.3 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-aiplatform->-r requirements.txt (line 2)) (23.0)\n", - "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-aiplatform->-r requirements.txt (line 2)) (3.19.6)\n", - "Collecting google-cloud-resource-manager<3.0.0dev,>=1.3.3\n", - " Using cached google_cloud_resource_manager-1.10.2-py2.py3-none-any.whl (321 kB)\n", - "Requirement already satisfied: google-cloud-core<3.0.0dev,>=1.6.0 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-bigquery->-r requirements.txt (line 3)) (2.3.2)\n", - "Requirement already satisfied: requests<3.0.0dev,>=2.21.0 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-bigquery->-r requirements.txt (line 3)) (2.28.2)\n", - "Collecting google-resumable-media<3.0dev,>=0.6.0\n", - " Using cached google_resumable_media-2.5.0-py2.py3-none-any.whl (77 kB)\n", - "Requirement already satisfied: python-dateutil<3.0dev,>=2.7.2 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-bigquery->-r requirements.txt (line 3)) (2.8.2)\n", - "Requirement already satisfied: grpcio<2.0dev,>=1.47.0 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-bigquery->-r requirements.txt (line 3)) (1.51.3)\n", - "Collecting grpc-google-iam-v1<1.0.0dev,>=0.12.4\n", - " Using cached grpc_google_iam_v1-0.12.6-py2.py3-none-any.whl (26 kB)\n", - "Collecting google-cloud-appengine-logging<2.0.0dev,>=0.1.0\n", - " Using cached google_cloud_appengine_logging-1.3.1-py2.py3-none-any.whl (16 kB)\n", - "Collecting google-cloud-audit-log<1.0.0dev,>=0.1.0\n", - " Using cached google_cloud_audit_log-0.2.5-py2.py3-none-any.whl (12 kB)\n", - "Requirement already satisfied: google-auth<3.0dev,>=1.25.0 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-cloud-storage->-r requirements.txt (line 5)) (2.16.2)\n", - "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.56.2 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform->-r requirements.txt (line 2)) (1.59.0)\n", - "Requirement already satisfied: grpcio-status<2.0dev,>=1.33.2 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,<3.0.0dev,>=1.32.0->google-cloud-aiplatform->-r requirements.txt (line 2)) (1.48.2)\n", - "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-auth<3.0dev,>=1.25.0->google-cloud-storage->-r requirements.txt (line 5)) (5.3.0)\n", - "Requirement already satisfied: six>=1.9.0 in /Users/erschmid/.local/lib/python3.8/site-packages (from google-auth<3.0dev,>=1.25.0->google-cloud-storage->-r requirements.txt (line 5)) (1.15.0)\n", - "Requirement already satisfied: pyasn1-modules>=0.2.1 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-auth<3.0dev,>=1.25.0->google-cloud-storage->-r requirements.txt (line 5)) (0.2.8)\n", - "Requirement already satisfied: rsa<5,>=3.1.4 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from google-auth<3.0dev,>=1.25.0->google-cloud-storage->-r requirements.txt (line 5)) (4.9)\n", - "Collecting google-crc32c<2.0dev,>=1.0\n", - " Using cached google_crc32c-1.5.0-cp38-cp38-macosx_10_9_x86_64.whl (30 kB)\n", - "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from requests<3.0.0dev,>=2.21.0->google-cloud-bigquery->-r requirements.txt (line 3)) (1.26.15)\n", - "Requirement already satisfied: idna<4,>=2.5 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from requests<3.0.0dev,>=2.21.0->google-cloud-bigquery->-r requirements.txt (line 3)) (3.4)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from requests<3.0.0dev,>=2.21.0->google-cloud-bigquery->-r requirements.txt (line 3)) (3.1.0)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from requests<3.0.0dev,>=2.21.0->google-cloud-bigquery->-r requirements.txt (line 3)) (2022.12.7)\n", - "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /Users/erschmid/.pyenv/versions/3.8.16/lib/python3.8/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=1.25.0->google-cloud-storage->-r requirements.txt (line 5)) (0.4.8)\n", - "Installing collected packages: polling2, tqdm, shapely, google-crc32c, google-resumable-media, google-cloud-audit-log, grpc-google-iam-v1, google-cloud-vision, google-cloud-storage, google-cloud-resource-manager, google-cloud-bigquery, google-cloud-appengine-logging, google-cloud-logging, google-cloud-aiplatform\n", - "\u001b[33m WARNING: The script tqdm is installed in '/Users/erschmid/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", - " NOTE: The current PATH contains path(s) starting with `~`, which may not be expanded by all applications.\u001b[0m\u001b[33m\n", - "\u001b[0m\u001b[33m WARNING: The script tb-gcp-uploader is installed in '/Users/erschmid/.local/bin' which is not on PATH.\n", - " Consider adding this directory to PATH or, if you prefer to suppress this warning, use --no-warn-script-location.\n", - " NOTE: The current PATH contains path(s) starting with `~`, which may not be expanded by all applications.\u001b[0m\u001b[33m\n", - "\u001b[0mSuccessfully installed google-cloud-aiplatform-1.28.1 google-cloud-appengine-logging-1.3.1 google-cloud-audit-log-0.2.5 google-cloud-bigquery-3.11.4 google-cloud-logging-3.6.0 google-cloud-resource-manager-1.10.2 google-cloud-storage-2.10.0 google-cloud-vision-3.4.4 google-crc32c-1.5.0 google-resumable-media-2.5.0 grpc-google-iam-v1-0.12.6 polling2-0.5.0 shapely-1.8.5.post1 tqdm-4.65.0\n", - "\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.2.1 is available.\n", - "You should consider upgrading via the '/Users/erschmid/.pyenv/versions/3.8.16/bin/python3.8 -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", - "\u001b[0m" - ] - } - ], + "outputs": [], "source": [ "# Install the packages\n", "import os\n", @@ -280,30 +206,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "id": "f200f10a1da3" }, - "outputs": [ - { - "data": { - "text/plain": [ - "{'status': 'ok', 'restart': True}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - }, - { - "ename": "", - "evalue": "", - "output_type": "error", - "traceback": [ - "\u001b[1;31mThe Kernel crashed while executing code in the the current cell or a previous cell. Please review the code in the cell(s) to identify a possible cause of the failure. Click here for more info. View Jupyter log for further details." - ] - } - ], + "outputs": [], "source": [ "# Automatically restart kernel after installs so that your environment can access the new packages\n", "import IPython\n", @@ -323,7 +230,7 @@ "\n", "### Set up your Google Cloud project\n", "\n", - "This notebook assumes that you have already deployed this solution using either the [Terraform script]() **TODO: fix target for link** or using the [Solutions console](https://console.cloud.google.com/products/solutions/catalog). During this deployment, several actions required to run this solution were performed on your behalf:\n", + "This notebook assumes that you have already deployed this solution using either the [Terraform script](https://github.com/GoogleCloudPlatform/terraform-genai-doc-summarization) or using the [Solutions console](https://console.cloud.google.com/products/solutions/details/generative-ai-document-summarization). During this deployment, several actions required to run this solution were performed on your behalf:\n", "\n", "1. The [Cloud Function](https://console.cloud.google.com/functions/list) was deployed.\n", "\n", @@ -356,19 +263,11 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "metadata": { "id": "oM1iC_MfAts1" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Updated property [core/project].\n" - ] - } - ], + "outputs": [], "source": [ "PROJECT_ID = \"[your-project-name]\" # @param {type:\"string\"}\n", "\n", @@ -390,7 +289,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "id": "region" }, @@ -445,27 +344,11 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "id": "254614fa0c46" }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Your browser has been opened to visit:\n", - "\n", - " https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8085%2F&scope=openid+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fuserinfo.email+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcloud-platform+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fappengine.admin+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fcompute+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Faccounts.reauth&code_challenge=9QrnfgtBYc8TN96oasecu3uRYnTwHn7vkkNYSHI8w2M&code_challenge_method=S256&access_type=offline&response_type=code&prompt=select_account\n", - "\n", - "\n", - "\n", - "You are now logged in as [erschmid@google.com].\n", - "Your current project is [jss-16p1-test]. You can change this setting by running:\n", - " $ gcloud config set project PROJECT_ID\n" - ] - } - ], + "outputs": [], "source": [ "# ! gcloud auth login" ] @@ -493,7 +376,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": null, "metadata": { "id": "PyQmSRbKA8r-" }, @@ -529,26 +412,9 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404001v1.pdf\n", - "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404002v1.pdf\n", - "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404003v2.pdf\n", - "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404004v1.pdf\n", - "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404005v1.pdf\n", - "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404007v1.pdf\n", - "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404008v1.pdf\n", - "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404009v3.pdf\n", - "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404010v2.pdf\n", - "gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404011v2.pdf\n" - ] - } - ], + "outputs": [], "source": [ "# List all the comparative linguistics papers from Cloud Storage\n", "! gsutil ls gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404" @@ -563,19 +429,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Copying gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/9404002v1.pdf...\n", - "/ [1 files][196.1 KiB/196.1 KiB] \n", - "Operation completed over 1 objects/196.1 KiB. \n" - ] - } - ], + "outputs": [], "source": [ "filename = '9404002v1'\n", "file_uri = f'gs://arxiv-dataset/arxiv/cmp-lg/pdf/9404/{filename}.pdf'\n", @@ -599,7 +455,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -616,7 +472,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -651,7 +507,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -679,31 +535,9 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "156d17bd47064565a855dc630c186afe", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/100 [00:00\n", - "cloud_event_id(jss-16p1-test_uploads/pdfs/9404002v1.pdf/1690146389113776): DB_WRITE\n", - "polling\n", - "polling\n", - "polling\n", - "polling\n", - "polling\n" - ] - } - ], + "outputs": [], "source": [ "entries = []\n", "bar = tqdm(total=6)\n", @@ -821,20 +614,9 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'The Loebner prize competition is not a good way to measure the progress of AI. The competition is not well-designed, and it is not clear what purpose it serves. The Loebner prize is also inappropriate given the current level of technology.'" - ] - }, - "execution_count": 18, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "bigquery_client = bigquery.Client(project=PROJECT_ID)\n", "\n", @@ -877,7 +659,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -983,39 +765,9 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "OCR: waiting for the operation to finish.\n", - "arXiv:cmp-lg/9404002v1 4 Apr 1994\n", - "Lessons from a\n", - "Restricted Turing Test\n", - "Stuart M. Shieber\n", - "Aiken Computation Laboratory\n", - "Division of Applied Sciences\n", - "Harvard University\n", - "March 28, 1994\n", - "(Revision 6)\n", - "Abstract\n", - "We report on the recent Loebner prize competition inspired by Turing's\n", - "test of intelligent behavior. The presentation covers the structure of the\n", - "competition and the outcome of its first instantiation in an actual event,\n", - "and an analysis of the purpose, design, and appropriateness of such a\n", - "competition. We argue that the competition has no clear purpose, that\n", - "its design prevents any useful outcome, and that such a competition is\n", - "inappropriate given the current level of technology. We then speculate as\n", - "to suitable alternatives to the Loebner prize.\n", - "This paper is to appear in Communications of the Association for Comput-\n", - "ing Machinery, and is available from the Center for Research in Computing\n", - "Technology, Harvard University, as Technical Report TR-19-92 and from the\n", - "Computation and Langua\n" - ] - } - ], + "outputs": [], "source": [ "bucket = \"arxiv-dataset\"\n", "pdf_name = \"arxiv/cmp-lg/pdf/9404/9404002v1.pdf\"\n", @@ -1043,7 +795,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1094,7 +846,7 @@ }, { "cell_type": "code", - "execution_count": 31, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1134,19 +886,9 @@ }, { "cell_type": "code", - "execution_count": 32, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The Loebner Prize competition is inspired by Turing's test of intelligent behavior.\n", - "The authors argue that the competition has no clear purpose, that its design prevents any useful outcome, and that such a competition is inappropriate given the current level of technology.\n", - "They then speculate as to suitable alternatives to the Loebner prize.\n" - ] - } - ], + "outputs": [], "source": [ "model_name = \"text-bison@001\"\n", "temperature = 0.2\n", @@ -1180,7 +922,7 @@ }, { "cell_type": "code", - "execution_count": 35, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1201,7 +943,7 @@ }, { "cell_type": "code", - "execution_count": 36, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1223,7 +965,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1288,7 +1030,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -1319,19 +1061,9 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The Loebner Prize competition is inspired by Turing's test of intelligent behavior.\n", - "The authors argue that the competition has no clear purpose, that its design prevents any useful outcome, and that such a competition is inappropriate given the current level of technology.\n", - "They then speculate as to suitable alternatives to the Loebner prize.\n" - ] - } - ], + "outputs": [], "source": [ "bigquery_client = bigquery.Client(project=PROJECT_ID)\n", "\n",