Skip to content

Commit

Permalink
feat: adds BQ file
Browse files Browse the repository at this point in the history
  • Loading branch information
telpirion committed May 11, 2023
1 parent f0c72cd commit acdc411
Show file tree
Hide file tree
Showing 4 changed files with 85 additions and 2 deletions.
72 changes: 72 additions & 0 deletions src/bigquery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from datetime import datetime

from google.cloud import bigquery
from google.cloud import logging

BIGQUERY_UPSERT_LOGGER = "BigQueryUpsertLogger"


def write_summarization_to_table(
project_id: str,
dataset_name: str,
table_name: str,
bucket: str,
filename: str,
complete_text: str,
complete_text_uri: str,
summary: str,
summary_uri: str,
timestamp: datetime,
):
"""Updates the BigQuery table with the document summarization
Args:
project_id (str): the Google Cloud project ID
dataset_name (str): the name of the BigQuery dataset
table_name (str): the name of the BigQuery table
bucket (str): the name of the bucket with the PDF
filename (str): path of PDF relative to bucket root
complete_text (str): the complete text of the PDF
complete_text_uri (str): the Storage URI of the complete TXT document
summary (str): the text summary of the document
summary_uri (str): the Storage URI of the summary TXT document
timestamp (datetime): when the processing occurred
"""
client = bigquery.Client()

table_id = f"{project_id}.{dataset_name}.{table_name}"

rows_to_insert = [
{
"bucket": bucket,
"filename": filename,
"extracted_text": complete_text,
"summary_uri": summary_uri,
"summary": summary,
"complete_text_uri": complete_text_uri,
"timestamp": timestamp,
}
]

errors = client.insert_rows_json(
table_id, rows_to_insert, row_ids=[None] * len(rows_to_insert)
)
if errors != []:
logging_client = logging.Client()
logger = logging_client.logger(BIGQUERY_UPSERT_LOGGER)
logger.log(f"Encountered errors while inserting rows: {errors}",
severity="ERROR")
11 changes: 10 additions & 1 deletion src/function.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,16 @@ def read_datetimes(cls, kwargs):

# WEBHOOK FUNCTION
@functions_framework.cloud_event
def entrypoint(cloud_event):
def entrypoint(cloud_event) -> dict:
"""Entrypoint for Cloud Function
Args:
cloud_event (CloudEvent): an event from EventArc
Returns:
dictionary with 'summary' and 'output_filename' keys
"""

event_id = cloud_event["id"]
event_type = cloud_event["type"]
bucket = cloud_event.data["bucket"]
Expand Down
2 changes: 2 additions & 0 deletions src/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
functions-framework
google-auth
google-cloud-aiplatform==1.25.0
google-cloud-bigquery
google-cloud-logging
google-cloud-storage
google-cloud-vision
2 changes: 1 addition & 1 deletion src/vertex_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,4 +126,4 @@ def predict_large_language_model_hack(
json=json_data,
)

return response.json()['predictions'][0]['content']
return response.json()['predictions'][0]['content']

0 comments on commit acdc411

Please sign in to comment.