Skip to content

Commit

Permalink
chore(deps): bump google.golang.org/protobuf from 1.30.0 to 1.33.0 in…
Browse files Browse the repository at this point in the history
… /cx-content-moderation (#766)

Bumps google.golang.org/protobuf from 1.30.0 to 1.33.0.


[![Dependabot compatibility
score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=google.golang.org/protobuf&package-manager=go_modules&previous-version=1.30.0&new-version=1.33.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores)

Dependabot will resolve any conflicts with this PR as long as you don't
alter it yourself. You can also trigger a rebase manually by commenting
`@dependabot rebase`.

[//]: # (dependabot-automerge-start)
[//]: # (dependabot-automerge-end)

---

<details>
<summary>Dependabot commands and options</summary>
<br />

You can trigger Dependabot actions by commenting on this PR:
- `@dependabot rebase` will rebase this PR
- `@dependabot recreate` will recreate this PR, overwriting any edits
that have been made to it
- `@dependabot merge` will merge this PR after your CI passes on it
- `@dependabot squash and merge` will squash and merge this PR after
your CI passes on it
- `@dependabot cancel merge` will cancel a previously requested merge
and block automerging
- `@dependabot reopen` will reopen this PR if it is closed
- `@dependabot close` will close this PR and stop Dependabot recreating
it. You can achieve the same result by closing it manually
- `@dependabot show <dependency name> ignore conditions` will show all
of the ignore conditions of the specified dependency
- `@dependabot ignore this major version` will close this PR and stop
Dependabot creating any more for this major version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this minor version` will close this PR and stop
Dependabot creating any more for this minor version (unless you reopen
the PR or upgrade to it yourself)
- `@dependabot ignore this dependency` will close this PR and stop
Dependabot creating any more for this dependency (unless you reopen the
PR or upgrade to it yourself)
You can disable automated security fix PRs for this repo from the
[Security Alerts
page](https://github.com/GoogleCloudPlatform/document-ai-samples/network/alerts).

</details>
  • Loading branch information
ghchinoy authored Mar 14, 2024
2 parents 2bd2d76 + b4a2452 commit c4bc200
Show file tree
Hide file tree
Showing 7 changed files with 456 additions and 294 deletions.
2 changes: 1 addition & 1 deletion cx-content-moderation/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -32,5 +32,5 @@ require (
google.golang.org/genproto/googleapis/api v0.0.0-20230530153820-e85fd2cbaebc // indirect
google.golang.org/genproto/googleapis/rpc v0.0.0-20230530153820-e85fd2cbaebc // indirect
google.golang.org/grpc v1.56.3 // indirect
google.golang.org/protobuf v1.30.0 // indirect
google.golang.org/protobuf v1.33.0 // indirect
)
4 changes: 2 additions & 2 deletions cx-content-moderation/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,8 @@ google.golang.org/protobuf v1.23.1-0.20200526195155-81db48ad09cc/go.mod h1:EGpAD
google.golang.org/protobuf v1.25.0/go.mod h1:9JNX74DMeImyA3h4bdi1ymwjUzf21/xIlbajtzgsN7c=
google.golang.org/protobuf v1.26.0-rc.1/go.mod h1:jlhhOSvTdKEhbULTjvd4ARK9grFBp09yW+WbY/TyQbw=
google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc=
google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng=
google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I=
google.golang.org/protobuf v1.33.0 h1:uNO2rsAINq/JlFpSdYEKIZ0uKD/R9cpdv0T+yoGwGmI=
google.golang.org/protobuf v1.33.0/go.mod h1:c6P6GXX6sHbq/GpV6MGZEdwhWPcYBgnhAHhKbcUYpos=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
Expand Down
34 changes: 7 additions & 27 deletions cx-content-moderation/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"log"
"os"

"cloud.google.com/go/compute/metadata"
documentai "cloud.google.com/go/documentai/apiv1beta3"
"cloud.google.com/go/documentai/apiv1beta3/documentaipb"

Expand All @@ -30,10 +29,8 @@ import (
)

var (
projectID string = getProjectID()
contentModerationProcessorName string = envCheck("CONTENT_MODERATION_NAME", "")
port string = envCheck("PORT", "8080")
logname string = envCheck("LOGNAME", "cx-content-moderation")
contentModerationProcessorName = envCheck("CONTENT_MODERATION_NAME", "")
port = envCheck("PORT", "8080")
)

const location = "us"
Expand All @@ -56,7 +53,7 @@ func analyzeCommentHandler(res *ezcx.WebhookResponse, req *ezcx.WebhookRequest)
params := req.GetSessionParameters()
text := req.GetText()
if text == "" {
return fmt.Errorf("No text provided.")
return fmt.Errorf("no text provided")
}

// perform content moderation on text
Expand All @@ -73,7 +70,7 @@ func analyzeCommentHandler(res *ezcx.WebhookResponse, req *ezcx.WebhookRequest)
attributes[attribute.GetType()] = attribute.GetConfidence()
}
if params == nil {
params = make(map[string]any)
params = make(map[string]interface{})
}
params["content-moderation"] = attributes

Expand Down Expand Up @@ -123,26 +120,9 @@ func apiEndpoint() string {

// envCheck checks for an environment variable, otherwise returns default
func envCheck(environmentVariable, defaultVar string) string {
if envar, ok := os.LookupEnv(environmentVariable); !ok {
envar, ok := os.LookupEnv(environmentVariable)
if envar == "" || !ok {
return defaultVar
} else if envar == "" {
return defaultVar
} else {
return envar
}
}

// getProjectID checks for a local environment variable and then GCP metadata to
func getProjectID() string {
projectID := envCheck("PROJECT_ID", "")
if projectID == "" { // local
projectID = envCheck("GOOGLE_CLOUD_PROJECT", "") // appengine
if projectID == "" { // gcp metadata
var err error
if projectID, err = metadata.ProjectID(); err != nil {
log.Fatal("Unable to get Google Cloud Project ID", err)
}
}
}
return projectID
return envar
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,14 @@

import concurrent.futures
from typing import List

from google.cloud import documentai
from google.cloud import firestore
from google.cloud import storage
import pandas as pd
from google.cloud import documentai, firestore, storage
from utilities import batch_process_documents_sample, copy_blob, list_blobs
from utilities import batch_process_documents_sample
from utilities import copy_blob
from utilities import list_blobs

INPUT_BUCKET_NAME = "your_test_bucket_name"
GCS_OUTPUT_URI_PREFIX = "your_output_folder_prefix"
Expand Down Expand Up @@ -200,8 +205,11 @@ def metadata_reader(metadata: documentai.BatchProcessMetadata) -> List:
"operation_id": i.output_gcs_destination.split("/")[-2],
"file_output_gcs_destination": i.output_gcs_destination,
"file_human_review_status": i.human_review_status.state.name,
"file_human_review_operation_id":
i.human_review_status.human_review_operation.split("/")[-1],
"file_human_review_operation_id": i.human_review_status.human_review_operation.split( # pylint: line-too-long
"/"
)[
-1
],
}
)
return info_array
Expand Down Expand Up @@ -242,9 +250,7 @@ def file_copy(array_having_file_names: List, bucket_name_with_folder: str) -> No
)


def concurrent_processing(
daira_output_test: str, batch_array: List
) -> None:
def concurrent_processing(daira_output_test: str, batch_array: List) -> None:
"""
To create a concurrent process for batch processing the files .
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,25 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""This module contains helper functions for Advance Table Parsing Tool"""
from collections import defaultdict
from io import BytesIO
import math
import re
import time
from collections import defaultdict
from io import BytesIO
from typing import Dict, List, MutableSequence, Tuple, Union, Any
from typing import Any, Dict, List, MutableSequence, Tuple, Union

import numpy as np
import pandas as pd
import PyPDF2
from google.api_core.client_options import ClientOptions
from google.api_core.exceptions import InternalServerError, RetryError
from google.cloud import documentai, storage
from google.api_core.exceptions import InternalServerError
from google.api_core.exceptions import RetryError
from google.cloud import documentai
from google.cloud import storage
from google.longrunning import operations_pb2
from google.longrunning.operations_pb2 import GetOperationRequest
import numpy as np
import pandas as pd
from PIL import Image as PilImage
from PIL import ImageDraw
import PyPDF2


def batch_process_documents(
Expand Down Expand Up @@ -382,9 +384,7 @@ def poll_hitl_operations(
]
if not operations:
break
print(
f"Still waiting for {len(operations)} HITL operations to complete"
)
print(f"Still waiting for {len(operations)} HITL operations to complete")
time.sleep(100)
print(f"Finished waiting for all {num_operations} HITL operations.")

Expand Down Expand Up @@ -449,7 +449,7 @@ def parse_document_tables(output_bucket, output_prefix, output_csv_prefix):
output_bucket=output_bucket, output_prefix=output_prefix
)
for file_key, document in doc_obj_dict.items():
for _ , page in enumerate(document.pages):
for _, page in enumerate(document.pages):
header_row_values: List[List[str]] = []
body_row_values: List[List[str]] = []
for index, table in enumerate(page.tables):
Expand Down Expand Up @@ -820,7 +820,7 @@ def process_taxonomy_disclosure(st: str) -> str:
ea = re.search(r"^[A-Z]\.\s[a-zA-Z\s-]+", st)
if ea:
span = ea.span()
interstr = st[span[0]:span[1]].split("\n")[0]
interstr = st[span[0] : span[1]].split("\n")[0]
return interstr


Expand All @@ -839,7 +839,7 @@ def process_taxonomy_disclosure_complex(st: str) -> Tuple[str, str]:
ea = re.search(r"^[A-Z]\.[1-9](.|)[a-zA-Z()\s-]+", st)
if ea:
span = ea.span()
interstr = st[span[0]:span[1]].split("\n")[0:-1]
interstr = st[span[0] : span[1]].split("\n")[0:-1]
ans = " ".join(interstr)
st = st.replace(st[span[0] : span[1]], "")
return st, ans
Expand All @@ -859,7 +859,9 @@ def process_taxonomy_disclosure_multiple(row: pd.Series) -> None:
st = row["taxonomy_disclosure"]
row_ea = re.findall(r"\d.\d+ [a-zA-Z\s]+", st)
if len(row_ea) > 1:
row["taxonomy_disclosure"] = "\n".join([ea.replace("\n", " ").strip() for ea in row_ea])
row["taxonomy_disclosure"] = "\n".join(
[ea.replace("\n", " ").strip() for ea in row_ea]
)


def collect_multiple_values(row: pd.Series, col: str) -> List:
Expand All @@ -884,8 +886,9 @@ def collect_multiple_values(row: pd.Series, col: str) -> List:
return split_row


def collect_and_extend_values(final_df_: pd.DataFrame, final_data_: dict,
row: pd.Series, col: str) -> None:
def collect_and_extend_values(
final_df_: pd.DataFrame, final_data_: dict, row: pd.Series, col: str
) -> None:
"""
Collect and extend values from a specific column in a row to the final data structure.
Expand Down Expand Up @@ -914,8 +917,9 @@ def collect_and_extend_values(final_df_: pd.DataFrame, final_data_: dict,
final_data_ = update_data(final_df_, final_data_, ea_)


def extend_column_data(final_data_: dict, row: pd.Series,
column: str, split_row: List[str]) -> None:
def extend_column_data(
final_data_: dict, row: pd.Series, column: str, split_row: List[str]
) -> None:
"""
Extend column data in the final data structure.
Expand Down Expand Up @@ -985,13 +989,18 @@ def post_process(
)
# Post-processing code matches expected values and rearranges them into the final dataframe
final_data_: Dict[Any, Any] = defaultdict(list)
for _ , row in dest_df.iterrows():
for _, row in dest_df.iterrows():
if row["taxonomy_disclosure"] is np.nan:
continue
st = row["taxonomy_disclosure"]
st = st.replace(process_taxonomy_disclosure(row["taxonomy_disclosure"]) + "\n", "").strip()
final_data_ = update_data(final_df_, final_data_, process_taxonomy_disclosure(
row["taxonomy_disclosure"]))
st = st.replace(
process_taxonomy_disclosure(row["taxonomy_disclosure"]) + "\n", ""
).strip()
final_data_ = update_data(
final_df_,
final_data_,
process_taxonomy_disclosure(row["taxonomy_disclosure"]),
)
row["taxonomy_disclosure"] = st

st = row["taxonomy_disclosure"]
Expand Down Expand Up @@ -1053,10 +1062,10 @@ def run_table_extractor_pipeline(
)
final_data_2_processed = final_data_new2.copy()
nrows = 0 # num of rows
for _ , v in final_data_new2.items():
for _, v in final_data_new2.items():
nrows = max(len(v), nrows)

for _ , v in final_data_2_processed.items():
for _, v in final_data_2_processed.items():
length = len(v)
if length != nrows:
v.extend([np.nan] * (nrows - length))
Expand Down Expand Up @@ -1103,7 +1112,7 @@ def walk_the_ocr(
)
cde_document = cde_jsons[file[:-4]]
print("NO HITL")
_ , y_coord, row_map_cde, _ = get_coordinates_map(cde_document)
_, y_coord, row_map_cde, _ = get_coordinates_map(cde_document)
fp_document_path = fp_input_output_map[file]
fp_document = read_json_output(
output_bucket=gcs_output_bucket, output_prefix=fp_document_path
Expand Down Expand Up @@ -1159,8 +1168,7 @@ def draw_vertical(
)
if (
n + 1 < len(x_coordinates[idx])
and (x_coordinates[idx][n + 1][1] + voffset // 2)
- (cor[1] + voffset // 2)
and (x_coordinates[idx][n + 1][1] + voffset // 2) - (cor[1] + voffset // 2)
> 50
):
draw.line(
Expand Down Expand Up @@ -1275,7 +1283,7 @@ def enhance_and_save_pdfs(
try:
images_for_pdf = []
for idx, page in enumerate(document.pages):
x_coordinates, _ , _ , max_ycd = get_coordinates_map(document)
x_coordinates, _, _, max_ycd = get_coordinates_map(document)
image_content = page.image.content
image = PilImage.open(BytesIO(image_content))
draw = ImageDraw.Draw(image)
Expand All @@ -1284,8 +1292,18 @@ def enhance_and_save_pdfs(
hoffset_ = factor * voffset
# Draw horizontal
if idx in max_ycd:
draw_horizontal(idx, max_ycd, hoffset, hoffset_, min_x,
min_height, max_x, line_colour, line_width, draw)
draw_horizontal(
idx,
max_ycd,
hoffset,
hoffset_,
min_x,
min_height,
max_x,
line_colour,
line_width,
draw,
)
# for n, y in enumerate(max_ycd[idx]):
# if n == 0: # column header min y coord
# draw.line(
Expand All @@ -1311,8 +1329,17 @@ def enhance_and_save_pdfs(
# )
# Drawing vertical lines
if idx in x_coordinates:
draw_vertical(idx, x_coordinates, hoffset_, min_height,
max_height, line_colour, line_width, voffset, draw)
draw_vertical(
idx,
x_coordinates,
hoffset_,
min_height,
max_height,
line_colour,
line_width,
voffset,
draw,
)
# for n, cor in enumerate(x_coordinates[idx]):
# if n == 0:
# draw.line(
Expand Down
Loading

0 comments on commit c4bc200

Please sign in to comment.