From b0b9a2c9556da750806e6e34ddcdfd58203db2ae Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 19 Dec 2023 12:39:26 -0600 Subject: [PATCH] chore(deps): bump golang.org/x/crypto from 0.14.0 to 0.17.0 in /cx-content-moderation (#699) Bumps [golang.org/x/crypto](https://github.com/golang/crypto) from 0.14.0 to 0.17.0.
Commits

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=golang.org/x/crypto&package-manager=go_modules&previous-version=0.14.0&new-version=0.17.0)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/GoogleCloudPlatform/document-ai-samples/network/alerts).
--------- Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> Co-authored-by: Owl Bot --- cx-content-moderation/go.mod | 6 +- cx-content-moderation/go.sum | 12 +- .../date_entities_annotation.ipynb | 18 +- .../docai_processor_migration.ipynb | 566 +++++++++++------- .../parsed_json_split_address.ipynb | 3 +- .../schema_comparision.ipynb | 10 +- 6 files changed, 375 insertions(+), 240 deletions(-) diff --git a/cx-content-moderation/go.mod b/cx-content-moderation/go.mod index 59c7eb877..15eb942b3 100644 --- a/cx-content-moderation/go.mod +++ b/cx-content-moderation/go.mod @@ -22,11 +22,11 @@ require ( github.com/googleapis/enterprise-certificate-proxy v0.2.3 // indirect github.com/googleapis/gax-go/v2 v2.10.0 // indirect go.opencensus.io v0.24.0 // indirect - golang.org/x/crypto v0.14.0 // indirect + golang.org/x/crypto v0.17.0 // indirect golang.org/x/net v0.17.0 // indirect golang.org/x/oauth2 v0.8.0 // indirect - golang.org/x/sys v0.13.0 // indirect - golang.org/x/text v0.13.0 // indirect + golang.org/x/sys v0.15.0 // indirect + golang.org/x/text v0.14.0 // indirect google.golang.org/appengine v1.6.7 // indirect google.golang.org/genproto v0.0.0-20230530153820-e85fd2cbaebc // indirect google.golang.org/genproto/googleapis/api v0.0.0-20230530153820-e85fd2cbaebc // indirect diff --git a/cx-content-moderation/go.sum b/cx-content-moderation/go.sum index 5b663c690..113430bce 100644 --- a/cx-content-moderation/go.sum +++ b/cx-content-moderation/go.sum @@ -93,8 +93,8 @@ golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACk golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20220314234659-1baeb1ce4c0b/go.mod h1:IxCIyHEi3zRg3s0A5j5BB6A9Jmi73HwBIUl50j+osU4= -golang.org/x/crypto v0.14.0 h1:wBqGXzWJW6m1XrIKlAH0Hs1JJ7+9KBwnIO8v66Q9cHc= -golang.org/x/crypto v0.14.0/go.mod h1:MVFd36DqK4CsrnJYDkBA3VC4m2GkXAM0PvzMCn4JQf4= +golang.org/x/crypto v0.17.0 h1:r8bRNjWL3GshPW3gkd+RpvzWrZAwPS49OmTGZ/uhM4k= +golang.org/x/crypto v0.17.0/go.mod h1:gCAAfMLgwOJRpTjQ2zCCt2OcSfYMTeZVSRtQlPC7Nq4= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvxsM5YxQ5yQlVC4a0KAMCusXpPoU= @@ -135,8 +135,8 @@ golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.13.0 h1:Af8nKPmuFypiUBjVoU9V20FiaFXOcuZI21p0ycVYYGE= -golang.org/x/sys v0.13.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.15.0 h1:h48lPFYpsTvQJZF4EKyI4aLHaev3CxivZmv7yZig9pc= +golang.org/x/sys v0.15.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -145,8 +145,8 @@ golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.3.8/go.mod h1:E6s5w1FMmriuDzIBO73fBruAKo1PCIq6d2Q6DHfQ8WQ= -golang.org/x/text v0.13.0 h1:ablQoSUd0tRdKxZewP80B+BaqeKJuVhuRxj/dkrun3k= -golang.org/x/text v0.13.0/go.mod h1:TvPlkZtksWOMsz7fbANvkp4WM8x/WCo/om8BMLbz+aE= +golang.org/x/text v0.14.0 h1:ScX5w1eTa3QqT8oi6+ziP7dTV1S2+ALU0bI+0zXKWiQ= +golang.org/x/text v0.14.0/go.mod h1:18ZOQIKpY8NJVqYksKHtTdi31H5itFRjB5/qKTNYzSU= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= diff --git a/incubator-tools/date_entities_annotation_tool/date_entities_annotation.ipynb b/incubator-tools/date_entities_annotation_tool/date_entities_annotation.ipynb index a7c67b544..01ec5050c 100644 --- a/incubator-tools/date_entities_annotation_tool/date_entities_annotation.ipynb +++ b/incubator-tools/date_entities_annotation_tool/date_entities_annotation.ipynb @@ -188,9 +188,7 @@ "metadata": {}, "outputs": [], "source": [ - "def get_token(\n", - " json_dict: object, page: str, text_anchors_check: list\n", - "):\n", + "def get_token(json_dict: object, page: str, text_anchors_check: list):\n", " \"\"\"THIS FUNCTION USED LOADED JSON, PAGE NUMBER AND TEXT ANCHORS AS INPUT AND GIVES THE X AND Y COORDINATES\n", "\n", " Args:\n", @@ -278,9 +276,7 @@ " return entities_page\n", "\n", "\n", - "def get_min_max_y_lineitem(\n", - " json_dict: object, page: int, ent2: list\n", - "):\n", + "def get_min_max_y_lineitem(json_dict: object, page: int, ent2: list):\n", " \"\"\"\n", " Extracts minimum and maximum Y-coordinates for line items from a JSON dictionary.\n", "\n", @@ -571,7 +567,7 @@ " return final_match\n", "\n", " date_entities_final = []\n", - " \n", + "\n", " try:\n", " page_wise_ent = get_page_wise_entities(json_dict)\n", " except Exception as e:\n", @@ -618,17 +614,13 @@ " continue\n", "\n", " try:\n", - " final_match = final_match(\n", - " final_dates_dict, header_exist_dict\n", - " )\n", + " final_match = final_match(final_dates_dict, header_exist_dict)\n", " except Exception as e:\n", " print(\"Match not found-->\", e)\n", " continue\n", "\n", " try:\n", - " date_entities = create_entities(\n", - " final_match, page, json_dict\n", - " )\n", + " date_entities = create_entities(final_match, page, json_dict)\n", " except Exception as e:\n", " print(\"COULDNT CREATE ENTITIES--> \", e)\n", "\n", diff --git a/incubator-tools/docai_processor_migration/docai_processor_migration.ipynb b/incubator-tools/docai_processor_migration/docai_processor_migration.ipynb index 3d87a90f5..8fa85a37e 100644 --- a/incubator-tools/docai_processor_migration/docai_processor_migration.ipynb +++ b/incubator-tools/docai_processor_migration/docai_processor_migration.ipynb @@ -284,446 +284,549 @@ }, "outputs": [], "source": [ - "def create_destination_dataset_bucket(project_id : str, destination_exported_dataset_gcs_uri : str) -> None:\n", + "def create_destination_dataset_bucket(\n", + " project_id: str, destination_exported_dataset_gcs_uri: str\n", + ") -> None:\n", " \"\"\"\n", " This function will create destination dataset bucket.\n", - " \n", + "\n", " Args:\n", - " project_id (str): The number representing the Google Cloud project. \n", - " destination_exported_dataset_gcs_uri (str): This is the GCS bucket path where the dataset from the source processor will be copied over to the destination project. \n", + " project_id (str): The number representing the Google Cloud project.\n", + " destination_exported_dataset_gcs_uri (str): This is the GCS bucket path where the dataset from the source processor will be copied over to the destination project.\n", " Simply provide an empty bucket path here.\n", - " \n", + "\n", " Returns:\n", " None\n", " \"\"\"\n", - " \n", - " client = storage.Client(project=project_id) \n", - " bucket = client.bucket(destination_exported_dataset_gcs_uri.split('//')[1]) \n", - " if not bucket.exists(): \n", - " tqdm.write(f\"Creating bucket {bucket.name}\") \n", + "\n", + " client = storage.Client(project=project_id)\n", + " bucket = client.bucket(destination_exported_dataset_gcs_uri.split(\"//\")[1])\n", + " if not bucket.exists():\n", + " tqdm.write(f\"Creating bucket {bucket.name}\")\n", " client.create_bucket(bucket)\n", "\n", - "def move_exported_dataset(source_exported_dataset_gcs_uri : str, destination_exported_dataset_gcs_uri : str) -> None:\n", + "\n", + "def move_exported_dataset(\n", + " source_exported_dataset_gcs_uri: str, destination_exported_dataset_gcs_uri: str\n", + ") -> None:\n", " \"\"\"\n", - " This function will copy files from source exported dataset bucket into destination exported dataset bucket and splitting train and test documents.\n", - " \n", - " Args:\n", - " source_exported_dataset_gcs_uri (str) : This is the bucket path where the dataset from the source processor has been exported.\n", - " destination_exported_dataset_gcs_uri (str): This is the GCS bucket path where the dataset from the source processor will be copied over to the destination project. \n", - " Simply provide an empty bucket path here.\n", - " \n", - " Returns:\n", - " None\n", + " This function will copy files from source exported dataset bucket into destination exported dataset bucket and splitting train and test documents.\n", + "\n", + " Args:\n", + " source_exported_dataset_gcs_uri (str) : This is the bucket path where the dataset from the source processor has been exported.\n", + " destination_exported_dataset_gcs_uri (str): This is the GCS bucket path where the dataset from the source processor will be copied over to the destination project.\n", + " Simply provide an empty bucket path here.\n", + "\n", + " Returns:\n", + " None\n", " \"\"\"\n", - " \n", + "\n", " client = storage.Client()\n", - " bucket_src = client.get_bucket(source_exported_dataset_gcs_uri.split('//')[1])\n", - " blobs_src = client.list_blobs(source_exported_dataset_gcs_uri.split('//')[1])\n", - " bucket_dest = storage.Bucket(client, destination_exported_dataset_gcs_uri.split('//')[1])\n", + " bucket_src = client.get_bucket(source_exported_dataset_gcs_uri.split(\"//\")[1])\n", + " blobs_src = client.list_blobs(source_exported_dataset_gcs_uri.split(\"//\")[1])\n", + " bucket_dest = storage.Bucket(\n", + " client, destination_exported_dataset_gcs_uri.split(\"//\")[1]\n", + " )\n", "\n", " from datetime import datetime\n", + "\n", " now = datetime.now()\n", " dt_string = now.strftime(\"%Y-%m-%d-%H-%M-%S\")\n", " print(\"date and time =\", dt_string)\n", " for blob_src in blobs_src:\n", - " blob_new = bucket_src.copy_blob(blob_src, bucket_dest, new_name= dt_string + '/' + blob_src.name)\n", - " print(f'Copied [{source_exported_dataset_gcs_uri}/{blob_src.name}] into: [{destination_exported_dataset_gcs_uri}/{dt_string}]')\n", - " gcs_document = { 'gcsUri': destination_exported_dataset_gcs_uri + '/' + dt_string + '/' + blob_src.name, 'mimeType': \"application/json\" }\n", - " if blob_src.name.split('/')[0] == 'train' :\n", - " gcs_documents_train.append(gcs_document) \n", - " gcs_documents_train.append(gcs_document) \n", - " if blob_src.name.split('/')[0] == 'test' :\n", - " gcs_documents_test.append(gcs_document) \n", - " gcs_documents_test.append(gcs_document) \n", - "\n", - " print('gcs_documents_train:')\n", + " blob_new = bucket_src.copy_blob(\n", + " blob_src, bucket_dest, new_name=dt_string + \"/\" + blob_src.name\n", + " )\n", + " print(\n", + " f\"Copied [{source_exported_dataset_gcs_uri}/{blob_src.name}] into: [{destination_exported_dataset_gcs_uri}/{dt_string}]\"\n", + " )\n", + " gcs_document = {\n", + " \"gcsUri\": destination_exported_dataset_gcs_uri\n", + " + \"/\"\n", + " + dt_string\n", + " + \"/\"\n", + " + blob_src.name,\n", + " \"mimeType\": \"application/json\",\n", + " }\n", + " if blob_src.name.split(\"/\")[0] == \"train\":\n", + " gcs_documents_train.append(gcs_document)\n", + " gcs_documents_train.append(gcs_document)\n", + " if blob_src.name.split(\"/\")[0] == \"test\":\n", + " gcs_documents_test.append(gcs_document)\n", + " gcs_documents_test.append(gcs_document)\n", + "\n", + " print(\"gcs_documents_train:\")\n", " print(gcs_documents_train)\n", - " print('\\n')\n", - " print('gcs_documents_test:')\n", - " print(gcs_documents_test) \n", - " \n", - "def import_document_by_type(destination_processor_name : str, gcs_documents : List[str], dataset_type : str)-> Dict[str, str]:\n", + " print(\"\\n\")\n", + " print(\"gcs_documents_test:\")\n", + " print(gcs_documents_test)\n", + "\n", + "\n", + "def import_document_by_type(\n", + " destination_processor_name: str, gcs_documents: List[str], dataset_type: str\n", + ") -> Dict[str, str]:\n", " \"\"\"\n", " This function will import document to its destination processor by its document type either test or train.\n", - " \n", + "\n", " Args:\n", " destination_processor_name (str) : Name of the destination processor.\n", " gcs_documents (list) : Takes the list of files from splitted train or splitted test documents from destination exported dataset bucket.\n", " dataset_type (str) : Takes the values 'DATASET_SPLIT_TEST' or 'DATASET_SPLIT_TRAIN'.\n", - " \n", + "\n", " Returns:\n", - " Dictionary representing JSON data using their names. \n", + " Dictionary representing JSON data using their names.\n", " \"\"\"\n", - " \n", + "\n", " tqdm.write(\"Import document\")\n", " url = get_base_url(destination_processor_name) + \"/dataset:importDocuments\"\n", - " headers = {'Authorization': f'Bearer {get_access_token()}'}\n", + " headers = {\"Authorization\": f\"Bearer {get_access_token()}\"}\n", " import_documents_request = {\n", - " 'batch_documents_import_configs': {\n", - " 'dataset_split': dataset_type,\n", - " 'batch_input_config': {\n", - " 'gcs_documents': {\n", - " 'documents': gcs_documents\n", - " }\n", - " }\n", - " }\n", + " \"batch_documents_import_configs\": {\n", + " \"dataset_split\": dataset_type,\n", + " \"batch_input_config\": {\"gcs_documents\": {\"documents\": gcs_documents}},\n", " }\n", - " import_document_response = requests.post(url, headers=headers, json=import_documents_request)\n", + " }\n", + " import_document_response = requests.post(\n", + " url, headers=headers, json=import_documents_request\n", + " )\n", " import_document_response.raise_for_status()\n", - " import_document_result = get_operation_result(import_document_response.json()['name'])\n", + " import_document_result = get_operation_result(\n", + " import_document_response.json()[\"name\"]\n", + " )\n", " return import_document_result\n", - " \n", + "\n", + "\n", "def get_access_token() -> str:\n", " \"\"\"\n", - " This function is used as an authentication mechanism to obtain current user / service account credentials. \n", - " \n", + " This function is used as an authentication mechanism to obtain current user / service account credentials.\n", + "\n", " Returns:\n", " A string representing the access token.\n", " \"\"\"\n", - " \n", - " credentials, _ = auth.default() \n", - " credentials.refresh(google.auth.transport.requests.Request()) \n", + "\n", + " credentials, _ = auth.default()\n", + " credentials.refresh(google.auth.transport.requests.Request())\n", " return credentials.token\n", "\n", - "def get_base_url(name : str) -> str:\n", + "\n", + "def get_base_url(name: str) -> str:\n", " \"\"\"\n", " The function uses a regular expression to extract a specific part of the input name.\n", - " \n", + "\n", " Args:\n", " name (str) : This is a string containing some kind of identifier or path.\n", - " \n", + "\n", " Returns:\n", " A formatted URL string using the extracted location and name.\n", " \"\"\"\n", - " \n", + "\n", " location = re.search(r\"projects/[^/]+/locations/([^/]+)/.*\", name).group(1)\n", " return f\"https://{location}-documentai.googleapis.com/v1beta3/{name}\"\n", "\n", - "def get_operation_result(operation_name :str, message : str = \"Waiting for operation to finish.\") -> Dict[str,str]:\n", + "\n", + "def get_operation_result(\n", + " operation_name: str, message: str = \"Waiting for operation to finish.\"\n", + ") -> Dict[str, str]:\n", " \"\"\"\n", - " This function retrieves the result of a long-running operation. \n", + " This function retrieves the result of a long-running operation.\n", " It interacts with an API using HTTP requests and uses the tqdm library for progress reporting\n", - " \n", + "\n", " Args:\n", " operation_name (str): This is a string representing the name or identifier of a long-running operation.\n", - " message (str with default value): This is a string that provides a message to be displayed while waiting for the operation to finish. \n", + " message (str with default value): This is a string that provides a message to be displayed while waiting for the operation to finish.\n", " It has a default value of \"Waiting for operation to finish.\"\n", - " \n", + "\n", " Returns:\n", " Dictionary representing JSON data.\n", " \"\"\"\n", - " \n", - " tqdm.write(message,end='')\n", + "\n", + " tqdm.write(message, end=\"\")\n", " url = get_base_url(operation_name)\n", - " headers = {'Authorization': f'Bearer {get_access_token()}'}\n", + " headers = {\"Authorization\": f\"Bearer {get_access_token()}\"}\n", " get_operation_response = requests.get(url, headers=headers)\n", " get_operation_response.raise_for_status()\n", - " if not 'done' in get_operation_response.json() or not get_operation_response.json()['done']:\n", + " if (\n", + " not \"done\" in get_operation_response.json()\n", + " or not get_operation_response.json()[\"done\"]\n", + " ):\n", " time.sleep(1)\n", - " return get_operation_result(operation_name, message = \".\")\n", + " return get_operation_result(operation_name, message=\".\")\n", " tqdm.write(\"\")\n", " return get_operation_response.json()\n", "\n", - "def get_processor_details(processor_name :str) -> Dict[str,str]:\n", + "\n", + "def get_processor_details(processor_name: str) -> Dict[str, str]:\n", " \"\"\"\n", " This function is used to retrieve the processor details using processor name.\n", - " \n", - " Args: \n", + "\n", + " Args:\n", " processor_name (str) : This is the processor name for which you want to retrieve the details of a processor.\n", - " \n", + "\n", " Returns:\n", " Dictionary representing JSON data of processor.\n", " \"\"\"\n", - " \n", + "\n", " tqdm.write(\"Getting processor details\")\n", " url = get_base_url(processor_name)\n", - " headers = {'Authorization': f'Bearer {get_access_token()}'}\n", + " headers = {\"Authorization\": f\"Bearer {get_access_token()}\"}\n", " get_processor_response = requests.get(url, headers=headers)\n", " get_processor_response.raise_for_status()\n", " return get_processor_response.json()\n", "\n", - "def get_processor_version_details(processor_name : str, version_name : str) -> str:\n", + "\n", + "def get_processor_version_details(processor_name: str, version_name: str) -> str:\n", " \"\"\"\n", " This function is used to get the processor version details.\n", - " \n", - " Args: \n", + "\n", + " Args:\n", " processor_name (str) : This is the name of the processor for which you want to retrieve details.\n", " version_name (str) : This is the name of the version for which you want to retrieve details.\n", - " \n", + "\n", " Returns:\n", " A String containing the deployed_version displayName.\n", " \"\"\"\n", - " \n", + "\n", " tqdm.write(\"Getting processor version details\")\n", " url = get_base_url(processor_name) + \"/processorVersions\"\n", - " headers = {'Authorization': f'Bearer {get_access_token()}'}\n", + " headers = {\"Authorization\": f\"Bearer {get_access_token()}\"}\n", " get_processor_version_response = requests.get(url, headers=headers)\n", " get_processor_version_response.raise_for_status()\n", - " deployed_version = ''\n", - " for data in get_processor_version_response.json()['processorVersions']:\n", - " if data['name'] == version_name and data['state'] == 'DEPLOYED':\n", - " deployed_version = data['displayName']\n", + " deployed_version = \"\"\n", + " for data in get_processor_version_response.json()[\"processorVersions\"]:\n", + " if data[\"name\"] == version_name and data[\"state\"] == \"DEPLOYED\":\n", + " deployed_version = data[\"displayName\"]\n", " print(deployed_version)\n", " break\n", " return deployed_version\n", "\n", - "def get_processor_dataset_schema(processor_name : str) -> Dict[str,str]:\n", + "\n", + "def get_processor_dataset_schema(processor_name: str) -> Dict[str, str]:\n", " \"\"\"\n", " This function is used to get the processor dataset schema.\n", - " \n", - " Args: \n", + "\n", + " Args:\n", " processor_name (str) : This is the name of the processor for which you want to retrieve dataset schema.\n", - " \n", + "\n", " Returns:\n", " Dictionary representing JSON data of processor schema.\n", " \"\"\"\n", - " \n", + "\n", " tqdm.write(\"Getting processor dataset schema\")\n", " url = get_base_url(processor_name) + \"/dataset/datasetSchema\"\n", - " headers = {'Authorization': f'Bearer {get_access_token()}'}\n", + " headers = {\"Authorization\": f\"Bearer {get_access_token()}\"}\n", " get_schema_response = requests.get(url, headers=headers)\n", " get_schema_response.raise_for_status()\n", " return get_schema_response.json()\n", "\n", - "def create_processor(project_id: str, location: str, processor_details: Dict[str,str] , kms_key_name: str = \"\") -> str:\n", + "\n", + "def create_processor(\n", + " project_id: str,\n", + " location: str,\n", + " processor_details: Dict[str, str],\n", + " kms_key_name: str = \"\",\n", + ") -> str:\n", " \"\"\"\n", " This function is used to create a processor in the destination project.\n", - " \n", + "\n", " Args:\n", " project_id (str): This is a string representing the ID of the project.\n", " location (str): This is a string representing the location of the project.\n", " processor_details (dictionary): This is a dictionary containing details about the processor being created.\n", - " kms_key_name (str): This is a string representing the Key Management Service (KMS) key name. \n", + " kms_key_name (str): This is a string representing the Key Management Service (KMS) key name.\n", " It has a default value of an empty string.\n", - " \n", - " Returns: \n", + "\n", + " Returns:\n", " A string representing the name of the created processor.\n", " \"\"\"\n", - " \n", + "\n", " tqdm.write(\"Create processor\")\n", " url = f\"https://{location}-documentai.googleapis.com/uiv1beta3/projects/{project_id}/locations/{location}/processors\"\n", - " headers = {'Authorization': f'Bearer {get_access_token()}'} \n", + " headers = {\"Authorization\": f\"Bearer {get_access_token()}\"}\n", " create_processor_request = {\n", - " \"type\": processor_details['type'],\n", - " \"displayName\": processor_details['displayName'] + \"_v2\", \n", + " \"type\": processor_details[\"type\"],\n", + " \"displayName\": processor_details[\"displayName\"] + \"_v2\",\n", " }\n", " # enable CMEK if kms_key_name not empty\n", " if kms_key_name:\n", - " create_processor_request['kms_key_name'] = kms_key_name\n", - " create_processor_response = requests.post(url, headers=headers, json=create_processor_request)\n", + " create_processor_request[\"kms_key_name\"] = kms_key_name\n", + " create_processor_response = requests.post(\n", + " url, headers=headers, json=create_processor_request\n", + " )\n", " create_processor_response.raise_for_status()\n", - " return create_processor_response.json()['name']\n", + " return create_processor_response.json()[\"name\"]\n", "\n", - "def add_processor_dataset(processor_name : str, dataset_gcs_uri : str, project_id : str) -> Dict[str,str]:\n", + "\n", + "def add_processor_dataset(\n", + " processor_name: str, dataset_gcs_uri: str, project_id: str\n", + ") -> Dict[str, str]:\n", " \"\"\"\n", " This function is used to add processor dataset into destination project.\n", - " \n", + "\n", " Args:\n", " processor_name (str): This is a string representing the name or identifier of the processor.\n", " dataset_gcs_uri (str): This is a string representing the URI of the dataset in Google Cloud Storage.\n", " project_id (str): This is a string representing the ID of the Document AI project.\n", - " \n", + "\n", " Returns:\n", " Return value would likely be a JSON object containing information about the operation status or result.\n", " \"\"\"\n", - " \n", + "\n", " tqdm.write(\"Add processor dataset\")\n", " # first check if bucket of dataset_gcs_uri exists\n", " create_destination_dataset_bucket(project_id, dataset_gcs_uri)\n", " url = get_base_url(processor_name) + \"/dataset\"\n", - " headers = {'Authorization': f'Bearer {get_access_token()}'}\n", + " headers = {\"Authorization\": f\"Bearer {get_access_token()}\"}\n", " # dataset_string = {'gcsManagedConfig': {'gcsPrefix': {'gcsUriPrefix': 'gs://bachir_test'}}\n", - " update_dataset_request = {'gcsManagedConfig': {'gcsPrefix': {'gcsUriPrefix': dataset_gcs_uri}} ,'spannerIndexingConfig':{}}\n", - " add_dataset_response = requests.patch(url, headers=headers, json=update_dataset_request)\n", + " update_dataset_request = {\n", + " \"gcsManagedConfig\": {\"gcsPrefix\": {\"gcsUriPrefix\": dataset_gcs_uri}},\n", + " \"spannerIndexingConfig\": {},\n", + " }\n", + " add_dataset_response = requests.patch(\n", + " url, headers=headers, json=update_dataset_request\n", + " )\n", " add_dataset_response.raise_for_status()\n", - " add_dataset_result = get_operation_result(add_dataset_response.json()['name'])\n", + " add_dataset_result = get_operation_result(add_dataset_response.json()[\"name\"])\n", " return add_dataset_result\n", "\n", - "def update_processor_dataset_schema(processor_name : str, schema : Dict[str,str]) -> Dict[str,str]:\n", + "\n", + "def update_processor_dataset_schema(\n", + " processor_name: str, schema: Dict[str, str]\n", + ") -> Dict[str, str]:\n", " \"\"\"\n", " This function is responsible for updating the processor dataset schema in Document AI Project.\n", - " \n", + "\n", " Args:\n", " processor_name (str) : This is a string representing the name or identifier of the processor.\n", " schema (dictionary) : This is a dictionary containing the updated schema for the dataset.\n", - " \n", + "\n", " Returns:\n", " Dictionary representing JSON data likely to have information about the status of the schema update.\n", " \"\"\"\n", - " \n", + "\n", " tqdm.write(\"Updating processor dataset schema\")\n", " url = get_base_url(processor_name) + \"/dataset/datasetSchema\"\n", - " headers = {'Authorization': f'Bearer {get_access_token()}'}\n", + " headers = {\"Authorization\": f\"Bearer {get_access_token()}\"}\n", " update_schema_response = requests.patch(url, headers=headers, json=schema)\n", " update_schema_response.raise_for_status()\n", " return update_schema_response.json()\n", "\n", - "def get_dataset_split_stats(processor_name : str) -> Dict[str,str]:\n", + "\n", + "def get_dataset_split_stats(processor_name: str) -> Dict[str, str]:\n", " \"\"\"\n", " This function retrieves statistics about dataset splits associated with a processor in a Document AI project.\n", - " \n", + "\n", " Args:\n", " processor_name (str) : This is a string representing the name or identifier of the processor.\n", - " \n", + "\n", " Returns:\n", " Dictionary representing JSON data contains information about the dataset split statistics.\n", " \"\"\"\n", - " \n", + "\n", " tqdm.write(\"Getting dataset split statistics\")\n", " url = get_base_url(processor_name) + \"/dataset:getAllDatasetSplitStats\"\n", - " headers = {'Authorization': f'Bearer {get_access_token()}'}\n", + " headers = {\"Authorization\": f\"Bearer {get_access_token()}\"}\n", " get_dataset_split_stats_response = requests.get(url, headers=headers)\n", " get_dataset_split_stats_response.raise_for_status()\n", " return get_dataset_split_stats_response.json()\n", "\n", - "def list_processor_dataset_documents(processor_name: str, page_size: int = 50, next_page_token: str = None, dataset_split: str = None) -> Dict[str,str]:\n", + "\n", + "def list_processor_dataset_documents(\n", + " processor_name: str,\n", + " page_size: int = 50,\n", + " next_page_token: str = None,\n", + " dataset_split: str = None,\n", + ") -> Dict[str, str]:\n", " \"\"\"\n", " This function will list the processor dataset documents.\n", - " \n", + "\n", " Args:\n", " processor_name (str) : This is a string representing the name or identifier of the processor.\n", " page_size (int) : This parameter is optional and represents the number of documents to retrieve per page. If not provided, it defaults to 50.\n", " next_page_token (str) : This parameter is optional and is used for pagination. It represents a token that indicates which page of results to retrieve next.\n", " dataset_split (str) : This parameter is optional and represents a specific split of the dataset. If provided, it filters the documents based on this split type.\n", - " \n", + "\n", " Returns:\n", - " JSON content of the response which is about the listed documents. \n", + " JSON content of the response which is about the listed documents.\n", " \"\"\"\n", - " \n", + "\n", " tqdm.write(\"List documents in processor dataset\")\n", " document_metadata = []\n", " url = get_base_url(processor_name) + \"/dataset:listDocuments\"\n", - " headers = {'Authorization': f'Bearer {get_access_token()}'}\n", + " headers = {\"Authorization\": f\"Bearer {get_access_token()}\"}\n", " list_documents_request = {}\n", " if next_page_token:\n", - " list_documents_request['page_size'] = page_size\n", - " list_documents_request['page_token'] = next_page_token\n", + " list_documents_request[\"page_size\"] = page_size\n", + " list_documents_request[\"page_token\"] = next_page_token\n", " else:\n", - " list_documents_request['page_size'] = page_size\n", + " list_documents_request[\"page_size\"] = page_size\n", " if dataset_split:\n", - " list_documents_request['filter'] = f\"SplitType={dataset_split}\"\n", - " list_documents_response = requests.post(url, headers=headers, json=list_documents_request)\n", + " list_documents_request[\"filter\"] = f\"SplitType={dataset_split}\"\n", + " list_documents_response = requests.post(\n", + " url, headers=headers, json=list_documents_request\n", + " )\n", " list_documents_response.raise_for_status()\n", " return list_documents_response.json()\n", "\n", - "def get_document(processor_name : str, document_metadata : Dict[str,str]) -> str:\n", + "\n", + "def get_document(processor_name: str, document_metadata: Dict[str, str]) -> str:\n", " \"\"\"\n", " This function is used to extract a specific document from the corresponding processor name.\n", - " \n", + "\n", " Args:\n", " processor_name (str): This is a string representing the name or identifier of the processor.\n", " document_metadata (dictionary): This is a dictionary containing metadata about the document to retrieve.\n", - " \n", + "\n", " Returns:\n", " A string representing document ID.\n", " \"\"\"\n", - " \n", + "\n", " tqdm.write(\"Get document\")\n", " url = get_base_url(processor_name) + \"/dataset:getDocument\"\n", - " headers = {'Authorization': f'Bearer {get_access_token()}'}\n", - " params = {'documentId.gcsManagedDocId.gcsUri': document_metadata['documentId']['gcsManagedDocId']['gcsUri']}\n", + " headers = {\"Authorization\": f\"Bearer {get_access_token()}\"}\n", + " params = {\n", + " \"documentId.gcsManagedDocId.gcsUri\": document_metadata[\"documentId\"][\n", + " \"gcsManagedDocId\"\n", + " ][\"gcsUri\"]\n", + " }\n", " get_document_response = requests.get(url, headers=headers, params=params)\n", " get_document_response.raise_for_status()\n", - " return get_document_response.json()['document']\n", + " return get_document_response.json()[\"document\"]\n", + "\n", "\n", - "def upload_document(destination_dataset_gcs_uri : str, display_name : str, document : Dict[str,str]) -> str:\n", + "def upload_document(\n", + " destination_dataset_gcs_uri: str, display_name: str, document: Dict[str, str]\n", + ") -> str:\n", " \"\"\"\n", " This function is used to upload document into a temporary GCS bucket.\n", - " \n", + "\n", " Args:\n", " destination_dataset_gcs_uri (str) : This is the GCS bucket path where the document will be copied over to the destination project dataset.\n", " display_name (str) : This is a string which contains display name of the document.\n", " document (dictionary) : This is a dictionary representing the content of the document.\n", - " \n", + "\n", " Returns:\n", - " A string representing the GCS URI. \n", + " A string representing the GCS URI.\n", " \"\"\"\n", - " \n", + "\n", " tqdm.write(f\"Upload document to temporary GCS import location\")\n", " storage_client = storage.Client()\n", - " gcs_uri = destination_dataset_gcs_uri.strip('/') + '/import/' + display_name\n", + " gcs_uri = destination_dataset_gcs_uri.strip(\"/\") + \"/import/\" + display_name\n", " blob = storage.Blob.from_string(gcs_uri, storage_client)\n", - " blob.upload_from_string(json.dumps(document), content_type='application/json')\n", + " blob.upload_from_string(json.dumps(document), content_type=\"application/json\")\n", " return gcs_uri\n", - " \n", - "def remove_imported_document(gcs_uri : str) -> None:\n", + "\n", + "\n", + "def remove_imported_document(gcs_uri: str) -> None:\n", " \"\"\"\n", " This function is used to remove the imported documents from temporary bucket.\n", - " \n", + "\n", " Args:\n", - " gcs_uri (str) : This is the bucket from which documents needs to be removed. \n", + " gcs_uri (str) : This is the bucket from which documents needs to be removed.\n", " \"\"\"\n", - " \n", + "\n", " tqdm.write(\"Remove document from temporary GCS import location\")\n", " storage_client = storage.Client()\n", " blob = storage.Blob.from_string(gcs_uri, storage_client)\n", " blob.delete()\n", "\n", - "def migrate_documents(source_processor_name : str, destination_processor_name : str, destination_dataset_gcs_uri : str) -> None:\n", + "\n", + "def migrate_documents(\n", + " source_processor_name: str,\n", + " destination_processor_name: str,\n", + " destination_dataset_gcs_uri: str,\n", + ") -> None:\n", " \"\"\"\n", " This function is used to migrate documents from source processor to destination processor.\n", - " \n", + "\n", " Args:\n", " source_processor_name (str) : This is a String containing the source processor name of Document AI project.\n", " destination_processor_name (str) : This is a String containing the destination processor name of Document AI project.\n", " destination_dataset_gcs_uri (str) : The GCS bucket path which is used for the destination processor dataset, automatically created if it does not exist.\n", - " \n", + "\n", " Raise:\n", " ValueError : \"List document response is missing documentMetadata\"\n", " \"\"\"\n", - " \n", + "\n", " get_dataset_split_stats_response = get_dataset_split_stats(source_processor_name)\n", - " total_documents = sum(dataset_split_stat.get('datasetStats',{}).get('documentCount',0) for dataset_split_stat in get_dataset_split_stats_response['splitStats'])\n", + " total_documents = sum(\n", + " dataset_split_stat.get(\"datasetStats\", {}).get(\"documentCount\", 0)\n", + " for dataset_split_stat in get_dataset_split_stats_response[\"splitStats\"]\n", + " )\n", " print(total_documents)\n", - " progress_bar = tqdm(total=total_documents,desc=\"Migrating documents\",unit=\"document(s)\")\n", - " print(f\"Migrating {total_documents} documents\") \n", - " \n", + " progress_bar = tqdm(\n", + " total=total_documents, desc=\"Migrating documents\", unit=\"document(s)\"\n", + " )\n", + " print(f\"Migrating {total_documents} documents\")\n", + "\n", " counter = 0\n", " s = set()\n", - " for dataset_split in [\"DATASET_SPLIT_TEST\",\"DATASET_SPLIT_TRAIN\",\"DATASET_SPLIT_UNASSIGNED\"]:\n", - " total_documents = sum(dataset_split_stat.get('datasetStats',{}).get('documentCount',0) for dataset_split_stat in get_dataset_split_stats_response['splitStats'] if dataset_split_stat.get('type',\"\") == dataset_split)\n", - " print(f\" Migrating {total_documents} documents of dataset split type {dataset_split}\")\n", + " for dataset_split in [\n", + " \"DATASET_SPLIT_TEST\",\n", + " \"DATASET_SPLIT_TRAIN\",\n", + " \"DATASET_SPLIT_UNASSIGNED\",\n", + " ]:\n", + " total_documents = sum(\n", + " dataset_split_stat.get(\"datasetStats\", {}).get(\"documentCount\", 0)\n", + " for dataset_split_stat in get_dataset_split_stats_response[\"splitStats\"]\n", + " if dataset_split_stat.get(\"type\", \"\") == dataset_split\n", + " )\n", + " print(\n", + " f\" Migrating {total_documents} documents of dataset split type {dataset_split}\"\n", + " )\n", " next_page_token = None\n", " while True:\n", " out = Output()\n", " display(out)\n", - " with out: \n", - " list_documents_response = list_processor_dataset_documents(source_processor_name, next_page_token = next_page_token, dataset_split = dataset_split) \n", + " with out:\n", + " list_documents_response = list_processor_dataset_documents(\n", + " source_processor_name,\n", + " next_page_token=next_page_token,\n", + " dataset_split=dataset_split,\n", + " )\n", " clear_output()\n", " if not list_documents_response:\n", " break\n", - " if 'documentMetadata' in list_documents_response:\n", - " document_metadata_list = list_documents_response['documentMetadata']\n", + " if \"documentMetadata\" in list_documents_response:\n", + " document_metadata_list = list_documents_response[\"documentMetadata\"]\n", " else:\n", " raise ValueError(\"List document response is missing documentMetadata\")\n", " print(f\" Migrating batch of {len(document_metadata_list)} documents\")\n", " out = Output()\n", " display(out)\n", - " with out: \n", + " with out:\n", " gcs_documents = []\n", " for document_metadata in document_metadata_list:\n", " document = get_document(source_processor_name, document_metadata)\n", - " \n", + "\n", " if document_metadata[\"displayName\"] not in s:\n", - " gcs_uri = upload_document(destination_dataset_gcs_uri, document_metadata['displayName'], document)\n", - " gcs_document = { 'gcsUri': gcs_uri, 'mimeType': \"application/json\" }\n", - " gcs_documents.append(gcs_document) \n", + " gcs_uri = upload_document(\n", + " destination_dataset_gcs_uri,\n", + " document_metadata[\"displayName\"],\n", + " document,\n", + " )\n", + " gcs_document = {\n", + " \"gcsUri\": gcs_uri,\n", + " \"mimeType\": \"application/json\",\n", + " }\n", + " gcs_documents.append(gcs_document)\n", " s.add(document_metadata[\"displayName\"])\n", " else:\n", - " print(\"removed document as it is already present in the new processor\", dataset_split)\n", + " print(\n", + " \"removed document as it is already present in the new processor\",\n", + " dataset_split,\n", + " )\n", " counter += 1\n", " clear_output()\n", - " \n", - " \n", + "\n", " for gcs_document in gcs_documents:\n", " try:\n", - " remove_imported_document(gcs_document['gcsUri']) \n", + " remove_imported_document(gcs_document[\"gcsUri\"])\n", " clear_output()\n", " except:\n", - " print(\"file removal error\") \n", + " print(\"file removal error\")\n", " progress_bar.update(len(document_metadata_list))\n", " try:\n", - " next_page_token = list_documents_response['nextPageToken']\n", + " next_page_token = list_documents_response[\"nextPageToken\"]\n", " except KeyError:\n", " break\n", " except:\n", @@ -732,49 +835,66 @@ " print(\"len(set)= \", len(s))\n", " print(\"set = \", s)\n", "\n", - "def train_processor(destination_processor_name : str,version_display_name : str) -> Dict[str,str]:\n", + "\n", + "def train_processor(\n", + " destination_processor_name: str, version_display_name: str\n", + ") -> Dict[str, str]:\n", " \"\"\"\n", " This function is used to train the destination processor for the required version.\n", - " \n", + "\n", " Args:\n", " destination_processor_name (str) : This is the name of the destination processor for which you want to train the processor.\n", " version_display_name (str) : This is the name of the version for which you want to train the processsor.\n", - " \n", + "\n", " Returns:\n", " Dictionary of JSON data.\n", " \"\"\"\n", - " \n", + "\n", " tqdm.write(\"Training Processor\")\n", " url = get_base_url(destination_processor_name) + \"/processorVersions:train\"\n", - " headers = {'Authorization': f'Bearer {get_access_token()}'}\n", - " train_processor_request = {'processorVersion': {'displayName': version_display_name}}\n", - " train_processor_response = requests.post(url, headers=headers, json=train_processor_request)\n", + " headers = {\"Authorization\": f\"Bearer {get_access_token()}\"}\n", + " train_processor_request = {\n", + " \"processorVersion\": {\"displayName\": version_display_name}\n", + " }\n", + " train_processor_response = requests.post(\n", + " url, headers=headers, json=train_processor_request\n", + " )\n", " train_processor_response.raise_for_status()\n", " return train_processor_response\n", "\n", - "def deploy_processor(trained_processor_version : str) -> str:\n", + "\n", + "def deploy_processor(trained_processor_version: str) -> str:\n", " \"\"\"\n", " This function is used to deploy the processor after its usage inorder to avoid Quota Issues.\n", - " \n", + "\n", " Args:\n", " trained_processor_version (str) : This is a string having trained processor version.\n", - " \n", - " Returns: \n", + "\n", + " Returns:\n", " A String having the name of deployed processor.\n", " \"\"\"\n", - " \n", + "\n", " tqdm.write(\"Deploying Processor\")\n", " url = get_base_url(trained_processor_version) + \":deploy\"\n", - " headers = {'Authorization': f'Bearer {get_access_token()}'}\n", + " headers = {\"Authorization\": f\"Bearer {get_access_token()}\"}\n", " deploy_processor_response = requests.post(url, headers=headers)\n", " deploy_processor_response.raise_for_status()\n", - " deploy_processor_result = get_operation_result(deploy_processor_response.json()['name'])\n", + " deploy_processor_result = get_operation_result(\n", + " deploy_processor_response.json()[\"name\"]\n", + " )\n", " return deploy_processor_result\n", "\n", - "def migrate_processor(source_processor_name : str, destination_project_id : str, destination_processor_location : str, destination_dataset_gcs_uri : str,kms_key_name : str) -> None:\n", + "\n", + "def migrate_processor(\n", + " source_processor_name: str,\n", + " destination_project_id: str,\n", + " destination_processor_location: str,\n", + " destination_dataset_gcs_uri: str,\n", + " kms_key_name: str,\n", + ") -> None:\n", " \"\"\"\n", " This is the main function which we need to run for migration of processor from one project to another.\n", - " \n", + "\n", " Args:\n", " source_processor_name (str) : This is a String containing the source processor name of Document AI project.\n", " destination_project_id (str) : This is a String containing the destination project ID of the destination project.\n", @@ -782,22 +902,44 @@ " destination_dataset_gcs_uri (str) : The GCS bucket path which is used for the destination processor dataset, automatically created if it does not exist.\n", " kms_key_name (str) : This is a string representing the Key Management Service (KMS) key name.\n", " \"\"\"\n", - " \n", + "\n", " processor_details = get_processor_details(source_processor_name)\n", - " tqdm.write(f\"Migrating processor {processor_details['displayName']} of type {processor_details['type']}\")\n", - " destination_processor_name = create_processor(destination_project_id,destination_processor_location,processor_details)\n", - " tqdm.write(f\"Destination processor created with processor name {destination_processor_name}\")\n", - " add_processor_dataset(destination_processor_name, destination_dataset_gcs_uri, destination_project_id)\n", + " tqdm.write(\n", + " f\"Migrating processor {processor_details['displayName']} of type {processor_details['type']}\"\n", + " )\n", + " destination_processor_name = create_processor(\n", + " destination_project_id, destination_processor_location, processor_details\n", + " )\n", + " tqdm.write(\n", + " f\"Destination processor created with processor name {destination_processor_name}\"\n", + " )\n", + " add_processor_dataset(\n", + " destination_processor_name, destination_dataset_gcs_uri, destination_project_id\n", + " )\n", " schema = get_processor_dataset_schema(source_processor_name)\n", " update_processor_dataset_schema(destination_processor_name, schema)\n", - " create_destination_dataset_bucket(destination_project_id, destination_exported_dataset_gcs_uri)\n", - " move_exported_dataset(source_exported_dataset_gcs_uri, destination_exported_dataset_gcs_uri)\n", - " import_document_by_type(destination_processor_name, gcs_documents_train, 'DATASET_SPLIT_TRAIN')\n", - " import_document_by_type(destination_processor_name, gcs_documents_test, 'DATASET_SPLIT_TEST') \n", - " \n", - " tqdm.write(f\"Link to UI of migrated processor dataset: https://console.cloud.google.com/ai/document-ai/{'/'.join(destination_processor_name.split('/')[2:])}/dataset?project={destination_project_id}\")\n", - " version_display_name = get_processor_version_details(source_processor_name, processor_details['defaultProcessorVersion'])\n", - " trained_processor_response = train_processor(destination_processor_name,version_display_name)" + " create_destination_dataset_bucket(\n", + " destination_project_id, destination_exported_dataset_gcs_uri\n", + " )\n", + " move_exported_dataset(\n", + " source_exported_dataset_gcs_uri, destination_exported_dataset_gcs_uri\n", + " )\n", + " import_document_by_type(\n", + " destination_processor_name, gcs_documents_train, \"DATASET_SPLIT_TRAIN\"\n", + " )\n", + " import_document_by_type(\n", + " destination_processor_name, gcs_documents_test, \"DATASET_SPLIT_TEST\"\n", + " )\n", + "\n", + " tqdm.write(\n", + " f\"Link to UI of migrated processor dataset: https://console.cloud.google.com/ai/document-ai/{'/'.join(destination_processor_name.split('/')[2:])}/dataset?project={destination_project_id}\"\n", + " )\n", + " version_display_name = get_processor_version_details(\n", + " source_processor_name, processor_details[\"defaultProcessorVersion\"]\n", + " )\n", + " trained_processor_response = train_processor(\n", + " destination_processor_name, version_display_name\n", + " )" ] }, { diff --git a/incubator-tools/parsed_json_split_address/parsed_json_split_address.ipynb b/incubator-tools/parsed_json_split_address/parsed_json_split_address.ipynb index dbe0ed455..bd8770836 100644 --- a/incubator-tools/parsed_json_split_address/parsed_json_split_address.ipynb +++ b/incubator-tools/parsed_json_split_address/parsed_json_split_address.ipynb @@ -152,11 +152,10 @@ "output_path = (\n", " \"gs://xxxx_xxxxxxx_xxxxxxxx/Processed_Splitter/\" # path should end with '/'\n", ")\n", - "#Entity Name that needs to be splitted\n", + "# Entity Name that needs to be splitted\n", "entity_name = \"ship_to_address_line\"\n", "\n", "\n", - "\n", "input_storage_bucket_name = input_path.split(\"/\")[2]\n", "input_bucket_path_prefix = \"/\".join(input_path.split(\"/\")[3:])\n", "output_storage_bucket_name = output_path.split(\"/\")[2]\n", diff --git a/incubator-tools/schema_comparision/schema_comparision.ipynb b/incubator-tools/schema_comparision/schema_comparision.ipynb index 7137b9c6d..29d235824 100644 --- a/incubator-tools/schema_comparision/schema_comparision.ipynb +++ b/incubator-tools/schema_comparision/schema_comparision.ipynb @@ -135,23 +135,25 @@ "source": [ "import json\n", "\n", + "\n", "def load_schema(parser_type):\n", " # Map the parser types to their respective file names\n", " schema_files = {\n", - " 'I': 'invoice_schema.json',\n", - " 'P': 'purchase_order_schema.json',\n", - " 'C': 'contract_schema.json'\n", + " \"I\": \"invoice_schema.json\",\n", + " \"P\": \"purchase_order_schema.json\",\n", + " \"C\": \"contract_schema.json\",\n", " }\n", "\n", " file_name = schema_files.get(parser_type)\n", " if not file_name:\n", " raise ValueError(\"Invalid parser type\")\n", "\n", - " with open(file_name, 'r') as file:\n", + " with open(file_name, \"r\") as file:\n", " schemas = json.load(file)\n", " top_level_key = next(iter(schemas))\n", " return schemas[top_level_key]\n", "\n", + "\n", "Base_schema = load_schema(parser)\n", "# print(Base_schema)" ]