From 80c130b927ec87f3a8119bf20bff739e4e661a89 Mon Sep 17 00:00:00 2001 From: Deepak Moonat Date: Thu, 19 Oct 2023 13:28:14 +0530 Subject: [PATCH] Add utilities and modularize code --- ...hild_Entity_Tag_Using_Header_Keyword.ipynb | 726 ------------- .../Document_AI_Parser_Result_Merger.ipynb | 686 ------------- .../HITL Rejected Documents Tracking.ipynb} | 4 +- .../readme.md | 19 + .../Identifying_Poor_Performing_Docs.ipynb | 666 ++++++++++++ .../readme.md | 23 + .../Key Value Pair Entity Conversion.ipynb | 335 ++++++ .../readme.md | 18 + .../Key_Value_Pair_Entity_Conversion.ipynb | 459 --------- .../DocAI Parser Result Merger.ipynb | 608 +++++++++++ .../Parser Result Merger/readme.md | 29 + .../Pre Post Bounding Box Mismatch.ipynb | 394 +++++++ .../Pre Post Bounding Box Mismatch/readme.md | 56 + .../Pre and Post HITL Visualization.ipynb | 428 ++++++++ .../Pre Post HITL Visualization/readme.md | 26 + .../Pre_Post_HITL_Bounding_Box_Mismatch.ipynb | 813 --------------- .../Pre_and_Post_HITL_Visualization.ipynb | 962 ------------------ .../best-practices/README.md | 2 +- .../Removing Empty Bounding Boxes.ipynb | 324 ++++++ .../Removing Empty Bounding Boxes/readme.md | 21 + .../Removing_Empty_Bounding_Boxes.ipynb | 420 -------- .../best-practices/Utilities/readme.md | 33 + .../best-practices/Utilities/utilities.py | 712 +++++++++++++ 23 files changed, 3696 insertions(+), 4068 deletions(-) delete mode 100644 DocAI Incubator Tools/best-practices/Child_Entity_Tag_Using_Header_Keyword.ipynb delete mode 100644 DocAI Incubator Tools/best-practices/Document_AI_Parser_Result_Merger.ipynb rename DocAI Incubator Tools/best-practices/{HITL_Rejected_Documents_Tracking.ipynb => HITL Rejected Documents Tracking/HITL Rejected Documents Tracking.ipynb} (99%) create mode 100644 DocAI Incubator Tools/best-practices/HITL Rejected Documents Tracking/readme.md create mode 100644 DocAI Incubator Tools/best-practices/Identifying Poor Performing Docs/Identifying_Poor_Performing_Docs.ipynb create mode 100644 DocAI Incubator Tools/best-practices/Identifying Poor Performing Docs/readme.md create mode 100644 DocAI Incubator Tools/best-practices/Key Value Pair Entity Conversion/Key Value Pair Entity Conversion.ipynb create mode 100644 DocAI Incubator Tools/best-practices/Key Value Pair Entity Conversion/readme.md delete mode 100644 DocAI Incubator Tools/best-practices/Key_Value_Pair_Entity_Conversion.ipynb create mode 100644 DocAI Incubator Tools/best-practices/Parser Result Merger/DocAI Parser Result Merger.ipynb create mode 100644 DocAI Incubator Tools/best-practices/Parser Result Merger/readme.md create mode 100644 DocAI Incubator Tools/best-practices/Pre Post Bounding Box Mismatch/Pre Post Bounding Box Mismatch.ipynb create mode 100644 DocAI Incubator Tools/best-practices/Pre Post Bounding Box Mismatch/readme.md create mode 100644 DocAI Incubator Tools/best-practices/Pre Post HITL Visualization/Pre and Post HITL Visualization.ipynb create mode 100644 DocAI Incubator Tools/best-practices/Pre Post HITL Visualization/readme.md delete mode 100644 DocAI Incubator Tools/best-practices/Pre_Post_HITL_Bounding_Box_Mismatch.ipynb delete mode 100644 DocAI Incubator Tools/best-practices/Pre_and_Post_HITL_Visualization.ipynb create mode 100644 DocAI Incubator Tools/best-practices/Removing Empty Bounding Boxes/Removing Empty Bounding Boxes.ipynb create mode 100644 DocAI Incubator Tools/best-practices/Removing Empty Bounding Boxes/readme.md delete mode 100644 DocAI Incubator Tools/best-practices/Removing_Empty_Bounding_Boxes.ipynb create mode 100644 DocAI Incubator Tools/best-practices/Utilities/readme.md create mode 100644 DocAI Incubator Tools/best-practices/Utilities/utilities.py diff --git a/DocAI Incubator Tools/best-practices/Child_Entity_Tag_Using_Header_Keyword.ipynb b/DocAI Incubator Tools/best-practices/Child_Entity_Tag_Using_Header_Keyword.ipynb deleted file mode 100644 index 4f8992323..000000000 --- a/DocAI Incubator Tools/best-practices/Child_Entity_Tag_Using_Header_Keyword.ipynb +++ /dev/null @@ -1,726 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "fd54d653-9889-4360-b906-ec5d73384a31", - "metadata": {}, - "source": [ - "# Child Entity Tag Using Header Keyword" - ] - }, - { - "cell_type": "markdown", - "id": "7d9dbafb-8af8-4b93-a949-44468baaa57a", - "metadata": {}, - "source": [ - "* Author: docai-incubator@google.com" - ] - }, - { - "cell_type": "markdown", - "id": "8923889f-fc12-4dfb-a899-20afd9fcbd24", - "metadata": {}, - "source": [ - "## Disclaimer\n", - "\n", - "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.\t" - ] - }, - { - "cell_type": "markdown", - "id": "28425178-9c4e-4874-8d41-bdaafed22338", - "metadata": {}, - "source": [ - "## Purpose and Description" - ] - }, - { - "cell_type": "markdown", - "id": "c89800ca-a72e-4f4b-987d-2eb5bc719692", - "metadata": {}, - "source": [ - "This tool uses labeled json files in GCS bucket and header words as input and creates a new child entity tagging the values under the header keyword matching." - ] - }, - { - "cell_type": "markdown", - "id": "ee5e12b0-c231-4169-985e-ecd0bdf70708", - "metadata": {}, - "source": [ - "## Prerequisites" - ] - }, - { - "cell_type": "markdown", - "id": "75d3b6c1-0594-45d4-8231-2adc8bcd5aa9", - "metadata": {}, - "source": [ - "1. Vertex AI Notebook\n", - "2. Labeled json files in GCS Folder" - ] - }, - { - "cell_type": "markdown", - "id": "37c1e9af-e86d-497d-93a7-b5bfc0feb79e", - "metadata": {}, - "source": [ - "## Step by Step procedure " - ] - }, - { - "cell_type": "markdown", - "id": "e2d95dc5-7402-4d68-a0f9-e8806d9bffb6", - "metadata": {}, - "source": [ - "### 1. Input Details" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5a642209-19ed-4b50-be6e-61464b18884d", - "metadata": {}, - "outputs": [], - "source": [ - "# input details\n", - "Gcs_input_path = \"gs://xxxx/xxxx/xxxx/\"\n", - "Gcs_output_path = \"gs://xxxx/xxxx/xxxx/\"\n", - "list_total_amount = [\n", - " \"Total value\",\n", - " \"Amount\",\n", - " \"Nettowert\",\n", - " \"Nettowert in EUR\",\n", - " \"Wert\",\n", - " \"Importo\",\n", - " \"Nettobetrag\",\n", - " \"Extension\",\n", - " \"Net value\",\n", - " \"Ext. price\",\n", - " \"Extended Amt\",\n", - " \"Costo riga\",\n", - " \"Imp. Netto\",\n", - " \"Summe\",\n", - " \"Gesamtpreis\",\n", - " \"Gesamt\",\n", - " \"Gesamtgewicht\",\n", - " \"Betrag\",\n", - " \"Bedrag\",\n", - " \"Wartość\",\n", - " \"Wartość netto\",\n", - " \"Value\",\n", - " \"TOTAL\",\n", - " \"Line Total\",\n", - " \"Net\",\n", - " \"Net Amount\",\n", - " \"cost\",\n", - " \"Subtotal\",\n", - "]\n", - "project_id = \"xxxx-xxxx-xxx\"\n", - "total_amount_type = \"line_item/total_amount\"" - ] - }, - { - "cell_type": "markdown", - "id": "d52ae8d7-d250-4c2b-b64f-5818e9bbf5eb", - "metadata": {}, - "source": [ - "In the above input , the **list_total_amount** is the list of header words have to be used and the values under those headers will be tagged with child type **total_amount_type**" - ] - }, - { - "cell_type": "markdown", - "id": "88fc6767-f38b-4b2a-81f2-93c338cd9956", - "metadata": {}, - "source": [ - "![Alt Text](https://screenshot.googleplex.com/AofLjF6XqhZ4aEC.png)" - ] - }, - { - "cell_type": "markdown", - "id": "bd852f45-92ca-449a-bfa4-431f4fbe0621", - "metadata": {}, - "source": [ - "**THIS TOOL ONLY CREATES A CHILD ENTITY , TO GROUP THE CHILD ITEM TO PARENT ITEM USE [go/docai-line-items-improver-post-processing](go/docai-line-items-improver-post-processing)\n", - " AFTER TAGGING CHILD ITEM**" - ] - }, - { - "cell_type": "markdown", - "id": "049b9e6d-4e3a-4727-80f5-143a4181a1ae", - "metadata": {}, - "source": [ - "### 2. Run the Code" - ] - }, - { - "cell_type": "markdown", - "id": "fc0f48a8-d694-490d-b4be-d49d260a3158", - "metadata": {}, - "source": [ - "Copy the code provided in the sample code section and run the code to get the updated json files" - ] - }, - { - "cell_type": "markdown", - "id": "25cb7ced-5aca-413d-b3a7-236ee8569a16", - "metadata": {}, - "source": [ - "### 3. Output" - ] - }, - { - "cell_type": "markdown", - "id": "f4cb4da9-b541-4da4-b457-2d9674565426", - "metadata": {}, - "source": [ - "The items which are below the matched keyword will be tagged as entity name given" - ] - }, - { - "cell_type": "markdown", - "id": "a947c2ff-60e8-48da-bf0b-3f81ea6b3663", - "metadata": {}, - "source": [ - "## **Sample Code**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9413a9df-d7be-4dd3-8053-67b286a51efe", - "metadata": {}, - "outputs": [], - "source": [ - "import re\n", - "\n", - "from google.cloud import storage\n", - "\n", - "\n", - "def file_names(file_path):\n", - " \"\"\"This Function will load the bucket and get the list of files\n", - " in the gs path given\n", - " args: gs path\n", - " output: file names as list and dictionary with file names as keys and file path as values\n", - " \"\"\"\n", - "\n", - " from google.cloud import storage\n", - "\n", - " bucket = file_path.split(\"/\")[2]\n", - " file_names_list = []\n", - " file_dict = {}\n", - " storage_client = storage.Client()\n", - " source_bucket = storage_client.get_bucket(bucket)\n", - " filenames = [\n", - " filename.name for filename in list(\n", - " source_bucket.list_blobs(\n", - " prefix=((\"/\").join(file_path.split(\"/\")[3:]))))\n", - " ]\n", - " for i in range(len(filenames)):\n", - " x = filenames[i].split(\"/\")[-1]\n", - " if x != \"\":\n", - " file_names_list.append(x)\n", - " file_dict[x] = filenames[i]\n", - " return file_names_list, file_dict\n", - "\n", - "\n", - "def load_json(path):\n", - " import json\n", - "\n", - " import gcsfs\n", - "\n", - " gcs_file_system = gcsfs.GCSFileSystem(project=project_id)\n", - " with gcs_file_system.open(path) as f:\n", - " json_l = json.load(f)\n", - " return json_l\n", - "\n", - "\n", - "def get_page_wise_entities(json_dict):\n", - " \"\"\"Args: loaded json file\n", - " THIS FUNCTION GIVES THE ENTITIES SPEPERATED FROM EACH PAGE IN DICTIONARY FORMAT\n", - " RETURNS: {page: [entities]}\"\"\"\n", - "\n", - " entities_page = {}\n", - " for entity in json_dict[\"entities\"]:\n", - " page = 0\n", - " try:\n", - " if \"page\" in entity[\"pageAnchor\"][\"pageRefs\"][0].keys():\n", - " page = entity[\"pageAnchor\"][\"pageRefs\"][0][\"page\"]\n", - "\n", - " if page in entities_page.keys():\n", - " entities_page[page].append(entity)\n", - " else:\n", - " entities_page[page] = [entity]\n", - " except:\n", - " pass\n", - " return entities_page\n", - "\n", - "\n", - "def get_token(json_dict, page, text_anchors_check):\n", - " \"\"\"THIS FUNCITON USED LOADED JSON, PAGE NUMBER AND TEXT ANCHORS AS INPUT AND GIVES THE X AND Y COORDINATES\"\"\"\n", - "\n", - " temp_xy = {\"x\": [], \"y\": []}\n", - " min_x = \"\"\n", - " for token in json_dict[\"pages\"][page][\"tokens\"]:\n", - " text_anc = token[\"layout\"][\"textAnchor\"][\"textSegments\"]\n", - " for anc in text_anc:\n", - " try:\n", - " start_temp = int(anc[\"startIndex\"])\n", - " except:\n", - " start_temp = \"0\"\n", - " end_temp = int(anc[\"endIndex\"])\n", - "\n", - " for anc3 in text_anchors_check:\n", - " start_check = int(anc3[\"startIndex\"]) - 2\n", - " end_check = int(anc3[\"endIndex\"]) + 2\n", - " if (int(start_temp) >= start_check and end_temp <= end_check\n", - " and end_temp - int(start_temp) > 3):\n", - " normalized_vertices_temp = token[\"layout\"][\"boundingPoly\"][\n", - " \"normalizedVertices\"]\n", - " for ver_xy in normalized_vertices_temp:\n", - " try:\n", - " temp_xy[\"x\"].append(ver_xy[\"x\"])\n", - " temp_xy[\"y\"].append(ver_xy[\"y\"])\n", - " except:\n", - " pass\n", - "\n", - " try:\n", - " min_x = min(temp_xy[\"x\"])\n", - " except:\n", - " min_x = \"\"\n", - " try:\n", - " min_y = min(temp_xy[\"y\"])\n", - " except:\n", - " min_y = \"\"\n", - " try:\n", - " max_x = max(temp_xy[\"x\"])\n", - " except:\n", - " max_x = \"\"\n", - " try:\n", - " max_y = max(temp_xy[\"y\"])\n", - " except:\n", - " max_y = \"\"\n", - "\n", - " return {\"min_x\": min_x, \"min_y\": min_y, \"max_x\": max_x, \"max_y\": max_y}\n", - "\n", - "\n", - "def tag_ref_child_item(\n", - " json_dict,\n", - " page,\n", - " ent_min_dict,\n", - " consider_ent,\n", - " total_amount_type,\n", - " min_y_start,\n", - " max_stop_y,\n", - "):\n", - " \"\"\"THIS FUNCTION USED THE LOADED JSON, PAGE NUMBER , DICTIONARY OF HEADER KEYWORD AND VALUES AS X AND Y COORDINATES\n", - " AND THE STOP WORD Y COORDINATE\n", - "\n", - " ARGS: LOADED JSON, PAGE NUMBER, FIRST ENTITY TO BE TAGGED, STOP WORD Y COORDINATE\n", - "\n", - " RETURNS: LIST OF LINE ITEMS TAGGING FIRST ENTITY PROVIDED\n", - "\n", - " \"\"\"\n", - " consider_type = total_amount_type\n", - " line_items_temp = []\n", - " for token in json_dict[\"pages\"][page][\"tokens\"]:\n", - " line_item_ent = {\n", - " \"confidence\": 1,\n", - " \"mentionText\": \"\",\n", - " \"pageAnchor\": {\n", - " \"pageRefs\": [{\n", - " \"boundingPoly\": {\n", - " \"normalizedVertices\": []\n", - " },\n", - " \"page\": str(page)\n", - " }]\n", - " },\n", - " \"properties\": [],\n", - " \"textAnchor\": {\n", - " \"textSegments\": []\n", - " },\n", - " \"type\": \"line_item\",\n", - " }\n", - " sub_ent = {\n", - " \"confidence\": 1,\n", - " \"mentionText\": \"\",\n", - " \"pageAnchor\": {\n", - " \"pageRefs\": [{\n", - " \"boundingPoly\": {\n", - " \"normalizedVertices\": []\n", - " },\n", - " \"page\": str(page)\n", - " }]\n", - " },\n", - " \"textAnchor\": {\n", - " \"textSegments\": []\n", - " },\n", - " \"type\": \"\",\n", - " }\n", - " normalized_vertices = token[\"layout\"][\"boundingPoly\"]\n", - " try:\n", - " min_x = min(\n", - " vertex[\"x\"]\n", - " for vertex in normalized_vertices[\"normalizedVertices\"])\n", - " min_y = min(\n", - " vertex[\"y\"]\n", - " for vertex in normalized_vertices[\"normalizedVertices\"])\n", - " max_x = max(\n", - " vertex[\"x\"]\n", - " for vertex in normalized_vertices[\"normalizedVertices\"])\n", - " max_y = max(\n", - " vertex[\"y\"]\n", - " for vertex in normalized_vertices[\"normalizedVertices\"])\n", - " if (min_y > min_y_start\n", - " and min_x >= ent_min_dict[consider_ent][\"min_x\"] - 0.05\n", - " and max_x <= ent_min_dict[consider_ent][\"max_x\"] + 0.1\n", - " and max_y <= max_stop_y\n", - " and max_x > ent_min_dict[consider_ent][\"min_x\"]):\n", - " end_index = token[\"layout\"][\"textAnchor\"][\"textSegments\"][0][\n", - " \"endIndex\"]\n", - " start_index = token[\"layout\"][\"textAnchor\"][\"textSegments\"][0][\n", - " \"startIndex\"]\n", - " # pattern = re.compile(r'[^a-zA-Z]')\n", - " pattern_1 = re.compile(r\"[0-9\\s\\\\\\/]+\")\n", - " if (not bool(\n", - " pattern_1.search(\n", - " json_dict[\"text\"][int(start_index):int(end_index)].\n", - " replace(\" \", \"\").replace(\"\\n\", \"\"))) == False):\n", - " # float(json_dict['text'][int(start_index):int(end_index)].replace(\" \", \"\").replace(\"\\n\", \"\"))\n", - " # print(json_dict['text'][int(start_index):int(end_index)])\n", - " line_item_ent[\"mentionText\"] = json_dict[\"text\"][\n", - " int(start_index):int(end_index)]\n", - " line_item_ent[\"pageAnchor\"][\"pageRefs\"][0][\"boundingPoly\"][\n", - " \"normalizedVertices\"] = token[\"layout\"][\n", - " \"boundingPoly\"][\"normalizedVertices\"]\n", - " line_item_ent[\"textAnchor\"][\"textSegments\"] = token[\n", - " \"layout\"][\"textAnchor\"][\"textSegments\"]\n", - " sub_ent[\"mentionText\"] = json_dict[\"text\"][\n", - " int(start_index):int(end_index)]\n", - " sub_ent[\"pageAnchor\"][\"pageRefs\"][0][\"boundingPoly\"][\n", - " \"normalizedVertices\"] = token[\"layout\"][\n", - " \"boundingPoly\"][\"normalizedVertices\"]\n", - " sub_ent[\"textAnchor\"][\"textSegments\"] = token[\"layout\"][\n", - " \"textAnchor\"][\"textSegments\"]\n", - " sub_ent[\"type\"] = consider_type\n", - " line_item_ent[\"properties\"].append(sub_ent)\n", - " line_items_temp.append(line_item_ent)\n", - " except:\n", - " pass\n", - " # print(line_items_temp)\n", - " same_y_ent = []\n", - " for dup in line_items_temp:\n", - " temp_same_y = {\n", - " \"mentionText\": \"\",\n", - " \"min_y\": \"\",\n", - " \"max_y\": \"\",\n", - " \"min_x\": \"\",\n", - " \"text_anc\": [],\n", - " }\n", - " temp_same_y[\"mentionText\"] = dup[\"mentionText\"]\n", - " temp_norm_same_y = dup[\"pageAnchor\"][\"pageRefs\"][0][\"boundingPoly\"]\n", - " temp_same_y[\"min_y\"] = min(\n", - " vertex[\"y\"] for vertex in temp_norm_same_y[\"normalizedVertices\"])\n", - " temp_same_y[\"max_y\"] = max(\n", - " vertex[\"y\"] for vertex in temp_norm_same_y[\"normalizedVertices\"])\n", - " temp_same_y[\"min_x\"] = min(\n", - " vertex[\"x\"] for vertex in temp_norm_same_y[\"normalizedVertices\"])\n", - " temp_same_y[\"text_anc\"] = dup[\"textAnchor\"][\"textSegments\"]\n", - " same_y_ent.append(temp_same_y)\n", - " same_y_ent\n", - " sorted_same_y_ent = sorted(same_y_ent, key=lambda x: x[\"min_y\"])\n", - " groups_same_y = []\n", - " if len(sorted_same_y_ent) != 0:\n", - " current_group = [sorted_same_y_ent[0]]\n", - " for i in range(1, len(sorted_same_y_ent)):\n", - " if sorted_same_y_ent[i][\"min_y\"] - current_group[-1][\n", - " \"min_y\"] < 0.005:\n", - " current_group.append(sorted_same_y_ent[i])\n", - " else:\n", - " groups_same_y.append(current_group)\n", - " current_group = [sorted_same_y_ent[i]]\n", - "\n", - " # Append the last group\n", - " groups_same_y.append(current_group)\n", - " min_x_diff_list = [[\n", - " abs(elem[\"min_x\"] - ent_min_dict[consider_ent][\"min_x\"])\n", - " for elem in lst\n", - " ] for lst in groups_same_y]\n", - "\n", - " selected_elements = [\n", - " min(\n", - " lst,\n", - " key=lambda elem: abs(elem[\"min_x\"] - ent_min_dict[consider_ent][\n", - " \"min_x\"]),\n", - " ) for lst in groups_same_y\n", - " ]\n", - " if len(groups_same_y) != 0:\n", - " for group in groups_same_y:\n", - " for element in selected_elements:\n", - " for dup3 in group:\n", - " if dup3[\"text_anc\"] == element[\"text_anc\"]:\n", - " group.remove(dup3)\n", - " for group in groups_same_y:\n", - " for dup4 in group:\n", - " for e5 in line_items_temp:\n", - " if e5[\"textAnchor\"][\"textSegments\"] == dup4[\"text_anc\"]:\n", - " line_items_temp.remove(e5)\n", - "\n", - " return line_items_temp\n", - "\n", - "\n", - "def total_amount_entities(json_dict, total_amount_type):\n", - " for ent2 in json_dict[\"entities\"]:\n", - " if \"properties\" in ent2.keys() and ent2[\"type\"] == \"line_item\":\n", - " for sub_ent2 in ent2[\"properties\"]:\n", - " if \"line_item\" in sub_ent2[\"type\"]:\n", - " consider_ent_type = \"line_item/total_amount\"\n", - " else:\n", - " consider_ent_type = \"total_amount\"\n", - "\n", - " if \"/\" in consider_ent_type:\n", - " if \"/\" in total_amount_type:\n", - " pass\n", - " else:\n", - " total_amount_type = \"line_item\" + \"/\" + total_amount_type\n", - " else:\n", - " if \"/\" in total_amount_type:\n", - " total_amount_type = total_amount_type.split(\"/\")[-1]\n", - " else:\n", - " pass\n", - "\n", - " page_wise_ent = get_page_wise_entities(json_dict)\n", - " previous_page_headers = \"\"\n", - " total_amount_entities = []\n", - "\n", - " for page, ent2 in page_wise_ent.items():\n", - " line_items_all = []\n", - " # print(page)\n", - " for entity in ent2:\n", - " if \"properties\" in entity.keys() and entity[\"type\"] == \"line_item\":\n", - " line_items_all.append(entity)\n", - " # print(len(line_items_all))\n", - " if line_items_all != []:\n", - " if len(line_items_all) > 1 or len(\n", - " line_items_all[0][\"properties\"]) > 2:\n", - " min_y_line = 1\n", - " max_y_line = 0\n", - " min_y_child = 1\n", - " min_y_child_Mt = \"\"\n", - " entity_mentiontext = \"\"\n", - " for line_item in line_items_all:\n", - " norm_ver = line_item[\"pageAnchor\"][\"pageRefs\"][0][\n", - " \"boundingPoly\"][\"normalizedVertices\"]\n", - " for ver in norm_ver:\n", - " min_y_temp = min(vertex[\"y\"] for vertex in norm_ver)\n", - " max_y_temp = max(vertex[\"y\"] for vertex in norm_ver)\n", - " if min_y_line > min_y_temp:\n", - " min_y_line = min_y_temp\n", - " entity_mentiontext = line_item[\"mentionText\"]\n", - " for child_ent in line_item[\"properties\"]:\n", - " norm_ver_child = child_ent[\"pageAnchor\"][\n", - " \"pageRefs\"][0][\"boundingPoly\"][\n", - " \"normalizedVertices\"]\n", - " for ver_child in norm_ver_child:\n", - " min_y_child_temp = min(\n", - " vertex[\"y\"]\n", - " for vertex in norm_ver_child)\n", - " if min_y_child > min_y_child_temp:\n", - " min_y_child = min_y_child_temp\n", - " try:\n", - " min_y_child_Mt = child_ent[\n", - " \"mentionText\"]\n", - " except:\n", - " pass\n", - " # print(child_ent)\n", - " if max_y_line < max_y_temp:\n", - " max_y_line = max_y_temp\n", - " else:\n", - " pass\n", - " # print(min_y_line,max_y_line)\n", - " check_text = \"\"\n", - " start_temp = 100000000\n", - " end_temp = 0\n", - " total_amount_textanc = {}\n", - " for token in json_dict[\"pages\"][int(page)][\"tokens\"]:\n", - " normalized_vertices = token[\"layout\"][\"boundingPoly\"]\n", - " try:\n", - " max_y_temp_token = max(\n", - " vertex[\"y\"] for vertex in\n", - " normalized_vertices[\"normalizedVertices\"])\n", - " min_y_temp_token = min(\n", - " vertex[\"y\"] for vertex in\n", - " normalized_vertices[\"normalizedVertices\"])\n", - " if (min_y_line >= max_y_temp_token - 0.02 and\n", - " abs(min_y_line - min_y_temp_token) <= 0.15):\n", - " end_index = token[\"layout\"][\"textAnchor\"][\n", - " \"textSegments\"][0][\"endIndex\"]\n", - " start_index = token[\"layout\"][\"textAnchor\"][\n", - " \"textSegments\"][0][\"startIndex\"]\n", - " check_text = (check_text + json_dict[\"text\"]\n", - " [int(start_index):int(end_index)])\n", - " if int(start_temp) > int(start_index):\n", - " start_temp = int(start_index)\n", - " if int(end_temp) < int(end_index):\n", - " end_temp = int(end_index)\n", - " except Exception as e:\n", - " pass\n", - " # print(e)\n", - "\n", - " for i in list_total_amount:\n", - " if i.lower() in check_text.lower():\n", - " # print(i)\n", - " matches = re.finditer(\n", - " i.lower(),\n", - " json_dict[\"text\"]\n", - " [int(start_temp):int(end_temp)].lower(),\n", - " )\n", - " starting_indices = [match.start() for match in matches]\n", - " start_index_temp1 = max(starting_indices)\n", - " # start_index_temp1=json_dict['text'][int(start_temp):int(end_temp)].lower().find(i.lower())\n", - " # print(start_index_temp1)\n", - " start_index_1 = start_index_temp1 + int(start_temp)\n", - " end_index_1 = start_index_1 + len(i)\n", - " total_amount_textanc[i] = [{\n", - " \"startIndex\":\n", - " str(start_index_1),\n", - " \"endIndex\":\n", - " str(end_index_1),\n", - " }]\n", - " # print(start_temp,end_temp)\n", - " # print(check_text)\n", - " final_key = \"\"\n", - " for k, v in total_amount_textanc.items():\n", - " if len(final_key) < len(k):\n", - " final_key = k\n", - " # print(total_amount_textanc)\n", - " # print(final_key)\n", - " if final_key != \"\":\n", - " total_amount_dict = {\n", - " \"total_amount\":\n", - " get_token(json_dict, int(page),\n", - " total_amount_textanc[final_key])\n", - " }\n", - " previous_page_headers = total_amount_dict\n", - " else:\n", - " total_amount_dict = previous_page_headers\n", - " if len(total_amount_dict) != 0:\n", - " total_amount_line_items = tag_ref_child_item(\n", - " json_dict,\n", - " int(page),\n", - " total_amount_dict,\n", - " \"total_amount\",\n", - " total_amount_type,\n", - " min_y_line,\n", - " max_y_line,\n", - " )\n", - " for item in total_amount_line_items:\n", - " total_amount_entities.append(item)\n", - " from pprint import pprint\n", - " # pprint(total_amount_entities)\n", - "\n", - " from pprint import pprint\n", - "\n", - " for total_en in total_amount_entities:\n", - " json_dict[\"entities\"].append(total_en)\n", - " # pprint(total_en)\n", - "\n", - " return json_dict\n", - "\n", - "\n", - "import json\n", - "from pprint import pprint\n", - "\n", - "import gcsfs\n", - "from tqdm import tqdm\n", - "\n", - "fs = gcsfs.GCSFileSystem(project_id)\n", - "file_names_list, file_dict = file_names(Gcs_input_path)\n", - "count = 0\n", - "issue_files = {}\n", - "for filename, filepath in tqdm(file_dict.items(), desc=\"Progress\"):\n", - " input_bucket_name = Gcs_input_path.split(\"/\")[2]\n", - " if \".json\" in filepath:\n", - " filepath = \"gs://\" + input_bucket_name + \"/\" + filepath\n", - " json_dict = load_json(filepath)\n", - " print(filepath)\n", - " try:\n", - " if json_dict[\"pages\"][0][\"tokens\"] != []:\n", - " try:\n", - " json_dict = total_amount_entities(json_dict,\n", - " total_amount_type)\n", - " fs.pipe(\n", - " Gcs_output_path + \"/\" + filename,\n", - " bytes(json.dumps(json_dict, ensure_ascii=False),\n", - " \"utf-8\"),\n", - " content_type=\"application/json\",\n", - " )\n", - " except:\n", - " issue_files[filepath] = \"Some issue with Json\"\n", - " fs.pipe(\n", - " Gcs_output_path + \"/\" + filename,\n", - " bytes(json.dumps(json_dict, ensure_ascii=False),\n", - " \"utf-8\"),\n", - " content_type=\"application/json\",\n", - " )\n", - " pass\n", - " else:\n", - " issue_files[filepath] = \"No Tokens\"\n", - " count = count + 1\n", - " fs.pipe(\n", - " Gcs_output_path + \"/\" + filename,\n", - " bytes(json.dumps(json_dict, ensure_ascii=False), \"utf-8\"),\n", - " content_type=\"application/json\",\n", - " )\n", - " except:\n", - " fs.pipe(\n", - " Gcs_output_path + \"/\" + filename,\n", - " bytes(json.dumps(json_dict, ensure_ascii=False), \"utf-8\"),\n", - " content_type=\"application/json\",\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "9eb8bfda-6814-45e8-8441-f212d2c80dd5", - "metadata": {}, - "source": [ - "After this tool , run [go/docai-line-items-improver-post-processing](go/docai-line-items-improver-post-processing) for grouping the line items with respect to the new child item created." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "539486c3-0047-489b-a347-d1c904d4d9e8", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "environment": { - "kernel": "python3", - "name": "common-cpu.m104", - "type": "gcloud", - "uri": "gcr.io/deeplearning-platform-release/base-cpu:m104" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/DocAI Incubator Tools/best-practices/Document_AI_Parser_Result_Merger.ipynb b/DocAI Incubator Tools/best-practices/Document_AI_Parser_Result_Merger.ipynb deleted file mode 100644 index 01f0d5646..000000000 --- a/DocAI Incubator Tools/best-practices/Document_AI_Parser_Result_Merger.ipynb +++ /dev/null @@ -1,686 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "707aac26-5f83-4dfe-a9c3-73f9eb34dea4", - "metadata": {}, - "source": [ - "# Document AI Parser Result Merger" - ] - }, - { - "cell_type": "markdown", - "id": "fff479c9-a573-48c7-bd26-d9c13cf5be7f", - "metadata": {}, - "source": [ - "* Author: docai-incubator@google.com" - ] - }, - { - "cell_type": "markdown", - "id": "ff5efc16-e389-48af-90d6-5d99019a1059", - "metadata": {}, - "source": [ - "## Disclaimer\n", - "\n", - "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.\t" - ] - }, - { - "cell_type": "markdown", - "id": "47eed16a-e66d-4c76-860f-16f08ae45867", - "metadata": {}, - "source": [ - "## Objective\n", - "Document AI Parser Result Merger is a tool built using Python programming language. Its purpose is to address the issue of merging the two or more resultant json files of Document AI processors. This document highlights the working of the tool(script) and its requirements. The documents usually contain multiple pages. There are 2 use cases by which this solution can be operated. \n", - "### Case 1: Different documents, parser results json merger (Default).\n", - " * Case 1 deals when we are using two or multiple parser output Jsons are from different documents\n", - " * To Enable this case the flag should be ‘1’\n", - "### Case 2: Same document, different parsers json merger(Added functionality).\n", - " * Case 2 deals when we are using two or multiple parser outputs from the same document.\n", - " * To Enable this case the flag should be ‘2’" - ] - }, - { - "cell_type": "markdown", - "id": "44ef6ea3-79e0-42a0-a2bb-84ece51bff74", - "metadata": {}, - "source": [ - "## Prerequisites" - ] - }, - { - "cell_type": "markdown", - "id": "b64bc02d-ec9b-42b6-b3b0-ed70fa6c3808", - "metadata": {}, - "source": [ - "This tool requires the following services:\n", - "\n", - " * Google Jupyter Notebook or Colab.\n", - " * Google Cloud Storage \n", - " * DocumentAI processor and JSON files\n", - " \n", - "Google Jupyter Notebook or Colab is used for running the python notebook file. Cloud Storage Buckets have the input files to this script. The multiple input files are the json files which are the result of a Document AI processor (for eg., Bank Statement Parser). These json files include multiple pages in its document. After the script executes, the output file is a single merged json file stored in the output bucket path." - ] - }, - { - "cell_type": "markdown", - "id": "e4b4964c-d228-4aad-a6fb-346465791fe7", - "metadata": {}, - "source": [ - "## Workflow overview\n", - " " - ] - }, - { - "cell_type": "markdown", - "id": "60a6f212-d41a-4ae7-b374-1c97ffb03931", - "metadata": {}, - "source": [ - "![](https://screenshot.googleplex.com/9F5qLEtZJ4Kdj8m.png)\n", - "\n", - "The above diagram shows the flow diagram of the tool. As highlighted there are input and output GCP buckets and there is a python script which processes the request. The input bucket holds the multiple json files which need to be merged into a single file and this is achieved by the python script. This script accepts the input json files and prompts users to switch between the default case-1 or the case-2 mode as highlighted in the previous sections. Finally there is an output GCP bucket to store the single merged file. " - ] - }, - { - "cell_type": "markdown", - "id": "3155aca4-1aeb-4a22-a0b7-a3e9b43e69c0", - "metadata": {}, - "source": [ - "## Script walkthrough\n", - "Insights and details about the script are explained in detail as follows.\n", - "1. Config file Creation\n", - " Run the below code and create a config.ini file for providing input." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "17bdf742-c286-4735-bae5-91eb5c1a1ab0", - "metadata": {}, - "outputs": [], - "source": [ - "import configparser\n", - "\n", - "config = configparser.ConfigParser()\n", - "# Add the structure to the file we will create\n", - "config.add_section(\"Parameters\")\n", - "config.set(\"Parameters\", \"project_id\", \"xxxx-xxxx-xxxx\")\n", - "config.set(\"Parameters\", \"Input_Multiple_jsons_URI\", \"gs://\")\n", - "config.set(\"Parameters\", \"Output_Multiple_jsons_URI\", \"gs://\")\n", - "config.set(\"Parameters\", \"Name_output_Json\", \"merged_json\")\n", - "config.set(\"Parameters\", \"merger_type_flag(1-for different docs,2-same doc)\",\n", - " \"1\")\n", - "\n", - "# Write the new structure to the new file\n", - "with open(r\"configfile.ini\", \"w\") as configfile:\n", - " config.write(configfile)" - ] - }, - { - "cell_type": "markdown", - "id": "758afb38-cede-4042-b9e1-9c847eef818f", - "metadata": {}, - "source": [ - "2. Input Details : Entering Project details in Config files:" - ] - }, - { - "cell_type": "markdown", - "id": "444df062-df9e-48c0-9f96-fe2789f3e1f0", - "metadata": {}, - "source": [ - "![](https://screenshot.googleplex.com/7DTVnTRHPQUgLBG.png)\n", - "\n", - "\n", - "Once the config.ini file is created with the above step 1 , open the config.ini file and enter the input details specific to your project and GCP bucket paths. As shown in the diagram above, the following parameters are to be entered.\n", - " * **project_id:** provide your GCP project ID\n", - " * **input_multiple_jsons_uri:** provide the uri link of folder containing the input files\n", - " * **output_multiple_jsons_uri:** provide the folder name of the output file which gets generated post execution of the script.\n", - " * **Name_output_json:** enter a name for the generated file which is saved in the output bucket.\n", - " * **merger_type_flag(1-for different docs,2-same doc) :** based on user need, values 1 or 2 can be provided as mentioned in the earlier part of this document.\n", - "\n", - " - Case 1 deals when we are using two or multiple parser output Jsons are from different documents\n", - "\n", - " - Case 2 deals when we are using two or multiple parser outputs from the same document.\n" - ] - }, - { - "cell_type": "markdown", - "id": "1dba5180-f972-4d5e-9802-974885efe2d4", - "metadata": {}, - "source": [ - "## 3. Run the below code.\n", - "\n", - "Use the below code and Run all the cells (Update the Path parameter if it is not available in the current working directory)\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4ffdf37c-332f-4ba5-a55f-349d2cccb432", - "metadata": {}, - "outputs": [], - "source": [ - "import configparser\n", - "import gc\n", - "import json\n", - "# importing Libraries\n", - "import os\n", - "import re\n", - "import urllib.request\n", - "from datetime import datetime\n", - "from pprint import pprint\n", - "\n", - "import gcsfs\n", - "import pandas as pd\n", - "from google.api_core.client_options import ClientOptions\n", - "from google.api_core.exceptions import InternalServerError, RetryError\n", - "from google.cloud import storage\n", - "\n", - "# Getting Input from config file\n", - "Path = \"configfile.ini\" # Enter the path of config file\n", - "\n", - "config = configparser.ConfigParser()\n", - "config.read(Path)\n", - "project_id = config.get(\"Parameters\", \"project_id\")\n", - "input_multiple_jsons_uri = config.get(\"Parameters\", \"input_multiple_jsons_uri\")\n", - "JSON_DIRECTORY_PATH_OUTPUT = config.get(\"Parameters\",\n", - " \"Output_Multiple_jsons_URI\")\n", - "output_file_name = config.get(\"Parameters\", \"Name_output_Json\")\n", - "merger_type_flag = config.get(\n", - " \"Parameters\", \"merger_type_flag(1-for different docs,2-same doc)\")\n", - "\n", - "\n", - "# Functions\n", - "### CASE - 1\n", - "def merger(doc_first, doc_second):\n", - " doc_merged = {}\n", - "\n", - " ### Entities ###\n", - " for entity in doc_second[\"entities\"]:\n", - " try:\n", - " # print(\"\\n\\n+++++++++ PAGE ANCHOR +++++++++\")\n", - " for x in range(0, len(entity[\"pageAnchor\"][\"pageRefs\"])):\n", - " try:\n", - " entity[\"pageAnchor\"][\"pageRefs\"][x][\"page\"] = str(\n", - " int(entity[\"pageAnchor\"][\"pageRefs\"][x][\"page\"]) +\n", - " len(doc_first[\"pages\"]))\n", - " except:\n", - " entity[\"pageAnchor\"][\"pageRefs\"][x][\"page\"] = str(\n", - " len(doc_first[\"pages\"]))\n", - "\n", - " except:\n", - " pass\n", - "\n", - " # print(\"--- Properties ---\")\n", - "\n", - " try:\n", - " for x in range(0, len(entity[\"properties\"])):\n", - " for xx in range(\n", - " 0,\n", - " len(entity[\"properties\"][x][\"pageAnchor\"]\n", - " [\"pageRefs\"])):\n", - " try:\n", - " entity[\"properties\"][x][\"pageAnchor\"][\"pageRefs\"][xx][\n", - " \"page\"] = int(entity[\"properties\"][x][\"pageAnchor\"]\n", - " [\"pageRefs\"][xx][\"page\"]) + len(\n", - " doc_first[\"pages\"])\n", - " except:\n", - " entity[\"properties\"][x][\"pageAnchor\"][\"pageRefs\"][xx][\n", - " \"page\"] = len(doc_first[\"pages\"])\n", - " except:\n", - " pass\n", - "\n", - " # print(\"+++++++++ TEXT ANCHOR +++++++++\")\n", - "\n", - " try:\n", - " textAnchor = entity[\"textAnchor\"]\n", - " for y in range(0, len(entity[\"textAnchor\"][\"textSegments\"])):\n", - " entity[\"textAnchor\"][\"textSegments\"][y][\"endIndex\"] = int(\n", - " entity[\"textAnchor\"][\"textSegments\"][y][\"endIndex\"]) + len(\n", - " doc_first[\"text\"])\n", - " try:\n", - " entity[\"textAnchor\"][\"textSegments\"][y][\n", - " \"startIndex\"] = int(\n", - " entity[\"textAnchor\"][\"textSegments\"][y]\n", - " [\"startIndex\"]) + len(doc_first[\"text\"])\n", - " except: # if startIndex is absent\n", - " entity[\"textAnchor\"][\"textSegments\"][y][\n", - " \"startIndex\"] = len(doc_first[\"text\"])\n", - " except:\n", - " pass\n", - "\n", - " # print(\"--- Properties ---\")\n", - "\n", - " try:\n", - " for y in range(0, len(entity[\"properties\"])):\n", - " for yy in range(\n", - " 0,\n", - " len(entity[\"properties\"][y][\"textAnchor\"]\n", - " [\"textSegments\"])):\n", - " entity[\"properties\"][y][\"textAnchor\"][\"textSegments\"][yy][\n", - " \"endIndex\"] = int(\n", - " entity[\"properties\"][y][\"textAnchor\"]\n", - " [\"textSegments\"][yy][\"endIndex\"]) + len(\n", - " doc_first[\"text\"])\n", - " try:\n", - " entity[\"properties\"][y][\"textAnchor\"][\"textSegments\"][\n", - " yy][\"startIndex\"] = int(\n", - " entity[\"properties\"][y][\"textAnchor\"]\n", - " [\"textSegments\"][yy][\"startIndex\"]) + len(\n", - " doc_first[\"text\"])\n", - " except: # if startIndex is absent\n", - " entity[\"properties\"][y][\"textAnchor\"][\"textSegments\"][\n", - " yy][\"startIndex\"] = len(doc_first[\"text\"])\n", - " except:\n", - " pass\n", - "\n", - " doc_merged[\"entities\"] = doc_first[\"entities\"] + doc_second[\"entities\"]\n", - "\n", - " ### Page\n", - " ### Page No increment in second doc\n", - " for pg in doc_second[\"pages\"]:\n", - " print(pg[\"pageNumber\"])\n", - " pg[\"pageNumber\"] = int(pg[\"pageNumber\"]) + len(doc_first[\"pages\"])\n", - " print(\"\\t\", pg[\"pageNumber\"])\n", - "\n", - " ### Page\n", - " ### page . blocks . layout . textanchor . textsegment\n", - " for pg in range(0, len(doc_second[\"pages\"])):\n", - " for pg_ in range(0, len(doc_second[\"pages\"][pg][\"blocks\"])):\n", - " for pg_textSegment in range(\n", - " 0,\n", - " len(doc_second[\"pages\"][pg][\"blocks\"][pg_][\"layout\"]\n", - " [\"textAnchor\"][\"textSegments\"]),\n", - " ):\n", - " doc_second[\"pages\"][pg][\"blocks\"][pg_][\"layout\"][\"textAnchor\"][\n", - " \"textSegments\"][pg_textSegment][\"endIndex\"] = int(\n", - " doc_second[\"pages\"][pg][\"blocks\"][pg_][\"layout\"]\n", - " [\"textAnchor\"][\"textSegments\"][pg_textSegment]\n", - " [\"endIndex\"]) + len(doc_first[\"text\"])\n", - " try:\n", - " doc_second[\"pages\"][pg][\"blocks\"][pg_][\"layout\"][\n", - " \"textAnchor\"][\"textSegments\"][pg_textSegment][\n", - " \"startIndex\"] = int(\n", - " doc_second[\"pages\"][pg][\"blocks\"][pg_]\n", - " [\"layout\"][\"textAnchor\"][\"textSegments\"]\n", - " [pg_textSegment][\"startIndex\"]) + len(\n", - " doc_first[\"text\"])\n", - " except:\n", - " doc_second[\"pages\"][pg][\"blocks\"][pg_][\"layout\"][\n", - " \"textAnchor\"][\"textSegments\"][pg_textSegment][\n", - " \"startIndex\"] = len(doc_first[\"text\"])\n", - "\n", - " ### page . layout . textanchor . textsegment\n", - " for pg in range(0, len(doc_second[\"pages\"])):\n", - " # print(\"----\")\n", - " # print(doc_second['pages'][pg]['layout']['textAnchor']['textSegments'])\n", - " for pg_textSegment in range(\n", - " 0,\n", - " len(doc_second[\"pages\"][pg][\"layout\"][\"textAnchor\"]\n", - " [\"textSegments\"])):\n", - " doc_second[\"pages\"][pg][\"layout\"][\"textAnchor\"][\"textSegments\"][\n", - " pg_textSegment][\"endIndex\"] = int(\n", - " doc_second[\"pages\"][pg][\"layout\"][\"textAnchor\"]\n", - " [\"textSegments\"][pg_textSegment][\"endIndex\"]) + len(\n", - " doc_first[\"text\"])\n", - " try:\n", - " doc_second[\"pages\"][pg][\"layout\"][\"textAnchor\"][\n", - " \"textSegments\"][pg_textSegment][\"startIndex\"] = int(\n", - " doc_second[\"pages\"][pg][\"layout\"][\"textAnchor\"]\n", - " [\"textSegments\"][pg_textSegment][\"startIndex\"]) + len(\n", - " doc_first[\"text\"])\n", - " except:\n", - " doc_second[\"pages\"][pg][\"layout\"][\"textAnchor\"][\n", - " \"textSegments\"][pg_textSegment][\"startIndex\"] = len(\n", - " doc_first[\"text\"])\n", - "\n", - " ### page . lines . layout . textanchor . textsegment\n", - " for pg in range(0, len(doc_second[\"pages\"])):\n", - " for pg_line in range(0, len(doc_second[\"pages\"][pg][\"lines\"])):\n", - " for pg_textSegment in range(\n", - " 0,\n", - " len(doc_second[\"pages\"][pg][\"lines\"][pg_line][\"layout\"]\n", - " [\"textAnchor\"][\"textSegments\"]),\n", - " ):\n", - " doc_second[\"pages\"][pg][\"lines\"][pg_line][\"layout\"][\n", - " \"textAnchor\"][\"textSegments\"][pg_textSegment][\n", - " \"endIndex\"] = int(\n", - " doc_second[\"pages\"][pg][\"lines\"][pg_line][\"layout\"]\n", - " [\"textAnchor\"][\"textSegments\"][pg_textSegment]\n", - " [\"endIndex\"]) + len(doc_first[\"text\"])\n", - " try:\n", - " doc_second[\"pages\"][pg][\"lines\"][pg_line][\"layout\"][\n", - " \"textAnchor\"][\"textSegments\"][pg_textSegment][\n", - " \"startIndex\"] = int(\n", - " doc_second[\"pages\"][pg][\"lines\"][pg_line]\n", - " [\"layout\"][\"textAnchor\"][\"textSegments\"]\n", - " [pg_textSegment][\"startIndex\"]) + len(\n", - " doc_first[\"text\"])\n", - " except:\n", - " doc_second[\"pages\"][pg][\"lines\"][pg_line][\"layout\"][\n", - " \"textAnchor\"][\"textSegments\"][pg_textSegment][\n", - " \"startIndex\"] = len(doc_first[\"text\"])\n", - "\n", - " ### page . paragraph . layout . textanchor . textsegment\n", - " for pg in range(0, len(doc_second[\"pages\"])):\n", - " for pg_paragraph in range(0,\n", - " len(doc_second[\"pages\"][pg][\"paragraphs\"])):\n", - " for pg_textSegment in range(\n", - " 0,\n", - " len(doc_second[\"pages\"][pg][\"paragraphs\"][pg_paragraph]\n", - " [\"layout\"][\"textAnchor\"][\"textSegments\"]),\n", - " ):\n", - " doc_second[\"pages\"][pg][\"paragraphs\"][pg_paragraph][\"layout\"][\n", - " \"textAnchor\"][\"textSegments\"][pg_textSegment][\n", - " \"endIndex\"] = int(\n", - " doc_second[\"pages\"][pg][\"paragraphs\"][pg_paragraph]\n", - " [\"layout\"][\"textAnchor\"][\"textSegments\"]\n", - " [pg_textSegment][\"endIndex\"]) + len(\n", - " doc_first[\"text\"])\n", - " try:\n", - " doc_second[\"pages\"][pg][\"paragraphs\"][pg_paragraph][\n", - " \"layout\"][\"textAnchor\"][\"textSegments\"][\n", - " pg_textSegment][\"startIndex\"] = int(\n", - " doc_second[\"pages\"][pg][\"paragraphs\"]\n", - " [pg_paragraph][\"layout\"][\"textAnchor\"]\n", - " [\"textSegments\"][pg_textSegment]\n", - " [\"startIndex\"]) + len(doc_first[\"text\"])\n", - " except:\n", - " doc_second[\"pages\"][pg][\"paragraphs\"][pg_paragraph][\n", - " \"layout\"][\"textAnchor\"][\"textSegments\"][\n", - " pg_textSegment][\"startIndex\"] = len(\n", - " doc_first[\"text\"])\n", - "\n", - " ### page . tokens . layout . textanchor . textsegment\n", - " for pg in range(0, len(doc_second[\"pages\"])):\n", - " for pg_token in range(0, len(doc_second[\"pages\"][pg][\"tokens\"])):\n", - " for pg_textSegment in range(\n", - " 0,\n", - " len(doc_second[\"pages\"][pg][\"tokens\"][pg_token][\"layout\"]\n", - " [\"textAnchor\"][\"textSegments\"]),\n", - " ):\n", - " doc_second[\"pages\"][pg][\"tokens\"][pg_token][\"layout\"][\n", - " \"textAnchor\"][\"textSegments\"][pg_textSegment][\n", - " \"endIndex\"] = int(\n", - " doc_second[\"pages\"][pg][\"tokens\"][pg_token]\n", - " [\"layout\"][\"textAnchor\"][\"textSegments\"]\n", - " [pg_textSegment][\"endIndex\"]) + len(\n", - " doc_first[\"text\"])\n", - " try:\n", - " doc_second[\"pages\"][pg][\"tokens\"][pg_token][\"layout\"][\n", - " \"textAnchor\"][\"textSegments\"][pg_textSegment][\n", - " \"startIndex\"] = int(\n", - " doc_second[\"pages\"][pg][\"tokens\"][pg_token]\n", - " [\"layout\"][\"textAnchor\"][\"textSegments\"]\n", - " [pg_textSegment][\"startIndex\"]) + len(\n", - " doc_first[\"text\"])\n", - " except:\n", - " doc_second[\"pages\"][pg][\"tokens\"][pg_token][\"layout\"][\n", - " \"textAnchor\"][\"textSegments\"][pg_textSegment][\n", - " \"startIndex\"] = len(doc_first[\"text\"])\n", - "\n", - " doc_merged[\"pages\"] = doc_first[\"pages\"] + doc_second[\"pages\"]\n", - "\n", - " ### Text\n", - " doc_merged[\"text\"] = doc_first[\"text\"] + doc_second[\"text\"]\n", - "\n", - " ### shardInfo & uri\n", - " if \"shardInfo\" in doc_first:\n", - " doc_merged[\"shardInfo\"] = doc_first[\"shardInfo\"]\n", - " if \"uri\" in doc_first:\n", - " doc_merged[\"uri\"] = doc_first[\"uri\"]\n", - "\n", - " return doc_merged\n", - "\n", - "\n", - "### CASE -2\n", - "def SameDocDiffParser_merger(doc_first, doc_second):\n", - " doc_first[\"entities\"] = doc_first[\"entities\"] + doc_second[\"entities\"]\n", - " doc_merged = doc_first\n", - " return doc_merged\n", - "\n", - "\n", - "def file_names(file_path):\n", - " \"\"\"This Function will load the bucket and get the list of files\n", - " in the gs path given\n", - " args: gs path\n", - " output: file names as list and dictionary with file names as keys and file path as values\n", - " \"\"\"\n", - " bucket = file_path.split(\"/\")[2]\n", - " file_names_list = []\n", - " file_dict = {}\n", - " storage_client = storage.Client()\n", - " source_bucket = storage_client.get_bucket(bucket)\n", - " filenames = [\n", - " filename.name for filename in list(\n", - " source_bucket.list_blobs(\n", - " prefix=((\"/\").join(file_path.split(\"/\")[3:]))))\n", - " ]\n", - " for i in range(len(filenames)):\n", - " x = filenames[i].split(\"/\")[-1]\n", - " if x != \"\":\n", - " file_names_list.append(x)\n", - " file_dict[x] = filenames[i]\n", - " return file_names_list, file_dict\n", - "\n", - "\n", - "file_names_list, file_dict = file_names(input_multiple_jsons_uri)\n", - "\n", - "input_bucket_files = []\n", - "for fldrFile in file_names_list:\n", - " if fldrFile.endswith(\".json\"):\n", - " print(fldrFile)\n", - " input_bucket_files.append(fldrFile)\n", - "print(input_bucket_files)\n", - "\n", - "storage_client = storage.Client()\n", - "source_bucket = storage_client.get_bucket(\n", - " input_multiple_jsons_uri.split(\"/\")[2])\n", - "\n", - "fs = gcsfs.GCSFileSystem(project=project_id)\n", - "\n", - "### CASE - 1\n", - "if merger_type_flag == \"1\":\n", - " print(\">>> \\t Using Default Merger... \")\n", - "\n", - " if len(input_bucket_files) == 2: # For 2 docs\n", - " print(\"2 files...\")\n", - " print(input_bucket_files[0])\n", - " print(input_bucket_files[1])\n", - " doc_first = json.loads(\n", - " source_bucket.blob(file_dict[\n", - " input_bucket_files[0]]).download_as_string().decode(\"utf-8\"))\n", - " doc_second = json.loads(\n", - " source_bucket.blob(file_dict[\n", - " input_bucket_files[1]]).download_as_string().decode(\"utf-8\"))\n", - " x = merger(doc_first, doc_second)\n", - "\n", - " else: # For 2+ docs\n", - " print(\"more than 2 files....\")\n", - " print(input_bucket_files[0])\n", - " print(input_bucket_files[1])\n", - " doc_first = json.loads(\n", - " source_bucket.blob(file_dict[\n", - " input_bucket_files[0]]).download_as_string().decode(\"utf-8\"))\n", - " doc_second = json.loads(\n", - " source_bucket.blob(file_dict[\n", - " input_bucket_files[1]]).download_as_string().decode(\"utf-8\"))\n", - " x = merger(doc_first, doc_second)\n", - "\n", - " print(\"---------- 2+ Files ... -----------\")\n", - " for file in input_bucket_files[2:]: # skip first 2 files\n", - " print(file)\n", - " # doc_first = json.loads(bucket.blob(doc_merged).download_as_string().decode('utf-8'))\n", - " doc_second = json.loads(\n", - " source_bucket.blob(\n", - " file_dict[file]).download_as_string().decode(\"utf-8\"))\n", - " x = merger(x, doc_second)\n", - "\n", - "### CASE - 2\n", - "elif merger_type_flag == \"2\":\n", - " print(\">>> \\t Using Different Processor Result jsons merger... \")\n", - "\n", - " if len(input_bucket_files) == 2: # For 2 docs\n", - " print(\"2 files...\")\n", - " print(input_bucket_files[0])\n", - " print(input_bucket_files[1])\n", - " doc_first = json.loads(\n", - " source_bucket.blob(file_dict[\n", - " input_bucket_files[0]]).download_as_string().decode(\"utf-8\"))\n", - " doc_second = json.loads(\n", - " source_bucket.blob(file_dict[\n", - " input_bucket_files[1]]).download_as_string().decode(\"utf-8\"))\n", - " x = SameDocDiffParser_merger(doc_first, doc_second)\n", - "\n", - " else: # For 2+ docs\n", - " print(\"more than 2 files....\")\n", - " print(input_bucket_files[0])\n", - " print(input_bucket_files[1])\n", - " doc_first = json.loads(\n", - " source_bucket.blob(file_dict[\n", - " input_bucket_files[0]]).download_as_string().decode(\"utf-8\"))\n", - " doc_second = json.loads(\n", - " source_bucket.blob(file_dict[\n", - " input_bucket_files[1]]).download_as_string().decode(\"utf-8\"))\n", - " x = SameDocDiffParser_merger(doc_first, doc_second)\n", - "\n", - " print(\"---------- 2+ Files ... -----------\")\n", - " for file in input_bucket_files[2:]: # skip first 2 files\n", - " print(file)\n", - " # doc_first = json.loads(bucket.blob(doc_merged).download_as_string().decode('utf-8'))\n", - " doc_second = json.loads(\n", - " source_bucket.blob(\n", - " file_dict[file]).download_as_string().decode(\"utf-8\"))\n", - " x = SameDocDiffParser_merger(x, doc_second)\n", - "\n", - "else:\n", - " print(\"invalid input\")\n", - "\n", - "print(\"deleting ID under Entities\")\n", - "for z in x[\"entities\"]:\n", - " try:\n", - " print(z[\"id\"])\n", - " del z[\"id\"]\n", - " except:\n", - " pass\n", - "\n", - "print(\"deleting ID under Entities - properties\")\n", - "\n", - "for z in x[\"entities\"]:\n", - " # print(z)\n", - " try:\n", - " for a in z[\"properties\"]:\n", - " print(a[\"id\"])\n", - " del a[\"id\"]\n", - " except:\n", - " pass\n", - "merged_json_path = JSON_DIRECTORY_PATH_OUTPUT + \"/\" + output_file_name\n", - "fs.pipe(merged_json_path,\n", - " bytes(json.dumps(x), \"utf-8\"),\n", - " content_type=\"application/json\")\n", - "gc.collect()" - ] - }, - { - "cell_type": "markdown", - "id": "051b1c28-c998-4788-92e6-e9b50c671043", - "metadata": {}, - "source": [ - "## 4. Output \n", - "\n", - "The output of the tool is a **single json file**. Let's examine the outputs for each of the case types. We’ll consider 3 json docs for our experiment and examine the output formats.\n", - "\n", - "Consider following 3 input json files residing the input GCS Bucket: \n", - "\n", - "json_doc_merge / 0 / doc-0.json\n", - "json_doc_merge / 1 / doc-1.json\n", - "json_doc_merge / 2 / doc-2.json\n", - "\n", - "Upon running the script for both the cases, the below output details are observed as follows.\n", - "\n", - "### CASE - 1 Output \n", - "Let's suppose the three json files are from different documents (The parser used may be same or different )\n", - "In Case - 1, we observe in the output that the Pages and Entities count increases with the number of pages and entities present in the input files upon merging. The same applies for the and Text, the value is changed and texts are concatenated and stored as a single value for the Text key of the output file. " - ] - }, - { - "cell_type": "markdown", - "id": "f4288801-7881-4c5f-9ead-c7a55f298120", - "metadata": {}, - "source": [ - "| Input json files | Screenshot highlighting the number of entities and number of pages in each of the input json files | The output single merged json file |\n", - "|:----------------:|----------------------------------------------------------------------------------------------------|------------------------------------------------------------|\n", - "| **doc-0.json** | ![](https://screenshot.googleplex.com/7Cn7bf5HKA62omx.png) | ![](https://screenshot.googleplex.com/7zWP7zPZkLeZSra.png) |\n", - "| **doc-1.json** | ![](https://screenshot.googleplex.com/BMGMEcW3EFxWrRc.png) | |\n", - "| **doc-2.json** | ![](https://screenshot.googleplex.com/3wCEqP9i3Bm9dqB.png) | |\n", - "\n", - "**For example :** each json has 2 pages and 21 entities , the final output merged json has 6 pages and 63 entities." - ] - }, - { - "cell_type": "markdown", - "id": "f80cd3e7-ee35-4003-ab4a-094a7a935f16", - "metadata": { - "tags": [] - }, - "source": [ - "### CASE - 2 Output \n", - "\n", - "Let's suppose the three json files are from the single document and from different parser results.\n", - "\n", - "In Case - 2, we observe the pages count remains the same and there is only an increase in the count of Entities upon merging the multiple input json files. \n" - ] - }, - { - "cell_type": "markdown", - "id": "1b055a42-20dd-4b34-b624-89218224e7ea", - "metadata": {}, - "source": [ - "| Input json files | Screenshot highlighting the number of entities and number of pages in each of the input json files | The output single merged json file |\n", - "|:----------------:|----------------------------------------------------------------------------------------------------|------------------------------------------------------------|\n", - "| **doc-0.json** | ![](https://screenshot.googleplex.com/ZofmvdULKVFvZ9w.png) | ![](https://screenshot.googleplex.com/Bx2WNCxdcv3pN8p.png) |\n", - "| **doc-1.json** | ![](https://screenshot.googleplex.com/6fgDDEEtRaxNJ2N.png) | |\n", - "| **doc-2.json** | ![](https://screenshot.googleplex.com/BwYcWwMuT6byLTm.png) | |\n", - "\n", - "**For example :** each json has 2 pages and 21 entities , the final output merged json has 2 pages and 63 entities.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "37c85adc-ff16-494d-8e42-9b8da336778b", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "environment": { - "kernel": "python3", - "name": "common-cpu.m104", - "type": "gcloud", - "uri": "gcr.io/deeplearning-platform-release/base-cpu:m104" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/DocAI Incubator Tools/best-practices/HITL_Rejected_Documents_Tracking.ipynb b/DocAI Incubator Tools/best-practices/HITL Rejected Documents Tracking/HITL Rejected Documents Tracking.ipynb similarity index 99% rename from DocAI Incubator Tools/best-practices/HITL_Rejected_Documents_Tracking.ipynb rename to DocAI Incubator Tools/best-practices/HITL Rejected Documents Tracking/HITL Rejected Documents Tracking.ipynb index 4b2263dc8..fe7d22062 100644 --- a/DocAI Incubator Tools/best-practices/HITL_Rejected_Documents_Tracking.ipynb +++ b/DocAI Incubator Tools/best-practices/HITL Rejected Documents Tracking/HITL Rejected Documents Tracking.ipynb @@ -60,7 +60,9 @@ "cell_type": "code", "execution_count": null, "id": "a66c8067-7657-4873-99c0-f215d2b1fcea", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [], "source": [ "!pip install google-cloud-documentai" diff --git a/DocAI Incubator Tools/best-practices/HITL Rejected Documents Tracking/readme.md b/DocAI Incubator Tools/best-practices/HITL Rejected Documents Tracking/readme.md new file mode 100644 index 000000000..75c32d7fd --- /dev/null +++ b/DocAI Incubator Tools/best-practices/HITL Rejected Documents Tracking/readme.md @@ -0,0 +1,19 @@ +## Overview + +This Script is to oversee documents that the HITL system has declined. By accepting a set of Long Running Operation (LRO) IDs, it offers a glimpse into the documents that HITL has rejected based on those LROs. In addition to this, the Script also modifies and relocates processed JSON data to a predetermined GCS directory, now inclusive of a 'HITL_Status' entity. + +## Input Guidelines +**LRO List:** This is essentially a list of IDs that are generated once batch processing is completed. + +**GCS HITL Rejected Path:** This is the designated directory within GCS where you'd want your JSON data stored. Ensure that this path concludes with a forward slash ('/'). + +**Project ID:** This refers to the unique identifier of your project. + +**Location:** This is the geographical region or locale where the processing takes place, such as 'us' or 'eu'. + +## Output Details +Post-execution, the utility delivers outcomes in two distinct formats: + +**CSV Format:** A file named HITL_Status_Update.csv will be generated. This file will contain details like the file names, their HITL status, and, if applicable, reasons for their rejection by HITL. + +**JSON Format:** The processed JSON will now have an added entity, 'HITL_Status', and will be relocated to the previously specified GCS directory. \ No newline at end of file diff --git a/DocAI Incubator Tools/best-practices/Identifying Poor Performing Docs/Identifying_Poor_Performing_Docs.ipynb b/DocAI Incubator Tools/best-practices/Identifying Poor Performing Docs/Identifying_Poor_Performing_Docs.ipynb new file mode 100644 index 000000000..a2f5496e2 --- /dev/null +++ b/DocAI Incubator Tools/best-practices/Identifying Poor Performing Docs/Identifying_Poor_Performing_Docs.ipynb @@ -0,0 +1,666 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Identifying Poor Performing Documents\n", + "\n", + "- Author: docai-incubator@google.com\n", + "\n", + "## Purpose and Description\n", + "\n", + "The goal is to automate identifying poorly performing documents for uptraining. The metric of \n", + "poorly performing documents by the number of missed critical fields. The script will work based on the following conditions :\n", + "\n", + "### 1. Input provided to the script\n", + "\n", + " a. Input bucket of labeled documents\n", + " \n", + " b. Output bucket for poorly performing documents\n", + " \n", + " c. Project and processor ID (and version) to call the specified processor\n", + " \n", + " c. List of critical fields. Script should validate before running that critical field names match schema names. If it does not match then the script should throw an error and request you to update input for critical fields to match schema.\n", + " \n", + " d. Threshold needed for a document to be sent output bucket for poorly performing documents\n", + " \n", + "### 2. Numerical substring matching condition\n", + "\n", + " a. Script runs documents through a specified processor and identifies poorly performing docs by looking at critical fields of each document and comparing it to Ground Truth(GT).\n", + " \n", + " b. Optional numerical substring matching that can be set by entity. If enabled then as long as the numerical subset is correct then it is not counted as a miss by the processor. For Example, the ground truth is “ID 5123” and the model predicts “5123”. It is not counted as a miss by the script, as long as it picks up the substring containing all the correct numerical digits then would be correct.\n", + " \n", + "### 3. Threshold logic to move poor performance documents to bucket\n", + "\n", + " a. Script outputs the worst poorly performing documents (by some custom set threshold). For example, documents that got more than 50% of critical fields wrong are in the output. Should also accept integers, such as any document with more than 5 missed critical fields v sent to the output bucket.\n", + " \n", + "### 4. Output summary and stats file\n", + "\n", + " a. Output a list of misses by critical fields in sheets/CSV for each document that is being sent to the output bucket.\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Example output CSV of missed critical fields:\n", + "\n", + "| Document Names | Invoice_1 | Invoice_2 |\n", + "| --- | --- | --- |\n", + "| # of mIssed Invoice_ID | 2 | 0 |\n", + "| # of missed Address | 1 | 1 |\n", + "| # of missed Taxes | 1 | 3 |\n", + "\n", + "Example Input critical fields: \n", + "| Critical Fields | Numerical substring matching |\n", + "| --- | --- | \n", + "| Invoice_ID | Yes | \n", + "| Address | No | " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Prerequisites \n", + "1. Access to a Google Cloud project to create Document AI processors.\n", + " - Permission to Google project is needed to access Document AI processors.\n", + "2. Python: Jupyter notebook (Vertex AI) or Google Colab.\n", + "3. Critical fields list in csv file\n", + "4. Ground truth Json files in GCS Folders" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### NOTE ON INPUT DETAILS\n", + "\n", + "### The possible values for pctg_or_count_flag are one of the below:\n", + "\n", + "#### pctg_or_count_flag = count\n", + "#### pctg_or_count_flag = pctg\n", + "\n", + "1. If the pctg_or_count_flag = count, then value for threshold_count must be provided and threshold_pctg should be 0. And if the error count is greater than the value of input threshold_count then the predicted document is moved to a poor performance storage path.\n", + "\n", + "2. If the pctg_or_count_flag = pctg, then value for threshold_pctg must be provided and threshold_count = 0. And if the error percentage is greater than the value of input threshold_pctg then the predicted document is moved to a poor performance storage path.\n", + "\n", + "### Critical Fields File: This file contains the required list entities of documents along with the flag having values Yes/No. This file determines if the processing substring matches with the schema if enabled. \n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Tool Operation Procedure\n", + "\n", + "### 1. Install required libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: google-cloud-documentai in /opt/conda/lib/python3.7/site-packages (2.18.0)\n", + "Requirement already satisfied: google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0 in /opt/conda/lib/python3.7/site-packages (from google-cloud-documentai) (1.34.0)\n", + "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.19.5 in /opt/conda/lib/python3.7/site-packages (from google-cloud-documentai) (3.20.3)\n", + "Requirement already satisfied: proto-plus<2.0.0dev,>=1.22.0 in /opt/conda/lib/python3.7/site-packages (from google-cloud-documentai) (1.22.2)\n", + "Requirement already satisfied: google-auth<3.0dev,>=1.25.0 in /opt/conda/lib/python3.7/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0->google-cloud-documentai) (2.22.0)\n", + "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.56.2 in /opt/conda/lib/python3.7/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0->google-cloud-documentai) (1.58.0)\n", + "Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /opt/conda/lib/python3.7/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0->google-cloud-documentai) (2.28.2)\n", + "Requirement already satisfied: grpcio<2.0dev,>=1.33.2 in /opt/conda/lib/python3.7/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0->google-cloud-documentai) (1.51.3)\n", + "Requirement already satisfied: grpcio-status<2.0dev,>=1.33.2 in /opt/conda/lib/python3.7/site-packages (from google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0->google-cloud-documentai) (1.48.2)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from google-auth<3.0dev,>=1.25.0->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0->google-cloud-documentai) (4.2.4)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.7/site-packages (from google-auth<3.0dev,>=1.25.0->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0->google-cloud-documentai) (0.2.8)\n", + "Requirement already satisfied: six>=1.9.0 in /opt/conda/lib/python3.7/site-packages (from google-auth<3.0dev,>=1.25.0->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0->google-cloud-documentai) (1.16.0)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.7/site-packages (from google-auth<3.0dev,>=1.25.0->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0->google-cloud-documentai) (4.9)\n", + "Requirement already satisfied: urllib3<2.0 in /opt/conda/lib/python3.7/site-packages (from google-auth<3.0dev,>=1.25.0->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0->google-cloud-documentai) (1.26.14)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0->google-cloud-documentai) (2.1.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0->google-cloud-documentai) (3.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0->google-cloud-documentai) (2022.12.7)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=1.25.0->google-api-core[grpc]!=2.0.*,!=2.1.*,!=2.10.*,!=2.2.*,!=2.3.*,!=2.4.*,!=2.5.*,!=2.6.*,!=2.7.*,!=2.8.*,!=2.9.*,<3.0.0dev,>=1.34.0->google-cloud-documentai) (0.4.8)\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: google-cloud-storage in /opt/conda/lib/python3.7/site-packages (2.7.0)\n", + "Requirement already satisfied: google-resumable-media>=2.3.2 in /opt/conda/lib/python3.7/site-packages (from google-cloud-storage) (2.4.1)\n", + "Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /opt/conda/lib/python3.7/site-packages (from google-cloud-storage) (2.28.2)\n", + "Requirement already satisfied: google-cloud-core<3.0dev,>=2.3.0 in /opt/conda/lib/python3.7/site-packages (from google-cloud-storage) (2.3.2)\n", + "Requirement already satisfied: google-auth<3.0dev,>=1.25.0 in /opt/conda/lib/python3.7/site-packages (from google-cloud-storage) (2.22.0)\n", + "Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 in /opt/conda/lib/python3.7/site-packages (from google-cloud-storage) (1.34.0)\n", + "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.56.2 in /opt/conda/lib/python3.7/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-storage) (1.58.0)\n", + "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<4.0.0dev,>=3.19.5 in /opt/conda/lib/python3.7/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-storage) (3.20.3)\n", + "Requirement already satisfied: urllib3<2.0 in /opt/conda/lib/python3.7/site-packages (from google-auth<3.0dev,>=1.25.0->google-cloud-storage) (1.26.14)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.7/site-packages (from google-auth<3.0dev,>=1.25.0->google-cloud-storage) (4.9)\n", + "Requirement already satisfied: six>=1.9.0 in /opt/conda/lib/python3.7/site-packages (from google-auth<3.0dev,>=1.25.0->google-cloud-storage) (1.16.0)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from google-auth<3.0dev,>=1.25.0->google-cloud-storage) (4.2.4)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.7/site-packages (from google-auth<3.0dev,>=1.25.0->google-cloud-storage) (0.2.8)\n", + "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /opt/conda/lib/python3.7/site-packages (from google-resumable-media>=2.3.2->google-cloud-storage) (1.5.0)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-cloud-storage) (2022.12.7)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-cloud-storage) (2.1.1)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-cloud-storage) (3.4)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=1.25.0->google-cloud-storage) (0.4.8)\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: google-api-core in /opt/conda/lib/python3.7/site-packages (1.34.0)\n", + "Requirement already satisfied: requests<3.0.0dev,>=2.18.0 in /opt/conda/lib/python3.7/site-packages (from google-api-core) (2.28.2)\n", + "Requirement already satisfied: google-auth<3.0dev,>=1.25.0 in /opt/conda/lib/python3.7/site-packages (from google-api-core) (2.22.0)\n", + "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<4.0.0dev,>=3.19.5 in /opt/conda/lib/python3.7/site-packages (from google-api-core) (3.20.3)\n", + "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.56.2 in /opt/conda/lib/python3.7/site-packages (from google-api-core) (1.58.0)\n", + "Requirement already satisfied: urllib3<2.0 in /opt/conda/lib/python3.7/site-packages (from google-auth<3.0dev,>=1.25.0->google-api-core) (1.26.14)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.7/site-packages (from google-auth<3.0dev,>=1.25.0->google-api-core) (0.2.8)\n", + "Requirement already satisfied: six>=1.9.0 in /opt/conda/lib/python3.7/site-packages (from google-auth<3.0dev,>=1.25.0->google-api-core) (1.16.0)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from google-auth<3.0dev,>=1.25.0->google-api-core) (4.2.4)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.7/site-packages (from google-auth<3.0dev,>=1.25.0->google-api-core) (4.9)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core) (3.4)\n", + "Requirement already satisfied: charset-normalizer<4,>=2 in /opt/conda/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core) (2.1.1)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests<3.0.0dev,>=2.18.0->google-api-core) (2022.12.7)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=1.25.0->google-api-core) (0.4.8)\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: pandas in /opt/conda/lib/python3.7/site-packages (1.3.5)\n", + "Requirement already satisfied: pytz>=2017.3 in /opt/conda/lib/python3.7/site-packages (from pandas) (2022.7.1)\n", + "Requirement already satisfied: python-dateutil>=2.7.3 in /opt/conda/lib/python3.7/site-packages (from pandas) (2.8.2)\n", + "Requirement already satisfied: numpy>=1.17.3 in /opt/conda/lib/python3.7/site-packages (from pandas) (1.21.6)\n", + "Requirement already satisfied: six>=1.5 in /opt/conda/lib/python3.7/site-packages (from python-dateutil>=2.7.3->pandas) (1.16.0)\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: numpy in /opt/conda/lib/python3.7/site-packages (1.21.6)\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[31mERROR: Could not find a version that satisfies the requirement operator (from versions: none)\u001b[0m\u001b[31m\n", + "\u001b[0m\u001b[31mERROR: No matching distribution found for operator\u001b[0m\u001b[31m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[31mERROR: Could not find a version that satisfies the requirement difflib (from versions: none)\u001b[0m\u001b[31m\n", + "\u001b[0m\u001b[31mERROR: No matching distribution found for difflib\u001b[0m\u001b[31m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[31mERROR: Could not find a version that satisfies the requirement json (from versions: none)\u001b[0m\u001b[31m\n", + "\u001b[0m\u001b[31mERROR: No matching distribution found for json\u001b[0m\u001b[31m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: gcsfs in /opt/conda/lib/python3.7/site-packages (2023.1.0)\n", + "Requirement already satisfied: decorator>4.1.2 in /opt/conda/lib/python3.7/site-packages (from gcsfs) (5.1.1)\n", + "Requirement already satisfied: google-auth>=1.2 in /opt/conda/lib/python3.7/site-packages (from gcsfs) (2.22.0)\n", + "Requirement already satisfied: fsspec==2023.1.0 in /opt/conda/lib/python3.7/site-packages (from gcsfs) (2023.1.0)\n", + "Requirement already satisfied: requests in /opt/conda/lib/python3.7/site-packages (from gcsfs) (2.28.2)\n", + "Requirement already satisfied: google-auth-oauthlib in /opt/conda/lib/python3.7/site-packages (from gcsfs) (1.0.0)\n", + "Requirement already satisfied: google-cloud-storage in /opt/conda/lib/python3.7/site-packages (from gcsfs) (2.7.0)\n", + "Requirement already satisfied: aiohttp!=4.0.0a0,!=4.0.0a1 in /opt/conda/lib/python3.7/site-packages (from gcsfs) (3.8.4)\n", + "Requirement already satisfied: asynctest==0.13.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs) (0.13.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /opt/conda/lib/python3.7/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs) (1.3.1)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /opt/conda/lib/python3.7/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs) (1.3.3)\n", + "Requirement already satisfied: typing-extensions>=3.7.4 in /opt/conda/lib/python3.7/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs) (4.7.1)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /opt/conda/lib/python3.7/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs) (4.0.2)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs) (1.8.2)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /opt/conda/lib/python3.7/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs) (6.0.4)\n", + "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs) (2.1.1)\n", + "Requirement already satisfied: attrs>=17.3.0 in /opt/conda/lib/python3.7/site-packages (from aiohttp!=4.0.0a0,!=4.0.0a1->gcsfs) (22.2.0)\n", + "Requirement already satisfied: urllib3<2.0 in /opt/conda/lib/python3.7/site-packages (from google-auth>=1.2->gcsfs) (1.26.14)\n", + "Requirement already satisfied: cachetools<6.0,>=2.0.0 in /opt/conda/lib/python3.7/site-packages (from google-auth>=1.2->gcsfs) (4.2.4)\n", + "Requirement already satisfied: rsa<5,>=3.1.4 in /opt/conda/lib/python3.7/site-packages (from google-auth>=1.2->gcsfs) (4.9)\n", + "Requirement already satisfied: six>=1.9.0 in /opt/conda/lib/python3.7/site-packages (from google-auth>=1.2->gcsfs) (1.16.0)\n", + "Requirement already satisfied: pyasn1-modules>=0.2.1 in /opt/conda/lib/python3.7/site-packages (from google-auth>=1.2->gcsfs) (0.2.8)\n", + "Requirement already satisfied: requests-oauthlib>=0.7.0 in /opt/conda/lib/python3.7/site-packages (from google-auth-oauthlib->gcsfs) (1.3.1)\n", + "Requirement already satisfied: google-resumable-media>=2.3.2 in /opt/conda/lib/python3.7/site-packages (from google-cloud-storage->gcsfs) (2.4.1)\n", + "Requirement already satisfied: google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5 in /opt/conda/lib/python3.7/site-packages (from google-cloud-storage->gcsfs) (1.34.0)\n", + "Requirement already satisfied: google-cloud-core<3.0dev,>=2.3.0 in /opt/conda/lib/python3.7/site-packages (from google-cloud-storage->gcsfs) (2.3.2)\n", + "Requirement already satisfied: idna<4,>=2.5 in /opt/conda/lib/python3.7/site-packages (from requests->gcsfs) (3.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /opt/conda/lib/python3.7/site-packages (from requests->gcsfs) (2022.12.7)\n", + "Requirement already satisfied: protobuf!=3.20.0,!=3.20.1,!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<4.0.0dev,>=3.19.5 in /opt/conda/lib/python3.7/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-storage->gcsfs) (3.20.3)\n", + "Requirement already satisfied: googleapis-common-protos<2.0dev,>=1.56.2 in /opt/conda/lib/python3.7/site-packages (from google-api-core!=2.0.*,!=2.1.*,!=2.2.*,!=2.3.0,<3.0.0dev,>=1.31.5->google-cloud-storage->gcsfs) (1.58.0)\n", + "Requirement already satisfied: google-crc32c<2.0dev,>=1.0 in /opt/conda/lib/python3.7/site-packages (from google-resumable-media>=2.3.2->google-cloud-storage->gcsfs) (1.5.0)\n", + "Requirement already satisfied: pyasn1<0.5.0,>=0.4.6 in /opt/conda/lib/python3.7/site-packages (from pyasn1-modules>=0.2.1->google-auth>=1.2->gcsfs) (0.4.8)\n", + "Requirement already satisfied: oauthlib>=3.0.0 in /opt/conda/lib/python3.7/site-packages (from requests-oauthlib>=0.7.0->google-auth-oauthlib->gcsfs) (3.2.2)\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: PyPDF2 in /opt/conda/lib/python3.7/site-packages (3.0.1)\n", + "Requirement already satisfied: typing_extensions>=3.10.0.0 in /opt/conda/lib/python3.7/site-packages (from PyPDF2) (4.7.1)\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mCollecting ast\n", + " Using cached AST-0.0.2.tar.gz (19 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25lerror\n", + " \u001b[1;31merror\u001b[0m: \u001b[1msubprocess-exited-with-error\u001b[0m\n", + " \n", + " \u001b[31m×\u001b[0m \u001b[32mpython setup.py egg_info\u001b[0m did not run successfully.\n", + " \u001b[31m│\u001b[0m exit code: \u001b[1;36m1\u001b[0m\n", + " \u001b[31m╰─>\u001b[0m \u001b[31m[8 lines of output]\u001b[0m\n", + " \u001b[31m \u001b[0m Traceback (most recent call last):\n", + " \u001b[31m \u001b[0m File \"\", line 36, in \n", + " \u001b[31m \u001b[0m File \"\", line 34, in \n", + " \u001b[31m \u001b[0m File \"/var/tmp/pip-install-x199_r8k/ast_707b5d3e001c4043974d31464f2f901b/setup.py\", line 6, in \n", + " \u001b[31m \u001b[0m README = codecs.open(os.path.join(here, 'AST/README'), encoding='utf8').read()\n", + " \u001b[31m \u001b[0m File \"/opt/conda/lib/python3.7/codecs.py\", line 904, in open\n", + " \u001b[31m \u001b[0m file = builtins.open(filename, mode, buffering)\n", + " \u001b[31m \u001b[0m FileNotFoundError: [Errno 2] No such file or directory: '/var/tmp/pip-install-x199_r8k/ast_707b5d3e001c4043974d31464f2f901b/AST/README'\n", + " \u001b[31m \u001b[0m \u001b[31m[end of output]\u001b[0m\n", + " \n", + " \u001b[1;35mnote\u001b[0m: This error originates from a subprocess, and is likely not a problem with pip.\n", + "\u001b[1;31merror\u001b[0m: \u001b[1mmetadata-generation-failed\u001b[0m\n", + "\n", + "\u001b[31m×\u001b[0m Encountered error while generating package metadata.\n", + "\u001b[31m╰─>\u001b[0m See above for output.\n", + "\n", + "\u001b[1;35mnote\u001b[0m: This is an issue with the package mentioned above, not pip.\n", + "\u001b[1;36mhint\u001b[0m: See above for details.\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[?25hNote: you may need to restart the kernel to use updated packages.\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mRequirement already satisfied: Pillow in /opt/conda/lib/python3.7/site-packages (9.5.0)\n", + "\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0m\u001b[33mWARNING: Ignoring invalid distribution -oogle-auth (/opt/conda/lib/python3.7/site-packages)\u001b[0m\u001b[33m\n", + "\u001b[0mNote: you may need to restart the kernel to use updated packages.\n" + ] + } + ], + "source": [ + "%pip install google-cloud-documentai\n", + "%pip install google-cloud-storage\n", + "%pip install google-api-core\n", + "%pip install pandas\n", + "%pip install numpy\n", + "%pip install operator\n", + "%pip install difflib\n", + "%pip install json\n", + "%pip install gcsfs\n", + "%pip install PyPDF2\n", + "%pip install ast\n", + "%pip install Pillow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 2. Import Packages" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "#importing libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import operator\n", + "import difflib\n", + "import json\n", + "import os\n", + "import time\n", + "import gcsfs\n", + "from google.cloud import storage\n", + "from google.cloud import documentai_v1beta3 as documentai\n", + "from PIL import Image\n", + "from typing import Container, Iterable, Iterator, List, Mapping, Optional, Sequence, Tuple, Union\n", + "from PyPDF2 import PdfFileReader\n", + "import ast\n", + "import io\n", + "import re\n", + "import datetime\n", + "import utilities #--> DOWNLOAD THIS AND IMPORT ACCORDINGLY\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 3. Input Details" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "## INPUT DETAILS\n", + "processor_ID='7fbb1ccb4dff7b3c' # processor ID based on which documents performance has to be checked\n", + "project_number='514064100333' # GCP Project number\n", + "processor_versionID='pretrained-invoice-v1.3-2022-07-15' # Processor version ID to use for testing\n", + "location='us' # location of processor created\n", + "project_id='rand-automl-project' # GCP project ID \n", + "GT_Output_URI = 'gs://scb_line_item_exp/SCB_Samples/groundtruth/' #GCS Bucket where the ground truth files are saved\n", + "output_folder_path_name='poor_performance_doc/' # Name of the folder which has to be created and poor performance docs has to be \n", + "pctg_or_count_FLAG ='count' # criteria to decide the poor performance documents\n", + "threshold_count=1 # Threshold count\n", + "threshold_pctg=0 # Threshold percentage\n", + "critical_fields_csv = 'CriticalFields.csv' # path to csv file containing the list of critical fields" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### 4. Run the functions\n" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "def get_poor_perfoming_docs(master_df,pctg_or_count_FLAG,threshold_count,threshold_pctg,cf,output_folder_path_name,processed_predected_documents):\n", + " \"\"\"\n", + " Identify and upload poorly performing documents based on specified criteria.\n", + "\n", + " Args:\n", + " master_df (DataFrame): The master dataframe containing document data.\n", + " pctg_or_count_FLAG (str): The flag indicating whether to use 'count' or 'pctg' as the performance criterion.\n", + " threshold_count (int): The threshold count value for 'count' criterion.\n", + " threshold_pctg (float): The threshold percentage value for 'pctg' criterion.\n", + " cf (dict): A dictionary of Critical fields given in csv.\n", + " output_folder_path_name (str): The path where the output files will be stored.\n", + " processed_predected_documents (dict): A dictionary containing processed predicted documents.\n", + "\n", + " Returns:\n", + " dict: A dictionary containing performance statistics and file paths.\n", + "\n", + " Raises:\n", + " ValueError: If the 'pctg_or_count_FLAG' is not 'count' or 'pctg'.\n", + " \"\"\"\n", + " \n", + " def blob_upload(output_folder_path_name,file_name,json_upload):\n", + " \"\"\"\n", + " Upload a JSON document to a cloud storage bucket.\n", + "\n", + " Args:\n", + " output_folder_path_name (str): The path where the document will be stored.\n", + " file_name (str): The name of the file to be uploaded.\n", + " json_upload (Document): The JSON document to be uploaded.\n", + " \"\"\"\n", + " output_json_file = output_folder_path_name + 'poor_performance-'+str(file_name)\n", + " json_poor=documentai.Document.to_json(json_upload)\n", + " storage_client = storage.Client()\n", + " source_bucket = storage_client.bucket(GT_Output_URI.split('/')[2])\n", + " blob = source_bucket.blob(output_json_file)\n", + " blob.upload_from_string(data=bytes(json.dumps(json_poor),'utf-8'),content_type='application/json')\n", + " stats_json['poor_performed_doc'].append(output_json_file)\n", + " \n", + " stats_json = {}\n", + " stats_json['GT_file_path'] = GT_Output_URI\n", + " \n", + " time_stamp = datetime.datetime.now().strftime('%d_%m_%y-%H%M%S')\n", + " df_filtered=master_df.loc[(master_df['GTvsPredictedDifference'] == 'YES'),('File Name', 'GT Entity Type', 'GT_Output', 'Predicted_Output')]\n", + " df_filtered = df_filtered[df_filtered['GT Entity Type'].isin(cf.keys())]\n", + " df_group=df_filtered.groupby('GT Entity Type')['File Name'].value_counts().unstack().fillna(0)\n", + " summary_df = pd.DataFrame()\n", + " summary_df['GT Entity Type'] = cf.keys()\n", + " summary_df.reset_index(drop=True, inplace=True)\n", + " x = summary_df.join(df_group, on='GT Entity Type').fillna(0)\n", + " x.iloc[:,1:] = x.iloc[:, 1:].applymap(lambda x: int(x) if not pd.isnull(x) else x)\n", + " x.to_csv('analysis/summary_'+time_stamp+'.csv')\n", + " stats_json['analysis_summary_csv'] = 'analysis/summary_stats_'+time_stamp+'.csv'\n", + " stats_json['pctg_or_count_FLAG'] = pctg_or_count_FLAG\n", + " stats_json['threshold_count'] = threshold_count\n", + " stats_json['threshold_pctg'] = threshold_pctg\n", + " stats_json['poor_performed_doc'] = []\n", + " if( (pctg_or_count_FLAG=='count') and (threshold_pctg == 0) ):\n", + " for predicted_file in list(processed_predected_documents.keys()):\n", + " try:\n", + " if( sum((x[predicted_file])) >= threshold_count ):\n", + " blob_upload(output_folder_path_name,predicted_file,processed_predected_documents[predicted_file])\n", + " else:\n", + " print('not meeting the threshold count value')\n", + " except Exception as e:\n", + " print(e)\n", + " continue\n", + " elif((pctg_or_count_FLAG=='pctg') and (threshold_count == 0) ):\n", + " for predicted_file in list(processed_predected_documents.keys()):\n", + " for val in x[predicted_file].values:\n", + " try:\n", + " if( (val/sum(x[predicted_file].values) * 100) >= threshold_pctg): \n", + " blob_upload(output_folder_path_name,predicted_file,processed_predected_documents[predicted_file])\n", + " break\n", + " else:\n", + " print('not meeting the threshold pctg value')\n", + " except Exception as e:\n", + " print(e)\n", + " continue\n", + " else:print(\"Please check input 'pctg_or_count_flag'. Value should be either 'count' or 'pctg'.\")\n", + " from pprint import pprint\n", + " pprint(stats_json)\n", + " with open('summary_run_'+time_stamp+'.json', 'w') as fo:\n", + " fo.write(json.dumps(stats_json))\n", + " \n", + " return stats_json\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "min() arg is an empty sequence\n", + "min() arg is an empty sequence\n", + "min() arg is an empty sequence\n", + "min() arg is an empty sequence\n", + "min() arg is an empty sequence\n", + "min() arg is an empty sequence\n", + "min() arg is an empty sequence\n", + "min() arg is an empty sequence\n", + "min() arg is an empty sequence\n", + "'docai_scoring_output_client_11band12b_3686237892651273550_92_DTP_EXP_BIL_LC_INV_958151758472_ISS000_B39B76F7-CD00-4396-8579-C109EDB35CA7_7-0.json'\n", + "'docai_scoring_output_client_11band12b_3686237892651273550_104_DTP_EXP_COLN_INV_958130578848_ISS000_AE4D4916-9B2D-4D7B-A45D-28638CBC8F7E_10-0.json'\n", + "'docai_scoring_output_client_11band12b_3686237892651273550_112_DTP_EXP_COLN_INV_958130579990_ISS000_6058D335-D4FE-4AED-ADD1-FF9A405A90B1-0.json'\n", + "{'GT_file_path': 'gs://scb_line_item_exp/SCB_Samples/groundtruth/',\n", + " 'analysis_summary_csv': 'analysis/summary_stats_18_10_23-090132.csv',\n", + " 'pctg_or_count_FLAG': 'count',\n", + " 'poor_performed_doc': ['poor_performance_doc/poor_performance-docai_scoring_output_client_11band12b_3686237892651273550_96_DTP_EXP_BIL_LC_INV_958151761413_ISS000_D521D32E-DFB3-4888-9394-D8197A02DCBF_8_9-0.json',\n", + " 'poor_performance_doc/poor_performance-docai_scoring_output_client_11band12b_3686237892651273550_99_DTP_EXP_COLN_INV_958130573932_ISS000_1DDCF4CE-1054-452F-9903-685DF03C7ED2_7_8-0.json',\n", + " 'poor_performance_doc/poor_performance-docai_scoring_output_client_11band12b_3686237892651273550_140_DTP_EXP_COLN_INV_958130576911_ISS000_213F3E0A-E67D-4FCA-B41D-518F55D1E98D_8_9-0.json',\n", + " 'poor_performance_doc/poor_performance-docai_scoring_output_client_11band12b_3686237892651273550_118_DTP_EXP_COLN_INV_958130579099_ISS000_0033EF48-44C5-4572-9508-DA4842E3F960_9_10-0.json',\n", + " 'poor_performance_doc/poor_performance-docai_scoring_output_client_11band12b_3686237892651273550_161_DTP_EXP_COLN_INV_958130566548_ISS000_FB7C4EB9-E708-4576-AC76-BABBCEA873D3_9_10_11_12_13-0.json',\n", + " 'poor_performance_doc/poor_performance-docai_scoring_output_client_11band12b_3686237892651273550_142_DTP_EXP_COLN_INV_958130561794_ISS000_A67663BE-CA58-4F1F-AAE3-A724F7AB58C1-0.json'],\n", + " 'threshold_count': 1,\n", + " 'threshold_pctg': 0}\n" + ] + } + ], + "source": [ + "def main():\n", + " #Getting data from Critical fields.csv\n", + " with open(critical_fields_csv, 'r') as cf_file:\n", + " cf_data = cf_file.read()\n", + "\n", + " cf = {}\n", + " for field in cf_data.split('\\n'):\n", + " data = field.split(',')\n", + " cf[data[0].strip()] = data[1].strip()\n", + " #cf.append((field.split(',')))\n", + " num_substring_entities_list = []\n", + " for k, v in cf.items():\n", + " if(v.lower() == 'yes'):\n", + " num_substring_entities_list.append(k)\n", + "\n", + " GT_bucket=GT_Output_URI.split('/')[2]\n", + " try:\n", + " os.mkdir('analysis')\n", + " except:\n", + " pass\n", + "\n", + " storage_client = storage.Client()\n", + " source_bucket = storage_client.bucket(GT_bucket) # storage bucket name\n", + " source_blob = source_bucket.list_blobs(prefix='/'.join(GT_Output_URI.split('/')[3:-1]))\n", + "\n", + " list_of_files = []\n", + " for blob in source_blob:\n", + " if blob.name.endswith('.json'):\n", + " list_of_files.append('gs://'+GT_bucket+'/'+blob.name) \n", + "\n", + " document_schema=utilities.get_document_schema(location,project_number,processor_ID,processor_versionID)\n", + "\n", + " #Checking whether critical entities are available in schema \n", + " list_of_entities=[]\n", + " for entity_type in document_schema.entity_types:\n", + " for entity in entity_type.properties:\n", + " list_of_entities.append(entity.name)\n", + " for ent in cf.keys():\n", + " if ent not in list_of_entities:\n", + " print(\"Stop! Critical Field Entity {} NOT FOUND in Ground Truth Entities \\n Check the entities in each files and correct...\".format(ent))\n", + " sys.exit(1)\n", + "\n", + " master_df = pd.DataFrame(columns=['File Name', 'GT Entity Type', 'GT_Output', 'GT_bbox',\n", + " 'Predicted_Output', 'GTvsPredictedDifference', 'Predicted_bbox', 'Match',\n", + " 'Fuzzy Ratio', 'bbox_mismatch'])\n", + " processed_predected_documents={}\n", + "\n", + " for prcsd_file in list_of_files: \n", + " compare_merged = pd.DataFrame()\n", + " GT_json=utilities.documentai_json_proto_downloader(GT_bucket, ('/').join(prcsd_file.split('/')[3:]))\n", + " pdf_bytes,synthesiz_images=utilities.create_pdf_bytes_from_json(documentai.Document.to_dict(GT_json))\n", + " processor_result=utilities.process_document_sample(project_id=project_number,location=location,processor_id=processor_ID,pdf_bytes=pdf_bytes,processor_version=processor_versionID).document\n", + " temp_parsed_json=documentai.Document(processor_result)\n", + " compare_output_1,score = utilities.compare_pre_hitl_and_post_hitl_output(GT_json, temp_parsed_json) \n", + " compare_output_1.rename(columns = {'Entity Type':'GT Entity Type','Pre_HITL_Output':'GT_Output','Post_HITL_Output':'Predicted_Output','pre_bbox':'GT_bbox','post_bbox':'Predicted_bbox'}, inplace = True)\n", + " compare_output=compare_output_1.loc[:,['GT Entity Type','GT_Output','GT_bbox','Predicted_Output','Predicted_bbox','Fuzzy Ratio']]\n", + " compare_output.to_csv('2.csv')\n", + " column = [prcsd_file.split('/')[-1]] * compare_output.shape[0]\n", + " compare_output.insert(loc = 0,column = 'File Name',value = column)\n", + " compare_output.insert(loc=5,column = 'GTvsPredictedDifference',value = \" \")\n", + " for j in range(len(compare_output)):\n", + " if compare_output['Fuzzy Ratio'][j]!=1.0: #strict\n", + " #if logic - check if the entity value has numeric and update the column to No/Yes \n", + " for x in num_substring_entities_list:\n", + " GTo = compare_output[(compare_output['GT Entity Type'] == x) & (compare_output['GT_Output'] != compare_output['Predicted_Output'])]['GT_Output']\n", + " Labo = compare_output[(compare_output['GT Entity Type'] == x) & (compare_output['GT_Output'] != compare_output['Predicted_Output'])]['Predicted_Output']\n", + " if str(Labo).isdigit() and Labo in GTo:\n", + " compare_output.loc[ (compare_output['GT Entity Type'] == x),'GTvsPredictedDifference'] = 'NO'\n", + " if compare_output['GT_Output'][j]=='Entity not found.' and compare_output['Predicted_Output'][j]=='Entity not found.':\n", + " compare_output['GTvsPredictedDifference'][j]='NO'\n", + " else:\n", + " compare_output['GTvsPredictedDifference'][j]='YES'\n", + " else:\n", + " compare_output['GTvsPredictedDifference'][j]='NO'\n", + " for k in range(len(compare_output)):\n", + " if compare_output['Fuzzy Ratio'][k]!=1.0: #strict\n", + " change_GT_and_parsed=\"parsed json is diff from GT\"\n", + " break\n", + " else:\n", + " compare_output['GTvsPredictedDifference'][k]='NO'\n", + " processed_predected_documents[prcsd_file.split('/')[-1]] = temp_parsed_json\n", + " #compare_output['bbox_mismatch'] = compare_output['GT_bbox'] != compare_output['Predicted_bbox']\n", + " new_row=pd.Series([prcsd_file.split('/')[-1],\"parsed json\",\"is updated\",\"compared to GT\",\":\",np.nan,change_GT_and_parsed,''], index=compare_output.columns)\n", + " compare_output=compare_output.append(new_row,ignore_index= True)\n", + " frames = [compare_merged, compare_output]\n", + " compare_merged = pd.concat(frames)\n", + " master_df = pd.concat([master_df, compare_merged], ignore_index=True)\n", + "\n", + " stats_json=get_poor_perfoming_docs(master_df,pctg_or_count_FLAG,threshold_count,threshold_pctg,cf,output_folder_path_name,processed_predected_documents)\n", + "\n", + "main()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "common-cpu.m104", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m104" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/DocAI Incubator Tools/best-practices/Identifying Poor Performing Docs/readme.md b/DocAI Incubator Tools/best-practices/Identifying Poor Performing Docs/readme.md new file mode 100644 index 000000000..c96406392 --- /dev/null +++ b/DocAI Incubator Tools/best-practices/Identifying Poor Performing Docs/readme.md @@ -0,0 +1,23 @@ +## Introduction +Our primary objective is to seamlessly automate the process of pinpointing documents that underperform, in order to facilitate their uptraining. A document's performance is quantified based on its count of missed crucial fields. This script operates according to the following specifications: + +## Input Specifications +**Labeled Document Bucket:** The source containing labeled documents. + +**Destination Bucket for Underperformers:** Where the poorly performing documents will be placed. + +**Project and Processor Details:** Needed to invoke the desired processor. This includes the project ID, processor ID, and its version. + +**Critical Fields List:** The script should first confirm that the names of these fields align with the schema. Discrepancies result in errors, prompting an update to the critical fields' input for congruence with the schema. + +**Performance Threshold:** Determines when a document is deemed underperforming and should be transferred to the designated bucket. + +## Numerical Substring Matching Criterion +**Processor-driven Document Evaluation:** The script processes documents using a designated processor. It identifies underperformers by assessing each document's critical fields against the Ground Truth (GT). + +**optional Numerical Substring Matching:** Can be activated per entity. If this feature is on, as long as the numerical subset is accurate, the processor doesn't mark it as an oversight. For instance, if GT shows “ID 5123” and the prediction is “5123”, it isn't considered an error. The script acknowledges it as correct as long as the substring with the right numerical digits is detected. + +## Logic for Relocating Underperforming Documents Based on Thresholds +**Output of Most Underperforming Documents:** The script outputs the worst-performing documents based on a custom-defined threshold. For example, documents that incorrectly identify over 50% of crucial fields are included in the output. The script can also recognize and process integer values; e.g., any document missing more than 5 crucial fields will be sent to the output bucket. +## Summary & Statistics of Output +**Missed Field List:** The script outputs a detailed list, either in sheets or CSV format, specifying misses in the critical fields for each document that's been transferred to the output bucket. \ No newline at end of file diff --git a/DocAI Incubator Tools/best-practices/Key Value Pair Entity Conversion/Key Value Pair Entity Conversion.ipynb b/DocAI Incubator Tools/best-practices/Key Value Pair Entity Conversion/Key Value Pair Entity Conversion.ipynb new file mode 100644 index 000000000..7635f9a32 --- /dev/null +++ b/DocAI Incubator Tools/best-practices/Key Value Pair Entity Conversion/Key Value Pair Entity Conversion.ipynb @@ -0,0 +1,335 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "33e81a43-1c3e-4ba6-8ddf-cf1371e031ca", + "metadata": {}, + "source": [ + "# Key Value Pair Entity Conversion" + ] + }, + { + "cell_type": "markdown", + "id": "357b28b9-4bc9-4c05-b144-b9bd7f08ea69", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "43d9299e-97fd-4bb9-813e-e3a26c29daa6", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.\t" + ] + }, + { + "cell_type": "markdown", + "id": "0f0dab58-25b4-4710-ad18-abdc98e0fae5", + "metadata": {}, + "source": [ + "## Purpose and Description" + ] + }, + { + "cell_type": "markdown", + "id": "dba9362f-09bd-4481-b354-ebcde6133fe9", + "metadata": {}, + "source": [ + "This tool uses Form parser JSON files (Parsed from a processor) from the GCS bucket as input, converts the key/value pair to the entities and stores it to the GCS bucket as JSON files." + ] + }, + { + "cell_type": "markdown", + "id": "f8fe7a56-d471-427f-a0ff-a7c6c44d1b3c", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "1. Vertex AI Notebook\n", + "2. Labeled json files in GCS Folder" + ] + }, + { + "cell_type": "markdown", + "id": "9955cea1-8f9f-413b-ad0b-d082f7f554e3", + "metadata": {}, + "source": [ + "## Step by Step procedure " + ] + }, + { + "cell_type": "markdown", + "id": "255d46a4-f49c-4627-9ea0-c252b50554ed", + "metadata": {}, + "source": [ + "### 1. Setup Input Variables" + ] + }, + { + "cell_type": "markdown", + "id": "c916f6ec-0329-4d1a-a0df-fd7e5dd431be", + "metadata": {}, + "source": [ + "\n", + " * **PROJECT_ID:** provide your GCP project ID (Optional)\n", + " * **bucket_name:** provide the bucket name \n", + " * **formparser_path:** provide the folder name of the jsons gor parsed with form parser.\n", + " * **output_path:** provide for the folder name where jsons will be saved.\n", + " * **entity_synonyms_list:** Here add the entity names in place of \"Entity_1\", \"Entity_2\" and add the synonyms related to the entity in place of \"Entity_1_synonyms_1\" and so on. Add multiple entities with their synonyms in the list.\n", + "\n", + " [{\"Entity_1\":[\"Entity_1_synonyms_1\",\"Entity_1_synonyms_2\",\"Entity_1_synonyms_3\"]},{\"Entity_2\":[\"Entity_2_synonyms_1\",\"Entity_2_synonyms_2\",\"Entity_2_synonyms_3\"]}]" + ] + }, + { + "cell_type": "markdown", + "id": "f9bc87bb-44f7-44d2-b69d-6fd59071279b", + "metadata": {}, + "source": [ + "### 2. Output" + ] + }, + { + "cell_type": "markdown", + "id": "707cc815-35e2-44bb-a767-48baeb9422be", + "metadata": {}, + "source": [ + "We get the converted Json in the GCS path which is provided in the script with the variable name **output_path**. " + ] + }, + { + "cell_type": "markdown", + "id": "bf831c2e-45b5-474f-b3cd-044d37ec7964", + "metadata": {}, + "source": [ + "![](https://screenshot.googleplex.com/43tgnEWB3HXSRpt.png)" + ] + }, + { + "cell_type": "markdown", + "id": "95b79b16-c50d-4e73-95f4-223c38a5188e", + "metadata": {}, + "source": [ + "### 3. Sample Code" + ] + }, + { + "cell_type": "markdown", + "id": "7f68a4db-78f5-421b-8bc9-84c4c06839bd", + "metadata": { + "tags": [] + }, + "source": [ + "#### importing necessary modules" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "76f25301-742f-4dbc-83f4-06bf57219eaf", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import re\n", + "from io import BytesIO\n", + "from pathlib import Path\n", + "from utilities import *\n", + "# import gcsfs\n", + "import google.auth\n", + "from google.cloud import documentai_v1beta3 as documentai\n", + "from google.cloud import storage\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "markdown", + "id": "e4e6b074-57ce-45f8-8a86-044f3e938bf0", + "metadata": {}, + "source": [ + "#### Setup the required inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "732eec2a-3da1-4759-bb34-ecdddb5a3a50", + "metadata": {}, + "outputs": [], + "source": [ + "PROJECT_ID = \"xxx-xxx-xxx\" # your project id\n", + "bucket_name = \"xxxxxx\" # bucket name\n", + "\n", + "formparser_path = \"xxx/xxxxxxx/xxxxxx\" # path of the form parser output without bucket name\n", + "output_path = \"xxxx/xxxxxxxx/xxxxx\" # output path for this script without bucket name\n", + "\n", + "entity_synonyms_list = [{\"Bill_to\":[\"Bill To:\",\"Bill To\",\"BillTo\"]},\n", + " {\"Due_date\":[\"Due Date:\",\"Due Date\",\"DueDate\"]}] #example" + ] + }, + { + "cell_type": "markdown", + "id": "a4cef02c-b4e6-4233-b04f-cf0b98a5045d", + "metadata": {}, + "source": [ + "#### Execute the code" + ] + }, + { + "cell_type": "code", + "execution_count": 171, + "id": "b65904ff-89d1-4229-b375-47aadd51781f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Status : 100%|██████████| 2/2 [00:01<00:00, 1.14it/s]\n" + ] + } + ], + "source": [ + "def list_blobs(bucket_name):\n", + " \"\"\"This function will give the list of files in a bucket \n", + " args: gcs bucket name\n", + " output: list of files\"\"\"\n", + " from google.cloud import storage\n", + " blob_list = []\n", + " storage_client = storage.Client()\n", + " blobs = storage_client.list_blobs(bucket_name)\n", + " for blob in blobs:\n", + " blob_list.append(blob.name)\n", + " return blob_list\n", + "\n", + "\n", + "def store_blob(document, file_name: str):\n", + " \"\"\"\n", + " Store files in cloud storage.\n", + " \"\"\"\n", + "\n", + " storage_client = storage.Client()\n", + " process_result_bucket = storage_client.get_bucket(bucket_name)\n", + " document_blob = storage.Blob(name=str(Path(output_path, file_name)),\n", + " bucket=process_result_bucket)\n", + " document_blob.upload_from_string(document,\n", + " content_type=\"application/json\")\n", + "\n", + "\n", + "def entity_synonyms(old_entity: str):\n", + " \"\"\"\n", + " To check for any synonyms for the entites and replace.\n", + " \"\"\"\n", + " for item in entity_synonyms_list:\n", + " synonym_list = list(map(str.lower,[*item.values()][0]))\n", + " if old_entity.lower() in synonym_list:\n", + " return [*item][0]\n", + "\n", + " # if entity does not match with any synonyms, will return none.\n", + " return \"\"\n", + "\n", + "\n", + "def entity_data(formField_data, page_number: int):\n", + " \"\"\"\n", + " Function to create entity objects with some cleaning.\n", + " \"\"\"\n", + " # Cleaning the entity name\n", + " key_name = (re.sub(r\"[^\\w\\s]\",\"\", formField_data.field_name.text_anchor.content.replace(\" \", \"\").strip()))\n", + " # checking for entity synonyms\n", + " key_name = entity_synonyms(key_name)\n", + " #initializing new entity \n", + " entity = documentai.Document.Entity()\n", + " \n", + " if key_name:\n", + " entity.confidence = formField_data.field_value.confidence\n", + " entity.mention_text = formField_data.field_value.text_anchor.content\n", + " page_ref=entity.page_anchor.PageRef()\n", + " page_ref.bounding_poly.normalized_vertices.extend(formField_data.field_value.bounding_poly.normalized_vertices)\n", + " page_ref.page = page_number\n", + " entity.page_anchor.page_refs.append(page_ref)\n", + " entity.text_anchor = formField_data.field_value.text_anchor\n", + " entity.type = key_name\n", + " return entity\n", + " else:\n", + " return {}\n", + "\n", + "\n", + "def convert_kv_entities(file: str):\n", + " \"\"\"\n", + " Function to convert form parser key value to entities.\n", + " \"\"\"\n", + " # initializing entities list\n", + " file.entities = []\n", + " \n", + " for page_number, page_data in enumerate(file.pages):\n", + " for formField_number, formField_data in enumerate(\n", + " getattr(page_data,\"form_fields\", [])):\n", + " \n", + " # get the element and push it to the entities array\n", + " entity_obj = entity_data(formField_data, page_number)\n", + " if entity_obj:\n", + " file.entities.append(entity_obj)\n", + " # removing the form parser data\n", + " for page in file.pages:\n", + " del page.form_fields\n", + " del page.tables\n", + " \n", + " \n", + " \n", + "\n", + " return file\n", + "\n", + "\n", + "def main():\n", + " \"\"\"\n", + " Main function to call helper functions\n", + " \"\"\"\n", + " # fetching all the files\n", + " files = list(file_names(f\"gs://{bucket_name}/{formparser_path}\")[1].values())\n", + " for file in tqdm(files, desc=\"Status : \"):\n", + " # converting key value to entites\n", + " \n", + " entity_json = convert_kv_entities(documentai_json_proto_downloader(bucket_name,file))\n", + " \n", + " # storing the json\n", + " store_blob(documentai.Document.to_json(entity_json), file.split(\"/\")[-1])\n", + "\n", + "\n", + "# calling main function\n", + "main()" + ] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "common-cpu.m104", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m104" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/DocAI Incubator Tools/best-practices/Key Value Pair Entity Conversion/readme.md b/DocAI Incubator Tools/best-practices/Key Value Pair Entity Conversion/readme.md new file mode 100644 index 000000000..e813f0ca2 --- /dev/null +++ b/DocAI Incubator Tools/best-practices/Key Value Pair Entity Conversion/readme.md @@ -0,0 +1,18 @@ +## Overview +This utility is tailored to process Form parser JSON files, which are initially parsed using a processor. These files, sourced from the GCS bucket, are transformed from a key/value structure into entities. Once transformed, the tool saves them back into the GCS bucket in JSON format. + +## Configuration +To make the tool operational, configure the following input parameters: + +**PROJECT_ID:** Input your GCP project ID. This is optional. + +**bucket_name:** Specify the name of the bucket from where the files will be sourced. + +**formparser_path:** Designate the folder containing JSON files parsed with the form parser. + +**output_path:** Identify the folder where the processed JSON files will be saved. + +**entity_synonyms_list:** This list is structured to map entities with their synonyms. Replace placeholders like "Entity_1", "Entity_2" with the actual entity names. Similarly, replace "Entity_1_synonyms_1", "Entity_1_synonyms_2", etc., with the related synonyms for the respective entities. + +## Output +Once the tool completes its operation, the transformed JSON files will be saved in the GCS directory specified by the output_path variable. \ No newline at end of file diff --git a/DocAI Incubator Tools/best-practices/Key_Value_Pair_Entity_Conversion.ipynb b/DocAI Incubator Tools/best-practices/Key_Value_Pair_Entity_Conversion.ipynb deleted file mode 100644 index 1cad87190..000000000 --- a/DocAI Incubator Tools/best-practices/Key_Value_Pair_Entity_Conversion.ipynb +++ /dev/null @@ -1,459 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "33e81a43-1c3e-4ba6-8ddf-cf1371e031ca", - "metadata": {}, - "source": [ - "# Key Value Pair Entity Conversion" - ] - }, - { - "cell_type": "markdown", - "id": "357b28b9-4bc9-4c05-b144-b9bd7f08ea69", - "metadata": {}, - "source": [ - "* Author: docai-incubator@google.com" - ] - }, - { - "cell_type": "markdown", - "id": "43d9299e-97fd-4bb9-813e-e3a26c29daa6", - "metadata": {}, - "source": [ - "## Disclaimer\n", - "\n", - "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.\t" - ] - }, - { - "cell_type": "markdown", - "id": "0f0dab58-25b4-4710-ad18-abdc98e0fae5", - "metadata": {}, - "source": [ - "## Purpose and Description" - ] - }, - { - "cell_type": "markdown", - "id": "dba9362f-09bd-4481-b354-ebcde6133fe9", - "metadata": {}, - "source": [ - "This tool uses Form parser JSON files (Parsed from a processor) from the GCS bucket as input, converts the key/value pair to the entities and stores it to the GCS bucket as JSON files." - ] - }, - { - "cell_type": "markdown", - "id": "f8fe7a56-d471-427f-a0ff-a7c6c44d1b3c", - "metadata": {}, - "source": [ - "## Prerequisites\n", - "\n", - "1. Vertex AI Notebook\n", - "2. Labeled json files in GCS Folder" - ] - }, - { - "cell_type": "markdown", - "id": "9955cea1-8f9f-413b-ad0b-d082f7f554e3", - "metadata": {}, - "source": [ - "## Step by Step procedure " - ] - }, - { - "cell_type": "markdown", - "id": "a40e912e-79a2-4e4a-9ad7-86d4325c4c8e", - "metadata": {}, - "source": [ - "### 1. Config file Creation" - ] - }, - { - "cell_type": "markdown", - "id": "e4472e18-ad97-486c-a8d2-c62c7026c265", - "metadata": {}, - "source": [ - "Run the below code and create a config.ini file for providing input." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "71087bd5-66ef-4cee-b3d4-8c2d3bbb2037", - "metadata": {}, - "outputs": [], - "source": [ - "import configparser\n", - "\n", - "config = configparser.ConfigParser()\n", - "config_path = \"config.ini\" # Enter the path of config file\n", - "# Add the structure to the file we will create\n", - "config.add_section(\"Entities_synonyms\")\n", - "config.set(\"Entities_synonyms\", \"entity1\",\n", - " \"key_synonym1, key_synonym2, key_synonym3\")\n", - "config.set(\"Entities_synonyms\", \"entity2\",\n", - " \"key_synonym1, key_synonym2, key_synonym3\")\n", - "# Write the new structure to the new file\n", - "with open(config_path, \"w\") as configfile:\n", - " config.write(configfile)" - ] - }, - { - "cell_type": "markdown", - "id": "019e148d-1ca8-4fad-a12c-7c725e9e148a", - "metadata": {}, - "source": [ - "### 2. Input Details" - ] - }, - { - "cell_type": "markdown", - "id": "74ed06f0-3a95-45f6-80dd-bed793b130be", - "metadata": {}, - "source": [ - "Once config.ini file is created with the above step , enter the input in the config file : " - ] - }, - { - "cell_type": "markdown", - "id": "c916f6ec-0329-4d1a-a0df-fd7e5dd431be", - "metadata": {}, - "source": [ - "entity1 = key_synonym1, key_synonym2, key_synonym3" - ] - }, - { - "cell_type": "markdown", - "id": "0d83c1f6-9c8c-4a13-8ed6-c375d102eeaa", - "metadata": {}, - "source": [ - "![Alt text](https://screenshot.googleplex.com/BevguHAbpRKYQdX.png)" - ] - }, - { - "cell_type": "markdown", - "id": "044c1ec6-c9e6-4ef0-8a23-0f6fbf0138d9", - "metadata": {}, - "source": [ - "Here add the entity name in place of entity1 and add the synonyms related to the entity in place of key_synonym separated by comma(,). Add multiple entities with their synonyms in the next line.\n", - "\n", - "**Example :** \n", - "Address = AddressName, AddressName1, AddressLine\n", - "InvoiceNumber = Invoice,InvoiceNo\n", - "PaymentDate = SNC, SNCs, SNC1" - ] - }, - { - "cell_type": "markdown", - "id": "dfb2369e-4ae7-4675-854d-9d320acb5b62", - "metadata": {}, - "source": [ - "### 3.Run the Code" - ] - }, - { - "cell_type": "markdown", - "id": "71d8f883-0287-4fb6-9680-eeafa259a67f", - "metadata": {}, - "source": [ - " a. Copy the code provided in this document, Enter the path of Config file " - ] - }, - { - "cell_type": "markdown", - "id": "53914f2d-33b7-4801-bd04-82b51468783a", - "metadata": {}, - "source": [ - "![](https://screenshot.googleplex.com/9gpHncm7kNGibvw.png)" - ] - }, - { - "cell_type": "markdown", - "id": "ac1c7f11-79b2-4993-b91d-50f02ddb50fa", - "metadata": {}, - "source": [ - " b. Update the project id, form parser output path, GCS bucket name and the GCP output for the labeled entities Jsons." - ] - }, - { - "cell_type": "markdown", - "id": "223c02ff-18ee-4da5-8e72-d8ff909b898b", - "metadata": {}, - "source": [ - "![](https://screenshot.googleplex.com/BfBAVjwW2zoxhxT.png)" - ] - }, - { - "cell_type": "markdown", - "id": "f9bc87bb-44f7-44d2-b69d-6fd59071279b", - "metadata": {}, - "source": [ - "### 4. Output" - ] - }, - { - "cell_type": "markdown", - "id": "707cc815-35e2-44bb-a767-48baeb9422be", - "metadata": {}, - "source": [ - "We get the converted Json in the GCS path which is provided in the script with the variable name **output_path**. " - ] - }, - { - "cell_type": "markdown", - "id": "bf831c2e-45b5-474f-b3cd-044d37ec7964", - "metadata": {}, - "source": [ - "![](https://screenshot.googleplex.com/43tgnEWB3HXSRpt.png)" - ] - }, - { - "cell_type": "markdown", - "id": "95b79b16-c50d-4e73-95f4-223c38a5188e", - "metadata": {}, - "source": [ - "### 5. Sample Code" - ] - }, - { - "cell_type": "markdown", - "id": "7f68a4db-78f5-421b-8bc9-84c4c06839bd", - "metadata": { - "tags": [] - }, - "source": [ - "#### importing necessary modules" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "76f25301-742f-4dbc-83f4-06bf57219eaf", - "metadata": {}, - "outputs": [], - "source": [ - "import json\n", - "import re\n", - "from io import BytesIO\n", - "from pathlib import Path\n", - "\n", - "import gcsfs\n", - "import google.auth\n", - "from google.cloud import documentai_v1beta3 as documentai\n", - "from google.cloud import storage\n", - "from tqdm import tqdm" - ] - }, - { - "cell_type": "markdown", - "id": "e4e6b074-57ce-45f8-8a86-044f3e938bf0", - "metadata": {}, - "source": [ - "#### Setup the required inputs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "732eec2a-3da1-4759-bb34-ecdddb5a3a50", - "metadata": {}, - "outputs": [], - "source": [ - "PROJECT_ID = “XXXX-XXXX-XXXX” # your project id\n", - "bucket_name = \"ZZZZ-ZZZZ\" # bucket name\n", - "\n", - "credentials, _ = google.auth.default()\n", - "fileSystem =gcsfs.GCSFileSystem(project=PROJECT_ID, token=credentials) \n", - "formparser_path = \"kv_entites_conversion/test_script\" # path of the form parser output\n", - "output_path = \"kv_entites_conversion/test_script/output\" # output path for this script\n", - "config_path = \"/path/to/config.ini\n", - "config = configparser.ConfigParser()\n", - "config.optionxform = str\n", - "config.read(config_path)" - ] - }, - { - "cell_type": "markdown", - "id": "a4cef02c-b4e6-4233-b04f-cf0b98a5045d", - "metadata": {}, - "source": [ - "#### Execute the code" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b65904ff-89d1-4229-b375-47aadd51781f", - "metadata": {}, - "outputs": [], - "source": [ - "def get_file(file_path: str):\n", - " \"\"\"\n", - " To read files from cloud storage.\n", - " \"\"\"\n", - " file_object = json.loads(fileSystem.cat(file_path))\n", - " return file_object\n", - "\n", - "\n", - "def store_blob(document, file_name: str):\n", - " \"\"\"\n", - " Store files in cloud storage.\n", - " \"\"\"\n", - " storage_client = storage.Client()\n", - " process_result_bucket = storage_client.get_bucket(bucket_name)\n", - " document_blob = storage.Blob(name=str(Path(output_path, file_name)),\n", - " bucket=process_result_bucket)\n", - " document_blob.upload_from_string(json.dumps(document),\n", - " content_type=\"application/json\")\n", - " # print(f\"File Saved : {file_name}.\")\n", - "\n", - "\n", - "def entity_synonyms(old_entity: str):\n", - " \"\"\"\n", - " To check for any synonyms for the entites and replace.\n", - " \"\"\"\n", - " entities_synonyms = config.items(\"Entities_synonyms\")\n", - " for item in entities_synonyms:\n", - " synonym_list = [i.lower().strip() for i in item[1].split(\",\")]\n", - " if old_entity.lower() in synonym_list:\n", - " return item[0]\n", - "\n", - " # if entity does not match with any synonyms, will return entity as it is.\n", - " return \"\"\n", - "\n", - "\n", - "def entity_data(formField_data: dict, page_number: int):\n", - " \"\"\"\n", - " Function to create entity objects with some cleaning.\n", - " \"\"\"\n", - " # Cleaning the entity name\n", - " key_name = (re.sub(\n", - " r\"[^\\w\\s]\",\n", - " \"\", formField_data[\"fieldName\"][\"textAnchor\"][\"content\"]).replace(\n", - " \" \", \"\").strip())\n", - " # checking for entity synonyms\n", - " key_name = entity_synonyms(key_name)\n", - " if key_name:\n", - " entity_dict = {\n", - " \"confidence\": formField_data[\"fieldValue\"][\"confidence\"],\n", - " \"mentionText\":\n", - " formField_data[\"fieldValue\"][\"textAnchor\"][\"content\"],\n", - " \"pageAnchor\": {\n", - " \"pageRefs\": [{\n", - " \"boundingPoly\":\n", - " formField_data[\"fieldValue\"][\"boundingPoly\"],\n", - " \"page\":\n", - " page_number,\n", - " }]\n", - " },\n", - " \"textAnchor\": formField_data[\"fieldValue\"][\"textAnchor\"],\n", - " \"type\": key_name,\n", - " }\n", - "\n", - " return entity_dict\n", - " else:\n", - " return {}\n", - "\n", - "\n", - "def convert_kv_entities(file: str):\n", - " \"\"\"\n", - " Function to convert form parser key value to entities.\n", - " \"\"\"\n", - " # get the file object\n", - " file = get_file(file)\n", - " # initializing entities list\n", - " file[\"entities\"] = []\n", - "\n", - " for page_number, page_data in enumerate(file[\"pages\"]):\n", - " for formField_number, formField_data in enumerate(\n", - " page_data.get(\"formFields\", [])):\n", - " # get the element and push it to the entities array\n", - " entity_obj = entity_data(formField_data, page_number)\n", - " if entity_obj:\n", - " file[\"entities\"].append(entity_obj)\n", - " # removing the form parser data\n", - " for i in range(len(file[\"pages\"])):\n", - " if \"formFields\" in file[\"pages\"][i].keys():\n", - " del file[\"pages\"][i][\"formFields\"]\n", - " if \"tables\" in file[\"pages\"][i].keys():\n", - " del file[\"pages\"][i][\"tables\"]\n", - "\n", - " return file\n", - "\n", - "\n", - "def main():\n", - " \"\"\"\n", - " Main function to call helper functions\n", - " \"\"\"\n", - " # fetching all the files\n", - " files = [\n", - " i for i in fileSystem.ls(bucket_name + \"/\" + formparser_path)\n", - " if i.endswith(\".json\")\n", - " ]\n", - " for file in tqdm.tqdm(files, desc=\"Status : \"):\n", - " # converting key value to entites\n", - " entity_json = convert_kv_entities(file)\n", - "\n", - " # storing the json\n", - " file_name = file.split(\"/\")[-1]\n", - " store_blob(entity_json, file_name)\n", - "\n", - "\n", - "# calling main function\n", - "main()" - ] - }, - { - "cell_type": "markdown", - "id": "365d37a6-62cd-4741-a0a2-214b08d91f9b", - "metadata": {}, - "source": [ - "### What’s next ?" - ] - }, - { - "cell_type": "markdown", - "id": "d82f2e94-2279-487b-9431-b479ddc9270d", - "metadata": {}, - "source": [ - "[go/docai-parsed-json-split-address-lines](go/docai-parsed-json-split-address-lines)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9577c16f-bfdc-4e38-a4c6-8d11c4e87ffb", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "environment": { - "kernel": "python3", - "name": "common-cpu.m104", - "type": "gcloud", - "uri": "gcr.io/deeplearning-platform-release/base-cpu:m104" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/DocAI Incubator Tools/best-practices/Parser Result Merger/DocAI Parser Result Merger.ipynb b/DocAI Incubator Tools/best-practices/Parser Result Merger/DocAI Parser Result Merger.ipynb new file mode 100644 index 000000000..5470c3c86 --- /dev/null +++ b/DocAI Incubator Tools/best-practices/Parser Result Merger/DocAI Parser Result Merger.ipynb @@ -0,0 +1,608 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "707aac26-5f83-4dfe-a9c3-73f9eb34dea4", + "metadata": {}, + "source": [ + "# Document AI Parser Result Merger" + ] + }, + { + "cell_type": "markdown", + "id": "fff479c9-a573-48c7-bd26-d9c13cf5be7f", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "ff5efc16-e389-48af-90d6-5d99019a1059", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.\t" + ] + }, + { + "cell_type": "markdown", + "id": "47eed16a-e66d-4c76-860f-16f08ae45867", + "metadata": {}, + "source": [ + "## Objective\n", + "Document AI Parser Result Merger is a tool built using Python programming language. Its purpose is to address the issue of merging the two or more resultant json files of Document AI processors. This document highlights the working of the tool(script) and its requirements. The documents usually contain multiple pages. There are 2 use cases by which this solution can be operated. \n", + "### Case 1: Different documents, parser results json merger (Default).\n", + " * Case 1 deals when we are using two or multiple parser output Jsons are from different documents\n", + " * To Enable this case the flag should be ‘1’\n", + "### Case 2: Same document, different parsers json merger(Added functionality).\n", + " * Case 2 deals when we are using two or multiple parser outputs from the same document.\n", + " * To Enable this case the flag should be ‘2’" + ] + }, + { + "cell_type": "markdown", + "id": "44ef6ea3-79e0-42a0-a2bb-84ece51bff74", + "metadata": {}, + "source": [ + "## Prerequisites" + ] + }, + { + "cell_type": "markdown", + "id": "b64bc02d-ec9b-42b6-b3b0-ed70fa6c3808", + "metadata": {}, + "source": [ + "This tool requires the following services:\n", + "\n", + " * Google Jupyter Notebook or Colab.\n", + " * Google Cloud Storage \n", + " * DocumentAI processor and JSON files\n", + " \n", + "Google Jupyter Notebook or Colab is used for running the python notebook file. Cloud Storage Buckets have the input files to this script. The multiple input files are the json files which are the result of a Document AI processor (for eg., Bank Statement Parser). These json files include multiple pages in its document. After the script executes, the output file is a single merged json file stored in the output bucket path." + ] + }, + { + "cell_type": "markdown", + "id": "e4b4964c-d228-4aad-a6fb-346465791fe7", + "metadata": {}, + "source": [ + "## Workflow overview\n", + " " + ] + }, + { + "cell_type": "markdown", + "id": "60a6f212-d41a-4ae7-b374-1c97ffb03931", + "metadata": {}, + "source": [ + "![](https://screenshot.googleplex.com/9F5qLEtZJ4Kdj8m.png)\n", + "\n", + "The above diagram shows the flow diagram of the tool. As highlighted there are input and output GCP buckets and there is a python script which processes the request. The input bucket holds the multiple json files which need to be merged into a single file and this is achieved by the python script. This script accepts the input json files and prompts users to switch between the default case-1 or the case-2 mode as highlighted in the previous sections. Finally there is an output GCP bucket to store the single merged file. " + ] + }, + { + "cell_type": "markdown", + "id": "3155aca4-1aeb-4a22-a0b7-a3e9b43e69c0", + "metadata": {}, + "source": [ + "## Script walkthrough\n", + "Insights and details about the script are explained in detail as follows." + ] + }, + { + "cell_type": "markdown", + "id": "1dba5180-f972-4d5e-9802-974885efe2d4", + "metadata": {}, + "source": [ + "## 1. Import Modules/Packages\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "a73b8fd9-547b-4db9-8a00-5628ceac6034", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import re\n", + "from typing import Dict, List, Tuple, Union\n", + "\n", + "from google.cloud import documentai_v1beta3 as documentai\n", + "from google.cloud import storage\n", + "from google.cloud.documentai_v1beta3 import Document" + ] + }, + { + "cell_type": "markdown", + "id": "758afb38-cede-4042-b9e1-9c847eef818f", + "metadata": { + "tags": [] + }, + "source": [ + "## 2. Input Details : Entering Project details in below variables" + ] + }, + { + "cell_type": "markdown", + "id": "444df062-df9e-48c0-9f96-fe2789f3e1f0", + "metadata": {}, + "source": [ + "\n", + " * **PROJECT_ID:** provide your GCP project ID (Optional)\n", + " * **INPUT_MULTIPLE_JSONS_URI:** provide the uri link of folder containing the input files (ends with \"/\")\n", + " * **JSON_DIRECTORY_PATH_OUTPUT:** provide the folder name of the output file(ends with \"/\") which gets generated post execution of the script.\n", + " * **OUTPUT_FILE_NAME:** enter a name for the generated file which is saved in the output bucket.\n", + " * **MERGER_TYPE_FLAG:** based on user need, values 1 or 2 can be provided as mentioned in the earlier part of this document.\n", + "\n", + " - Case 1 deals when we are using two or multiple parser output Jsons are from different documents\n", + "\n", + " - Case 2 deals when we are using two or multiple parser outputs from the same document.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "17bdf742-c286-4735-bae5-91eb5c1a1ab0", + "metadata": {}, + "outputs": [], + "source": [ + "PROJECT_ID = \"xxxx-xxxx\" # Optional\n", + "INPUT_MULTIPLE_JSONS_URI = \"gs://xxxx/xxxx/\" # ends with \"/\"\n", + "JSON_DIRECTORY_PATH_OUTPUT = \"gs://xxxx/xxxx/\" # ends with \"/\"\n", + "OUTPUT_FILE_NAME = \"merged_file.json\"\n", + "MERGER_TYPE_FLAG = 1 # 1-for different docs, 2-same doc default=1" + ] + }, + { + "cell_type": "markdown", + "id": "79be40da-86a7-46c3-8135-6b1e384438b6", + "metadata": {}, + "source": [ + "## 3. Run the below code.\n", + "\n", + "Use the below code and Run all the cells (Update the Path parameter if it is not available in the current working directory)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "4ffdf37c-332f-4ba5-a55f-349d2cccb432", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "def split_gcs_folder(path: str) -> Tuple[str, str]:\n", + " \"\"\"\n", + " This function splits gcs uri to 2 parts\n", + " 1. gcs bucket\n", + " 2. file path after bucket\n", + " \"\"\"\n", + " \n", + " pattern = re.compile(\"gs://(?P.*?)/(?P.*)\")\n", + " uri = re.match(pattern, path)\n", + " return uri.group(\"bucket\"), uri.group(\"files_dir\")\n", + "\n", + "\n", + "def file_names(bucket: str,\n", + " files_dir_prefix: str) -> Tuple[List[str], Dict[str, str]]:\n", + " \"\"\"This Function will load the bucket and get the list of files\n", + " in the gs path given\n", + " \"\"\"\n", + "\n", + " filenames_list = []\n", + " filenames_dict = {}\n", + " storage_client = storage.Client()\n", + " bucket = storage_client.get_bucket(bucket)\n", + " blobs = bucket.list_blobs(prefix=files_dir_prefix)\n", + " filenames = [blob.name for blob in blobs]\n", + " for filename in filenames:\n", + " file = filename.split(\"/\")[-1]\n", + " if file:\n", + " filenames_list.append(file)\n", + " filenames_dict[file] = filename\n", + " return filenames_list, filenames_dict\n", + "\n", + "\n", + "def list_json_files(filenames: List[str]) -> List[str]:\n", + " \"\"\"\n", + " Takes filenames and return JSON files only as list\n", + " \"\"\"\n", + "\n", + " json_files = []\n", + " for filename in filenames:\n", + " if filename.endswith(\".json\"):\n", + " json_files.append(filename)\n", + " return json_files\n", + "\n", + "\n", + "def assign_indexes(layout: Union[Document.Entity, Document.Page.Layout],\n", + " text: str) -> None:\n", + " \"\"\"\n", + " It will assign new index values to start_index and end_index for respective class object\n", + " \"\"\"\n", + "\n", + " for text_segment in layout.text_anchor.text_segments:\n", + " text_segment.end_index = int(text_segment.end_index) + len(text)\n", + " text_segment.start_index = int(text_segment.start_index) + len(text)\n", + "\n", + "\n", + "def assign_page_ref_page(entity: documentai.Document.Entity,\n", + " doc_first: documentai.Document) -> None:\n", + " \"\"\"\n", + " It will accumulate page count for entities-page_anchor-page_refs\n", + " \"\"\"\n", + " \n", + " for page_ref in entity.page_anchor.page_refs:\n", + " page_ref.page = str(int(page_ref.page) + len(doc_first.pages))\n", + "\n", + "\n", + "### CASE - 1\n", + "def different_doc_merger(\n", + " doc_first: documentai.Document,\n", + " doc_second: documentai.Document) -> documentai.Document:\n", + " \"\"\"\n", + " This function takes two documentai.Document objects and merges them as one\n", + " \"\"\"\n", + " \n", + " doc_merged = documentai.Document()\n", + "\n", + " ### Entities ###\n", + " for entity in doc_second.entities:\n", + " assign_indexes(entity, doc_first.text) # entity-textanchors\n", + " assign_page_ref_page(entity, doc_first) # entity-pageanchors\n", + " for prop in entity.properties: # entity properties\n", + " assign_indexes(prop, doc_first.text)\n", + " assign_page_ref_page(prop, doc_first)\n", + " doc_merged.entities = list(doc_first.entities) + list(doc_second.entities)\n", + "\n", + " # Pages\n", + " for page in doc_second.pages:\n", + " print(page.page_number, end=\" \")\n", + " page.page_number = int(page.page_number) + len(\n", + " doc_first.pages) # Page Number increment in second doc\n", + " print(\" \", page.page_number)\n", + "\n", + " # page . layout . textanchor . textsegment\n", + " assign_indexes(page.layout, doc_first.text)\n", + "\n", + " for block in page.blocks:\n", + " assign_indexes(block.layout, doc_first.text)\n", + "\n", + " for paragraph in page.paragraphs:\n", + " assign_indexes(paragraph.layout, doc_first.text)\n", + "\n", + " for line in page.lines:\n", + " assign_indexes(line.layout, doc_first.text)\n", + "\n", + " for token in page.tokens:\n", + " assign_indexes(token.layout, doc_first.text)\n", + "\n", + " doc_merged.pages = list(doc_first.pages) + list(doc_second.pages)\n", + " doc_merged.text = doc_first.text + doc_second.text\n", + " doc_merged.shard_info = doc_second.shard_info\n", + " doc_merged.uri = doc_second.uri\n", + " return doc_merged\n", + "\n", + "\n", + "### CASE -2\n", + "def same_doc_diff_parser_merger(\n", + " doc_first: documentai.Document,\n", + " doc_second: documentai.Document) -> documentai.Document:\n", + " \"\"\"\n", + " This function merges the entities of two documentai.Document object as one\n", + " \"\"\"\n", + " \n", + " doc_first.entities = list(doc_first.entities) + list(doc_second.entities)\n", + " doc_first.uri = doc_second.uri\n", + " doc_first.text = doc_second.text\n", + " doc_first.pages = doc_second.pages\n", + " doc_first.shard_info = doc_second.shard_info\n", + " return doc_first\n", + "\n", + "\n", + "def iter_json_files(\n", + " bucket_obj: storage.Bucket,\n", + " input_bucket_files: List[str],\n", + " file_dict: Dict[str, str],\n", + " doc_merged: documentai.Document,\n", + " MERGER_TYPE_FLAG: int = 1,\n", + ") -> documentai.Document:\n", + " \"\"\"\n", + " It will iterate through all json files and merges each file to doc_merged parameter based on MERGER_TYPE_FLAG\n", + " \"\"\"\n", + " \n", + " func = (different_doc_merger if\n", + " (MERGER_TYPE_FLAG == 1) else same_doc_diff_parser_merger)\n", + " for file in input_bucket_files:\n", + " print(file)\n", + " doc_second = load_document_from_gcs(bucket_obj, file_dict[file])\n", + " doc_merged = func(doc_merged, doc_second)\n", + " return doc_merged\n", + "\n", + "\n", + "def delete_id(doc_merged: documentai.Document) -> documentai.Document:\n", + " \"\"\"\n", + " It will assign empty string to id property of Entity object\n", + " \"\"\"\n", + "\n", + " for entity in doc_merged.entities:\n", + " entity.id = \"\"\n", + " for prop in entity.properties:\n", + " prop.id = \"\"\n", + " return doc_merged\n", + "\n", + "\n", + "def load_document_from_gcs(bucket_obj: storage.Bucket,\n", + " filepath: str) -> documentai.Document:\n", + " \"\"\"\n", + " It will load json file from GCS filepath and returns documentai.Document object\n", + " \"\"\"\n", + " \n", + " data_str = bucket_obj.blob(filepath).download_as_string().decode(\"utf-8\")\n", + " document = documentai.Document.from_json(data_str)\n", + " return document\n", + "\n", + "\n", + "def merge_document_objects(\n", + " MERGER_TYPE_FLAG: int,\n", + " input_bucket: str,\n", + " input_bucket_files: List[str],\n", + " file_dict: Dict[str, str],\n", + ") -> documentai.Document:\n", + " \"\"\"\n", + " It will merges all json files from gcs folder based on MERGER_TYPE_FLAG and return merged documentai.Document object\n", + " \"\"\"\n", + " \n", + " if MERGER_TYPE_FLAG not in (2, \"2\"):\n", + " print(\"\\t\" * 5, \"Using Default Merger\")\n", + " MERGER_TYPE_FLAG = 1\n", + " else:\n", + " print(\"\\t\" * 5, \"Using Different Processor Result jsons merger\")\n", + " MERGER_TYPE_FLAG = 2\n", + " storage_client = storage.Client()\n", + " bucket_obj = storage_client.get_bucket(input_bucket)\n", + " if len(input_bucket_files) < 2:\n", + " raise AssertionError(\n", + " \"minimum number of files required are >= 2 to perform Merging.\")\n", + " print(\n", + " \"....more than 2 JSON files detected....\",\n", + " \"Process Started...\",\n", + " sep=\"\\n\",\n", + " )\n", + " doc_merged = documentai.Document()\n", + " if int(MERGER_TYPE_FLAG) == 1:\n", + " doc_merged = iter_json_files(bucket_obj,\n", + " input_bucket_files,\n", + " file_dict,\n", + " doc_merged,\n", + " MERGER_TYPE_FLAG=1)\n", + " elif int(MERGER_TYPE_FLAG) == 2:\n", + " doc_merged = iter_json_files(bucket_obj,\n", + " input_bucket_files,\n", + " file_dict,\n", + " doc_merged,\n", + " MERGER_TYPE_FLAG=2)\n", + " return doc_merged\n", + "\n", + "\n", + "def upload_doc_obj_to_gcs(doc_merged: documentai.Document, output_bucket: str,\n", + " merged_json_path: str) -> None:\n", + " \"\"\"\n", + " It will convert documentai.Document object to JSON and uploads to specified GCS uri path as JSON.\n", + " \"\"\"\n", + " \n", + " storage_client = storage.Client(output_bucket)\n", + " bucket_obj = storage_client.get_bucket(output_bucket)\n", + " blob = bucket_obj.blob(merged_json_path)\n", + " print(f\"Uploading file to gs://{output_bucket}/{merged_json_path} ...\")\n", + " blob.upload_from_string(\n", + " documentai.Document.to_json(\n", + " doc_merged,\n", + " use_integers_for_enums=False,\n", + " including_default_value_fields=False,\n", + " ))\n", + " print(\n", + " \"Entities count After Merging - \",\n", + " len(doc_merged.entities),\n", + " )\n", + " print(\n", + " \"Pages count After Merging - \",\n", + " len(doc_merged.pages),\n", + " )\n", + " blob.content_type = \"application/json\"\n", + " blob.update()\n", + " print(f\"Successfully uploaded Merged Documnet Object as JSON to GCS\")\n", + "\n", + "\n", + "def main(\n", + " INPUT_MULTIPLE_JSONS_URI: str,\n", + " JSON_DIRECTORY_PATH_OUTPUT: str,\n", + " OUTPUT_FILE_NAME: str,\n", + " MERGER_TYPE_FLAG: int,\n", + " PROJECT_ID: str = \"\",\n", + ") -> None:\n", + " print(\"Merging JSON's tool started\")\n", + " input_bucket, input_files_dir = split_gcs_folder(INPUT_MULTIPLE_JSONS_URI)\n", + " output_bucket, output_files_dir = split_gcs_folder(\n", + " JSON_DIRECTORY_PATH_OUTPUT)\n", + " output_files_dir = output_files_dir.strip(\"/\")\n", + " file_names_list, file_dict = file_names(input_bucket, input_files_dir)\n", + " print(\n", + " f\"Pulling list of JSON files from source GCS Path - {INPUT_MULTIPLE_JSONS_URI}\"\n", + " )\n", + " input_bucket_files = list_json_files(file_names_list)\n", + " doc_merged = merge_document_objects(MERGER_TYPE_FLAG, input_bucket,\n", + " input_bucket_files, file_dict)\n", + " print(\"Merging process completed...\")\n", + " print(\"Deleting id under Entities & Properties of Document Object...\")\n", + " delete_id(doc_merged)\n", + " merged_json_path = ((output_files_dir + \"/\" +\n", + " OUTPUT_FILE_NAME) if output_files_dir else\n", + " (OUTPUT_FILE_NAME))\n", + " upload_doc_obj_to_gcs(doc_merged, output_bucket, merged_json_path)\n", + " print(\"Process Completed.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "a5a4589a-f4e2-4d45-b89e-0b5f1b691bc6", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Merging JSON's tool started\n", + "Pulling list of JSON files from source GCS Path - gs://siddam_bucket_test/cde_processor_test/test/\n", + "\t\t\t\t\t Using Default Merger\n", + "....more than 2 JSON files detected....\n", + "Process Started...\n", + "InsuranceCard-7.json\n", + "1 1\n", + "InsuranceCard_24.json\n", + "1 2\n", + "InsuranceCard_21.json\n", + "1 3\n", + "InsuranceCard_20.json\n", + "1 4\n", + "InsuranceCard_26.json\n", + "1 5\n", + "InsuranceCard_22.json\n", + "1 6\n", + "InsuranceCard_23.json\n", + "1 7\n", + "InsuranceCard_25.json\n", + "1 8\n", + "InsuranceCard-6.json\n", + "1 9\n", + "InsuranceCard-10.json\n", + "1 10\n", + "Merging process completed...\n", + "Deleting id under Entities & Properties of Document Object...\n", + "Uploading file to gs://siddam_bucket_test/cde_processor_test/merged_file.json ...\n", + "Entities count After Merging - 40\n", + "Pages count After Merging - 10\n", + "Successfully uploaded Merged Documnet Object as JSON to GCS\n", + "Process Completed.\n" + ] + } + ], + "source": [ + "main(\n", + " INPUT_MULTIPLE_JSONS_URI,\n", + " JSON_DIRECTORY_PATH_OUTPUT,\n", + " OUTPUT_FILE_NAME,\n", + " MERGER_TYPE_FLAG,\n", + " PROJECT_ID,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "051b1c28-c998-4788-92e6-e9b50c671043", + "metadata": {}, + "source": [ + "## 4. Output \n", + "\n", + "The output of the tool is a **single json file**. Let's examine the outputs for each of the case types. We’ll consider 3 json docs for our experiment and examine the output formats.\n", + "\n", + "Consider following 3 input json files residing the input GCS Bucket: \n", + "\n", + "json_doc_merge / 0 / doc-0.json\n", + "json_doc_merge / 1 / doc-1.json\n", + "json_doc_merge / 2 / doc-2.json\n", + "\n", + "Upon running the script for both the cases, the below output details are observed as follows.\n", + "\n", + "### CASE - 1 Output \n", + "Let's suppose the three json files are from different documents (The parser used may be same or different )\n", + "In Case - 1, we observe in the output that the Pages and Entities count increases with the number of pages and entities present in the input files upon merging. The same applies for the and Text, the value is changed and texts are concatenated and stored as a single value for the Text key of the output file. " + ] + }, + { + "cell_type": "markdown", + "id": "f4288801-7881-4c5f-9ead-c7a55f298120", + "metadata": {}, + "source": [ + "| Input json files | Screenshot highlighting the number of entities and number of pages in each of the input json files | The output single merged json file |\n", + "|:----------------:|----------------------------------------------------------------------------------------------------|------------------------------------------------------------|\n", + "| **doc-0.json** | ![](https://screenshot.googleplex.com/7Cn7bf5HKA62omx.png) | ![](https://screenshot.googleplex.com/7zWP7zPZkLeZSra.png) |\n", + "| **doc-1.json** | ![](https://screenshot.googleplex.com/BMGMEcW3EFxWrRc.png) | |\n", + "| **doc-2.json** | ![](https://screenshot.googleplex.com/3wCEqP9i3Bm9dqB.png) | |\n", + "\n", + "**For example :** each json has 2 pages and 21 entities , the final output merged json has 6 pages and 63 entities." + ] + }, + { + "cell_type": "markdown", + "id": "f80cd3e7-ee35-4003-ab4a-094a7a935f16", + "metadata": { + "tags": [] + }, + "source": [ + "### CASE - 2 Output \n", + "\n", + "Let's suppose the three json files are from the single document and from different parser results.\n", + "\n", + "In Case - 2, we observe the pages count remains the same and there is only an increase in the count of Entities upon merging the multiple input json files. \n" + ] + }, + { + "cell_type": "markdown", + "id": "1b055a42-20dd-4b34-b624-89218224e7ea", + "metadata": {}, + "source": [ + "| Input json files | Screenshot highlighting the number of entities and number of pages in each of the input json files | The output single merged json file |\n", + "|:----------------:|----------------------------------------------------------------------------------------------------|------------------------------------------------------------|\n", + "| **doc-0.json** | ![](https://screenshot.googleplex.com/ZofmvdULKVFvZ9w.png) | ![](https://screenshot.googleplex.com/Bx2WNCxdcv3pN8p.png) |\n", + "| **doc-1.json** | ![](https://screenshot.googleplex.com/6fgDDEEtRaxNJ2N.png) | |\n", + "| **doc-2.json** | ![](https://screenshot.googleplex.com/BwYcWwMuT6byLTm.png) | |\n", + "\n", + "**For example :** each json has 2 pages and 21 entities , the final output merged json has 2 pages and 63 entities.\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "089d2d80-0eb5-41a8-9af6-763559607999", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "common-cpu.m104", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m104" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/DocAI Incubator Tools/best-practices/Parser Result Merger/readme.md b/DocAI Incubator Tools/best-practices/Parser Result Merger/readme.md new file mode 100644 index 000000000..1c881cde8 --- /dev/null +++ b/DocAI Incubator Tools/best-practices/Parser Result Merger/readme.md @@ -0,0 +1,29 @@ +## Introduction +The Document AI Parser Result Merger is a utility developed in Python to seamlessly merge multiple resultant JSON files produced by Document AI processors. Given that documents typically span multiple pages, this tool provides two distinct use cases for merging these JSON files. This README elaborates on the tool's functionality and its operational prerequisites. + +## Use Cases +**Case 1:** Merging JSON Results from Different Documents (Default) +This scenario applies when merging JSON outputs stemming from distinct documents. +To activate this mode, set the flag to 1. +**Case 2:** Merging JSON Results from the Same Document (Enhanced Functionality) +This scenario is pertinent when integrating multiple JSON outputs that are all derived from a single document. +To initiate this mode, set the flag to 2. +Workflow Overview +### The tool's workflow is structured as follows: + +**Input GCP Bucket:** This container holds the multitude of JSON files that are targeted for merging. + +**Processing with Python Script:** This script is designed to accept and process the input JSON files. It also facilitates the user's transition between the default (Case 1) and enhanced (Case 2) modes, based on the modes detailed above. + +**Output GCP Bucket:** The unified, merged JSON file is saved into this GCP bucket post-processing. + +## Outputs +### CASE 1 Output: +Considering that three JSON files emerge from varying documents (regardless of whether the parser is identical): + +In this mode, the cumulative count of both Pages and Entities grows proportionately with the count in the input files. The Text values from each file are integrated and presented as a singular value within the Text key of the output JSON. + +### CASE 2 Output: +Assuming that the trio of JSON files originate from one document but differ in parser results: + +In this mode, the Pages count remains constant. However, there's an augmentation in the Entities count as the input JSON files merge. For instance, if every individual JSON contains 2 pages and 21 entities, the consolidated output JSON will display 2 pages but a total of 63 entities. \ No newline at end of file diff --git a/DocAI Incubator Tools/best-practices/Pre Post Bounding Box Mismatch/Pre Post Bounding Box Mismatch.ipynb b/DocAI Incubator Tools/best-practices/Pre Post Bounding Box Mismatch/Pre Post Bounding Box Mismatch.ipynb new file mode 100644 index 000000000..1834c1332 --- /dev/null +++ b/DocAI Incubator Tools/best-practices/Pre Post Bounding Box Mismatch/Pre Post Bounding Box Mismatch.ipynb @@ -0,0 +1,394 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "ea6835d4-cda8-44c5-873f-5687dc564ff9", + "metadata": {}, + "source": [ + "# PRE - POST HITL Bounding Box Mismatch " + ] + }, + { + "cell_type": "markdown", + "id": "2fcf8e7b-1374-4a4a-9b63-2f04e76d63cd", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "58b31e10-66da-4e9c-bea3-22be7c232ad6", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.\t" + ] + }, + { + "cell_type": "markdown", + "id": "790ad613-2fe3-4e0f-a5f5-2217497f5329", + "metadata": {}, + "source": [ + "## Purpose of the script\n", + "\n", + "Pre and POST HITL comparison tool which detect two issues - Parser issue and OCR issue.\n", + "And the result output contains a summary json file which shows basic stats, count of the OCR and Parser issues for entities present in each document and corresponding analysis csv files.\n", + "\n", + " * **Parser issue :** This issue is identified with the parser when the bounding box is not covering the text region completely and hence the required text was not captured completely. The user accesses HITL worker UI and adjusts the bounding box to include the text region and save. The script highlight such cases\n", + "\n", + " * **OCR issue :** This issue is identified with the parser when the bounding box covers the whole text region and as result the expected text was not captured completely. The script highlight such cases." + ] + }, + { + "cell_type": "markdown", + "id": "4f1352d4-1a7a-4232-9f1c-c3e4fa47ab7a", + "metadata": {}, + "source": [ + "## Prerequisites\n", + " * Vertex AI Notebook\n", + " * Google Cloud Storage bucket\n", + " * Pre HITL and Post HITL Json files (filename should be same) in GCS Folders\n", + " * DocumentAI and HITL" + ] + }, + { + "cell_type": "markdown", + "id": "a5f1381d-f69a-4dca-a65b-eae04761c40d", + "metadata": { + "tags": [] + }, + "source": [ + "## Step by Step procedure \n", + "### 1. Setup the required inputs\n", + "#### Execute the below code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7ec15989-4a5e-4882-a9cb-bae8b5480361", + "metadata": {}, + "outputs": [], + "source": [ + "project_id = \"\"\n", + "pre_HITL_output_URI = \"gs:///\"\n", + "post_HITL_output_URI = \"gs:///\"" + ] + }, + { + "cell_type": "markdown", + "id": "d2035fbe-b6fa-4c51-9b76-40e87c38c850", + "metadata": {}, + "source": [ + " * **project_id**: provide the project id \n", + " * **Pre_HITL_Output_URI:** provide the gcs path of pre HITL jsons (processed jsons) \n", + " * **Post_HITL_Output_URI:** provide the gcs path of post HITL jsons (Jsons processed through HITL) \n", + "\n", + "**NOTE:** The Name of Post-HITL Json will not be the same as the original file name by default. This has to be updated manually before using this tool." + ] + }, + { + "cell_type": "markdown", + "id": "0e98fc7d-4e80-4e49-a2ba-60851864ff0b", + "metadata": {}, + "source": [ + "## 2. Output\n", + "Result summary table is obtained which highlight the count of parser and ocr issues for each file. The result table contain details related to pre and post HITL entity changes, whether there were bounding box coordinates mismatched upon post HITL processing. The below screenshots showcases the parser or ocr issue.\n", + "\n", + "![](https://screenshot.googleplex.com/6S47qFm5SjP8eMC.png)\n", + "![](https://screenshot.googleplex.com/6HyQwucSQPZR4ii.png)\n", + "\n", + "Summary json file is generated which highlight count of bounding box mismatches, OCR and Parser errors and analysis path to result table for each of the processed files.\n", + "\n", + "![](https://screenshot.googleplex.com/55R5NKSuVYmyP9H.png)\n", + "\n", + "Entity wise analysis for each file can be observed in the following csv files under analysis/ folder.\n", + "\n", + "![](https://screenshot.googleplex.com/BKd5QCidEJac9Jy.png)\n", + "\n", + "**Table columns:**\n", + "\n", + "The result output table has following columns and its details are as follows:\n", + " * File Name : name of the file\n", + " * Entity Type : type of the entity \n", + " * Pre_HITL_Output : entity text before HITL \n", + " * Pre_HITL_bbox : entity bounding box coordinates before HITL\n", + " * Post_HITL_Output : entity text before HITL \n", + " * Hitl_update : if there was HITL update for that particular entity\n", + " * Post_HITL_bbox : entity bounding box coordinates after HITL\n", + " * Fuzzy Ratio : text match %\n", + " * Bbox_mismatch : if the bounding box coordinates are mismatched\n", + " * OCR issue : represents if its classified as OCR Issue\n", + " * Parser issue : represents if its classified as Parser Issue\n" + ] + }, + { + "cell_type": "markdown", + "id": "abe86372-45ba-443b-818c-0b4a75fee96c", + "metadata": { + "tags": [] + }, + "source": [ + "## Notebook Script\n", + "\n", + "**Install the below libraries before executing the script** \\\n", + "If you encounter an error while importing libraries, please verify that you have installed them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e1daa5fc-1259-4c5c-a0d3-e81d81b78637", + "metadata": {}, + "outputs": [], + "source": [ + "!pip install google-cloud-documentai\n", + "!pip install PyPDF2" + ] + }, + { + "cell_type": "markdown", + "id": "63a18e66-dd30-4f7e-9236-cc04a371fd6b", + "metadata": {}, + "source": [ + "**Script**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8751be86-e5d3-49e2-a38c-bf362cb5dab9", + "metadata": {}, + "outputs": [], + "source": [ + "import ast\n", + "import configparser\n", + "import difflib\n", + "import io\n", + "import json\n", + "import operator\n", + "import os\n", + "import re\n", + "import time\n", + "from collections.abc import Container, Iterable, Iterator, Mapping, Sequence\n", + "from typing import List, Optional, Tuple, Union\n", + "\n", + "import numpy as np\n", + "# Import the libraries\n", + "import pandas as pd\n", + "from google.cloud import documentai_v1beta3, storage\n", + "from PIL import Image\n", + "from PyPDF2 import PdfFileReader\n", + "\n", + "pd.options.mode.chained_assignment = None # default='warn'\n", + "import datetime\n", + "import json\n", + "import os\n", + "import utilities\n", + "\n", + "\n", + "now = str(datetime.datetime.now())\n", + "now = re.sub(r\"\\W+\", \"\", now)\n", + "\n", + "print(\"Creating temporary buckets\")\n", + "pre_HITL_bucket_name_temp = \"pre_hitl_output\" + \"_\" + now\n", + "post_HITL_bucket_name_temp = \"post_hitl_output_temp\" + \"_\" + now\n", + "# bucket name and prefix\n", + "pre_HITL_bucket = pre_HITL_output_URI.split(\"/\")[2]\n", + "post_HITL_bucket = post_HITL_output_URI.split(\"/\")[2]\n", + "# getting all files and copying to temporary folder\n", + "\n", + "try:\n", + " utilities.check_create_bucket(pre_HITL_bucket_name_temp)\n", + " utilities.check_create_bucket(post_HITL_bucket_name_temp)\n", + "except Exception as e:\n", + " print(\"unable to create bucket because of exception : \", e)\n", + "\n", + "try:\n", + " pre_HITL_output_files, pre_HITL_output_dict = utilities.file_names(\n", + " pre_HITL_output_URI)\n", + " # print(pre_HITL_output_files,pre_HITL_output_dict)\n", + " post_HITL_output_files, post_HITL_output_dict = utilities.file_names(\n", + " post_HITL_output_URI)\n", + " # print(post_HITL_output_files,post_HITL_output_dict)\n", + " print(\"copying files to temporary bucket\")\n", + " for i in pre_HITL_output_files:\n", + " utilities.copy_blob(pre_HITL_bucket, pre_HITL_output_dict[i],\n", + " pre_HITL_bucket_name_temp, i)\n", + " for i in post_HITL_output_files:\n", + " utilities.copy_blob(post_HITL_bucket, post_HITL_output_dict[i],\n", + " post_HITL_bucket_name_temp, i)\n", + " pre_HITL_files_list = utilities.list_blobs(pre_HITL_bucket_name_temp)\n", + " post_HITL_files_list = utilities.list_blobs(post_HITL_bucket_name_temp)\n", + "except Exception as e:\n", + " print(\"unable to get list of files in buckets because : \", e)\n", + "# processing the files and saving the files in temporary GCP bucket\n", + "relation_dict, non_relation_dict = utilities.matching_files_two_buckets(\n", + " pre_HITL_bucket_name_temp, post_HITL_bucket_name_temp)\n", + "\n", + "time_stamp = datetime.datetime.now().strftime(\"%d_%m_%y-%H%M%S\")\n", + "filename_error_count_dict = {}\n", + "\n", + "compare_merged = pd.DataFrame()\n", + "accuracy_docs = []\n", + "print(\"comparing the PRE-HITL Jsons and POST-HITL jsons ....Wait for Summary \")\n", + "for i in relation_dict:\n", + " # print(\"***** i : \", i)\n", + " pre_HITL_json = utilities.documentai_json_proto_downloader(pre_HITL_bucket_name_temp, i)\n", + " post_HITL_json = utilities.documentai_json_proto_downloader(post_HITL_bucket_name_temp,\n", + " relation_dict[i])\n", + " # print('pre_HITL_json : ', pre_HITL_json)\n", + " # print('post_HITL_json : ', post_HITL_json)\n", + " compare_output = utilities.compare_pre_hitl_and_post_hitl_output(\n", + " pre_HITL_json, post_HITL_json)[0]\n", + " # Rename columns\n", + " compare_output = compare_output.rename(columns={\n", + " 'pre_bbox': 'Pre_HITL_bbox',\n", + " 'post_bbox': 'Post_HITL_bbox'\n", + " })\n", + "\n", + " # Drop unwanted columns\n", + " compare_output = compare_output.drop(['page1', 'page2'], axis=1)\n", + "\n", + " # print('compare_output :',compare_output)\n", + " # display(compare_output)\n", + " column = [relation_dict[i]] * compare_output.shape[0]\n", + " # print(\"++++column++++\")\n", + " # print(column)\n", + " compare_output.insert(loc=0, column=\"File Name\", value=column)\n", + "\n", + " compare_output.insert(loc=5, column=\"hitl_update\", value=\" \")\n", + " for j in range(len(compare_output)):\n", + " if compare_output[\"Fuzzy Ratio\"][j] != 1.0: # strict\n", + " if (compare_output[\"Pre_HITL_Output\"][j] == \"Entity not found.\"\n", + " and compare_output[\"Post_HITL_Output\"][j]\n", + " == \"Entity not found.\"):\n", + " compare_output[\"hitl_update\"][j] = \"NO\"\n", + " else:\n", + " compare_output[\"hitl_update\"][j] = \"YES\"\n", + " else:\n", + " compare_output[\"hitl_update\"][j] = \"NO\"\n", + " for k in range(len(compare_output)):\n", + " if compare_output[\"Fuzzy Ratio\"][k] != 1.0: # strict\n", + " hitl_update = \"HITL UPDATED\"\n", + " break\n", + " else:\n", + " compare_output[\"hitl_update\"][k] = \"NO\"\n", + "\n", + " ##\n", + " compare_output[\"bbox_mismatch\"] = (compare_output[\"Pre_HITL_bbox\"]\n", + " != compare_output[\"Post_HITL_bbox\"])\n", + "\n", + " # OCR Issue\n", + " compare_output[\"OCR Issue\"] = \"No\"\n", + " # compare_output.loc[(compare_output['Pre_HITL_Output'] != compare_output['Post_HITL_Output']), 'OCR Issue'] = 'Yes' # & cordinates are same\n", + " compare_output.loc[\n", + " (compare_output[\"Pre_HITL_Output\"] !=\n", + " compare_output[\"Post_HITL_Output\"])\n", + " &\n", + " (compare_output[\"Pre_HITL_bbox\"] == compare_output[\"Post_HITL_bbox\"]),\n", + " \"OCR Issue\",\n", + " ] = \"Yes\"\n", + "\n", + " # Parser Issue\n", + " compare_output[\"Parser Issue\"] = \"No\"\n", + " compare_output.loc[\n", + " (compare_output[\"hitl_update\"] == \"YES\")\n", + " & (compare_output[\"bbox_mismatch\"] == True),\n", + " \"Parser Issue\",\n", + " ] = \"Yes\" # & cordinates are different\n", + " try:\n", + " compare_merged.loc[\n", + " (compare_merged[\"Post_HITL_Output\"] == \"Entity not found.\")\n", + " | (compare_merged[\"Pre_HITL_Output\"] == \"Entity not found.\"),\n", + " \"Parser Issue\",\n", + " ] = \"Yes\"\n", + " except:\n", + " pass\n", + "\n", + " ## global dict : no of parser error / file\n", + " temp = {}\n", + " temp[\"bbox_mismatch\"] = len(\n", + " compare_output[compare_output[\"bbox_mismatch\"] == True])\n", + "\n", + " temp[\"OCR_issue\"] = len(\n", + " compare_output.loc[(compare_output[\"Pre_HITL_Output\"] !=\n", + " compare_output[\"Post_HITL_Output\"])\n", + " & (compare_output[\"Pre_HITL_bbox\"] ==\n", + " compare_output[\"Post_HITL_bbox\"])])\n", + " temp[\"Parser_issue\"] = len(\n", + " compare_output.loc[(compare_output[\"hitl_update\"] == \"YES\")\n", + " & (compare_output[\"bbox_mismatch\"] == True)])\n", + " temp[\"output_file\"] = \"analysis_\" + time_stamp + \"/\" + i.replace(\n", + " \"json\", \"csv\")\n", + "\n", + " filename_error_count_dict[i] = temp\n", + "\n", + " new_row = pd.Series(\n", + " [\n", + " i,\n", + " \"Entities\",\n", + " \"are updated\",\n", + " \"by HITL\",\n", + " \":\",\n", + " np.nan,\n", + " hitl_update,\n", + " \"\",\n", + " \"\",\n", + " \"\",\n", + " \"\",\n", + " \"\",\n", + " ],\n", + " index=compare_output.columns,\n", + " )\n", + " compare_output = compare_output.append(new_row, ignore_index=True)\n", + " frames = [compare_merged, compare_output]\n", + " compare_merged = pd.concat(frames)\n", + "\n", + "with open(\"summary_\" + time_stamp + \".json\", \"w\") as ofile:\n", + " ofile.write(json.dumps(filename_error_count_dict))\n", + "\n", + "for x in relation_dict:\n", + " # print(x)\n", + " file_out = compare_merged[compare_merged[\"File Name\"] == x]\n", + " try:\n", + " os.mkdir(\"analysis_\" + time_stamp)\n", + " except:\n", + " pass\n", + " file_out.to_csv(\"analysis_\" + time_stamp + \"/\" + x.replace(\"json\", \"csv\"))\n", + "\n", + "utilities.bucket_delete(pre_HITL_bucket_name_temp)\n", + "utilities.bucket_delete(post_HITL_bucket_name_temp)" + ] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "common-cpu.m104", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m104" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/DocAI Incubator Tools/best-practices/Pre Post Bounding Box Mismatch/readme.md b/DocAI Incubator Tools/best-practices/Pre Post Bounding Box Mismatch/readme.md new file mode 100644 index 000000000..7c0f70cb9 --- /dev/null +++ b/DocAI Incubator Tools/best-practices/Pre Post Bounding Box Mismatch/readme.md @@ -0,0 +1,56 @@ +## Purpose of the Script + +This tool is a comparison utility script designed to detect two primary issues: Parser issue and OCR issue. The output generated by the tool consists of a summary JSON file that provides basic stats and the count of OCR and Parser issues for entities present in each document. Additionally, analysis CSV files are also produced. + +### Issues Defined: + +- **Parser issue:** + Identified when the bounding box fails to encompass the entire text region, resulting in incomplete text capture. When users access the HITL worker UI, they adjust the bounding box to cover the entire text region and save their changes. This script highlights such discrepancies. + +- **OCR issue:** + Recognized when the bounding box does cover the entire text region, but the resultant text is not captured fully. These cases are flagged by the script. + +### Inputs + +- **project_id:** Provide the specific project ID. + +- **Pre_HITL_Output_URI:** Input the GCS path containing pre-HITL processed JSONs. + +- **Post_HITL_Output_URI:** Input the GCS path of post-HITL processed JSONs (those processed through HITL). + +> **NOTE:** By default, the name of the Post-HITL JSON will differ from the original file name. It's essential to update this manually before utilizing the tool. + +## Output Details + +A result summary table is produced that distinctly highlights the count of both parser and OCR issues for each file. This table provides insights into pre and post-HITL entity modifications, and whether any bounding box coordinate mismatches emerged after post-HITL processing. Supporting images would demonstrate either the parser or OCR issue (as mentioned). + +A summary JSON file is also generated, emphasizing counts of bounding box mismatches, OCR and Parser errors, and an analysis path to the result table for each of the processed files. + +For a granular analysis of each file, refer to the CSV files located in the `analysis/` folder. + +### Table Structure: + +The result output table is structured with the following columns: + +- **File Name:** Displays the name of the file. + +- **Entity Type:** Designates the entity type. + +- **Pre_HITL_Output:** Shows the entity text before HITL intervention. + +- **Pre_HITL_bbox:** Lists the bounding box coordinates pre-HITL. + +- **Post_HITL_Output:** Represents the entity text post-HITL. + +- **Hitl_update:** Indicates if a HITL update was applied to the particular entity. + +- **Post_HITL_bbox:** Details the bounding box coordinates post-HITL. + +- **Fuzzy Ratio:** Demonstrates the percentage match of the text. + +- **Bbox_mismatch:** Flags instances where bounding box coordinates didn't match. + +- **OCR issue:** Denotes if the issue was classified as an OCR Issue. + +- **Parser issue:** Specifies if the issue was recognized as a Parser Issue. + diff --git a/DocAI Incubator Tools/best-practices/Pre Post HITL Visualization/Pre and Post HITL Visualization.ipynb b/DocAI Incubator Tools/best-practices/Pre Post HITL Visualization/Pre and Post HITL Visualization.ipynb new file mode 100644 index 000000000..c3de20c70 --- /dev/null +++ b/DocAI Incubator Tools/best-practices/Pre Post HITL Visualization/Pre and Post HITL Visualization.ipynb @@ -0,0 +1,428 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "12ccafbd-d094-4604-9f85-975f709f7038", + "metadata": {}, + "source": [ + "# Pre and Post HITL Visualization" + ] + }, + { + "cell_type": "markdown", + "id": "370b84a9-28f8-4e8b-9dd4-8b8363e59282", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "270ddad0-5d76-4dd2-a863-74c3eb4effba", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.\t" + ] + }, + { + "cell_type": "markdown", + "id": "351f231a-bd30-4ee3-a617-c12bf2261d57", + "metadata": {}, + "source": [ + "## Purpose of the script\n", + "This tool uses Pre-HITL JSON files (Parsed from a processor) and Post HITL JSON files(Updated through HITL) from GCS bucket as input, compares the Json files and differences are shown in an Excel with bounding boxes added images.\n" + ] + }, + { + "cell_type": "markdown", + "id": "30079506-4bc3-462b-aecb-dd95a1f1958a", + "metadata": { + "tags": [] + }, + "source": [ + "## Prerequisite\n", + " * Vertex AI Notebook\n", + " * Pre HITL and Post HITL Json files (filename should be same) in GCS Folders\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "id": "418be9ca-916f-41a1-9196-800401bddd9d", + "metadata": { + "tags": [] + }, + "source": [ + "## Step by Step procedure \n", + "### 1. Setup the required inputs\n", + "#### Execute the below code" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cb227a56-08d7-4feb-82fb-c52dadf6dece", + "metadata": {}, + "outputs": [], + "source": [ + "project_id = \"\"\n", + "pre_HITL_output_URI = \"gs:///\"\n", + "post_HITL_output_URI = \"gs:///\"" + ] + }, + { + "cell_type": "markdown", + "id": "2d561b9d-d183-41f3-9d08-337a4e6c0bb8", + "metadata": {}, + "source": [ + " * **project_id**: provide the project id \n", + " * **Pre_HITL_Output_URI:** provide the gcs path of pre HITL jsons (processed jsons) \n", + " * **Post_HITL_Output_URI:** provide the gcs path of post HITL jsons (Jsons processed through HITL) \n", + "\n", + "**NOTE:** The Name of Post-HITL Json will not be the same as the original file name by default. This has to be updated manually before using this tool." + ] + }, + { + "cell_type": "markdown", + "id": "ceae5bee-4d39-460b-a7ef-5071e4d140ca", + "metadata": {}, + "source": [ + "**2. Output** \n", + "\n", + "The output of the tool will be in an Excel format showing the entities which are updated in HITL and unchanged as well with images of labeled docs (both pre and post HITL).\n", + "\n", + "The Excel sheet which is created will have a summary of all the file files in “Consolidated_Data” and comparison in a separate sheet for each file.\n", + "\n", + "Each Excel sheet created will have a batch of 20 files in it.\n", + "\n", + "![](https://screenshot.googleplex.com/6nL7E3hrRSEi6ST.png)\n", + "\n", + "The Excel file will have all the details of Pre-HITL text, Post-HITL text and whether the entity is updated in HITL in the form YES and NO as shown below .\n", + "\n", + "![](https://screenshot.googleplex.com/8wqPTMyUY5ASKZA.png)\n", + "\n", + "There will be a list of documents for which either the required confidence threshold is met or no HITL output is created yet is updated as “NO POST HITL OUTPUT AVAILABLE” at the end of excel in consolidated sheets.\n", + "\n", + "![](https://screenshot.googleplex.com/8tpFZsVfFdTBoKA.png)\n", + "\n", + "\n", + "Blue Bounding Box⇒ Entities in Pre-HITL Json\n", + "Red Bounding Box⇒ Entities updated in HITL\n", + "Green Bounding Box⇒ Entities deleted in HITL( Entities which are detected by parser are deleted in HITL)\n", + "\n", + "**Bounding box color coding in images**\n", + "\n", + "![](https://screenshot.googleplex.com/9aph7w2N2vywPFP.png)" + ] + }, + { + "cell_type": "markdown", + "id": "8c8a8f42-9d73-40f1-a356-39de0853cf33", + "metadata": {}, + "source": [ + "Pre Post Bounding Box Mismatch\n", + "**Sample Code**" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6f5352a2-cc8c-480e-8336-0e886b4cac76", + "metadata": {}, + "outputs": [], + "source": [ + "#pip install below libraries for one time\n", + "#!pip install configparser\n", + "#!pip install google.cloud\n", + "#!pip install ast\n", + "#!pip install openpyxl\n", + "\n", + "# installing libraries\n", + "import pandas as pd\n", + "import operator\n", + "import difflib\n", + "import json\n", + "import os\n", + "import pandas as pd\n", + "import time\n", + "import numpy as np\n", + "from google.cloud import storage\n", + "from google.cloud import documentai_v1beta3\n", + "from PIL import Image\n", + "from typing import Container, Iterable, Iterator, List, Mapping, Optional, Sequence, Tuple, Union\n", + "from PyPDF2 import PdfFileReader\n", + "import configparser\n", + "import ast\n", + "import numpy\n", + "import io\n", + "import re\n", + "import cv2\n", + "from PIL import Image, ImageDraw\n", + "import openpyxl\n", + "from google.cloud import documentai_v1beta3 as documentai\n", + "\n", + "\n", + "import utilities\n", + "\n", + "pd.options.mode.chained_assignment = None # default='warn'\n", + "\n", + "def find_excel_name():\n", + " i=1\n", + " excel_file_name='HITL_VISUAL'+str(i)+'.xlsx'\n", + " comapare_analysis=compare_merged.drop(['pre_bbox','post_bbox','page1','page2'],axis=1)\n", + " try:\n", + " workbook = openpyxl.load_workbook(excel_file_name)\n", + " num_sheets = len(workbook.sheetnames)\n", + " #print(num_sheets)\n", + " if num_sheets > 20: \n", + " excel_file='HITL_VISUAL'+str(i+1)+'.xlsx'\n", + " comapare_analysis.to_excel(excel_file,sheet_name='Consolidated_Data')\n", + " else:\n", + " excel_file='HITL_VISUAL'+str(i)+'.xlsx'\n", + " except FileNotFoundError:\n", + " excel_file='HITL_VISUAL'+str(i)+'.xlsx'\n", + " comapare_analysis.to_excel(excel_file,sheet_name='Consolidated_Data')\n", + " return excel_file\n", + "\n", + "def get_visualization_excel(pre_HITL_output_URI,compare_merged,relation_dict):\n", + "\n", + " #compare_merged.to_excel(\"HITL_VISUAL1.xlsx\",sheet_name='Consolidated_Data')\n", + " pre_HITL_bucket=pre_HITL_output_URI.split(\"/\")[2]\n", + " pre_HITL_output_files,pre_HITL_output_dict=utilities.file_names(pre_HITL_output_URI)\n", + " for file in pre_HITL_output_dict:\n", + " excel_file=find_excel_name()\n", + " df=compare_merged.drop(['pre_bbox','post_bbox','page1','page2'],axis=1)\n", + " if file in relation_dict.keys():\n", + " df_file=df[df['File Name']==file]\n", + " with pd.ExcelWriter(excel_file,engine='openpyxl', mode='a') as writer: \n", + " df_file.to_excel(writer, sheet_name=str(file))\n", + "\n", + " # path=\"gs://\"+pre_HITL_bucket+'/'+pre_HITL_output_dict[file]\n", + " GT_json = utilities.documentai_json_proto_downloader(pre_HITL_bucket,pre_HITL_output_dict[file])\n", + " pdf_bytes,synthesized_images=utilities.create_pdf_bytes_from_json(documentai.Document.to_dict(GT_json))\n", + " list_bbox_no={}\n", + " list_bbox_yes_changed={}\n", + " list_bbox_yes_old={}\n", + " for row in compare_merged.values:\n", + " if row[0]==file:\n", + " if row[8]=='NO':\n", + " if type(row[4])==list and row[4]!=[]:\n", + " try:\n", + " if row[6] in list_bbox_no.keys():\n", + " list_bbox_no[row[6]].append(row[4]) \n", + " else:\n", + " list_bbox_no[row[6]]=[row[4]]\n", + " #print({row[6]:row[4]})\n", + " except:\n", + " pass\n", + " elif row[8]=='YES':\n", + " if type(row[5])==list and row[5]!=[]:\n", + " try:\n", + " if row[7] in list_bbox_yes_changed.keys():\n", + " list_bbox_yes_changed[row[7]].append(row[5])\n", + " else:\n", + " list_bbox_yes_changed[row[7]]=[row[5]]\n", + "\n", + " except:\n", + " pass\n", + " elif type(row[4])==list and row[4]!=[]:\n", + " if row[6] in list_bbox_yes_old.keys():\n", + " list_bbox_yes_old[row[6]].append(row[4]) \n", + " else:\n", + " list_bbox_yes_old[row[6]]=[row[4]]\n", + "\n", + " open_cv_image={}\n", + " for i in range(len(synthesized_images)):\n", + " open_cv_image[i] = numpy.array(synthesized_images[i].convert('RGB')) \n", + " #print(list_bbox_yes_changed)\n", + " img_list=[]\n", + " list_bbox_no = {str(key): value for key, value in list_bbox_no.items()}\n", + " list_bbox_yes_changed = {str(key): value for key, value in list_bbox_yes_changed.items()}\n", + " list_bbox_yes_old = {str(key): value for key, value in list_bbox_yes_old.items()}\n", + "\n", + " for i in range(len(open_cv_image)):\n", + " size=open_cv_image[i].shape\n", + " try:\n", + " for bbox in list_bbox_no[str(i)]:\n", + " x1 = int(bbox[0]*size[1])\n", + " x2 = int(bbox[2]*size[1])\n", + " y1 = int(bbox[1]*size[0])\n", + " y2 = int(bbox[3]*size[0])\n", + " cv2.rectangle(open_cv_image[i], (x1, y1), (x2, y2), (0,0,255), 2)\n", + " except:\n", + " pass\n", + " try:\n", + " for bbox in list_bbox_yes_changed[str(i)]:\n", + " x1 = int(bbox[0]*size[1])\n", + " x2 = int(bbox[2]*size[1])\n", + " y1 = int(bbox[1]*size[0])\n", + " y2 = int(bbox[3]*size[0])\n", + " cv2.rectangle(open_cv_image[i], (x1, y1), (x2, y2), (255,0,0), 2)\n", + " except:\n", + " pass\n", + " try:\n", + " for bbox in list_bbox_yes_old[str(i)]:\n", + " x1 = int(bbox[0]*size[1])\n", + " x2 = int(bbox[2]*size[1])\n", + " y1 = int(bbox[1]*size[0])\n", + " y2 = int(bbox[3]*size[0])\n", + " cv2.rectangle(open_cv_image[i], (x1, y1), (x2, y2), (0,255,0), 2)\n", + " except:\n", + " pass\n", + "\n", + "\n", + " img1 = Image.fromarray(open_cv_image[i])\n", + " import openpyxl\n", + "\n", + " workbook = openpyxl.load_workbook(excel_file)\n", + " worksheet = workbook[str(file)]\n", + "\n", + " img1.save(f\"open_cv_image[i].png\", \"PNG\")\n", + " img = openpyxl.drawing.image.Image(f\"open_cv_image[i].png\")\n", + " img.anchor='K'+str(1+int(i)*50)\n", + " worksheet.add_image(img)\n", + " img.width = 500\n", + " img.height = 700\n", + " workbook.save(excel_file)\n", + "\n", + "\n", + "try:\n", + " #creating temperary buckets\n", + " import datetime\n", + " now = str(datetime.datetime.now())\n", + " now = re.sub('\\W+','', now)\n", + "\n", + " print(\"Creating temporary buckets\")\n", + " pre_HITL_bucket_name_temp = 'pre_hitl_output'+\"_\"+now\n", + " post_HITL_bucket_name_temp = 'post_hitl_output_temp'+\"_\"+now\n", + " #bucket name and prefix\n", + " pre_HITL_bucket=pre_HITL_output_URI.split(\"/\")[2]\n", + " post_HITL_bucket=post_HITL_output_URI.split(\"/\")[2]\n", + " #getting all files and copying to temporary folder\n", + "\n", + " try:\n", + " utilities.check_create_bucket(pre_HITL_bucket_name_temp)\n", + " utilities.check_create_bucket(post_HITL_bucket_name_temp)\n", + " except Exception as e:\n", + " print(\"unable to create bucket because of exception : \",e)\n", + "\n", + " try:\n", + " pre_HITL_output_files,pre_HITL_output_dict=utilities.file_names(pre_HITL_output_URI)\n", + " post_HITL_output_files,post_HITL_output_dict=utilities.file_names(post_HITL_output_URI)\n", + " print(\"copying files to temporary bucket\")\n", + " for i in pre_HITL_output_files:\n", + " utilities.copy_blob(pre_HITL_bucket,pre_HITL_output_dict[i],pre_HITL_bucket_name_temp,i) \n", + " for i in post_HITL_output_files:\n", + " utilities.copy_blob(post_HITL_bucket,post_HITL_output_dict[i],post_HITL_bucket_name_temp,i)\n", + " pre_HITL_files_list=utilities.list_blobs(pre_HITL_bucket_name_temp)\n", + " post_HITL_files_list=utilities.list_blobs(post_HITL_bucket_name_temp)\n", + " except Exception as e:\n", + " print(\"unable to get list of files in buckets because : \",e)\n", + " #processing the files and saving the files in temporary gCP bucket\n", + " relation_dict ,non_relation_dict= utilities.matching_files_two_buckets(pre_HITL_bucket_name_temp, post_HITL_bucket_name_temp)\n", + " compare_merged = pd.DataFrame()\n", + " accuracy_docs=[]\n", + " print(\"comparing the PRE-HITL Jsons and POST-HITL jsons ....Wait for Summary \")\n", + " for i in relation_dict:\n", + " pre_HITL_json = utilities.documentai_json_proto_downloader(pre_HITL_bucket_name_temp, i)\n", + " post_HITL_json = utilities.documentai_json_proto_downloader(post_HITL_bucket_name_temp, relation_dict[i])\n", + " compare_output = utilities.compare_pre_hitl_and_post_hitl_output(pre_HITL_json, post_HITL_json)[0]\n", + " column = [relation_dict[i]] * compare_output.shape[0]\n", + " compare_output.insert(loc = 0,\n", + " column = 'File Name',\n", + " value = column)\n", + "\n", + " compare_output.insert(loc=8,column = 'hitl_update',value = \" \")\n", + " for j in range(len(compare_output)):\n", + " if compare_output['Fuzzy Ratio'][j]!=1.0:\n", + " if compare_output['Pre_HITL_Output'][j]=='Entity not found.' and compare_output['Post_HITL_Output'][j]=='Entity not found.':\n", + " compare_output['hitl_update'][j]='NO'\n", + " else:\n", + " compare_output['hitl_update'][j]='YES'\n", + " else:\n", + " compare_output['hitl_update'][j]='NO'\n", + " for k in range(len(compare_output)):\n", + " if compare_output['Fuzzy Ratio'][k]!=1.0:\n", + " hitl_update=\"HITL UPDATED\"\n", + " break\n", + " else:\n", + " compare_output['hitl_update'][k]='NO'\n", + " frames = [compare_merged, compare_output]\n", + " compare_merged = pd.concat(frames)\n", + " try: \n", + " utilities.bucket_delete(pre_HITL_bucket_name_temp)\n", + " print(\"Deleting temperary buckets created\")\n", + " utilities.bucket_delete(post_HITL_bucket_name_temp)\n", + " except:\n", + " pass\n", + " compare_merged.drop([\"Match\",\"Fuzzy Ratio\"],axis=1,inplace=True)\n", + "\n", + " def highlight(s):\n", + " if s.hitl_update=='YES':\n", + " return ['background-color: yellow'] * len(s)\n", + " else:\n", + " return ['background-color: white'] * len(s)\n", + "\n", + " for k in non_relation_dict: \n", + " new_row=pd.Series([k,\"-\",\"-\",\"-\",\"\",\"\",\"\",\"\",non_relation_dict[k]], index=compare_merged.columns)\n", + " compare_merged=compare_merged.append(new_row,ignore_index= True)\n", + " comapare_analysis1=compare_merged.drop(['pre_bbox','post_bbox','page1','page2'],axis=1)\n", + " #comapare_analysis1.to_csv('compare_analysis.csv')\n", + " entity_change=compare_merged.loc[compare_merged['hitl_update'] == 'YES']\n", + " compare_merged_style=compare_merged.style.apply(highlight, axis=1)\n", + " import traceback\n", + " try:\n", + " print(\"HITL Comparision excel is getting prepared\")\n", + " get_visualization_excel(pre_HITL_output_URI,compare_merged,relation_dict)\n", + " print(\"Completed creating the HITL Comparision Excel\") \n", + " except Exception as e:\n", + " print(\"Unable to create HITL comparison excel because of:\", e)\n", + " print(traceback.format_exc())\n", + "except Exception as e:\n", + " try:\n", + " utilities.bucket_delete(pre_HITL_bucket_name_temp)\n", + " utilities.bucket_delete(post_HITL_bucket_name_temp)\n", + " print(\"unable to process the file : \",e)\n", + " except:\n", + " print(\"unable to process the file : \",e)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95823f2c-a91b-4bb9-85aa-5d90f17fff05", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "common-cpu.m104", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m104" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/DocAI Incubator Tools/best-practices/Pre Post HITL Visualization/readme.md b/DocAI Incubator Tools/best-practices/Pre Post HITL Visualization/readme.md new file mode 100644 index 000000000..d54842343 --- /dev/null +++ b/DocAI Incubator Tools/best-practices/Pre Post HITL Visualization/readme.md @@ -0,0 +1,26 @@ +## Purpose of the Script + +This tool is specifically designed to compare Pre-HITL JSON files (those parsed from a processor) and Post-HITL JSON files (those updated via HITL) sourced from a GCS bucket. The differences between the JSON files are presented in an Excel sheet, complete with images that feature bounding boxes. + +## Output Details + +The tool generates its output in Excel format. This Excel output delineates the entities that underwent HITL updates and those that remained unchanged. It also provides images of labeled documents captured both before and after HITL processing. + +The Excel workbook produced is structured with a "Consolidated_Data" sheet summarizing all processed files, alongside individual comparison sheets for each file. + +Notably, each generated Excel sheet will contain a batch of 20 files. + +Within the Excel file, the following details are outlined: +- Pre-HITL text +- Post-HITL text +- An indication of whether the entity underwent an update during HITL, represented as 'YES' or 'NO'. + +For any documents that either meet the requisite confidence threshold or lack a HITL output, the notation “NO POST HITL OUTPUT AVAILABLE” is appended at the conclusion of the consolidated sheets within the Excel workbook. + +## Bounding Box Color Coding in Images + +The tool utilizes color-coded bounding boxes in the output images to represent specific data: +- **Blue Bounding Box:** Represents entities found in the Pre-HITL JSON. +- **Red Bounding Box:** Denotes entities that were updated during the HITL process. +- **Green Bounding Box:** Indicates entities deleted during the HITL process. Specifically, these are entities originally detected by the parser but removed during HITL. + diff --git a/DocAI Incubator Tools/best-practices/Pre_Post_HITL_Bounding_Box_Mismatch.ipynb b/DocAI Incubator Tools/best-practices/Pre_Post_HITL_Bounding_Box_Mismatch.ipynb deleted file mode 100644 index ac8d7f76b..000000000 --- a/DocAI Incubator Tools/best-practices/Pre_Post_HITL_Bounding_Box_Mismatch.ipynb +++ /dev/null @@ -1,813 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "ea6835d4-cda8-44c5-873f-5687dc564ff9", - "metadata": {}, - "source": [ - "# PRE - POST HITL Bounding Box Mismatch " - ] - }, - { - "cell_type": "markdown", - "id": "2fcf8e7b-1374-4a4a-9b63-2f04e76d63cd", - "metadata": {}, - "source": [ - "* Author: docai-incubator@google.com" - ] - }, - { - "cell_type": "markdown", - "id": "58b31e10-66da-4e9c-bea3-22be7c232ad6", - "metadata": {}, - "source": [ - "## Disclaimer\n", - "\n", - "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.\t" - ] - }, - { - "cell_type": "markdown", - "id": "790ad613-2fe3-4e0f-a5f5-2217497f5329", - "metadata": {}, - "source": [ - "## Purpose of the script\n", - "\n", - "Pre and POST HITL comparison tool which detect two issues - Parser issue and OCR issue.\n", - "And the result output contains a summary json file which shows basic stats, count of the OCR and Parser issues for entities present in each document and corresponding analysis csv files.\n", - "\n", - " * **Parser issue :** This issue is identified with the parser when the bounding box is not covering the text region completely and hence the required text was not captured completely. The user accesses HITL worker UI and adjusts the bounding box to include the text region and save. The script highlight such cases\n", - "\n", - " * **OCR issue :** This issue is identified with the parser when the bounding box covers the whole text region and as result the expected text was not captured completely. The script highlight such cases." - ] - }, - { - "cell_type": "markdown", - "id": "4f1352d4-1a7a-4232-9f1c-c3e4fa47ab7a", - "metadata": {}, - "source": [ - "## Prerequisites\n", - " * Vertex AI Notebook\n", - " * Google Cloud Storage bucket\n", - " * Pre HITL and Post HITL Json files (filename should be same) in GCS Folders\n", - " * DocumentAI and HITL" - ] - }, - { - "cell_type": "markdown", - "id": "a5f1381d-f69a-4dca-a65b-eae04761c40d", - "metadata": {}, - "source": [ - "## Step by Step procedure \n", - "### 1. Config file Creation\n", - " **Config file Creation** \\\n", - " Run the below code and create a config.ini file for providing input." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "39f484e8-82df-478c-a06c-5e087b24c5fb", - "metadata": {}, - "outputs": [], - "source": [ - "import configparser\n", - "\n", - "config = configparser.ConfigParser()\n", - "# Add the structure to the file we will create\n", - "config.add_section(\"Parameters\")\n", - "config.set(\"Parameters\", \"project_id\", \"xxxx-xxxxxx-xxxxxx\")\n", - "config.set(\"Parameters\", \"Pre_HITL_Output_URI\", \"gs://\")\n", - "config.set(\"Parameters\", \"Post_HITL_Output_URI\", \"gs://\")\n", - "# Write the new structure to the new file\n", - "with open(r\"configfile.ini\", \"w\") as configfile:\n", - " config.write(configfile)" - ] - }, - { - "cell_type": "markdown", - "id": "1cf9b882-aafc-494b-8772-322fe90c59e2", - "metadata": { - "tags": [] - }, - "source": [ - "## 2. Input Details\n", - "\n", - "Once config.ini file is created with the above step , enter the input in the config file with necessary details as below \n", - " * **project_id**: provide the project id \n", - " * **Pre_HITL_Output_URI:** provide the gcs path of pre HITL jsons (processed jsons) \n", - " * **Post_HITL_Output_URI:** provide the gcs path of post HITL jsons (Jsons processed through HITL) \n", - " \n", - "![](https://screenshot.googleplex.com/4hLVtBgjU4vJeCo.png)\n", - "\n", - "**NOTE:** The Name of Post-HITL Json will not be the same as the original file name by default. This has to be updated manually before using this tool." - ] - }, - { - "cell_type": "markdown", - "id": "1eb704ae-f7dc-4bce-b8b4-8d56cf401f7e", - "metadata": {}, - "source": [ - "## 3. Run the Code\n", - "Copy the code provided in this document, Enter the path of the Config file and Run without any edits. The complete notebook script is found in the last section of this document. The output is the summary of entities updated through HITL which has the comparison of pre and post HITL jsons and count of Parser or OCR issue per document.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "adc3910d-5802-4bfa-b4f9-5300bbe5b9b2", - "metadata": {}, - "outputs": [], - "source": [ - "import configparser\n", - "\n", - "# input\n", - "Path = \"configfile.ini\" # Enter the path of config file\n", - "config = configparser.ConfigParser()\n", - "config.read(Path)\n", - "\n", - "project_id = config.get(\"Parameters\", \"project_id\")\n", - "pre_HITL_output_URI = config.get(\"Parameters\", \"pre_hitl_output_uri\")\n", - "post_HITL_output_URI = config.get(\"Parameters\", \"post_hitl_output_uri\")" - ] - }, - { - "cell_type": "markdown", - "id": "0e98fc7d-4e80-4e49-a2ba-60851864ff0b", - "metadata": {}, - "source": [ - "## 4. Output\n", - "Result summary table is obtained which highlight the count of parser and ocr issues for each file. The result table contain details related to pre and post HITL entity changes, whether there were bounding box coordinates mismatched upon post HITL processing. The below screenshots showcases the parser or ocr issue.\n", - "\n", - "![](https://screenshot.googleplex.com/6S47qFm5SjP8eMC.png)\n", - "![](https://screenshot.googleplex.com/6HyQwucSQPZR4ii.png)\n", - "\n", - "Summary json file is generated which highlight count of bounding box mismatches, OCR and Parser errors and analysis path to result table for each of the processed files.\n", - "\n", - "![](https://screenshot.googleplex.com/55R5NKSuVYmyP9H.png)\n", - "\n", - "Entity wise analysis for each file can be observed in the following csv files under analysis/ folder.\n", - "\n", - "![](https://screenshot.googleplex.com/BKd5QCidEJac9Jy.png)\n", - "\n", - "**Table columns:**\n", - "\n", - "The result output table has following columns and its details are as follows:\n", - " * File Name : name of the file\n", - " * Entity Type : type of the entity \n", - " * Pre_HITL_Output : entity text before HITL \n", - " * Pre_HITL_bbox : entity bounding box coordinates before HITL\n", - " * Post_HITL_Output : entity text before HITL \n", - " * Hitl_update : if there was HITL update for that particular entity\n", - " * Post_HITL_bbox : entity bounding box coordinates after HITL\n", - " * Fuzzy Ratio : text match %\n", - " * Bbox_mismatch : if the bounding box coordinates are mismatched\n", - " * OCR issue : represents if its classified as OCR Issue\n", - " * Parser issue : represents if its classified as Parser Issue\n" - ] - }, - { - "cell_type": "markdown", - "id": "abe86372-45ba-443b-818c-0b4a75fee96c", - "metadata": { - "tags": [] - }, - "source": [ - "## Notebook Script\n", - "\n", - "**Install the below libraries before executing the script** \\\n", - "If you encounter an error while importing libraries, please verify that you have installed them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e1daa5fc-1259-4c5c-a0d3-e81d81b78637", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install google-cloud-documentai\n", - "!pip install PyPDF2" - ] - }, - { - "cell_type": "markdown", - "id": "63a18e66-dd30-4f7e-9236-cc04a371fd6b", - "metadata": {}, - "source": [ - "**Script**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8751be86-e5d3-49e2-a38c-bf362cb5dab9", - "metadata": {}, - "outputs": [], - "source": [ - "import ast\n", - "import configparser\n", - "import difflib\n", - "import io\n", - "import json\n", - "import operator\n", - "import os\n", - "import re\n", - "import time\n", - "from collections.abc import Container, Iterable, Iterator, Mapping, Sequence\n", - "from typing import List, Optional, Tuple, Union\n", - "\n", - "import gcsfs\n", - "import numpy as np\n", - "# Import the libraries\n", - "import pandas as pd\n", - "from google.cloud import documentai_v1beta3, storage\n", - "from PIL import Image\n", - "from PyPDF2 import PdfFileReader\n", - "\n", - "pd.options.mode.chained_assignment = None # default='warn'\n", - "import datetime\n", - "import json\n", - "import os\n", - "\n", - "# input\n", - "Path = \"configfile.ini\" # Enter the path of config file\n", - "config = configparser.ConfigParser()\n", - "config.read(Path)\n", - "\n", - "project_id = config.get(\"Parameters\", \"project_id\")\n", - "pre_HITL_output_URI = config.get(\"Parameters\", \"pre_hitl_output_uri\")\n", - "post_HITL_output_URI = config.get(\"Parameters\", \"post_hitl_output_uri\")\n", - "\n", - "\n", - "# checking whether bucket exists else create temperary bucket\n", - "def check_create_bucket(bucket_name):\n", - " \"\"\"This Function is to create a temperary bucket\n", - " for storing the processed files\n", - " args: name of bucket\"\"\"\n", - "\n", - " storage_client = storage.Client()\n", - " try:\n", - " bucket = storage_client.get_bucket(bucket_name)\n", - " print(f\"Bucket {bucket_name} already exists.\")\n", - " except:\n", - " bucket = storage_client.create_bucket(bucket_name)\n", - " print(f\"Bucket {bucket_name} created.\")\n", - " return bucket\n", - "\n", - "\n", - "def bucket_delete(bucket_name):\n", - " print(\"Deleting bucket : \", bucket_name)\n", - " \"\"\"This function deltes the bucket and used for deleting the temporary\n", - " bucket\n", - " args: bucket name\"\"\"\n", - " storage_client = storage.Client()\n", - " try:\n", - " bucket = storage_client.get_bucket(bucket_name)\n", - " bucket.delete(force=True)\n", - " except:\n", - " pass\n", - "\n", - "\n", - "def file_names(file_path):\n", - " \"\"\"This Function will load the bucket and get the list of files\n", - " in the gs path given\n", - " args: gs path\n", - " output: file names as list and dictionary with file names as keys and file path as values\n", - " \"\"\"\n", - " bucket = file_path.split(\"/\")[2]\n", - " file_names_list = []\n", - " file_dict = {}\n", - " storage_client = storage.Client()\n", - " source_bucket = storage_client.get_bucket(bucket)\n", - " filenames = [\n", - " filename.name for filename in list(\n", - " source_bucket.list_blobs(\n", - " prefix=((\"/\").join(file_path.split(\"/\")[3:]))))\n", - " ]\n", - " for i in range(len(filenames)):\n", - " x = filenames[i].split(\"/\")[-1]\n", - " if x != \"\":\n", - " file_names_list.append(x)\n", - " file_dict[x] = filenames[i]\n", - " return file_names_list, file_dict\n", - "\n", - "\n", - "# list\n", - "def list_blobs(bucket_name):\n", - " \"\"\"This function will give the list of files in a bucket\n", - " args: gcs bucket name\n", - " output: list of files\"\"\"\n", - " blob_list = []\n", - " storage_client = storage.Client()\n", - " blobs = storage_client.list_blobs(bucket_name)\n", - " for blob in blobs:\n", - " blob_list.append(blob.name)\n", - " return blob_list\n", - "\n", - "\n", - "# Bucket operations\n", - "def relation_dict_generator(pre_hitl_output_bucket, post_hitl_output_bucket):\n", - " \"\"\"This Function will check the files from pre_hitl_output_bucket and post_hitl_output_bucket\n", - " and finds the json with same names(relation)\"\"\"\n", - " pre_hitl_bucket_blobs = list_blobs(pre_hitl_output_bucket)\n", - " post_hitl_bucket_blobs = list_blobs(post_hitl_output_bucket)\n", - "\n", - " relation_dict = {}\n", - " non_relation_dict = {}\n", - " for i in pre_hitl_bucket_blobs:\n", - " for j in post_hitl_bucket_blobs:\n", - " matched_score = difflib.SequenceMatcher(None, i, j).ratio()\n", - " print(\"matched_score : \", matched_score)\n", - " if (\n", - " matched_score == 1\n", - " ): # 0.9 This is for file name. pre and post hitl json files are to be same\n", - " relation_dict[i] = j\n", - " else:\n", - " non_relation_dict[i] = \"NO POST HITL OUTPUT AVAILABLE\"\n", - " # print(i)\n", - " for i in relation_dict:\n", - " if i in non_relation_dict.keys():\n", - " del non_relation_dict[i]\n", - " print(\"relation_dict = \", relation_dict)\n", - " print(\"non_relation_dict = \", non_relation_dict)\n", - " return relation_dict, non_relation_dict\n", - "\n", - "\n", - "def blob_downloader(bucket_name, blob_name):\n", - " \"\"\"This Function is used to download the files from gcs bucket\"\"\"\n", - " storage_client = storage.Client()\n", - " bucket = storage_client.bucket(bucket_name)\n", - " blob = bucket.blob(blob_name)\n", - " contents = blob.download_as_string()\n", - " return json.loads(contents.decode())\n", - "\n", - "\n", - "def copy_blob(bucket_name, blob_name, destination_bucket_name,\n", - " destination_blob_name):\n", - " \"\"\"This Method will copy files from one bucket(or folder) to another\"\"\"\n", - " storage_client = storage.Client()\n", - " source_bucket = storage_client.bucket(bucket_name)\n", - " source_blob = source_bucket.blob(blob_name)\n", - " destination_bucket = storage_client.bucket(destination_bucket_name)\n", - " blob_copy = source_bucket.copy_blob(source_blob, destination_bucket,\n", - " destination_blob_name)\n", - "\n", - "\n", - "def bbox_maker(boundingPoly):\n", - " x_list = []\n", - " y_list = []\n", - " for i in boundingPoly:\n", - " x_list.append(i[\"x\"])\n", - " y_list.append(i[\"y\"])\n", - " bbox = [min(x_list), min(y_list), max(x_list), max(y_list)]\n", - " return bbox\n", - "\n", - "\n", - "def JsonToDataframe(data):\n", - " \"\"\"Returns entities in dataframe format\"\"\"\n", - " df = pd.DataFrame(columns=[\"type\", \"mentionText\", \"bbox\"])\n", - "\n", - " if \"entities\" not in data.keys():\n", - " return df\n", - "\n", - " for entity in data[\"entities\"]:\n", - " if \"properties\" in entity and len(entity[\"properties\"]) > 0:\n", - " for sub_entity in entity[\"properties\"]:\n", - " if \"type\" in sub_entity:\n", - " try:\n", - " boundingPoly = sub_entity[\"pageAnchor\"][\"pageRefs\"][0][\n", - " \"boundingPoly\"][\"normalizedVertices\"]\n", - " bbox = bbox_maker(boundingPoly)\n", - " # bbox = [boundingPoly[0]['x'], boundingPoly[0]['y'], boundingPoly[2]['x'], boundingPoly[2]['y']]\n", - " df.loc[len(df.index)] = [\n", - " sub_entity[\"type\"],\n", - " sub_entity[\"mentionText\"],\n", - " bbox,\n", - " ]\n", - " except KeyError:\n", - " if \"mentionText\" in sub_entity:\n", - " df.loc[len(df.index)] = [\n", - " sub_entity[\"type\"],\n", - " sub_entity[\"mentionText\"],\n", - " [],\n", - " ]\n", - " else:\n", - " df.loc[len(df.index)] = [\n", - " sub_entity[\"type\"],\n", - " \"Entity not found.\",\n", - " [],\n", - " ]\n", - " elif \"type\" in entity:\n", - " try:\n", - " boundingPoly = entity[\"pageAnchor\"][\"pageRefs\"][0][\n", - " \"boundingPoly\"][\"normalizedVertices\"]\n", - " bbox = bbox_maker(boundingPoly)\n", - " # bbox = [boundingPoly[0]['x'], boundingPoly[0]['y'], boundingPoly[2]['x'], boundingPoly[2]['y']]\n", - " df.loc[len(\n", - " df.index)] = [entity[\"type\"], entity[\"mentionText\"], bbox]\n", - " except KeyError:\n", - " if \"mentionText\" in entity:\n", - " df.loc[len(df.index)] = [\n", - " entity[\"type\"], entity[\"mentionText\"], []\n", - " ]\n", - " else:\n", - " df.loc[len(\n", - " df.index)] = [entity[\"type\"], \"Entity not found.\", []]\n", - " return df\n", - "\n", - "\n", - "def RemoveRow(df, entity):\n", - " \"\"\"Drops the entity passed from the dataframe\"\"\"\n", - " return df[df[\"type\"] != entity]\n", - "\n", - "\n", - "def FindMatch(entity_file1, df_file2):\n", - " \"\"\"Finds the matching entity from the dataframe using\n", - " the area of IOU between bboxes reference\n", - " \"\"\"\n", - " bbox_file1 = entity_file1[2]\n", - " # Entity not present in json file\n", - " if not bbox_file1:\n", - " return None\n", - "\n", - " # filtering entities with the same name\n", - " df_file2 = df_file2[df_file2[\"type\"] == entity_file1[0]]\n", - "\n", - " # calculating IOU values for the entities\n", - " index_iou_pairs = []\n", - " for index, entity_file2 in enumerate(df_file2.values):\n", - " if entity_file2[2]:\n", - " iou = BBIntersectionOverUnion(bbox_file1, entity_file2[2])\n", - " index_iou_pairs.append((index, iou))\n", - "\n", - " # choose entity with highest IOU, IOU should be atleast > 0.5\n", - " matched_index = None\n", - " for index_iou in sorted(index_iou_pairs,\n", - " key=operator.itemgetter(1),\n", - " reverse=True):\n", - " if index_iou[1] > 0.2: # 0.5\n", - " matched_index = df_file2.index[index_iou[0]]\n", - " break\n", - " return matched_index\n", - "\n", - "\n", - "def BBIntersectionOverUnion(box1, box2):\n", - " \"\"\"Calculates the area of IOU between two bounding boxes\"\"\"\n", - " print(\"++ BBIntersectionOverUnion ++\")\n", - " x1 = max(box1[0], box2[0])\n", - " y1 = max(box1[1], box2[1])\n", - " x2 = min(box1[2], box2[2])\n", - " y2 = min(box1[3], box2[3])\n", - "\n", - " inter_area = abs(max((x2 - x1, 0)) * max((y2 - y1), 0))\n", - " if inter_area == 0:\n", - " return 0\n", - " box1_area = abs((box1[2] - box1[0]) * (box1[3] - box1[1]))\n", - " box2_area = abs((box2[2] - box2[0]) * (box2[3] - box2[1]))\n", - " iou = inter_area / float(box1_area + box2_area - inter_area)\n", - "\n", - " return iou\n", - "\n", - "\n", - "def GetMatchRatio(values):\n", - " file1_value = values[1]\n", - " file2_value = values[3]\n", - " if file1_value == \"Entity not found.\" or file2_value == \"Entity not found.\":\n", - " return 0\n", - " else:\n", - " return difflib.SequenceMatcher(a=file1_value, b=file2_value).ratio()\n", - "\n", - "\n", - "def compare_pre_hitl_and_post_hitl_output(file1, file2):\n", - " \"\"\"Compares the entities between two files and returns\n", - " the results in a dataframe\n", - " \"\"\"\n", - " print(\"== compare_pre_hitl_and_post_hitl_output ==\")\n", - " df_file1 = JsonToDataframe(file1)\n", - " df_file2 = JsonToDataframe(file2)\n", - " file1_entities = [entity[0] for entity in df_file1.values]\n", - " print(file1_entities, \"\\n\")\n", - " file2_entities = [entity[0] for entity in df_file2.values]\n", - " print(file2_entities)\n", - "\n", - " # find entities which are present only once in both files\n", - " # these entities will be matched directly\n", - " common_entities = set(file1_entities).intersection(set(file2_entities))\n", - " exclude_entities = []\n", - " for entity in common_entities:\n", - " print(\"entity -- : \", entity)\n", - " if file1_entities.count(entity) > 1 or file2_entities.count(\n", - " entity) > 1:\n", - " exclude_entities.append(entity)\n", - "\n", - " print(\"exclude_entities : \", exclude_entities)\n", - " for entity in exclude_entities:\n", - " common_entities.remove(entity)\n", - " df_compare = pd.DataFrame(columns=[\n", - " \"Entity Type\",\n", - " \"Pre_HITL_Output\",\n", - " \"Pre_HITL_bbox\",\n", - " \"Post_HITL_Output\",\n", - " \"Post_HITL_bbox\",\n", - " ])\n", - " print(\"df_compare:--- \\n\", df_compare)\n", - " for entity in common_entities:\n", - " value1 = df_file1[df_file1[\"type\"] == entity].iloc[0][\"mentionText\"]\n", - " value2 = df_file2[df_file2[\"type\"] == entity].iloc[0][\"mentionText\"]\n", - " bbox1 = df_file1[df_file1[\"type\"] == entity].iloc[0][\"bbox\"]\n", - " bbox2 = df_file2[df_file2[\"type\"] == entity].iloc[0][\"bbox\"]\n", - " df_compare.loc[len(\n", - " df_compare.index)] = [entity, value1, bbox1, value2, bbox2]\n", - "\n", - " # common entities are removed from df_file1 and df_file2\n", - " df_file1 = RemoveRow(df_file1, entity)\n", - " df_file2 = RemoveRow(df_file2, entity)\n", - "\n", - " # remaining entities are matched comparing the area of IOU across them\n", - " mentionText2 = pd.Series(dtype=str)\n", - " bbox2 = pd.Series(dtype=str)\n", - " for index, row in enumerate(df_file1.values):\n", - " matched_index = FindMatch(row, df_file2)\n", - " if matched_index != None:\n", - " mentionText2.loc[index] = df_file2.loc[matched_index][1]\n", - " bbox2.loc[index] = df_file2.loc[matched_index][2]\n", - " df_file2 = df_file2.drop(matched_index)\n", - " else:\n", - " mentionText2.loc[index] = \"Entity not found.\"\n", - " bbox2.loc[index] = \"bbox not found\"\n", - "\n", - " df_file1[\"mentionText2\"] = mentionText2.values\n", - " df_file1[\"bbox2\"] = bbox2.values\n", - " # df_file1 = df_file1.drop(['bbox'], axis=1)\n", - " df_file1.rename(\n", - " columns={\n", - " \"type\": \"Entity Type\",\n", - " \"mentionText\": \"Pre_HITL_Output\",\n", - " \"bbox\": \"Pre_HITL_bbox\",\n", - " \"mentionText2\": \"Post_HITL_Output\",\n", - " \"bbox2\": \"Post_HITL_bbox\",\n", - " },\n", - " inplace=True,\n", - " )\n", - " df_compare = df_compare._append(df_file1, ignore_index=True)\n", - "\n", - " # adding entities which are present in file2 but not in file1\n", - " for row in df_file2.values:\n", - " df_compare.loc[len(df_compare.index)] = [\n", - " row[0],\n", - " \"Entity not found.\",\n", - " \"bbox not present\",\n", - " row[1],\n", - " row[2],\n", - " ]\n", - "\n", - " # df_compare['Match'] = df_compare['Ground Truth Text'] == df_compare['Output Text']\n", - " match_array = []\n", - " for i in range(0, len(df_compare)):\n", - " match_string = \"\"\n", - " if (df_compare.iloc[i][\"Pre_HITL_Output\"] == \"Entity not found.\" and\n", - " df_compare.iloc[i][\"Post_HITL_Output\"] == \"Entity not found.\"):\n", - " match_string = \"TN\"\n", - " elif (df_compare.iloc[i][\"Pre_HITL_Output\"] != \"Entity not found.\" and\n", - " df_compare.iloc[i][\"Post_HITL_Output\"] == \"Entity not found.\"):\n", - " match_string = \"FN\"\n", - " elif (df_compare.iloc[i][\"Pre_HITL_Output\"] == \"Entity not found.\" and\n", - " df_compare.iloc[i][\"Post_HITL_Output\"] != \"Entity not found.\"):\n", - " match_string = \"FP\"\n", - " elif (df_compare.iloc[i][\"Pre_HITL_Output\"] != \"Entity not found.\" and\n", - " df_compare.iloc[i][\"Post_HITL_Output\"] != \"Entity not found.\"):\n", - " if (df_compare.iloc[i][\"Pre_HITL_Output\"] == df_compare.iloc[i]\n", - " [\"Post_HITL_Output\"]):\n", - " match_string = \"TP\"\n", - " else:\n", - " match_string = \"FP\"\n", - " else:\n", - " match_string = \"Something went Wrong.\"\n", - "\n", - " match_array.append(match_string)\n", - "\n", - " df_compare[\"Match\"] = match_array\n", - "\n", - " df_compare[\"Fuzzy Ratio\"] = df_compare.apply(GetMatchRatio, axis=1)\n", - " if list(df_compare.index):\n", - " score = df_compare[\"Fuzzy Ratio\"].sum() / len(df_compare.index)\n", - " else:\n", - " score = 0\n", - "\n", - " print(\"match_array\")\n", - " print(match_array)\n", - " return df_compare, score\n", - "\n", - "\n", - "# Execute the below code\n", - "\n", - "pre_HITL_output_URI = config.get(\"Parameters\", \"pre_hitl_output_uri\")\n", - "post_HITL_output_URI = config.get(\"Parameters\", \"post_hitl_output_uri\")\n", - "# print(pre_HITL_output_URI)\n", - "# print(post_HITL_output_URI)\n", - "\n", - "# creating temperary buckets\n", - "import datetime\n", - "\n", - "now = str(datetime.datetime.now())\n", - "now = re.sub(r\"\\W+\", \"\", now)\n", - "\n", - "print(\"Creating temporary buckets\")\n", - "pre_HITL_bucket_name_temp = \"pre_hitl_output\" + \"_\" + now\n", - "post_HITL_bucket_name_temp = \"post_hitl_output_temp\" + \"_\" + now\n", - "# bucket name and prefix\n", - "pre_HITL_bucket = pre_HITL_output_URI.split(\"/\")[2]\n", - "post_HITL_bucket = post_HITL_output_URI.split(\"/\")[2]\n", - "# getting all files and copying to temporary folder\n", - "\n", - "try:\n", - " check_create_bucket(pre_HITL_bucket_name_temp)\n", - " check_create_bucket(post_HITL_bucket_name_temp)\n", - "except Exception as e:\n", - " print(\"unable to create bucket because of exception : \", e)\n", - "\n", - "try:\n", - " pre_HITL_output_files, pre_HITL_output_dict = file_names(\n", - " pre_HITL_output_URI)\n", - " # print(pre_HITL_output_files,pre_HITL_output_dict)\n", - " post_HITL_output_files, post_HITL_output_dict = file_names(\n", - " post_HITL_output_URI)\n", - " # print(post_HITL_output_files,post_HITL_output_dict)\n", - " print(\"copying files to temporary bucket\")\n", - " for i in pre_HITL_output_files:\n", - " copy_blob(pre_HITL_bucket, pre_HITL_output_dict[i],\n", - " pre_HITL_bucket_name_temp, i)\n", - " for i in post_HITL_output_files:\n", - " copy_blob(post_HITL_bucket, post_HITL_output_dict[i],\n", - " post_HITL_bucket_name_temp, i)\n", - " pre_HITL_files_list = list_blobs(pre_HITL_bucket_name_temp)\n", - " post_HITL_files_list = list_blobs(post_HITL_bucket_name_temp)\n", - "except Exception as e:\n", - " print(\"unable to get list of files in buckets because : \", e)\n", - "# processing the files and saving the files in temporary GCP bucket\n", - "fs = gcsfs.GCSFileSystem(project_id)\n", - "relation_dict, non_relation_dict = relation_dict_generator(\n", - " pre_HITL_bucket_name_temp, post_HITL_bucket_name_temp)\n", - "\n", - "time_stamp = datetime.datetime.now().strftime(\"%d_%m_%y-%H%M%S\")\n", - "filename_error_count_dict = {}\n", - "\n", - "compare_merged = pd.DataFrame()\n", - "accuracy_docs = []\n", - "print(\"comparing the PRE-HITL Jsons and POST-HITL jsons ....Wait for Summary \")\n", - "for i in relation_dict:\n", - " # print(\"***** i : \", i)\n", - " pre_HITL_json = blob_downloader(pre_HITL_bucket_name_temp, i)\n", - " post_HITL_json = blob_downloader(post_HITL_bucket_name_temp,\n", - " relation_dict[i])\n", - " # print('pre_HITL_json : ', pre_HITL_json)\n", - " # print('post_HITL_json : ', post_HITL_json)\n", - " compare_output = compare_pre_hitl_and_post_hitl_output(\n", - " pre_HITL_json, post_HITL_json)[0]\n", - " # print('compare_output :',compare_output)\n", - " column = [relation_dict[i]] * compare_output.shape[0]\n", - " # print(\"++++column++++\")\n", - " # print(column)\n", - " compare_output.insert(loc=0, column=\"File Name\", value=column)\n", - "\n", - " compare_output.insert(loc=5, column=\"hitl_update\", value=\" \")\n", - " for j in range(len(compare_output)):\n", - " if compare_output[\"Fuzzy Ratio\"][j] != 1.0: # strict\n", - " if (compare_output[\"Pre_HITL_Output\"][j] == \"Entity not found.\"\n", - " and compare_output[\"Post_HITL_Output\"][j]\n", - " == \"Entity not found.\"):\n", - " compare_output[\"hitl_update\"][j] = \"NO\"\n", - " else:\n", - " compare_output[\"hitl_update\"][j] = \"YES\"\n", - " else:\n", - " compare_output[\"hitl_update\"][j] = \"NO\"\n", - " for k in range(len(compare_output)):\n", - " if compare_output[\"Fuzzy Ratio\"][k] != 1.0: # strict\n", - " hitl_update = \"HITL UPDATED\"\n", - " break\n", - " else:\n", - " compare_output[\"hitl_update\"][k] = \"NO\"\n", - "\n", - " ##\n", - " compare_output[\"bbox_mismatch\"] = (compare_output[\"Pre_HITL_bbox\"]\n", - " != compare_output[\"Post_HITL_bbox\"])\n", - "\n", - " # OCR Issue\n", - " compare_output[\"OCR Issue\"] = \"No\"\n", - " # compare_output.loc[(compare_output['Pre_HITL_Output'] != compare_output['Post_HITL_Output']), 'OCR Issue'] = 'Yes' # & cordinates are same\n", - " compare_output.loc[\n", - " (compare_output[\"Pre_HITL_Output\"] !=\n", - " compare_output[\"Post_HITL_Output\"])\n", - " &\n", - " (compare_output[\"Pre_HITL_bbox\"] == compare_output[\"Post_HITL_bbox\"]),\n", - " \"OCR Issue\",\n", - " ] = \"Yes\"\n", - "\n", - " # Parser Issue\n", - " compare_output[\"Parser Issue\"] = \"No\"\n", - " compare_output.loc[\n", - " (compare_output[\"hitl_update\"] == \"YES\")\n", - " & (compare_output[\"bbox_mismatch\"] == True),\n", - " \"Parser Issue\",\n", - " ] = \"Yes\" # & cordinates are different\n", - " # compare_output.loc[\n", - " # ((compare_output['hitl_update'] == 'YES') & (compare_output['bbox_mismatch'] == True))\n", - " # &\n", - " # (compare_output['Pre_HITL_bbox'] != compare_output['Post_HITL_bbox']), 'Parser Issue'\n", - " # ] = 'Yes'\n", - "\n", - " # Parser Issue - entity not found cases | skip if both are 'Entity not found'\n", - " try:\n", - " compare_merged.loc[\n", - " (compare_merged[\"Post_HITL_Output\"] == \"Entity not found.\")\n", - " | (compare_merged[\"Pre_HITL_Output\"] == \"Entity not found.\"),\n", - " \"Parser Issue\",\n", - " ] = \"Yes\"\n", - " except:\n", - " pass\n", - "\n", - " ## global dict : no of parser error / file\n", - " temp = {}\n", - " temp[\"bbox_mismatch\"] = len(\n", - " compare_output[compare_output[\"bbox_mismatch\"] == True])\n", - "\n", - " temp[\"OCR_issue\"] = len(\n", - " compare_output.loc[(compare_output[\"Pre_HITL_Output\"] !=\n", - " compare_output[\"Post_HITL_Output\"])\n", - " & (compare_output[\"Pre_HITL_bbox\"] ==\n", - " compare_output[\"Post_HITL_bbox\"])])\n", - " temp[\"Parser_issue\"] = len(\n", - " compare_output.loc[(compare_output[\"hitl_update\"] == \"YES\")\n", - " & (compare_output[\"bbox_mismatch\"] == True)])\n", - " temp[\"output_file\"] = \"analysis_\" + time_stamp + \"/\" + i.replace(\n", - " \"json\", \"csv\")\n", - "\n", - " filename_error_count_dict[i] = temp\n", - "\n", - " new_row = pd.Series(\n", - " [\n", - " i,\n", - " \"Entities\",\n", - " \"are updated\",\n", - " \"by HITL\",\n", - " \":\",\n", - " np.nan,\n", - " hitl_update,\n", - " \"\",\n", - " \"\",\n", - " \"\",\n", - " \"\",\n", - " \"\",\n", - " ],\n", - " index=compare_output.columns,\n", - " )\n", - " compare_output = compare_output._append(new_row, ignore_index=True)\n", - " frames = [compare_merged, compare_output]\n", - " compare_merged = pd.concat(frames)\n", - "\n", - "with open(\"summary_\" + time_stamp + \".json\", \"w\") as ofile:\n", - " ofile.write(json.dumps(filename_error_count_dict))\n", - "\n", - "for x in relation_dict:\n", - " # print(x)\n", - " file_out = compare_merged[compare_merged[\"File Name\"] == x]\n", - " try:\n", - " os.mkdir(\"analysis_\" + time_stamp)\n", - " except:\n", - " pass\n", - " file_out.to_csv(\"analysis_\" + time_stamp + \"/\" + x.replace(\"json\", \"csv\"))\n", - "\n", - "bucket_delete(pre_HITL_bucket_name_temp)\n", - "bucket_delete(post_HITL_bucket_name_temp)" - ] - } - ], - "metadata": { - "environment": { - "kernel": "python3", - "name": "common-cpu.m104", - "type": "gcloud", - "uri": "gcr.io/deeplearning-platform-release/base-cpu:m104" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/DocAI Incubator Tools/best-practices/Pre_and_Post_HITL_Visualization.ipynb b/DocAI Incubator Tools/best-practices/Pre_and_Post_HITL_Visualization.ipynb deleted file mode 100644 index 623d9825f..000000000 --- a/DocAI Incubator Tools/best-practices/Pre_and_Post_HITL_Visualization.ipynb +++ /dev/null @@ -1,962 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "12ccafbd-d094-4604-9f85-975f709f7038", - "metadata": {}, - "source": [ - "# Pre and Post HITL Visualization" - ] - }, - { - "cell_type": "markdown", - "id": "370b84a9-28f8-4e8b-9dd4-8b8363e59282", - "metadata": {}, - "source": [ - "* Author: docai-incubator@google.com" - ] - }, - { - "cell_type": "markdown", - "id": "270ddad0-5d76-4dd2-a863-74c3eb4effba", - "metadata": {}, - "source": [ - "## Disclaimer\n", - "\n", - "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.\t" - ] - }, - { - "cell_type": "markdown", - "id": "351f231a-bd30-4ee3-a617-c12bf2261d57", - "metadata": {}, - "source": [ - "## Purpose of the script\n", - "This tool uses Pre-HITL JSON files (Parsed from a processor) and Post HITL JSON files(Updated through HITL) from GCS bucket as input, compares the Json files and differences are shown in an Excel with bounding boxes added images.\n" - ] - }, - { - "cell_type": "markdown", - "id": "30079506-4bc3-462b-aecb-dd95a1f1958a", - "metadata": { - "tags": [] - }, - "source": [ - "## Prerequisite\n", - " * Vertex AI Notebook\n", - " * Pre HITL and Post HITL Json files (filename should be same) in GCS Folders\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "id": "418be9ca-916f-41a1-9196-800401bddd9d", - "metadata": {}, - "source": [ - "## Step by Step procedure \n", - "\n", - "**1. Config file Creation** \\\n", - " Run the below code and create a config.ini file for providing input.\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b8f46bad-fc9d-4439-9284-67265b519b58", - "metadata": {}, - "outputs": [], - "source": [ - "import configparser\n", - "\n", - "config = configparser.ConfigParser()\n", - "# Add the structure to the file we will create\n", - "config.add_section(\"Parameters\")\n", - "config.set(\"Parameters\", \"project_id\", \"xxxx-xxxx-xxxx\")\n", - "config.set(\"Parameters\", \"Pre_HITL_Output_URI\", \"gs://\")\n", - "config.set(\"Parameters\", \"Post_HITL_Output_URI\", \"gs://\")\n", - "# Write the new structure to the new file\n", - "with open(r\"configfile.ini\", \"w\") as configfile:\n", - " config.write(configfile)" - ] - }, - { - "cell_type": "markdown", - "id": "364c5f84-6284-4c39-a422-51c1f1109028", - "metadata": {}, - "source": [ - "**2. Input Details** \n", - "\n", - "Once **config.ini** file is created with the above step , enter the input in the config file with necessary details as below\n", - " * project_id: provide the project id\n", - " * Pre_HITL_Output_URI: provide the gcs path of pre HITL jsons (processed jsons)\n", - " * Post_HITL_Output_URI: provide the gcs path of post HITL jsons (Jsons processed thru HITL)\n", - " \n", - "![](https://screenshot.googleplex.com/7DMhDW8d5GZnUBG.png)\n", - "\n", - "**NOTE:** The Name of Post-HITL Json will not be the same as the original file name by default. This has to be updated manually before using this tool." - ] - }, - { - "cell_type": "markdown", - "id": "6793a405-73d5-466c-bb7a-10f219bbd6dc", - "metadata": {}, - "source": [ - "**3. Run the Code**\n", - "\n", - "Copy the code provided in this document, Enter the path of Config file and Run without any edits\n", - "![](https://screenshot.googleplex.com/BP8v3wHicSEs6xr.png)" - ] - }, - { - "cell_type": "markdown", - "id": "ceae5bee-4d39-460b-a7ef-5071e4d140ca", - "metadata": {}, - "source": [ - "**4. Output** \n", - "\n", - "The output of the tool will be in an Excel format showing the entities which are updated in HITL and unchanged as well with images of labeled docs (both pre and post HITL).\n", - "\n", - "The Excel sheet which is created will have a summary of all the file files in “Consolidated_Data” and comparison in a separate sheet for each file.\n", - "\n", - "Each Excel sheet created will have a batch of 20 files in it.\n", - "\n", - "![](https://screenshot.googleplex.com/6nL7E3hrRSEi6ST.png)\n", - "\n", - "The Excel file will have all the details of Pre-HITL text, Post-HITL text and whether the entity is updated in HITL in the form YES and NO as shown below .\n", - "\n", - "![](https://screenshot.googleplex.com/8wqPTMyUY5ASKZA.png)\n", - "\n", - "There will be a list of documents for which either the required confidence threshold is met or no HITL output is created yet is updated as “NO POST HITL OUTPUT AVAILABLE” at the end of excel in consolidated sheets.\n", - "\n", - "![](https://screenshot.googleplex.com/8tpFZsVfFdTBoKA.png)\n", - "\n", - "\n", - "Blue Bounding Box⇒ Entities in Pre-HITL Json\n", - "Red Bounding Box⇒ Entities updated in HITL\n", - "Green Bounding Box⇒ Entities deleted in HITL( Entities which are detected by parser are deleted in HITL)\n", - "\n", - "**Bounding box color coding in images**\n", - "\n", - "![](https://screenshot.googleplex.com/9aph7w2N2vywPFP.png)" - ] - }, - { - "cell_type": "markdown", - "id": "8c8a8f42-9d73-40f1-a356-39de0853cf33", - "metadata": {}, - "source": [ - "## **Sample Code**" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6f5352a2-cc8c-480e-8336-0e886b4cac76", - "metadata": {}, - "outputs": [], - "source": [ - "# pip install below libraries for one time\n", - "\n", - "#!pip install configparser\n", - "#!pip install google.cloud\n", - "#!pip install ast\n", - "#!pip install openpyxl\n", - "\n", - "import ast\n", - "import configparser\n", - "import difflib\n", - "import io\n", - "import json\n", - "import operator\n", - "import os\n", - "import re\n", - "import time\n", - "from collections.abc import Container, Iterable, Iterator, Mapping, Sequence\n", - "from typing import List, Optional, Tuple, Union\n", - "\n", - "import cv2\n", - "import gcsfs\n", - "import numpy\n", - "import numpy as np\n", - "import openpyxl\n", - "# installing libraries\n", - "import pandas as pd\n", - "from google.cloud import documentai_v1beta3, storage\n", - "from PIL import Image, ImageDraw\n", - "from PyPDF2 import PdfFileReader\n", - "\n", - "pd.options.mode.chained_assignment = None # default='warn'\n", - "\n", - "# input\n", - "Path = \"configfile.ini\" # Enter the path of config file\n", - "config = configparser.ConfigParser()\n", - "config.read(Path)\n", - "\n", - "project_id = config.get(\"Parameters\", \"project_id\")\n", - "pre_HITL_output_URI = config.get(\"Parameters\", \"pre_hitl_output_uri\")\n", - "post_HITL_output_URI = config.get(\"Parameters\", \"post_hitl_output_uri\")\n", - "\n", - "# FUNCTIONS\n", - "\n", - "\n", - "# checking whether bucket exists else create temperary bucket\n", - "def check_create_bucket(bucket_name):\n", - " \"\"\"This Function is to create a temperary bucket\n", - " for storing the processed files\n", - " args: name of bucket\"\"\"\n", - "\n", - " storage_client = storage.Client()\n", - " try:\n", - " bucket = storage_client.get_bucket(bucket_name)\n", - " print(f\"Bucket {bucket_name} already exists.\")\n", - " except:\n", - " bucket = storage_client.create_bucket(bucket_name)\n", - " print(f\"Bucket {bucket_name} created.\")\n", - " return bucket\n", - "\n", - "\n", - "def bucket_delete(bucket_name):\n", - " \"\"\"This function deltes the bucket and used for deleting the temporary\n", - " bucket\n", - " args: bucket name\"\"\"\n", - " storage_client = storage.Client()\n", - " try:\n", - " bucket = storage_client.get_bucket(bucket_name)\n", - " bucket.delete(force=True)\n", - " except:\n", - " pass\n", - "\n", - "\n", - "def file_names(file_path):\n", - " \"\"\"This Function will load the bucket and get the list of files\n", - " in the gs path given\n", - " args: gs path\n", - " output: file names as list and dictionary with file names as keys and file path as values\n", - " \"\"\"\n", - " bucket = file_path.split(\"/\")[2]\n", - " file_names_list = []\n", - " file_dict = {}\n", - " storage_client = storage.Client()\n", - " source_bucket = storage_client.get_bucket(bucket)\n", - " filenames = [\n", - " filename.name for filename in list(\n", - " source_bucket.list_blobs(\n", - " prefix=((\"/\").join(file_path.split(\"/\")[3:]))))\n", - " ]\n", - " for i in range(len(filenames)):\n", - " x = filenames[i].split(\"/\")[-1]\n", - " if x != \"\":\n", - " file_names_list.append(x)\n", - " file_dict[x] = filenames[i]\n", - " return file_names_list, file_dict\n", - "\n", - "\n", - "# list\n", - "def list_blobs(bucket_name):\n", - " \"\"\"This function will give the list of files in a bucket\n", - " args: gcs bucket name\n", - " output: list of files\"\"\"\n", - " blob_list = []\n", - " storage_client = storage.Client()\n", - " blobs = storage_client.list_blobs(bucket_name)\n", - " for blob in blobs:\n", - " blob_list.append(blob.name)\n", - " return blob_list\n", - "\n", - "\n", - "# Bucket operations\n", - "def relation_dict_generator(pre_hitl_output_bucket, post_hitl_output_bucket):\n", - " \"\"\"This Function will check the files from pre_hitl_output_bucket and post_hitl_output_bucket\n", - " and finds the json with same names(relation)\"\"\"\n", - " pre_hitl_bucket_blobs = list_blobs(pre_hitl_output_bucket)\n", - " post_hitl_bucket_blobs = list_blobs(post_hitl_output_bucket)\n", - "\n", - " relation_dict = {}\n", - " non_relation_dict = {}\n", - " for i in pre_hitl_bucket_blobs:\n", - " for j in post_hitl_bucket_blobs:\n", - " matched_score = difflib.SequenceMatcher(None, i, j).ratio()\n", - " if matched_score > 0.9:\n", - " relation_dict[i] = j\n", - " else:\n", - " non_relation_dict[i] = \"NO POST HITL OUTPUT AVAILABLE\"\n", - " # print(i)\n", - " for i in relation_dict:\n", - " if i in non_relation_dict.keys():\n", - " del non_relation_dict[i]\n", - "\n", - " return relation_dict, non_relation_dict\n", - "\n", - "\n", - "def blob_downloader(bucket_name, blob_name):\n", - " \"\"\"This Function is used to download the files from gcs bucket\"\"\"\n", - " storage_client = storage.Client()\n", - " bucket = storage_client.bucket(bucket_name)\n", - " blob = bucket.blob(blob_name)\n", - " contents = blob.download_as_string()\n", - " return json.loads(contents.decode())\n", - "\n", - "\n", - "def copy_blob(bucket_name, blob_name, destination_bucket_name,\n", - " destination_blob_name):\n", - " \"\"\"This Method will copy files from one bucket(or folder) to another\"\"\"\n", - " storage_client = storage.Client()\n", - " source_bucket = storage_client.bucket(bucket_name)\n", - " source_blob = source_bucket.blob(blob_name)\n", - " destination_bucket = storage_client.bucket(destination_bucket_name)\n", - " blob_copy = source_bucket.copy_blob(source_blob, destination_bucket,\n", - " destination_blob_name)\n", - "\n", - "\n", - "def bbox_maker(boundingPoly):\n", - " x_list = []\n", - " y_list = []\n", - " for i in boundingPoly:\n", - " x_list.append(i[\"x\"])\n", - " y_list.append(i[\"y\"])\n", - " bbox = [min(x_list), min(y_list), max(x_list), max(y_list)]\n", - " return bbox\n", - "\n", - "\n", - "def JsonToDataframe(data):\n", - " \"\"\"Returns entities in dataframe format\"\"\"\n", - " df = pd.DataFrame(columns=[\"type\", \"mentionText\", \"bbox\", \"page\"])\n", - "\n", - " if \"entities\" not in data.keys():\n", - " return df\n", - "\n", - " for entity in data[\"entities\"]:\n", - " if \"properties\" in entity and len(entity[\"properties\"]) > 0:\n", - " for sub_entity in entity[\"properties\"]:\n", - " if \"type\" in sub_entity:\n", - " try:\n", - " boundingPoly = sub_entity[\"pageAnchor\"][\"pageRefs\"][0][\n", - " \"boundingPoly\"][\"normalizedVertices\"]\n", - " bbox = bbox_maker(boundingPoly)\n", - " # page=sub_entity['pageAnchor']['pageRefs'][0]['page']\n", - " # bbox = [boundingPoly[0]['x'], boundingPoly[0]['y'], boundingPoly[2]['x'], boundingPoly[2]['y']]\n", - " # df.loc[len(df.index)] = [sub_entity['type'], sub_entity['mentionText'], bbox]\n", - " try:\n", - " page = sub_entity[\"pageAnchor\"][\"pageRefs\"][0][\n", - " \"page\"]\n", - " df.loc[len(df.index)] = [\n", - " sub_entity[\"type\"],\n", - " sub_entity[\"mentionText\"],\n", - " bbox,\n", - " page,\n", - " ]\n", - " except KeyError:\n", - " df.loc[len(df.index)] = [\n", - " sub_entity[\"type\"],\n", - " sub_entity[\"mentionText\"],\n", - " bbox,\n", - " \"0\",\n", - " ]\n", - " except KeyError:\n", - " if \"mentionText\" in sub_entity:\n", - " df.loc[len(df.index)] = [\n", - " sub_entity[\"type\"],\n", - " sub_entity[\"mentionText\"],\n", - " [],\n", - " \"no\",\n", - " ]\n", - " else:\n", - " df.loc[len(df.index)] = [\n", - " sub_entity[\"type\"],\n", - " \"Entity not found.\",\n", - " [],\n", - " \"no\",\n", - " ]\n", - " elif \"type\" in entity:\n", - " try:\n", - " boundingPoly = entity[\"pageAnchor\"][\"pageRefs\"][0][\n", - " \"boundingPoly\"][\"normalizedVertices\"]\n", - " bbox = bbox_maker(boundingPoly)\n", - " # bbox = [boundingPoly[0]['x'], boundingPoly[0]['y'], boundingPoly[2]['x'], boundingPoly[2]['y']]\n", - " # df.loc[len(df.index)] = [entity['type'], entity['mentionText'], bbox]\n", - " try:\n", - " page = entity[\"pageAnchor\"][\"pageRefs\"][0][\"page\"]\n", - " df.loc[len(df.index)] = [\n", - " entity[\"type\"],\n", - " entity[\"mentionText\"],\n", - " bbox,\n", - " page,\n", - " ]\n", - " except KeyError:\n", - " df.loc[len(df.index)] = [\n", - " entity[\"type\"],\n", - " entity[\"mentionText\"],\n", - " bbox,\n", - " \"0\",\n", - " ]\n", - "\n", - " except KeyError:\n", - " if \"mentionText\" in entity:\n", - " df.loc[len(df.index)] = [\n", - " entity[\"type\"],\n", - " entity[\"mentionText\"],\n", - " [],\n", - " \"no\",\n", - " ]\n", - " else:\n", - " df.loc[len(df.index)] = [\n", - " entity[\"type\"],\n", - " \"Entity not found.\",\n", - " [],\n", - " \"no\",\n", - " ]\n", - " return df\n", - "\n", - "\n", - "def RemoveRow(df, entity):\n", - " \"\"\"Drops the entity passed from the dataframe\"\"\"\n", - " return df[df[\"type\"] != entity]\n", - "\n", - "\n", - "def FindMatch(entity_file1, df_file2):\n", - " \"\"\"Finds the matching entity from the dataframe using\n", - " the area of IOU between bboxes reference\n", - " \"\"\"\n", - " bbox_file1 = entity_file1[2]\n", - " # Entity not present in json file\n", - " if not bbox_file1:\n", - " return None\n", - "\n", - " # filtering entities with the same name\n", - " df_file2 = df_file2[df_file2[\"type\"] == entity_file1[0]]\n", - "\n", - " # calculating IOU values for the entities\n", - " index_iou_pairs = []\n", - " for index, entity_file2 in enumerate(df_file2.values):\n", - " if entity_file2[2]:\n", - " iou = BBIntersectionOverUnion(bbox_file1, entity_file2[2])\n", - " index_iou_pairs.append((index, iou))\n", - "\n", - " # choose entity with highest IOU, IOU should be atleast > 0.5\n", - " matched_index = None\n", - " for index_iou in sorted(index_iou_pairs,\n", - " key=operator.itemgetter(1),\n", - " reverse=True):\n", - " if index_iou[1] > 0.5:\n", - " matched_index = df_file2.index[index_iou[0]]\n", - " break\n", - " return matched_index\n", - "\n", - "\n", - "def BBIntersectionOverUnion(box1, box2):\n", - " \"\"\"Calculates the area of IOU between two bounding boxes\"\"\"\n", - " x1 = max(box1[0], box2[0])\n", - " y1 = max(box1[1], box2[1])\n", - " x2 = min(box1[2], box2[2])\n", - " y2 = min(box1[3], box2[3])\n", - "\n", - " inter_area = abs(max((x2 - x1, 0)) * max((y2 - y1), 0))\n", - " if inter_area == 0:\n", - " return 0\n", - " box1_area = abs((box1[2] - box1[0]) * (box1[3] - box1[1]))\n", - " box2_area = abs((box2[2] - box2[0]) * (box2[3] - box2[1]))\n", - " iou = inter_area / float(box1_area + box2_area - inter_area)\n", - "\n", - " return iou\n", - "\n", - "\n", - "def GetMatchRatio(values):\n", - " file1_value = values[1]\n", - " file2_value = values[2]\n", - " if file1_value == \"Entity not found.\" or file2_value == \"Entity not found.\":\n", - " return 0\n", - " else:\n", - " return difflib.SequenceMatcher(a=file1_value, b=file2_value).ratio()\n", - "\n", - "\n", - "def compare_pre_hitl_and_post_hitl_output(file1, file2):\n", - " \"\"\"Compares the entities between two files and returns\n", - " the results in a dataframe\n", - " \"\"\"\n", - " df_file1 = JsonToDataframe(file1)\n", - " df_file2 = JsonToDataframe(file2)\n", - " # df_file1.to_csv(\"1.csv\")\n", - " # df_file2.to_csv(\"2.csv\")\n", - " file1_entities = [entity[0] for entity in df_file1.values]\n", - " file2_entities = [entity[0] for entity in df_file2.values]\n", - "\n", - " # find entities which are present only once in both files\n", - " # these entities will be matched directly\n", - " common_entities = set(file1_entities).intersection(set(file2_entities))\n", - " exclude_entities = []\n", - " for entity in common_entities:\n", - " if file1_entities.count(entity) > 1 or file2_entities.count(\n", - " entity) > 1:\n", - " exclude_entities.append(entity)\n", - " for entity in exclude_entities:\n", - " common_entities.remove(entity)\n", - " df_compare = pd.DataFrame(columns=[\n", - " \"Entity Type\",\n", - " \"Pre_HITL_Output\",\n", - " \"Post_HITL_Output\",\n", - " \"pre_bbox\",\n", - " \"post_bbox\",\n", - " \"page1\",\n", - " \"page2\",\n", - " ])\n", - " for entity in common_entities:\n", - " value1 = df_file1[df_file1[\"type\"] == entity].iloc[0][\"mentionText\"]\n", - " value2 = df_file2[df_file2[\"type\"] == entity].iloc[0][\"mentionText\"]\n", - " pre_bbox = df_file1[df_file1[\"type\"] == entity].iloc[0][\"bbox\"]\n", - " post_bbox = df_file2[df_file2[\"type\"] == entity].iloc[0][\"bbox\"]\n", - " page1 = df_file1[df_file1[\"type\"] == entity].iloc[0][\"page\"]\n", - " page2 = df_file2[df_file2[\"type\"] == entity].iloc[0][\"page\"]\n", - " df_compare.loc[len(df_compare.index)] = [\n", - " entity,\n", - " value1,\n", - " value2,\n", - " pre_bbox,\n", - " post_bbox,\n", - " page1,\n", - " page2,\n", - " ]\n", - " # common entities are removed from df_file1 and df_file2\n", - " df_file1 = RemoveRow(df_file1, entity)\n", - " df_file2 = RemoveRow(df_file2, entity)\n", - "\n", - " # remaining entities are matched comparing the area of IOU across them\n", - " mentionText2 = pd.Series(dtype=str)\n", - " bbox2 = pd.Series(dtype=object)\n", - " bbox1 = pd.Series(dtype=object)\n", - " page_1 = pd.Series(dtype=object)\n", - " page_2 = pd.Series(dtype=object)\n", - "\n", - " for index, row in enumerate(df_file1.values):\n", - " matched_index = FindMatch(row, df_file2)\n", - " if matched_index != None:\n", - " mentionText2.loc[index] = df_file2.loc[matched_index][1]\n", - " bbox2.loc[index] = df_file2.loc[matched_index][2]\n", - " bbox1.loc[index] = row[2]\n", - " page_2.loc[index] = df_file2.loc[matched_index][3]\n", - " page_1.loc[index] = row[3]\n", - " df_file2 = df_file2.drop(matched_index)\n", - " else:\n", - " mentionText2.loc[index] = \"Entity not found.\"\n", - " bbox2.loc[index] = \"Entity not found.\"\n", - " bbox1.loc[index] = row[2]\n", - " page_1.loc[index] = row[3]\n", - " page_2.loc[index] = \"no\"\n", - "\n", - " df_file1[\"mentionText2\"] = mentionText2.values\n", - " df_file1[\"bbox2\"] = bbox2.values\n", - " df_file1[\"bbox1\"] = bbox1.values\n", - " df_file1[\"page_1\"] = page_1.values\n", - " df_file1[\"page_2\"] = page_2.values\n", - "\n", - " df_file1 = df_file1.drop([\"bbox\"], axis=1)\n", - " df_file1 = df_file1.drop([\"page\"], axis=1)\n", - " df_file1.rename(\n", - " columns={\n", - " \"type\": \"Entity Type\",\n", - " \"mentionText\": \"Pre_HITL_Output\",\n", - " \"mentionText2\": \"Post_HITL_Output\",\n", - " \"bbox1\": \"pre_bbox\",\n", - " \"bbox2\": \"post_bbox\",\n", - " \"page_1\": \"page1\",\n", - " \"page_2\": \"page2\",\n", - " },\n", - " inplace=True,\n", - " )\n", - " df_compare = df_compare.append(df_file1, ignore_index=True)\n", - " # adding entities which are present in file2 but not in file1\n", - " for row in df_file2.values:\n", - " df_compare.loc[len(df_compare.index)] = [\n", - " row[0],\n", - " \"Entity not found.\",\n", - " row[1],\n", - " \"[]\",\n", - " row[2],\n", - " \"[]\",\n", - " row[3],\n", - " ]\n", - "\n", - " # df_compare['Match'] = df_compare['Ground Truth Text'] == df_compare['Output Text']\n", - " match_array = []\n", - " for i in range(0, len(df_compare)):\n", - " match_string = \"\"\n", - " if (df_compare.iloc[i][\"Pre_HITL_Output\"] == \"Entity not found.\" and\n", - " df_compare.iloc[i][\"Post_HITL_Output\"] == \"Entity not found.\"):\n", - " match_string = \"TN\"\n", - " elif (df_compare.iloc[i][\"Pre_HITL_Output\"] != \"Entity not found.\" and\n", - " df_compare.iloc[i][\"Post_HITL_Output\"] == \"Entity not found.\"):\n", - " match_string = \"FN\"\n", - " elif (df_compare.iloc[i][\"Pre_HITL_Output\"] == \"Entity not found.\" and\n", - " df_compare.iloc[i][\"Post_HITL_Output\"] != \"Entity not found.\"):\n", - " match_string = \"FP\"\n", - " elif (df_compare.iloc[i][\"Pre_HITL_Output\"] != \"Entity not found.\" and\n", - " df_compare.iloc[i][\"Post_HITL_Output\"] != \"Entity not found.\"):\n", - " if (df_compare.iloc[i][\"Pre_HITL_Output\"] == df_compare.iloc[i]\n", - " [\"Post_HITL_Output\"]):\n", - " match_string = \"TP\"\n", - " else:\n", - " match_string = \"FP\"\n", - " else:\n", - " match_string = \"Something went Wrong.\"\n", - "\n", - " match_array.append(match_string)\n", - "\n", - " df_compare[\"Match\"] = match_array\n", - "\n", - " df_compare[\"Fuzzy Ratio\"] = df_compare.apply(GetMatchRatio, axis=1)\n", - " if list(df_compare.index):\n", - " score = df_compare[\"Fuzzy Ratio\"].sum() / len(df_compare.index)\n", - " else:\n", - " score = 0\n", - " return df_compare, score\n", - "\n", - "\n", - "def create_pdf_bytes(path):\n", - " \"\"\"THis Function will create pdf bytes from the image\n", - " content of the ground truth JSONS which will be used for processing of files\n", - " args: gs path of json file\n", - " output : pdf bytes\"\"\"\n", - "\n", - " def decode_image(image_bytes: bytes) -> Image.Image:\n", - " with io.BytesIO(image_bytes) as image_file:\n", - " image = Image.open(image_file)\n", - " image.load()\n", - " return image\n", - "\n", - " def create_pdf_from_images(images: Sequence[Image.Image]) -> bytes:\n", - " \"\"\"Creates a PDF from a sequence of images.\n", - "\n", - " The PDF will contain 1 page per image, in the same order.\n", - "\n", - " Args:\n", - " images: A sequence of images.\n", - "\n", - " Returns:\n", - " The PDF bytes.\n", - " \"\"\"\n", - " if not images:\n", - " raise ValueError(\"At least one image is required to create a PDF\")\n", - "\n", - " # PIL PDF saver does not support RGBA images\n", - " images = [\n", - " image.convert(\"RGB\") if image.mode == \"RGBA\" else image\n", - " for image in images\n", - " ]\n", - "\n", - " with io.BytesIO() as pdf_file:\n", - " images[0].save(pdf_file,\n", - " save_all=True,\n", - " append_images=images[1:],\n", - " format=\"PDF\")\n", - " return pdf_file.getvalue()\n", - "\n", - " d = documentai_v1beta3.Document\n", - " document = d.from_json(fs.cat(path))\n", - " synthesized_images = []\n", - " for i in range(len(document.pages)):\n", - " synthesized_images.append(decode_image(\n", - " document.pages[i].image.content))\n", - " pdf_bytes = create_pdf_from_images(synthesized_images)\n", - "\n", - " return pdf_bytes, synthesized_images\n", - "\n", - "\n", - "def find_excel_name():\n", - " i = 1\n", - " excel_file_name = \"HITL_VISUAL\" + str(i) + \".xlsx\"\n", - " comapare_analysis = compare_merged.drop(\n", - " [\"pre_bbox\", \"post_bbox\", \"page1\", \"page2\"], axis=1)\n", - " try:\n", - " workbook = openpyxl.load_workbook(excel_file_name)\n", - " num_sheets = len(workbook.sheetnames)\n", - " # print(num_sheets)\n", - " if num_sheets > 20:\n", - " excel_file = \"HITL_VISUAL\" + str(i + 1) + \".xlsx\"\n", - " comapare_analysis.to_excel(excel_file,\n", - " sheet_name=\"Consolidated_Data\")\n", - " else:\n", - " excel_file = \"HITL_VISUAL\" + str(i) + \".xlsx\"\n", - " except FileNotFoundError:\n", - " excel_file = \"HITL_VISUAL\" + str(i) + \".xlsx\"\n", - " comapare_analysis.to_excel(excel_file, sheet_name=\"Consolidated_Data\")\n", - " return excel_file\n", - "\n", - "\n", - "def get_visualization_excel(pre_HITL_output_URI, compare_merged,\n", - " relation_dict):\n", - " # compare_merged.to_excel(\"HITL_VISUAL1.xlsx\",sheet_name='Consolidated_Data')\n", - " pre_HITL_bucket = pre_HITL_output_URI.split(\"/\")[2]\n", - " pre_HITL_output_files, pre_HITL_output_dict = file_names(\n", - " pre_HITL_output_URI)\n", - "\n", - " for file in pre_HITL_output_dict:\n", - " excel_file = find_excel_name()\n", - " df = compare_merged.drop([\"pre_bbox\", \"post_bbox\", \"page1\", \"page2\"],\n", - " axis=1)\n", - " if file in relation_dict.keys():\n", - " df_file = df[df[\"File Name\"] == file]\n", - " with pd.ExcelWriter(excel_file, engine=\"openpyxl\",\n", - " mode=\"a\") as writer:\n", - " df_file.to_excel(writer, sheet_name=str(file))\n", - "\n", - " path = \"gs://\" + pre_HITL_bucket + \"/\" + pre_HITL_output_dict[file]\n", - " pdf_bytes, synthesized_images = create_pdf_bytes(path)\n", - " list_bbox_no = {}\n", - " list_bbox_yes_changed = {}\n", - " list_bbox_yes_old = {}\n", - " for row in compare_merged.values:\n", - " if row[0] == file:\n", - " if row[8] == \"NO\":\n", - " if type(row[4]) == list and row[4] != []:\n", - " try:\n", - " if row[6] in list_bbox_no.keys():\n", - " list_bbox_no[row[6]].append(row[4])\n", - " else:\n", - " list_bbox_no[row[6]] = [row[4]]\n", - " # print({row[6]:row[4]})\n", - " except:\n", - " pass\n", - " elif row[8] == \"YES\":\n", - " if type(row[5]) == list and row[5] != []:\n", - " try:\n", - " if row[7] in list_bbox_yes_changed.keys():\n", - " list_bbox_yes_changed[row[7]].append(\n", - " row[5])\n", - " else:\n", - " list_bbox_yes_changed[row[7]] = [row[5]]\n", - "\n", - " except:\n", - " pass\n", - " elif type(row[4]) == list and row[4] != []:\n", - " if row[6] in list_bbox_yes_old.keys():\n", - " list_bbox_yes_old[row[6]].append(row[4])\n", - " else:\n", - " list_bbox_yes_old[row[6]] = [row[4]]\n", - "\n", - " open_cv_image = {}\n", - " for i in range(len(synthesized_images)):\n", - " open_cv_image[i] = numpy.array(\n", - " synthesized_images[i].convert(\"RGB\"))\n", - " # print(list_bbox_yes_changed)\n", - " img_list = []\n", - " for i in range(len(open_cv_image)):\n", - " size = open_cv_image[i].shape\n", - " try:\n", - " for bbox in list_bbox_no[str(i)]:\n", - " x1 = int(bbox[0] * size[1])\n", - " x2 = int(bbox[2] * size[1])\n", - " y1 = int(bbox[1] * size[0])\n", - " y2 = int(bbox[3] * size[0])\n", - " # print(bbox[0]*size[0])\n", - " cv2.rectangle(open_cv_image[i], (x1, y1), (x2, y2),\n", - " (0, 0, 255), 2)\n", - " # cv2.putText(open_cv_image[i],'He',(x1,y1),font,2,(255,255,255),1)\n", - "\n", - " except:\n", - " pass\n", - " try:\n", - " for bbox in list_bbox_yes_changed[str(i)]:\n", - " x1 = int(bbox[0] * size[1])\n", - " x2 = int(bbox[2] * size[1])\n", - " y1 = int(bbox[1] * size[0])\n", - " y2 = int(bbox[3] * size[0])\n", - " cv2.rectangle(open_cv_image[i], (x1, y1), (x2, y2),\n", - " (255, 0, 0), 2)\n", - " except:\n", - " pass\n", - " try:\n", - " for bbox in list_bbox_yes_old[str(i)]:\n", - " x1 = int(bbox[0] * size[1])\n", - " x2 = int(bbox[2] * size[1])\n", - " y1 = int(bbox[1] * size[0])\n", - " y2 = int(bbox[3] * size[0])\n", - " cv2.rectangle(open_cv_image[i], (x1, y1), (x2, y2),\n", - " (0, 255, 0), 2)\n", - " except:\n", - " pass\n", - "\n", - " img1 = Image.fromarray(open_cv_image[i])\n", - " # img_list.append(img)\n", - " # img.save(file+str(i)+'.png')\n", - " # img.show()\n", - " import openpyxl\n", - "\n", - " workbook = openpyxl.load_workbook(excel_file)\n", - " worksheet = workbook[str(file)]\n", - "\n", - " img1.save(f\"open_cv_image[i].png\", \"PNG\")\n", - " img = openpyxl.drawing.image.Image(f\"open_cv_image[i].png\")\n", - " # if len(open_cv_image)>0:\n", - " # for i in open_cv_image:\n", - " # img.anchor = 'K'+str(int(i)*200)\n", - " img.anchor = \"K\" + str(1 + int(i) * 50)\n", - " worksheet.add_image(img)\n", - " img.width = 500\n", - " img.height = 700\n", - " workbook.save(excel_file)\n", - "\n", - "\n", - "# Execute the below code\n", - "\n", - "pre_HITL_output_URI = config.get(\"Parameters\", \"pre_hitl_output_uri\")\n", - "post_HITL_output_URI = config.get(\"Parameters\", \"post_hitl_output_uri\")\n", - "\n", - "try:\n", - " # creating temperary buckets\n", - " import datetime\n", - "\n", - " now = str(datetime.datetime.now())\n", - " now = re.sub(r\"\\W+\", \"\", now)\n", - "\n", - " print(\"Creating temporary buckets\")\n", - " pre_HITL_bucket_name_temp = \"pre_hitl_output\" + \"_\" + now\n", - " post_HITL_bucket_name_temp = \"post_hitl_output_temp\" + \"_\" + now\n", - " # bucket name and prefix\n", - " pre_HITL_bucket = pre_HITL_output_URI.split(\"/\")[2]\n", - " post_HITL_bucket = post_HITL_output_URI.split(\"/\")[2]\n", - " # getting all files and copying to temporary folder\n", - "\n", - " try:\n", - " check_create_bucket(pre_HITL_bucket_name_temp)\n", - " check_create_bucket(post_HITL_bucket_name_temp)\n", - " except Exception as e:\n", - " print(\"unable to create bucket because of exception : \", e)\n", - "\n", - " try:\n", - " pre_HITL_output_files, pre_HITL_output_dict = file_names(\n", - " pre_HITL_output_URI)\n", - " post_HITL_output_files, post_HITL_output_dict = file_names(\n", - " post_HITL_output_URI)\n", - " print(\"copying files to temporary bucket\")\n", - " for i in pre_HITL_output_files:\n", - " copy_blob(pre_HITL_bucket, pre_HITL_output_dict[i],\n", - " pre_HITL_bucket_name_temp, i)\n", - " for i in post_HITL_output_files:\n", - " copy_blob(\n", - " post_HITL_bucket,\n", - " post_HITL_output_dict[i],\n", - " post_HITL_bucket_name_temp,\n", - " i,\n", - " )\n", - " pre_HITL_files_list = list_blobs(pre_HITL_bucket_name_temp)\n", - " post_HITL_files_list = list_blobs(post_HITL_bucket_name_temp)\n", - " except Exception as e:\n", - " print(\"unable to get list of files in buckets because : \", e)\n", - " # processing the files and saving the files in temporary gCP bucket\n", - " fs = gcsfs.GCSFileSystem(project_id)\n", - " relation_dict, non_relation_dict = relation_dict_generator(\n", - " pre_HITL_bucket_name_temp, post_HITL_bucket_name_temp)\n", - " compare_merged = pd.DataFrame()\n", - " accuracy_docs = []\n", - " print(\n", - " \"comparing the PRE-HITL Jsons and POST-HITL jsons ....Wait for Summary \"\n", - " )\n", - " for i in relation_dict:\n", - " pre_HITL_json = blob_downloader(pre_HITL_bucket_name_temp, i)\n", - " post_HITL_json = blob_downloader(post_HITL_bucket_name_temp,\n", - " relation_dict[i])\n", - " compare_output = compare_pre_hitl_and_post_hitl_output(\n", - " pre_HITL_json, post_HITL_json)[0]\n", - " column = [relation_dict[i]] * compare_output.shape[0]\n", - " # print(column)\n", - " compare_output.insert(loc=0, column=\"File Name\", value=column)\n", - "\n", - " compare_output.insert(loc=8, column=\"hitl_update\", value=\" \")\n", - " for j in range(len(compare_output)):\n", - " if compare_output[\"Fuzzy Ratio\"][j] != 1.0:\n", - " if (compare_output[\"Pre_HITL_Output\"][j] == \"Entity not found.\"\n", - " and compare_output[\"Post_HITL_Output\"][j]\n", - " == \"Entity not found.\"):\n", - " compare_output[\"hitl_update\"][j] = \"NO\"\n", - " else:\n", - " compare_output[\"hitl_update\"][j] = \"YES\"\n", - " else:\n", - " compare_output[\"hitl_update\"][j] = \"NO\"\n", - " for k in range(len(compare_output)):\n", - " if compare_output[\"Fuzzy Ratio\"][k] != 1.0:\n", - " hitl_update = \"HITL UPDATED\"\n", - " break\n", - " else:\n", - " compare_output[\"hitl_update\"][k] = \"NO\"\n", - "\n", - " # new_row=pd.Series([i,\"Entities\",\"are updated\",\"by HITL\",\":\",np.nan,hitl_update], index=compare_output.columns)\n", - " # compare_output=compare_output.append(new_row,ignore_index= True)\n", - " frames = [compare_merged, compare_output]\n", - " compare_merged = pd.concat(frames)\n", - " try:\n", - " bucket_delete(pre_HITL_bucket_name_temp)\n", - " print(\"Deleting temperary buckets created\")\n", - " bucket_delete(post_HITL_bucket_name_temp)\n", - " except:\n", - " pass\n", - " compare_merged.drop([\"Match\", \"Fuzzy Ratio\"], axis=1, inplace=True)\n", - "\n", - " def highlight(s):\n", - " if s.hitl_update == \"YES\":\n", - " return [\"background-color: yellow\"] * len(s)\n", - " else:\n", - " return [\"background-color: white\"] * len(s)\n", - "\n", - " for k in non_relation_dict:\n", - " new_row = pd.Series(\n", - " [k, \"-\", \"-\", \"-\", \"\", \"\", \"\", \"\", non_relation_dict[k]],\n", - " index=compare_merged.columns,\n", - " )\n", - " compare_merged = compare_merged.append(new_row, ignore_index=True)\n", - " comapare_analysis1 = compare_merged.drop(\n", - " [\"pre_bbox\", \"post_bbox\", \"page1\", \"page2\"], axis=1)\n", - " # comapare_analysis1.to_csv('compare_analysis.csv')\n", - " entity_change = compare_merged.loc[compare_merged[\"hitl_update\"] == \"YES\"]\n", - " compare_merged_style = compare_merged.style.apply(highlight, axis=1)\n", - " try:\n", - " print(\"HITL Comparision excel is getting prepared\")\n", - " get_visualization_excel(pre_HITL_output_URI, compare_merged,\n", - " relation_dict)\n", - " print(\"Completed creating the HITL Comparision Excel\")\n", - " except Exception as e:\n", - " print(\"Unable to create HITL comparision excel because of :\", e)\n", - "except Exception as e:\n", - " try:\n", - " bucket_delete(pre_HITL_bucket_name_temp)\n", - " bucket_delete(post_HITL_bucket_name_temp)\n", - " print(\"unable to process the file : \", e)\n", - " except:\n", - " print(\"unable to process the file : \", e)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "95823f2c-a91b-4bb9-85aa-5d90f17fff05", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "environment": { - "kernel": "python3", - "name": "common-cpu.m104", - "type": "gcloud", - "uri": "gcr.io/deeplearning-platform-release/base-cpu:m104" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/DocAI Incubator Tools/best-practices/README.md b/DocAI Incubator Tools/best-practices/README.md index 34ea8b4f1..4ecc75cf5 100644 --- a/DocAI Incubator Tools/best-practices/README.md +++ b/DocAI Incubator Tools/best-practices/README.md @@ -1,4 +1,4 @@ -# Notebooks +# Folders * Key_Value_Pair_Entity_Conversion diff --git a/DocAI Incubator Tools/best-practices/Removing Empty Bounding Boxes/Removing Empty Bounding Boxes.ipynb b/DocAI Incubator Tools/best-practices/Removing Empty Bounding Boxes/Removing Empty Bounding Boxes.ipynb new file mode 100644 index 000000000..5189fcefd --- /dev/null +++ b/DocAI Incubator Tools/best-practices/Removing Empty Bounding Boxes/Removing Empty Bounding Boxes.ipynb @@ -0,0 +1,324 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "72bc511c-2282-4d2b-a2d4-28d25c07ffa2", + "metadata": { + "tags": [] + }, + "source": [ + "# DocAI - Script for Removing Empty Bounding Boxes" + ] + }, + { + "cell_type": "markdown", + "id": "778291f7-ed93-4f0f-abc9-c9f82245cf23", + "metadata": {}, + "source": [ + "* Author: docai-incubator@google.com" + ] + }, + { + "cell_type": "markdown", + "id": "2c92559f-6a39-4318-b8ae-064325723cc7", + "metadata": {}, + "source": [ + "## Disclaimer\n", + "\n", + "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.\t" + ] + }, + { + "cell_type": "markdown", + "id": "ce8e2b95-c07f-40ac-bb50-93b20b143ded", + "metadata": {}, + "source": [ + "## Purpose of the Script" + ] + }, + { + "cell_type": "markdown", + "id": "c5a2c845-2df8-41f4-9e3d-59c559a3c4b4", + "metadata": {}, + "source": [ + "\n", + "The purpose of this document is to provide instructions and a Python script for removing empty bounding boxes from a labeled JSON file. The script identifies and removes any bounding boxes (entities) in the JSON file that do not contain any mentionText or textAnchors, streamlining the labeling process and improving the accuracy of the labeling data.\n" + ] + }, + { + "cell_type": "markdown", + "id": "42fa7cdd-9654-42c2-8612-ea779a20a48a", + "metadata": {}, + "source": [ + "## Prerequisites" + ] + }, + { + "cell_type": "markdown", + "id": "728a861b-21e4-4300-aef8-650cb390a271", + "metadata": {}, + "source": [ + "1. Python : Jupyter notebook (Vertex AI) \n", + "2. Service account permissions in projects." + ] + }, + { + "cell_type": "markdown", + "id": "59fa8919-7096-4b92-8604-a4a6daa224f9", + "metadata": {}, + "source": [ + "## Installation Procedure" + ] + }, + { + "cell_type": "markdown", + "id": "60803eb8-6452-49f2-992c-08465e55b770", + "metadata": {}, + "source": [ + "The script consists of Python code. It can be loaded and run via: \n", + "1. Upload the IPYNB file or copy the code to the Vertex Notebook and follow the operation procedure. \\\n", + "**NOTE:** Don’t Execute the Script with Processor Dataset Path. Export the dataset to json and then use that bucket as an input." + ] + }, + { + "cell_type": "markdown", + "id": "6993aead-9bb6-45a4-8fba-c3be04fd3322", + "metadata": {}, + "source": [ + "## Operation Procedure" + ] + }, + { + "cell_type": "markdown", + "id": "1d18824a-111d-44b4-8df5-c8aadff24202", + "metadata": {}, + "source": [ + "### 1. Import the modules" + ] + }, + { + "cell_type": "markdown", + "id": "b7ec98b1-f8fa-4bab-b2cb-5bd49b4c6c38", + "metadata": {}, + "source": [ + "**Note :** external modules are used so they need to be installed. To install run these commands : " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "1d0f9226-89cd-4fca-8090-7303b48df66d", + "metadata": {}, + "outputs": [], + "source": [ + "# !pip install gcsfs\n", + "# !pip install google-cloud\n", + "import json\n", + "from pathlib import Path\n", + "\n", + "import gcsfs\n", + "import google.auth\n", + "import pandas as pd\n", + "from google.cloud import storage\n", + "from tqdm import tqdm\n", + "from google.cloud import documentai_v1beta3 as documentai" + ] + }, + { + "cell_type": "markdown", + "id": "63d8482e-0450-498c-940c-e775037ef907", + "metadata": {}, + "source": [ + "### 2. Setup the required inputs" + ] + }, + { + "cell_type": "markdown", + "id": "5c6e1955-4667-49b8-b6fa-5eb06546d7e1", + "metadata": {}, + "source": [ + "* **PROJECT_ID** - Your Google project id or name\n", + "* **BUCKET_NAME** - Name of the bucket\n", + "* **INPUT_FOLDER_PATH** - The path of the folder containing the JSON files to be processed, without the bucket name.\n", + "* **OUTPUT_FOLDER_PATH** - The path of the folder where the JSON files need to be stored after process, without the bucket * name.\n", + "\n", + "**Note :** Both Input and output paths should be in the same bucket. " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bbbc4b67-fbae-4e7b-8518-ddde8b2f30c5", + "metadata": {}, + "outputs": [], + "source": [ + "PROJECT_ID = \"rand-automl-project\"\n", + "BUCKET_NAME = \"accenture_line_items_samples\"\n", + "INPUT_FOLDER_PATH = \"output/output/2839778604252110189/0\" # Path without bucket name\n", + "OUTPUT_FOLDER_PATH = \"output_atul/output/\" # Path without bucket name\n", + "credentials, _ = google.auth.default()\n", + "fs = gcsfs.GCSFileSystem(project=PROJECT_ID, token=credentials)" + ] + }, + { + "cell_type": "markdown", + "id": "5b5b0743-d63b-4960-a46d-f2ffd9bcd252", + "metadata": {}, + "source": [ + "### 3. Execute the code" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1d9e2908-2566-49b4-b5e0-a8b005b8821c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "No. of files : 1\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "100%|██████████| 1/1 [00:00<00:00, 1.62it/s]\n" + ] + } + ], + "source": [ + "def get_file(file_path: str) -> documentai.Document:\n", + " \"\"\"\n", + " To read files from cloud storage.\n", + " \"\"\"\n", + " file_object = fs.cat(file_path)\n", + " doc = documentai.Document.from_json(file_object) # JSON to DocumentProto Format\n", + " return doc\n", + "\n", + "def store_blob(document, file: str):\n", + " \"\"\"\n", + " Store files in cloud storage.\n", + " \"\"\"\n", + " storage_client = storage.Client()\n", + " result_bucket = storage_client.get_bucket(BUCKET_NAME)\n", + " document_blob = storage.Blob(name=str(file), bucket=result_bucket)\n", + " document_blob.upload_from_string(documentai.Document.to_json(document),\n", + " content_type=\"application/json\")\n", + "def main():\n", + " logs = pd.DataFrame(columns=[\"FileName\"])\n", + "\n", + " files = [\n", + " i for i in fs.find(f\"{BUCKET_NAME}/{INPUT_FOLDER_PATH}\")\n", + " if i.endswith(\".json\")\n", + " ]\n", + " document_files_list = [get_file(i) for i in files]\n", + " print(\"No. of files : \", len(files))\n", + "\n", + " for index in tqdm(range(len(files))):\n", + " file_name = files[index].split(\"/\", 1)[-1]\n", + " output_file_name = file_name.replace(INPUT_FOLDER_PATH,OUTPUT_FOLDER_PATH)\n", + " is_updated = False\n", + " doc = document_files_list[index]\n", + " # print(doc)\n", + " sub_log = pd.DataFrame(columns=[file_name])\n", + " # for i in reversed(range(len(doc[\"entities\"]))):\n", + " # entity = doc[\"entities\"][i]\n", + " if doc.entities:\n", + " for entity in doc.entities:\n", + " if not entity.mention_text:\n", + " sub_log = sub_log.append({file_name: entity.type},\n", + " ignore_index=True)\n", + " doc.entities.remove(entity)\n", + " is_updated = True\n", + " continue\n", + " else:\n", + " if entity.properties and entity.mention_text.strip():\n", + " for sub_entity in entity.properties:\n", + " if (sub_entity.mention_text):\n", + " if (sub_entity.mention_text.strip() == \"\"):\n", + " sub_log = sub_log.append({file_name:sub_entity.type},ignore_index=True)\n", + " entity.properties.remove(sub_entity)\n", + " is_updated = True\n", + " continue\n", + " elif (not sub_entity.mention_text):\n", + " sub_log = sub_log.append({file_name:sub_entity.type},ignore_index=True)\n", + " entity.properties.remove(sub_entity) \n", + " is_updated = True\n", + " continue\n", + " if (not sub_entity.text_anchor):\n", + " sub_log = sub_log.append({file_name:sub_entity.type},ignore_index=True)\n", + " entity.properties.remove(sub_entity) \n", + " is_updated = True\n", + " continue\n", + " elif (sub_entity.text_anchor):\n", + " if (not sub_entity.text_anchor.text_segments):\n", + " sub_log = sub_log.append({file_name:sub_entity.type},ignore_index=True)\n", + " entity.properties.remove(sub_entity) \n", + " is_updated = True\n", + " continue\n", + " elif (len(sub_entity.text_anchor.text_segments) < 1):\n", + " sub_log = sub_log.append({file_name:sub_entity.type},ignore_index=True)\n", + " entity.properties.remove(sub_entity) \n", + " is_updated = True\n", + " continue\n", + " else:\n", + " print(\"Entities missing : \", files[index])\n", + " # if is_updated:\n", + " store_blob(doc, output_file_name)\n", + " if not sub_log.empty:\n", + " logs = pd.concat([logs, sub_log], axis=1)\n", + " # logs.drop(\"FileName\", axis=1, inplace=True)\n", + " logs.to_csv(\"output.csv\", index=False)\n", + "\n", + "\n", + "main()" + ] + }, + { + "cell_type": "markdown", + "id": "e5345184-bb87-4b08-bda0-0bb7e7eca45e", + "metadata": {}, + "source": [ + "## Output File" + ] + }, + { + "cell_type": "markdown", + "id": "ef59951f-930c-4c7c-8df5-e2738b0d8ff8", + "metadata": {}, + "source": [ + "The script deletes all bounding boxes (entities) in the JSON file that do not contain any mentionText or textAnchors, and overwrites the file. The script will also create a CSV file containing a list of deleted entities." + ] + } + ], + "metadata": { + "environment": { + "kernel": "python3", + "name": "common-cpu.m104", + "type": "gcloud", + "uri": "gcr.io/deeplearning-platform-release/base-cpu:m104" + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/DocAI Incubator Tools/best-practices/Removing Empty Bounding Boxes/readme.md b/DocAI Incubator Tools/best-practices/Removing Empty Bounding Boxes/readme.md new file mode 100644 index 000000000..93c1b3054 --- /dev/null +++ b/DocAI Incubator Tools/best-practices/Removing Empty Bounding Boxes/readme.md @@ -0,0 +1,21 @@ + +## Purpose of the Script +This Python script are designed to streamline the labeling process of JSON files by identifying and removing empty bounding boxes. Specifically, the script eliminates bounding boxes (or entities) that lack `mentionText` or `textAnchors`, thus enhancing the accuracy of the labeling data. + +## Inputs + +- **PROJECT_ID**: Your Google project ID or name. +- **BUCKET_NAME**: The name of the bucket where your JSON files are stored. +- **INPUT_FOLDER_PATH**: Path to the folder containing the JSON files you wish to process. Do not include the bucket name in this path. +- **OUTPUT_FOLDER_PATH**: Path to the folder where you'd like to store the processed JSON files. Again, exclude the bucket name from this path. + +**Note**: Ensure that both the input and output paths reside within the same bucket. + +## Output +Upon execution, the script performs the following: + +1. Identifies and deletes all bounding boxes (or entities) in the JSON file that are devoid of `mentionText` or `textAnchors`. +2. Overwrites the original JSON file with the cleaned version. +3. Generates a CSV file that lists all the entities it removed. + +By using this script, you can ensure a more refined and accurate dataset by eliminating unnecessary or empty entities. diff --git a/DocAI Incubator Tools/best-practices/Removing_Empty_Bounding_Boxes.ipynb b/DocAI Incubator Tools/best-practices/Removing_Empty_Bounding_Boxes.ipynb deleted file mode 100644 index f3793d6ad..000000000 --- a/DocAI Incubator Tools/best-practices/Removing_Empty_Bounding_Boxes.ipynb +++ /dev/null @@ -1,420 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "72bc511c-2282-4d2b-a2d4-28d25c07ffa2", - "metadata": {}, - "source": [ - "# DocAI - Script for Removing Empty Bounding Boxes" - ] - }, - { - "cell_type": "markdown", - "id": "778291f7-ed93-4f0f-abc9-c9f82245cf23", - "metadata": {}, - "source": [ - "* Author: docai-incubator@google.com" - ] - }, - { - "cell_type": "markdown", - "id": "2c92559f-6a39-4318-b8ae-064325723cc7", - "metadata": {}, - "source": [ - "## Disclaimer\n", - "\n", - "This tool is not supported by the Google engineering team or product team. It is provided and supported on a best-effort basis by the DocAI Incubator Team. No guarantees of performance are implied.\t" - ] - }, - { - "cell_type": "markdown", - "id": "ce8e2b95-c07f-40ac-bb50-93b20b143ded", - "metadata": {}, - "source": [ - "## Purpose of the Script" - ] - }, - { - "cell_type": "markdown", - "id": "c5a2c845-2df8-41f4-9e3d-59c559a3c4b4", - "metadata": {}, - "source": [ - "\n", - "The purpose of this document is to provide instructions and a Python script for removing empty bounding boxes from a labeled JSON file. The script identifies and removes any bounding boxes (entities) in the JSON file that do not contain any mentionText or textAnchors, streamlining the labeling process and improving the accuracy of the labeling data.\n" - ] - }, - { - "cell_type": "markdown", - "id": "42fa7cdd-9654-42c2-8612-ea779a20a48a", - "metadata": {}, - "source": [ - "## Prerequisites" - ] - }, - { - "cell_type": "markdown", - "id": "728a861b-21e4-4300-aef8-650cb390a271", - "metadata": {}, - "source": [ - "1. Python : Jupyter notebook (Vertex AI) \n", - "2. Service account permissions in projects." - ] - }, - { - "cell_type": "markdown", - "id": "59fa8919-7096-4b92-8604-a4a6daa224f9", - "metadata": {}, - "source": [ - "## Installation Procedure" - ] - }, - { - "cell_type": "markdown", - "id": "60803eb8-6452-49f2-992c-08465e55b770", - "metadata": {}, - "source": [ - "The script consists of Python code. It can be loaded and run via: \n", - "1. Upload the IPYNB file or copy the code to the Vertex Notebook and follow the operation procedure. \\\n", - "**NOTE:** Don’t Execute the Script with Processor Dataset Path. Export the dataset to json and then use that bucket as an input." - ] - }, - { - "cell_type": "markdown", - "id": "6993aead-9bb6-45a4-8fba-c3be04fd3322", - "metadata": {}, - "source": [ - "## Operation Procedure" - ] - }, - { - "cell_type": "markdown", - "id": "1d18824a-111d-44b4-8df5-c8aadff24202", - "metadata": {}, - "source": [ - "### 1. Import the modules" - ] - }, - { - "cell_type": "markdown", - "id": "b7ec98b1-f8fa-4bab-b2cb-5bd49b4c6c38", - "metadata": {}, - "source": [ - "**Note :** external modules are used so they need to be installed. To install run these commands : " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f321c289-fb5f-4fbe-901b-5e5725f71b1a", - "metadata": {}, - "outputs": [], - "source": [ - "!pip install gcsfs\n", - "!pip install google-cloud\n", - "import json\n", - "from pathlib import Path\n", - "\n", - "import gcsfs\n", - "import google.auth\n", - "import pandas as pd\n", - "from google.cloud import storage\n", - "from tqdm import tqdm" - ] - }, - { - "cell_type": "markdown", - "id": "63d8482e-0450-498c-940c-e775037ef907", - "metadata": {}, - "source": [ - "### 2. Setup the required inputs" - ] - }, - { - "cell_type": "markdown", - "id": "5c6e1955-4667-49b8-b6fa-5eb06546d7e1", - "metadata": {}, - "source": [ - "* **PROJECT_ID** - Your Google project id or name\n", - "* **BUCKET_NAME** - Name of the bucket\n", - "* **INPUT_FOLDER_PATH** - The path of the folder containing the JSON files to be processed, without the bucket name.\n", - "* **OUTPUT_FOLDER_PATH** - The path of the folder where the JSON files need to be stored after process, without the bucket * name.\n", - "\n", - "**Note :** Both Input and output paths should be in the same bucket. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bbbc4b67-fbae-4e7b-8518-ddde8b2f30c5", - "metadata": {}, - "outputs": [], - "source": [ - "PROJECT_ID = \"xxxxxx-xxxxxx-xxxxx\"\n", - "BUCKET_NAME = \"xxxxxxx\"\n", - "INPUT_FOLDER_PATH = \"xxxxxxxx/xxxxxxxxx/xxxxxx\" # Path without bucket name\n", - "OUTPUT_FOLDER_PATH = \"xxxxxxxx/xxxxxxx/xxxxxxx/xxx\" # Path without bucket name\n", - "credentials, _ = google.auth.default()\n", - "fs = gcsfs.GCSFileSystem(project=PROJECT_ID, token=credentials)" - ] - }, - { - "cell_type": "markdown", - "id": "5b5b0743-d63b-4960-a46d-f2ffd9bcd252", - "metadata": {}, - "source": [ - "### 3. Execute the code" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3e06c32a-74d9-497a-907b-2a37265a0984", - "metadata": {}, - "outputs": [], - "source": [ - "def get_file(file_path: str):\n", - " \"\"\"\n", - " To read files from cloud storage.\n", - " \"\"\"\n", - " file_object = json.loads(fs.cat(file_path))\n", - " return file_object\n", - "\n", - "\n", - "def store_blob(document, file: str):\n", - " \"\"\"\n", - " Store files in cloud storage.\n", - " \"\"\"\n", - " storage_client = storage.Client()\n", - " result_bucket = storage_client.get_bucket(BUCKET_NAME)\n", - " document_blob = storage.Blob(name=str(file), bucket=result_bucket)\n", - " document_blob.upload_from_string(json.dumps(document),\n", - " content_type=\"application/json\")\n", - "\n", - "\n", - "def main():\n", - " logs = pd.DataFrame(columns=[\"FileName\"])\n", - "\n", - " files = [\n", - " i for i in fs.find(f\"{BUCKET_NAME}/{INPUT_FOLDER_PATH}\")\n", - " if i.endswith(\".json\")\n", - " ]\n", - " json_files_list = [get_file(i) for i in files]\n", - " print(\"No. of files : \", len(files))\n", - "\n", - " for index in tqdm(range(len(files))):\n", - " file_name = files[index].split(\"/\", 1)[-1]\n", - " output_file_name = file_name.replace(INPUT_FOLDER_PATH,\n", - " OUTPUT_FOLDER_PATH)\n", - " is_updated = False\n", - " json_content = json_files_list[index]\n", - " sub_log = pd.DataFrame(columns=[file_name])\n", - " if \"mime_type\" in json_content.keys():\n", - " mention_text_key = \"mention_text\"\n", - " text_anchor_key = \"text_anchor\"\n", - " text_segment_key = \"text_segments\"\n", - " else:\n", - " mention_text_key = \"mentionText\"\n", - " text_anchor_key = \"textAnchor\"\n", - " text_segment_key = \"textSegments\"\n", - " if \"entities\" in json_content.keys():\n", - " for i in reversed(range(len(json_content[\"entities\"]))):\n", - " entity = json_content[\"entities\"][i]\n", - " if mention_text_key not in entity.keys():\n", - " sub_log = sub_log.append({file_name: entity[\"type\"]},\n", - " ignore_index=True)\n", - " del json_content[\"entities\"][i]\n", - " is_updated = True\n", - " continue\n", - " else:\n", - " if (\"properties\" in json_content[\"entities\"][i].keys()\n", - " and entity[mention_text_key].strip()):\n", - " for j in range(\n", - " len(json_content[\"entities\"][i][\"properties\"])\n", - " - 1, -1, -1):\n", - " if (mention_text_key in json_content[\"entities\"][i]\n", - " [\"properties\"][j].keys()):\n", - " if (json_content[\"entities\"][i][\"properties\"]\n", - " [j][mention_text_key].strip() == \"\"):\n", - " sub_log = sub_log.append(\n", - " {\n", - " file_name:\n", - " json_content[\"entities\"][i]\n", - " [\"properties\"][j][\"type\"]\n", - " },\n", - " ignore_index=True,\n", - " )\n", - " del json_content[\"entities\"][i][\n", - " \"properties\"][j]\n", - " is_updated = True\n", - " continue\n", - " elif (mention_text_key\n", - " not in json_content[\"entities\"][i]\n", - " [\"properties\"][j].keys()):\n", - " sub_log = sub_log.append(\n", - " {\n", - " file_name:\n", - " json_content[\"entities\"][i]\n", - " [\"properties\"][j][\"type\"]\n", - " },\n", - " ignore_index=True,\n", - " )\n", - " del json_content[\"entities\"][i][\"properties\"][\n", - " j]\n", - " is_updated = True\n", - " continue\n", - " if (text_anchor_key not in json_content[\"entities\"]\n", - " [i][\"properties\"][j].keys()):\n", - " sub_log = sub_log.append(\n", - " {\n", - " file_name:\n", - " json_content[\"entities\"][i]\n", - " [\"properties\"][j][\"type\"]\n", - " },\n", - " ignore_index=True,\n", - " )\n", - " del json_content[\"entities\"][i]\n", - " is_updated = True\n", - " continue\n", - " elif (text_anchor_key in json_content[\"entities\"]\n", - " [i][\"properties\"][j].keys()):\n", - " if (text_segment_key\n", - " not in json_content[\"entities\"][i]\n", - " [\"properties\"][j][text_anchor_key].keys()):\n", - " sub_log = sub_log.append(\n", - " {\n", - " file_name:\n", - " json_content[\"entities\"][i]\n", - " [\"properties\"][j][\"type\"]\n", - " },\n", - " ignore_index=True,\n", - " )\n", - " del json_content[\"entities\"][i][\n", - " \"properties\"][j]\n", - " is_updated = True\n", - " continue\n", - " elif (len(json_content[\"entities\"][i]\n", - " [\"properties\"][j][text_anchor_key]\n", - " [text_segment_key]) < 1):\n", - " sub_log = sub_log.append(\n", - " {\n", - " file_name:\n", - " json_content[\"entities\"][i]\n", - " [\"properties\"][j][\"type\"]\n", - " },\n", - " ignore_index=True,\n", - " )\n", - " del json_content[\"entities\"][i][\n", - " \"properties\"][j]\n", - " is_updated = True\n", - " continue\n", - "\n", - " elif not entity[mention_text_key].strip():\n", - " sub_log = sub_log.append({file_name: entity[\"type\"]},\n", - " ignore_index=True)\n", - " del json_content[\"entities\"][i]\n", - " is_updated = True\n", - " continue\n", - "\n", - " if text_anchor_key not in entity.keys():\n", - " sub_log = sub_log.append({file_name: entity[\"type\"]},\n", - " ignore_index=True)\n", - " del json_content[\"entities\"][i]\n", - " is_updated = True\n", - " continue\n", - " elif text_anchor_key in entity.keys():\n", - " if text_segment_key not in entity[text_anchor_key].keys():\n", - " sub_log = sub_log.append({file_name: entity[\"type\"]},\n", - " ignore_index=True)\n", - " del json_content[\"entities\"][i]\n", - " is_updated = True\n", - " continue\n", - " elif len(entity[text_anchor_key][text_segment_key]) < 1:\n", - " sub_log = sub_log.append({file_name: entity[\"type\"]},\n", - " ignore_index=True)\n", - " del json_content[\"entities\"][i]\n", - " is_updated = True\n", - " continue\n", - " else:\n", - " print(\"Entities missing : \", files[index])\n", - " # if is_updated:\n", - " store_blob(json_content, output_file_name)\n", - " if not sub_log.empty:\n", - " logs = pd.concat([logs, sub_log], axis=1)\n", - " logs.drop(\"FileName\", axis=1, inplace=True)\n", - " logs.to_csv(\"output.csv\", index=False)\n", - "\n", - "\n", - "main()" - ] - }, - { - "cell_type": "markdown", - "id": "e5345184-bb87-4b08-bda0-0bb7e7eca45e", - "metadata": {}, - "source": [ - "## Output File" - ] - }, - { - "cell_type": "markdown", - "id": "ef59951f-930c-4c7c-8df5-e2738b0d8ff8", - "metadata": {}, - "source": [ - "The script deletes all bounding boxes (entities) in the JSON file that do not contain any mentionText or textAnchors, and overwrites the file. The script will also create a CSV file containing a list of deleted entities." - ] - }, - { - "cell_type": "markdown", - "id": "c0caa1d1-783f-4c66-814f-945c402fe415", - "metadata": {}, - "source": [ - "## Reference Links" - ] - }, - { - "cell_type": "markdown", - "id": "465741c1-2f9a-4303-bdb4-bf688bb4c128", - "metadata": {}, - "source": [ - "Drive Link to IPYNB File : [empty_bounding_box_removal_script.ipynb](https://drive.google.com/file/d/1rQJUFCYYwpex8agJPDId01eow0T91MN7/view?usp=sharing)\n", - "\n", - "Sample CSV output File : [empty_entity_output.csv](https://drive.google.com/file/d/1lgJsyu0Wkttox2pO2f7ex4c0w0ec3vPA/view?usp=share_link)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "05007337-77b0-44b0-9010-4fb3b165f9a7", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "environment": { - "kernel": "python3", - "name": "common-cpu.m104", - "type": "gcloud", - "uri": "gcr.io/deeplearning-platform-release/base-cpu:m104" - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/DocAI Incubator Tools/best-practices/Utilities/readme.md b/DocAI Incubator Tools/best-practices/Utilities/readme.md new file mode 100644 index 000000000..a30d503b0 --- /dev/null +++ b/DocAI Incubator Tools/best-practices/Utilities/readme.md @@ -0,0 +1,33 @@ +# Utilities for Google Cloud Storage Operations + +The `utilities.py` script offers a suite of utility functions designed to simplify and streamline operations associated with Google Cloud Storage (GCS) and Google Cloud's DocumentAI. + +## Features + +1. **Retrieving File Names**: Quickly fetch a list of files from a given GCS path. +2. **Bucket Management**: Effortlessly check for the existence of a GCS bucket, create one if it doesn't exist, or delete it when done. +3. **File Listings**: Retrieve a list of all files (blobs) within a specified GCS bucket. +4. **File Matching**: Compare files between two GCS buckets to identify similar filenames. +5. **Document Conversion**: Download a file from GCS and convert it into a DocumentAI Document proto. +6. **Blob Operations**: Easily copy blobs (files or objects) from one GCS bucket to another. +and several other functions. + +## Dependencies + +The functions mainly rely on Google Cloud Python client libraries such as: +- `google.cloud.storage` +- `google.cloud.documentai_v1beta3` + +To utilize these utilities, ensure the appropriate packages are installed and you have set up authentication with GCS. + +## Usage + +To use the functions, simply import `utilities.py` into your script or notebook: + +```python +import utilities +``` + +## Note +For detailed functionality and parameters of each function, kindly refer to the docstrings within utilities.py. + diff --git a/DocAI Incubator Tools/best-practices/Utilities/utilities.py b/DocAI Incubator Tools/best-practices/Utilities/utilities.py new file mode 100644 index 000000000..a6f8a0e19 --- /dev/null +++ b/DocAI Incubator Tools/best-practices/Utilities/utilities.py @@ -0,0 +1,712 @@ +# --- +# jupyter: +# jupytext: +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.14.5 +# kernelspec: +# display_name: Python 3 +# language: python +# name: python3 +# --- + +from google.cloud import storage +import ast +import configparser +import difflib +import io +import json +import operator +import os +import re +import time +from collections.abc import Container, Iterable, Iterator, Mapping, Sequence +from typing import List, Optional, Tuple, Union +import numpy as np +# Import the libraries +import pandas as pd +from google.cloud import documentai_v1beta3, storage +from PIL import Image +from PyPDF2 import PdfFileReader + +pd.options.mode.chained_assignment = None # default='warn' +import datetime +import json +import os +import utilities + +def file_names(GS_File_path): + """ + Retrieves the list of files from a given Google Cloud Storage path. + + Args: + GS_File_path (str): The Google Cloud Storage path in the format "gs:////". + + Returns: + tuple: A tuple containing two elements: + 1. List of filenames present in the specified path. + 2. Dictionary with filenames as keys and their respective full paths in the bucket as values. + """ + + from google.cloud import storage + + bucket = GS_File_path.split("/")[2] + file_names_list = [] + file_dict = {} + + storage_client = storage.Client() + source_bucket = storage_client.get_bucket(bucket) + + filenames = [filename.name for filename in list(source_bucket.list_blobs(prefix=(('/').join(GS_File_path.split('/')[3:]))))] + + for i in range(len(filenames)): + x = filenames[i].split('/')[-1] + if x: + file_names_list.append(x) + file_dict[x] = filenames[i] + + return file_names_list, file_dict + +def check_create_bucket(bucket_name): + """ + Checks if a specified Google Cloud Storage bucket exists. If not, it creates one. + Primarily used for creating a temporary bucket to store processed files. + + Args: + bucket_name (str): The name of the bucket to check or create. + + Returns: + google.cloud.storage.bucket.Bucket: The bucket object corresponding to the provided bucket name. + """ + + from google.cloud import storage + + storage_client = storage.Client() + try: + bucket = storage_client.get_bucket(bucket_name) + print(f"Bucket {bucket_name} already exists.") + except: + bucket = storage_client.create_bucket(bucket_name) + print(f"Bucket {bucket_name} created.") + + return bucket + + +def bucket_delete(bucket_name): + """ + Deletes a specified Google Cloud Storage bucket. + Primarily used for deleting temporary buckets after their purpose is served. + + Args: + bucket_name (str): The name of the bucket to be deleted. + + Returns: + None. If the bucket exists, it will be deleted. If it doesn't exist or an error occurs, the function will silently pass. + """ + + print("Deleting bucket:", bucket_name) + from google.cloud import storage + + storage_client = storage.Client() + try: + bucket = storage_client.get_bucket(bucket_name) + bucket.delete(force=True) + except: + pass + + +def list_blobs(bucket_name): + """ + Retrieves a list of filenames (blobs) from a specified Google Cloud Storage bucket. + + Args: + bucket_name (str): The name of the bucket from which to retrieve the list of filenames. + + Returns: + list: A list containing the names of all files (blobs) present in the specified bucket. + """ + + from google.cloud import storage + + blob_list = [] + storage_client = storage.Client() + blobs = storage_client.list_blobs(bucket_name) + + for blob in blobs: + blob_list.append(blob.name) + + return blob_list + + +def matching_files_two_buckets(bucket_1, bucket_2): + import difflib + """ + Compares the files from two Google Cloud Storage buckets to find files with similar names. + + Args: + bucket_1 (str): Name of the first GCS bucket. + bucket_2 (str): Name of the second GCS bucket. + + Returns: + tuple: A tuple containing two dictionaries: + 1. matched_files_dict: Dictionary with filenames from bucket_1 as keys and corresponding similar filenames from bucket_2 as values. + 2. non_matched_files_dict: Dictionary with filenames from bucket_1 as keys and a message indicating no similar file was found in bucket_2 as values. + """ + + bucket_1_blobs = list_blobs(bucket_1) + bucket_2_blobs = list_blobs(bucket_2) + + matched_files_dict = {} + non_matched_files_dict = {} + + for i in bucket_1_blobs: + for j in bucket_2_blobs: + matched_score = difflib.SequenceMatcher(None, i, j).ratio() + + print('matched_score:', matched_score) + if matched_score >= 0.8: + matched_files_dict[i] = j + else: + non_matched_files_dict[i] = "No parsed output available" + + for i in matched_files_dict: + if i in non_matched_files_dict.keys(): + del non_matched_files_dict[i] + + print('matched_files_dict =', matched_files_dict) + print('non_matched_files_dict =', non_matched_files_dict) + + return matched_files_dict, non_matched_files_dict + + +def documentai_json_proto_downloader(bucket_name, blob_name_with_prefix_path): + """ + Downloads a file from a specified Google Cloud Storage bucket and converts it into a DocumentAI Document proto. + + Args: + bucket_name (str): The name of the GCS bucket from which to download the file. + blob_name_with_prefix_path (str): The full path (prefix) to the JSON blob in the bucket. + + Returns: + documentai.Document: A DocumentAI Document proto representation of the downloaded JSON. + """ + + from google.cloud import documentai_v1beta3 as documentai + from google.cloud import storage + + storage_client = storage.Client() + bucket = storage_client.bucket(bucket_name) + blob = bucket.blob(blob_name_with_prefix_path) + + contents = blob.download_as_string() + doc = documentai.Document.from_json(contents.decode()) + + return doc + + + +def copy_blob( + bucket_name, blob_name, destination_bucket_name, destination_blob_name): + """ + Copies a blob (file/object) from one GCP storage bucket to another. + + Args: + bucket_name (str): Name of the source bucket. + blob_name (str): Name of the blob (file/object) in the source bucket to be copied. + destination_bucket_name (str): Name of the destination bucket. + destination_blob_name (str): Desired name for the blob in the destination bucket. + + Output: + None. The blob is copied to the destination bucket with the specified name. + """ + from google.cloud import storage + storage_client = storage.Client() + source_bucket = storage_client.bucket(bucket_name) + source_blob = source_bucket.blob(blob_name) + destination_bucket = storage_client.bucket(destination_bucket_name) + blob_copy = source_bucket.copy_blob( + source_blob, destination_bucket, destination_blob_name + ) + + +def JsonToDataframe(data): + """ + Converts a loaded DocumentAI proto JSON into a pandas DataFrame. + + Args: + data (json object): A loaded DocumentAI Document proto JSON. + + Returns: + pandas.DataFrame: A DataFrame representation of the JSON with columns ['type_', 'mention_text', 'bbox', 'page']. + 'type_' column indicates the type of entity. + 'mention_text' column contains the text of the entity or its property. + 'bbox' column contains bounding box coordinates. + 'page' column indicates the page number where the entity is found. + """ + + import pandas as pd + + df = pd.DataFrame(columns=['type_', 'mention_text', 'bbox', 'page']) + + try: + for entity in data.entities: + # First, we'll assume it doesn't have properties + has_properties = False + + # Check for subentities (properties) + try: + for subentity in entity.properties: + has_properties = True # Mark that we found properties + bbox = [] + try: + bound_poly = subentity.page_anchor.page_refs + coordinates_xy = bound_poly[0].bounding_poly.normalized_vertices + x_1 = [xy.x for xy in coordinates_xy] + y_1 = [xy.y for xy in coordinates_xy] + + try: + page = subentity.page_anchor.page_refs[0].page + except: + page = 0 + bbox = [round(min(x_1), 8), round(min(y_1), 8), round(max(x_1), 8), round(max(y_1), 8)] + df.loc[len(df.index)] = [subentity.type_, subentity.mention_text, bbox, page] + except: + continue + + except Exception as e: + # print(e) + continue + + # If no properties were found for the entity, add it to the dataframe + if not has_properties: + try: + bbox = [] + bound_poly = entity.page_anchor.page_refs + coordinates_xy = bound_poly[0].bounding_poly.normalized_vertices + x_1 = [xy.x for xy in coordinates_xy] + y_1 = [xy.y for xy in coordinates_xy] + try: + page = entity.page_anchor.page_refs[0].page + except: + page = 0 + + bbox = [round(min(x_1), 8), round(min(y_1), 8), round(max(x_1), 8), round(max(y_1), 8)] + df.loc[len(df.index)] = [entity.type_, entity.mention_text, bbox, page] + except Exception as e: + # print(e) + continue + + return df + except Exception as e: + # print(e) + return df + +def blob_downloader(bucket_name, blob_name): + """ + Downloads a JSON file from a specified Google Cloud Storage bucket and loads it as a Python dictionary. + + Args: + bucket_name (str): The name of the GCS bucket from which to download the file. + blob_name (str): The name or full path to the JSON blob in the bucket. + + Returns: + dict: A dictionary representation of the downloaded JSON file. + """ + + from google.cloud import storage + import json + + storage_client = storage.Client() + bucket = storage_client.bucket(bucket_name) + blob = bucket.blob(blob_name) + + contents = blob.download_as_string() + return json.loads(contents.decode()) + + +def bbox_maker(boundingPoly): + """ + Converts a bounding polygon (list of coordinates) into a bounding box represented by + the minimum and maximum x and y values. + + Args: + boundingPoly (list of dicts): A list of coordinates where each coordinate is a dictionary + with "x" and "y" keys. Example: [{"x": 0.5, "y": 0.5}, ...] + + Returns: + list: A list representing the bounding box in the format [min_x, min_y, max_x, max_y]. + """ + + x_list = [] + y_list = [] + + for i in boundingPoly: + x_list.append(i["x"]) + y_list.append(i["y"]) + + bbox = [min(x_list), min(y_list), max(x_list), max(y_list)] + + return bbox + + +def RemoveRow(df, entity): + """ + Removes rows from a DataFrame where the "type_" column matches the specified entity. + + Args: + df (pandas.DataFrame): The input DataFrame from which rows need to be removed. + entity (str): The entity value that should be used to identify rows to be removed. + + Returns: + pandas.DataFrame: A DataFrame with rows removed where the "type_" column matches the specified entity. + """ + + return df[df["type_"] != entity] + + + + + +def FindMatch(entity_file1, df_file2): + """ + Identifies a matching entity from a DataFrame (`df_file2`) based on the Intersection Over Union (IOU) + of bounding boxes with respect to a given entity (`entity_file1`). + + Args: + entity_file1 (list): A list containing entity details from the first file. + It should have the format [type_, mention_text, bbox, ...]. + df_file2 (pandas.DataFrame): The input DataFrame containing entity details from the second file. + + Returns: + int or None: The index of the matching entity from `df_file2` if found, otherwise None. + + Note: + The function assumes the existence of a function `BBIntersectionOverUnion` that computes the IOU. + """ + import operator + bbox_file1 = entity_file1[2] + + # Entity not present in json file + if not bbox_file1: + return None + + # Filtering entities with the same name + df_file2 = df_file2[df_file2["type_"] == entity_file1[0]] + + # Calculating IOU values for the entities + index_iou_pairs = [] + for index, entity_file2 in df_file2.iterrows(): + if entity_file2["bbox"]: + iou = BBIntersectionOverUnion(bbox_file1, entity_file2["bbox"]) + index_iou_pairs.append((index, iou)) + + # Choose entity with highest IOU, IOU should be at least > 0.2 + matched_index = None + for index_iou in sorted(index_iou_pairs, + key=operator.itemgetter(1), + reverse=True): + if index_iou[1] > 0.2: # Threshold + matched_index = index_iou[0] + break + + return matched_index + + +def BBIntersectionOverUnion(box1, box2): + """ + Calculates the Intersection Over Union (IOU) between two bounding boxes. + + The bounding boxes are represented as a list of coordinates: [x_min, y_min, x_max, y_max]. + + Args: + box1 (list[float]): Coordinates of the first bounding box. + box2 (list[float]): Coordinates of the second bounding box. + + Returns: + float: The IOU between the two bounding boxes. A value between 0 (no overlap) to 1 (complete overlap). + + Example: + box1 = [0.1, 0.1, 0.6, 0.6] + box2 = [0.5, 0.5, 1.0, 1.0] + iou = BBIntersectionOverUnion(box1, box2) + """ + + # Determine the coordinates of the intersection rectangle + x1 = max(box1[0], box2[0]) + y1 = max(box1[1], box2[1]) + x2 = min(box1[2], box2[2]) + y2 = min(box1[3], box2[3]) + + # Calculate the area of the intersection rectangle + inter_area = max(0, x2 - x1) * max(0, y2 - y1) + + # If there's no intersection, IOU is 0 + if inter_area == 0: + return 0 + + # Calculate the area of each bounding box + box1_area = (box1[2] - box1[0]) * (box1[3] - box1[1]) + box2_area = (box2[2] - box2[0]) * (box2[3] - box2[1]) + + # Calculate the IOU + iou = inter_area / float(box1_area + box2_area - inter_area) + + return iou + + + +def GetMatchRatio(values): + """ + Calculates the similarity ratio between two strings using SequenceMatcher. + + Args: + values (list[str]): A list containing four elements. + The second (index 1) and fourth (index 3) are the strings to be compared. + + Returns: + float: A ratio representing the similarity between the two strings. + A value between 0 (no similarity) and 1 (identical strings). + + Example: + values = ["Name1", "apple", "Name2", "apples"] + ratio = GetMatchRatio(values) + """ + import difflib + file1_value = values[1] + file2_value = values[2] + + if file1_value == "Entity not found." or file2_value == "Entity not found.": + return 0 + else: + return difflib.SequenceMatcher(a=file1_value, b=file2_value).ratio() + + +def compare_pre_hitl_and_post_hitl_output(file1, file2): + ''' Compares the entities between two files and returns + the results in a dataframe + + Args: + file1, file2: DocumentAI Object of Json + + Returns: + Dataframe based on the comparison + ''' + df_file1 = JsonToDataframe(file1) + df_file2 = JsonToDataframe(file2) + file1_entities = [entity[0] for entity in df_file1.values] + file2_entities = [entity[0] for entity in df_file2.values] + + # find entities which are present only once in both files + # these entities will be matched directly + common_entities = set(file1_entities).intersection(set(file2_entities)) + exclude_entities = [] + for entity in common_entities: + if file1_entities.count(entity) > 1 or file2_entities.count(entity) > 1: + exclude_entities.append(entity) + for entity in exclude_entities: + common_entities.remove(entity) + df_compare = pd.DataFrame(columns=['Entity Type', 'Pre_HITL_Output', 'Post_HITL_Output','pre_bbox','post_bbox','page1','page2']) + for entity in common_entities: + value1 = df_file1[df_file1['type_'] == entity].iloc[0]['mention_text'] + value2 = df_file2[df_file2['type_'] == entity].iloc[0]['mention_text'] + pre_bbox = df_file1[df_file1['type_'] == entity].iloc[0]['bbox'] + post_bbox = df_file2[df_file2['type_'] == entity].iloc[0]['bbox'] + page1=df_file1[df_file1['type_'] == entity].iloc[0]['page'] + page2=df_file2[df_file2['type_'] == entity].iloc[0]['page'] + df_compare.loc[len(df_compare.index)] = [entity, value1, value2,pre_bbox,post_bbox,page1,page2] + # common entities are removed from df_file1 and df_file2 + df_file1 = RemoveRow(df_file1, entity) + df_file2 = RemoveRow(df_file2, entity) + + # remaining entities are matched comparing the area of IOU across them + mention_text2 = pd.Series(dtype=str) + bbox2=pd.Series(dtype=object) + bbox1=pd.Series(dtype=object) + page_1=pd.Series(dtype=object) + page_2=pd.Series(dtype=object) + + for index, row in enumerate(df_file1.values): + matched_index = FindMatch(row, df_file2) + if matched_index != None: + mention_text2.loc[index] = df_file2.loc[matched_index][1] + bbox2.loc[index] = df_file2.loc[matched_index][2] + bbox1.loc[index] = row[2] + page_2.loc[index]=df_file2.loc[matched_index][3] + page_1.loc[index] = row[3] + df_file2 = df_file2.drop(matched_index) + else: + mention_text2.loc[index] = 'Entity not found.' + bbox2.loc[index] = 'Entity not found.' + bbox1.loc[index] = row[2] + page_1.loc[index] = row[3] + page_2.loc[index] = 'no' + + df_file1['mention_text2'] = mention_text2.values + df_file1['bbox2'] = bbox2.values + df_file1['bbox1'] = bbox1.values + df_file1['page_1'] = page_1.values + df_file1['page_2'] = page_2.values + + df_file1 = df_file1.drop(['bbox'], axis=1) + df_file1 = df_file1.drop(['page'], axis=1) + df_file1.rename(columns={'type_':'Entity Type', 'mention_text':'Pre_HITL_Output', 'mention_text2':'Post_HITL_Output','bbox1':'pre_bbox','bbox2':'post_bbox','page_1':'page1','page_2':'page2'}, inplace=True) + df_compare = df_compare.append(df_file1, ignore_index=True) + # adding entities which are present in file2 but not in file1 + for row in df_file2.values: + df_compare.loc[len(df_compare.index)] = [row[0], 'Entity not found.', row[1],"[]",row[2],'[]',row[3]] + + # df_compare['Match'] = df_compare['Ground Truth Text'] == df_compare['Output Text'] + match_array = [] + for i in range(0, len(df_compare)): + + match_string = '' + if df_compare.iloc[i]['Pre_HITL_Output'] == 'Entity not found.' and df_compare.iloc[i]['Post_HITL_Output'] == 'Entity not found.': + match_string = 'TN' + elif df_compare.iloc[i]['Pre_HITL_Output'] != 'Entity not found.' and df_compare.iloc[i]['Post_HITL_Output'] == 'Entity not found.': + match_string = 'FN' + elif df_compare.iloc[i]['Pre_HITL_Output'] == 'Entity not found.' and df_compare.iloc[i]['Post_HITL_Output'] != 'Entity not found.': + match_string = 'FP' + elif df_compare.iloc[i]['Pre_HITL_Output'] != 'Entity not found.' and df_compare.iloc[i]['Post_HITL_Output'] != 'Entity not found.': + if df_compare.iloc[i]['Pre_HITL_Output'] == df_compare.iloc[i]['Post_HITL_Output']: + match_string = 'TP' + else: + match_string = 'FP' + else: + match_string = 'Something went Wrong.' + + match_array.append(match_string) + + df_compare['Match'] = match_array + + df_compare['Fuzzy Ratio'] = df_compare.apply(GetMatchRatio, axis=1) + if list(df_compare.index): + score = df_compare['Fuzzy Ratio'].sum()/len(df_compare.index) + else: + score = 0 + + # display(df_compare) + return df_compare, score + + + +def get_document_schema(location, project_number, processor_ID, processor_versionID): + """ + Fetches the document schema of a specific processor version from Google Cloud DocumentAI. + + Args: + location (str): The location of the DocumentAI service ("eu" or other values). + project_number (str): The number representing the Google Cloud project. + processor_ID (str): The ID of the DocumentAI processor. + processor_versionID (str): The ID of the processor version. + + Returns: + documentai.types.Document.Schema: The document schema for the specified processor version. + + Example: + schema = get_document_schema("eu", "123456", "processor123", "version123") + """ + from google.cloud import documentai_v1beta3 as documentai + # Choose the endpoint based on the provided location. + opts = {} + if location == "eu": + opts = {"api_endpoint": "eu-documentai.googleapis.com"} + else: + opts = {"api_endpoint": "us-documentai.googleapis.com"} + + # Initialize the DocumentAI client. + client = documentai.DocumentProcessorServiceClient(client_options=opts) + + # Construct the request. + name = f"projects/{project_number}/locations/{location}/processors/{processor_ID}/processorVersions/{processor_versionID}" + request = documentai.GetProcessorVersionRequest(name=name) + + # Fetch the processor version details. + response = client.get_processor_version(request=request) + + # Extract and return the document schema. + return response.document_schema + +def create_pdf_bytes_from_json(GT_json): + """ THis Function will create pdf bytes from the image + content of the ground truth JSONS which will be used for processing of files + args: gs path of json file + output : pdf bytes""" + import io + from PIL import Image + from google.cloud import documentai_v1beta3 as documentai + from typing import Container, Iterable, Iterator, List, Mapping, Optional, Sequence, Tuple, Union + import json + def decode_image(image_bytes: bytes) -> Image.Image: + with io.BytesIO(image_bytes) as image_file: + image = Image.open(image_file) + image.load() + return image + def create_pdf_from_images(images: Sequence[Image.Image]) -> bytes: + """Creates a PDF from a sequence of images. + + The PDF will contain 1 page per image, in the same order. + + Args: + images: A sequence of images. + + Returns: + The PDF bytes. + """ + if not images: + raise ValueError('At least one image is required to create a PDF') + + # PIL PDF saver does not support RGBA images + images = [ + image.convert('RGB') if image.mode == 'RGBA' else image + for image in images + ] + + with io.BytesIO() as pdf_file: + images[0].save( + pdf_file, save_all=True, append_images=images[1:], format='PDF') + return pdf_file.getvalue() + + + d=documentai.Document + document=d.from_json(json.dumps(GT_json)) + synthesized_images=[] + for i in range(len(document.pages)): + synthesized_images.append(decode_image(document.pages[i].image.content)) + pdf_bytes=create_pdf_from_images(synthesized_images) + + return pdf_bytes, synthesized_images + +def process_document_sample( + project_id: str, location: str, processor_id: str, pdf_bytes,processor_version: str): + """THis function is used to process the files using pdf bytes and provides the processed file""" + from google.cloud import documentai_v1beta3 as documentai + # fs=gcsfs.GCSFileSystem(project=project_name) + # You must set the api_endpoint if you use a location other than 'us', e.g.: + opts = {} + if location == "eu": + opts = {"api_endpoint": "eu-documentai.googleapis.com"} + #https://us-autopush-documentai.sandbox.googleapis.com/v1/projects/118131588843/locations/us/processors/27c7f1d0a495615c:process + #opts = {"api_endpoint": "us-autopush-documentai.sandbox.googleapis.com"} + else : + opts = {"api_endpoint": "us-documentai.googleapis.com"} + + client = documentai.DocumentProcessorServiceClient(client_options=opts) + name = f"projects/{project_id}/locations/{location}/processors/{processor_id}/processorVersions/{processor_version}" + + # Read the file into memory + image_content =pdf_bytes + # Find the mime_type of the file + + # temp_info=fs.info(file_path) + # mime_type=temp_info['contentType'] + document = {"content": image_content, "mime_type": "application/pdf"} + + # Configure the process request + request = {"name": name, "raw_document": document, "skip_human_review": False} + + # Recognizes text entities in the PDF document + result = client.process_document(request=request) + + return result