Integration with the Google Cloud Vision API (#2902)

This PR adds a third OCR provider, alongside Tesseract and Paddle: the [Google Cloud Vision API](https://cloud.google.com/vision). It can be used similarly to other OCR methods: set the `OCR_AGENT` environment variable to the path to the OCR module (`unstructured.partition.utils.ocr_models.google_vision_ocr.OCRAgentGoogleVision`). You also need to set the credentials to use Google APIs, for instance by setting the `GOOGLE_APPLICATION_CREDENTIALS` environment variable. --------- Co-authored-by: christinestraub <[email protected]>
Unstructured-IO · Apr 23, 2024 · abb0174 · abb0174
1 parent 05ff975
commit abb0174
Show file tree

Hide file tree

Showing 24 changed files with 261 additions and 36 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,9 +1,11 @@
-## 0.13.4-dev0
+## 0.13.4-dev1
 
 ### Enhancements
 
 ### Features
 
+* **Add integration with the Google Cloud Vision API**. Adds a third OCR provider, alongside Tesseract and Paddle: the Google Cloud Vision API.
+
 ### Fixes
 
 * **Remove ElementMetadata.section field.**. This field was unused, not populated by any partitioners.

diff --git a/requirements/base.txt b/requirements/base.txt
@@ -25,7 +25,7 @@ dataclasses-json==0.6.4
     # via -r ./base.in
 dataclasses-json-speakeasy==0.5.11
     # via unstructured-client
-emoji==2.11.0
+emoji==2.11.1
     # via -r ./base.in
 filetype==1.2.0
     # via -r ./base.in

diff --git a/requirements/dev.txt b/requirements/dev.txt
@@ -185,7 +185,7 @@ jupyterlab==4.1.6
     # via notebook
 jupyterlab-pygments==0.3.0
     # via nbconvert
-jupyterlab-server==2.26.0
+jupyterlab-server==2.27.0
     # via
     #   jupyterlab
     #   notebook

diff --git a/requirements/extra-paddleocr.txt b/requirements/extra-paddleocr.txt
@@ -53,7 +53,7 @@ idna==3.7
     # via
     #   -c ./base.txt
     #   requests
-imageio==2.34.0
+imageio==2.34.1
     # via
     #   imgaug
     #   scikit-image

diff --git a/requirements/extra-pdf-image.in b/requirements/extra-pdf-image.in
@@ -13,3 +13,4 @@ unstructured-inference==0.7.27
 # unstructured fork of pytesseract that provides an interface to allow for multiple output formats
 # from one tesseract call
 unstructured.pytesseract>=0.3.12
+google-cloud-vision
diff --git a/requirements/extra-pdf-image.txt b/requirements/extra-pdf-image.txt
@@ -6,6 +6,8 @@
 #
 antlr4-python3-runtime==4.9.3
     # via omegaconf
+cachetools==5.3.3
+    # via google-auth
 certifi==2024.2.2
     # via
     #   -c ././deps/constraints.txt
@@ -43,6 +45,24 @@ fsspec==2024.3.1
     # via
     #   huggingface-hub
     #   torch
+google-api-core[grpc]==2.18.0
+    # via google-cloud-vision
+google-auth==2.29.0
+    # via
+    #   google-api-core
+    #   google-cloud-vision
+google-cloud-vision==3.7.2
+    # via -r ./extra-pdf-image.in
+googleapis-common-protos==1.63.0
+    # via
+    #   google-api-core
+    #   grpcio-status
+grpcio==1.62.2
+    # via
+    #   google-api-core
+    #   grpcio-status
+grpcio-status==1.62.2
+    # via google-api-core
 huggingface-hub==0.22.2
     # via
     #   timm
@@ -147,11 +167,26 @@ pillow-heif==0.16.0
     # via -r ./extra-pdf-image.in
 portalocker==2.8.2
     # via iopath
+proto-plus==1.23.0
+    # via
+    #   google-api-core
+    #   google-cloud-vision
 protobuf==4.23.4
     # via
     #   -c ././deps/constraints.txt
+    #   google-api-core
+    #   google-cloud-vision
+    #   googleapis-common-protos
+    #   grpcio-status
     #   onnx
     #   onnxruntime
+    #   proto-plus
+pyasn1==0.6.0
+    # via
+    #   pyasn1-modules
+    #   rsa
+pyasn1-modules==0.4.0
+    # via google-auth
 pycocotools==2.0.7
     # via
     #   -c ././deps/constraints.txt
@@ -195,8 +230,11 @@ regex==2024.4.16
 requests==2.31.0
     # via
     #   -c ./base.txt
+    #   google-api-core
     #   huggingface-hub
     #   transformers
+rsa==4.9
+    # via google-auth
 safetensors==0.4.3
     # via
     #   timm

diff --git a/requirements/ingest/astra.txt b/requirements/ingest/astra.txt
@@ -46,9 +46,7 @@ hpack==4.0.0
 httpcore==1.0.5
     # via httpx
 httpx[http2]==0.27.0
-    # via
-    #   astrapy
-    #   httpx
+    # via astrapy
 hyperframe==6.0.1
     # via h2
 idna==3.7

diff --git a/requirements/ingest/azure.txt b/requirements/ingest/azure.txt
@@ -80,9 +80,7 @@ portalocker==2.8.2
 pycparser==2.22
     # via cffi
 pyjwt[crypto]==2.8.0
-    # via
-    #   msal
-    #   pyjwt
+    # via msal
 requests==2.31.0
     # via
     #   -c ./ingest/../base.txt

diff --git a/requirements/ingest/box.txt b/requirements/ingest/box.txt
@@ -9,9 +9,7 @@ attrs==23.2.0
 boxfs==0.3.0
     # via -r ./ingest/box.in
 boxsdk[jwt]==3.9.2
-    # via
-    #   boxfs
-    #   boxsdk
+    # via boxfs
 certifi==2024.2.2
     # via
     #   -c ./ingest/../base.txt

diff --git a/requirements/ingest/chroma.txt b/requirements/ingest/chroma.txt
@@ -214,9 +214,7 @@ urllib3==1.26.18
     #   kubernetes
     #   requests
 uvicorn[standard]==0.29.0
-    # via
-    #   chromadb
-    #   uvicorn
+    # via chromadb
 uvloop==0.19.0
     # via uvicorn
 watchfiles==0.21.0

diff --git a/requirements/ingest/delta-table.txt b/requirements/ingest/delta-table.txt
@@ -4,7 +4,7 @@
 #
 #    pip-compile ./ingest/delta-table.in
 #
-deltalake==0.16.4
+deltalake==0.17.0
     # via -r ./ingest/delta-table.in
 fsspec==2024.3.1
     # via -r ./ingest/delta-table.in

diff --git a/requirements/ingest/embed-aws-bedrock.txt b/requirements/ingest/embed-aws-bedrock.txt
@@ -38,6 +38,8 @@ frozenlist==1.4.1
     # via
     #   aiohttp
     #   aiosignal
+greenlet==3.0.3
+    # via sqlalchemy
 idna==3.7
     # via
     #   -c ./ingest/../base.txt

diff --git a/requirements/ingest/embed-huggingface.txt b/requirements/ingest/embed-huggingface.txt
@@ -40,6 +40,8 @@ fsspec==2024.3.1
     # via
     #   huggingface-hub
     #   torch
+greenlet==3.0.3
+    # via sqlalchemy
 huggingface==0.0.1
     # via -r ./ingest/embed-huggingface.in
 huggingface-hub==0.22.2

diff --git a/requirements/ingest/embed-openai.txt b/requirements/ingest/embed-openai.txt
@@ -42,6 +42,8 @@ frozenlist==1.4.1
     # via
     #   aiohttp
     #   aiosignal
+greenlet==3.0.3
+    # via sqlalchemy
 h11==0.14.0
     # via httpcore
 httpcore==1.0.5

diff --git a/requirements/ingest/embed-vertexai.txt b/requirements/ingest/embed-vertexai.txt
@@ -42,7 +42,6 @@ frozenlist==1.4.1
     #   aiosignal
 google-api-core[grpc]==2.18.0
     # via
-    #   google-api-core
     #   google-cloud-aiplatform
     #   google-cloud-bigquery
     #   google-cloud-core
@@ -83,6 +82,8 @@ googleapis-common-protos[grpc]==1.63.0
     #   google-api-core
     #   grpc-google-iam-v1
     #   grpcio-status
+greenlet==3.0.3
+    # via sqlalchemy
 grpc-google-iam-v1==0.13.0
     # via google-cloud-resource-manager
 grpcio==1.62.2

diff --git a/requirements/ingest/github.txt b/requirements/ingest/github.txt
@@ -30,9 +30,7 @@ pycparser==2.22
 pygithub==2.3.0
     # via -r ./ingest/github.in
 pyjwt[crypto]==2.8.0
-    # via
-    #   pygithub
-    #   pyjwt
+    # via pygithub
 pynacl==1.5.0
     # via pygithub
 requests==2.31.0

diff --git a/requirements/ingest/onedrive.txt b/requirements/ingest/onedrive.txt
@@ -40,9 +40,7 @@ office365-rest-python-client==2.4.2
 pycparser==2.22
     # via cffi
 pyjwt[crypto]==2.8.0
-    # via
-    #   msal
-    #   pyjwt
+    # via msal
 pytz==2024.1
     # via office365-rest-python-client
 requests==2.31.0

diff --git a/requirements/ingest/outlook.txt b/requirements/ingest/outlook.txt
@@ -34,9 +34,7 @@ office365-rest-python-client==2.4.2
 pycparser==2.22
     # via cffi
 pyjwt[crypto]==2.8.0
-    # via
-    #   msal
-    #   pyjwt
+    # via msal
 pytz==2024.1
     # via office365-rest-python-client
 requests==2.31.0

diff --git a/requirements/ingest/qdrant.txt b/requirements/ingest/qdrant.txt
@@ -33,9 +33,7 @@ hpack==4.0.0
 httpcore==1.0.5
     # via httpx
 httpx[http2]==0.27.0
-    # via
-    #   httpx
-    #   qdrant-client
+    # via qdrant-client
 hyperframe==6.0.1
     # via h2
 idna==3.7
@@ -57,7 +55,7 @@ pydantic==2.7.0
     # via qdrant-client
 pydantic-core==2.18.1
     # via pydantic
-qdrant-client==1.8.2
+qdrant-client==1.9.0
     # via -r ./ingest/qdrant.in
 sniffio==1.3.1
     # via

diff --git a/requirements/ingest/sharepoint.txt b/requirements/ingest/sharepoint.txt
@@ -34,9 +34,7 @@ office365-rest-python-client==2.4.2
 pycparser==2.22
     # via cffi
 pyjwt[crypto]==2.8.0
-    # via
-    #   msal
-    #   pyjwt
+    # via msal
 pytz==2024.1
     # via office365-rest-python-client
 requests==2.31.0

diff --git a/test_unstructured/partition/pdf_image/test_ocr.py b/test_unstructured/partition/pdf_image/test_ocr.py
@@ -1,3 +1,4 @@
+from collections import namedtuple
 from unittest.mock import patch
 
 import numpy as np
@@ -19,6 +20,7 @@
 from unstructured.partition.utils.constants import (
     Source,
 )
+from unstructured.partition.utils.ocr_models.google_vision_ocr import OCRAgentGoogleVision
 from unstructured.partition.utils.ocr_models.paddle_ocr import OCRAgentPaddle
 from unstructured.partition.utils.ocr_models.tesseract_ocr import (
     OCRAgentTesseract,
@@ -192,6 +194,90 @@ def test_get_ocr_text_from_image_paddle(monkeypatch):
     assert ocr_text == "Hello\n\nWorld\n\n!"
 
 
+@pytest.fixture()
+def google_vision_text_annotation():
+    from google.cloud.vision import (
+        Block,
+        BoundingPoly,
+        Page,
+        Paragraph,
+        Symbol,
+        TextAnnotation,
+        Vertex,
+        Word,
+    )
+
+    breaks = TextAnnotation.DetectedBreak.BreakType
+    symbols_hello = [Symbol(text=c) for c in "Hello"] + [
+        Symbol(
+            property=TextAnnotation.TextProperty(
+                detected_break=TextAnnotation.DetectedBreak(type_=breaks.SPACE)
+            )
+        )
+    ]
+    symbols_world = [Symbol(text=c) for c in "World!"] + [
+        Symbol(
+            property=TextAnnotation.TextProperty(
+                detected_break=TextAnnotation.DetectedBreak(type_=breaks.LINE_BREAK)
+            )
+        )
+    ]
+    words = [Word(symbols=symbols_hello), Word(symbols=symbols_world)]
+    bounding_box = BoundingPoly(
+        vertices=[Vertex(x=0, y=0), Vertex(x=0, y=10), Vertex(x=10, y=10), Vertex(x=10, y=0)]
+    )
+    paragraphs = [Paragraph(words=words, bounding_box=bounding_box)]
+    blocks = [Block(paragraphs=paragraphs)]
+    pages = [Page(blocks=blocks)]
+    return TextAnnotation(text="Hello World!", pages=pages)
+
+
+@pytest.fixture()
+def google_vision_client(google_vision_text_annotation):
+    Response = namedtuple("Response", "full_text_annotation")
+
+    class FakeGoogleVisionClient:
+        def document_text_detection(self, image):
+            return Response(full_text_annotation=google_vision_text_annotation)
+
+    class OCRAgentFakeGoogleVision(OCRAgentGoogleVision):
+        def __init__(self):
+            self.client = FakeGoogleVisionClient()
+
+    return OCRAgentFakeGoogleVision()
+
+
+def test_get_ocr_from_image_google_vision(google_vision_client):
+    image = Image.new("RGB", (100, 100))
+
+    ocr_agent = google_vision_client
+    ocr_text = ocr_agent.get_text_from_image(image, ocr_languages="eng")
+
+    assert ocr_text == "Hello World!"
+
+
+def test_get_layout_from_image_google_vision(google_vision_client):
+    image = Image.new("RGB", (100, 100))
+
+    ocr_agent = google_vision_client
+    regions = ocr_agent.get_layout_from_image(image, ocr_languages="eng")
+    assert len(regions) == 1
+    assert regions[0].text == "Hello World!"
+    assert regions[0].source == Source.OCR_GOOGLEVISION
+    assert regions[0].bbox.x1 == 0
+    assert regions[0].bbox.y1 == 0
+    assert regions[0].bbox.x2 == 10
+    assert regions[0].bbox.y2 == 10
+
+
+def test_get_layout_elements_from_image_google_vision(google_vision_client):
+    image = Image.new("RGB", (100, 100))
+
+    ocr_agent = google_vision_client
+    layout_elements = ocr_agent.get_layout_elements_from_image(image, ocr_languages="eng")
+    assert len(layout_elements) == 1
+
+
 @pytest.fixture()
 def mock_ocr_regions():
     return [

diff --git a/unstructured/__version__.py b/unstructured/__version__.py
@@ -1 +1 @@
-__version__ = "0.13.4-dev0"  # pragma: no cover
+__version__ = "0.13.4-dev1"  # pragma: no cover
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		__version__ = "0.13.4-dev0" # pragma: no cover
		__version__ = "0.13.4-dev1" # pragma: no cover