Skip to content

Commit

Permalink
Integration with the Google Cloud Vision API (#2902)
Browse files Browse the repository at this point in the history
This PR adds a third OCR provider, alongside Tesseract and Paddle: the
[Google Cloud Vision API](https://cloud.google.com/vision).

It can be used similarly to other OCR methods: set the `OCR_AGENT`
environment variable to the path to the OCR module
(`unstructured.partition.utils.ocr_models.google_vision_ocr.OCRAgentGoogleVision`).
You also need to set the credentials to use Google APIs, for instance by
setting the `GOOGLE_APPLICATION_CREDENTIALS` environment variable.

---------

Co-authored-by: christinestraub <[email protected]>
  • Loading branch information
dlozeve and christinestraub authored Apr 23, 2024
1 parent 05ff975 commit abb0174
Show file tree
Hide file tree
Showing 24 changed files with 261 additions and 36 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
## 0.13.4-dev0
## 0.13.4-dev1

### Enhancements

### Features

* **Add integration with the Google Cloud Vision API**. Adds a third OCR provider, alongside Tesseract and Paddle: the Google Cloud Vision API.

### Fixes

* **Remove ElementMetadata.section field.**. This field was unused, not populated by any partitioners.
Expand Down
2 changes: 1 addition & 1 deletion requirements/base.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ dataclasses-json==0.6.4
# via -r ./base.in
dataclasses-json-speakeasy==0.5.11
# via unstructured-client
emoji==2.11.0
emoji==2.11.1
# via -r ./base.in
filetype==1.2.0
# via -r ./base.in
Expand Down
2 changes: 1 addition & 1 deletion requirements/dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ jupyterlab==4.1.6
# via notebook
jupyterlab-pygments==0.3.0
# via nbconvert
jupyterlab-server==2.26.0
jupyterlab-server==2.27.0
# via
# jupyterlab
# notebook
Expand Down
2 changes: 1 addition & 1 deletion requirements/extra-paddleocr.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ idna==3.7
# via
# -c ./base.txt
# requests
imageio==2.34.0
imageio==2.34.1
# via
# imgaug
# scikit-image
Expand Down
1 change: 1 addition & 0 deletions requirements/extra-pdf-image.in
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,4 @@ unstructured-inference==0.7.27
# unstructured fork of pytesseract that provides an interface to allow for multiple output formats
# from one tesseract call
unstructured.pytesseract>=0.3.12
google-cloud-vision
38 changes: 38 additions & 0 deletions requirements/extra-pdf-image.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
#
antlr4-python3-runtime==4.9.3
# via omegaconf
cachetools==5.3.3
# via google-auth
certifi==2024.2.2
# via
# -c ././deps/constraints.txt
Expand Down Expand Up @@ -43,6 +45,24 @@ fsspec==2024.3.1
# via
# huggingface-hub
# torch
google-api-core[grpc]==2.18.0
# via google-cloud-vision
google-auth==2.29.0
# via
# google-api-core
# google-cloud-vision
google-cloud-vision==3.7.2
# via -r ./extra-pdf-image.in
googleapis-common-protos==1.63.0
# via
# google-api-core
# grpcio-status
grpcio==1.62.2
# via
# google-api-core
# grpcio-status
grpcio-status==1.62.2
# via google-api-core
huggingface-hub==0.22.2
# via
# timm
Expand Down Expand Up @@ -147,11 +167,26 @@ pillow-heif==0.16.0
# via -r ./extra-pdf-image.in
portalocker==2.8.2
# via iopath
proto-plus==1.23.0
# via
# google-api-core
# google-cloud-vision
protobuf==4.23.4
# via
# -c ././deps/constraints.txt
# google-api-core
# google-cloud-vision
# googleapis-common-protos
# grpcio-status
# onnx
# onnxruntime
# proto-plus
pyasn1==0.6.0
# via
# pyasn1-modules
# rsa
pyasn1-modules==0.4.0
# via google-auth
pycocotools==2.0.7
# via
# -c ././deps/constraints.txt
Expand Down Expand Up @@ -195,8 +230,11 @@ regex==2024.4.16
requests==2.31.0
# via
# -c ./base.txt
# google-api-core
# huggingface-hub
# transformers
rsa==4.9
# via google-auth
safetensors==0.4.3
# via
# timm
Expand Down
4 changes: 1 addition & 3 deletions requirements/ingest/astra.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ hpack==4.0.0
httpcore==1.0.5
# via httpx
httpx[http2]==0.27.0
# via
# astrapy
# httpx
# via astrapy
hyperframe==6.0.1
# via h2
idna==3.7
Expand Down
4 changes: 1 addition & 3 deletions requirements/ingest/azure.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,9 +80,7 @@ portalocker==2.8.2
pycparser==2.22
# via cffi
pyjwt[crypto]==2.8.0
# via
# msal
# pyjwt
# via msal
requests==2.31.0
# via
# -c ./ingest/../base.txt
Expand Down
4 changes: 1 addition & 3 deletions requirements/ingest/box.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,7 @@ attrs==23.2.0
boxfs==0.3.0
# via -r ./ingest/box.in
boxsdk[jwt]==3.9.2
# via
# boxfs
# boxsdk
# via boxfs
certifi==2024.2.2
# via
# -c ./ingest/../base.txt
Expand Down
4 changes: 1 addition & 3 deletions requirements/ingest/chroma.txt
Original file line number Diff line number Diff line change
Expand Up @@ -214,9 +214,7 @@ urllib3==1.26.18
# kubernetes
# requests
uvicorn[standard]==0.29.0
# via
# chromadb
# uvicorn
# via chromadb
uvloop==0.19.0
# via uvicorn
watchfiles==0.21.0
Expand Down
2 changes: 1 addition & 1 deletion requirements/ingest/delta-table.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#
# pip-compile ./ingest/delta-table.in
#
deltalake==0.16.4
deltalake==0.17.0
# via -r ./ingest/delta-table.in
fsspec==2024.3.1
# via -r ./ingest/delta-table.in
Expand Down
2 changes: 2 additions & 0 deletions requirements/ingest/embed-aws-bedrock.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ frozenlist==1.4.1
# via
# aiohttp
# aiosignal
greenlet==3.0.3
# via sqlalchemy
idna==3.7
# via
# -c ./ingest/../base.txt
Expand Down
2 changes: 2 additions & 0 deletions requirements/ingest/embed-huggingface.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,8 @@ fsspec==2024.3.1
# via
# huggingface-hub
# torch
greenlet==3.0.3
# via sqlalchemy
huggingface==0.0.1
# via -r ./ingest/embed-huggingface.in
huggingface-hub==0.22.2
Expand Down
2 changes: 2 additions & 0 deletions requirements/ingest/embed-openai.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ frozenlist==1.4.1
# via
# aiohttp
# aiosignal
greenlet==3.0.3
# via sqlalchemy
h11==0.14.0
# via httpcore
httpcore==1.0.5
Expand Down
3 changes: 2 additions & 1 deletion requirements/ingest/embed-vertexai.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,6 @@ frozenlist==1.4.1
# aiosignal
google-api-core[grpc]==2.18.0
# via
# google-api-core
# google-cloud-aiplatform
# google-cloud-bigquery
# google-cloud-core
Expand Down Expand Up @@ -83,6 +82,8 @@ googleapis-common-protos[grpc]==1.63.0
# google-api-core
# grpc-google-iam-v1
# grpcio-status
greenlet==3.0.3
# via sqlalchemy
grpc-google-iam-v1==0.13.0
# via google-cloud-resource-manager
grpcio==1.62.2
Expand Down
4 changes: 1 addition & 3 deletions requirements/ingest/github.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,7 @@ pycparser==2.22
pygithub==2.3.0
# via -r ./ingest/github.in
pyjwt[crypto]==2.8.0
# via
# pygithub
# pyjwt
# via pygithub
pynacl==1.5.0
# via pygithub
requests==2.31.0
Expand Down
4 changes: 1 addition & 3 deletions requirements/ingest/onedrive.txt
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,7 @@ office365-rest-python-client==2.4.2
pycparser==2.22
# via cffi
pyjwt[crypto]==2.8.0
# via
# msal
# pyjwt
# via msal
pytz==2024.1
# via office365-rest-python-client
requests==2.31.0
Expand Down
4 changes: 1 addition & 3 deletions requirements/ingest/outlook.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,7 @@ office365-rest-python-client==2.4.2
pycparser==2.22
# via cffi
pyjwt[crypto]==2.8.0
# via
# msal
# pyjwt
# via msal
pytz==2024.1
# via office365-rest-python-client
requests==2.31.0
Expand Down
6 changes: 2 additions & 4 deletions requirements/ingest/qdrant.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,7 @@ hpack==4.0.0
httpcore==1.0.5
# via httpx
httpx[http2]==0.27.0
# via
# httpx
# qdrant-client
# via qdrant-client
hyperframe==6.0.1
# via h2
idna==3.7
Expand All @@ -57,7 +55,7 @@ pydantic==2.7.0
# via qdrant-client
pydantic-core==2.18.1
# via pydantic
qdrant-client==1.8.2
qdrant-client==1.9.0
# via -r ./ingest/qdrant.in
sniffio==1.3.1
# via
Expand Down
4 changes: 1 addition & 3 deletions requirements/ingest/sharepoint.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,7 @@ office365-rest-python-client==2.4.2
pycparser==2.22
# via cffi
pyjwt[crypto]==2.8.0
# via
# msal
# pyjwt
# via msal
pytz==2024.1
# via office365-rest-python-client
requests==2.31.0
Expand Down
86 changes: 86 additions & 0 deletions test_unstructured/partition/pdf_image/test_ocr.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from collections import namedtuple
from unittest.mock import patch

import numpy as np
Expand All @@ -19,6 +20,7 @@
from unstructured.partition.utils.constants import (
Source,
)
from unstructured.partition.utils.ocr_models.google_vision_ocr import OCRAgentGoogleVision
from unstructured.partition.utils.ocr_models.paddle_ocr import OCRAgentPaddle
from unstructured.partition.utils.ocr_models.tesseract_ocr import (
OCRAgentTesseract,
Expand Down Expand Up @@ -192,6 +194,90 @@ def test_get_ocr_text_from_image_paddle(monkeypatch):
assert ocr_text == "Hello\n\nWorld\n\n!"


@pytest.fixture()
def google_vision_text_annotation():
from google.cloud.vision import (
Block,
BoundingPoly,
Page,
Paragraph,
Symbol,
TextAnnotation,
Vertex,
Word,
)

breaks = TextAnnotation.DetectedBreak.BreakType
symbols_hello = [Symbol(text=c) for c in "Hello"] + [
Symbol(
property=TextAnnotation.TextProperty(
detected_break=TextAnnotation.DetectedBreak(type_=breaks.SPACE)
)
)
]
symbols_world = [Symbol(text=c) for c in "World!"] + [
Symbol(
property=TextAnnotation.TextProperty(
detected_break=TextAnnotation.DetectedBreak(type_=breaks.LINE_BREAK)
)
)
]
words = [Word(symbols=symbols_hello), Word(symbols=symbols_world)]
bounding_box = BoundingPoly(
vertices=[Vertex(x=0, y=0), Vertex(x=0, y=10), Vertex(x=10, y=10), Vertex(x=10, y=0)]
)
paragraphs = [Paragraph(words=words, bounding_box=bounding_box)]
blocks = [Block(paragraphs=paragraphs)]
pages = [Page(blocks=blocks)]
return TextAnnotation(text="Hello World!", pages=pages)


@pytest.fixture()
def google_vision_client(google_vision_text_annotation):
Response = namedtuple("Response", "full_text_annotation")

class FakeGoogleVisionClient:
def document_text_detection(self, image):
return Response(full_text_annotation=google_vision_text_annotation)

class OCRAgentFakeGoogleVision(OCRAgentGoogleVision):
def __init__(self):
self.client = FakeGoogleVisionClient()

return OCRAgentFakeGoogleVision()


def test_get_ocr_from_image_google_vision(google_vision_client):
image = Image.new("RGB", (100, 100))

ocr_agent = google_vision_client
ocr_text = ocr_agent.get_text_from_image(image, ocr_languages="eng")

assert ocr_text == "Hello World!"


def test_get_layout_from_image_google_vision(google_vision_client):
image = Image.new("RGB", (100, 100))

ocr_agent = google_vision_client
regions = ocr_agent.get_layout_from_image(image, ocr_languages="eng")
assert len(regions) == 1
assert regions[0].text == "Hello World!"
assert regions[0].source == Source.OCR_GOOGLEVISION
assert regions[0].bbox.x1 == 0
assert regions[0].bbox.y1 == 0
assert regions[0].bbox.x2 == 10
assert regions[0].bbox.y2 == 10


def test_get_layout_elements_from_image_google_vision(google_vision_client):
image = Image.new("RGB", (100, 100))

ocr_agent = google_vision_client
layout_elements = ocr_agent.get_layout_elements_from_image(image, ocr_languages="eng")
assert len(layout_elements) == 1


@pytest.fixture()
def mock_ocr_regions():
return [
Expand Down
2 changes: 1 addition & 1 deletion unstructured/__version__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.13.4-dev0" # pragma: no cover
__version__ = "0.13.4-dev1" # pragma: no cover
Loading

0 comments on commit abb0174

Please sign in to comment.