forked from evannaderi/Epic-Law
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathocr_pdf_parse.py
42 lines (32 loc) · 1.5 KB
/
ocr_pdf_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from google.api_core.client_options import ClientOptions
from google.cloud import documentai
PROJECT_ID = "intrepid-alloy-401317"
LOCATION = "us" # Format is 'us' or 'eu'
PROCESSOR_ID = "a448e307b426c48a" # Create processor in Cloud Console
# The local file in your current working directory
FILE_PATH = "documents/report.pdf"
# Refer to https://cloud.google.com/document-ai/docs/file-types
# for supported file types
MIME_TYPE = "application/pdf"
# Instantiates a client
docai_client = documentai.DocumentProcessorServiceClient(
client_options=ClientOptions(api_endpoint=f"{LOCATION}-documentai.googleapis.com")
)
# The full resource name of the processor, e.g.:
# projects/project-id/locations/location/processor/processor-id
# You must create new processors in the Cloud Console first
RESOURCE_NAME = docai_client.processor_path(PROJECT_ID, LOCATION, PROCESSOR_ID)
# Read the file into memory
with open(FILE_PATH, "rb") as image:
image_content = image.read()
# Load Binary Data into Document AI RawDocument Object
raw_document = documentai.RawDocument(content=image_content, mime_type=MIME_TYPE)
# Configure the process request
request = documentai.ProcessRequest(name=RESOURCE_NAME, raw_document=raw_document)
# Use the Document AI client to process the sample form
result = docai_client.process_document(request=request)
document_object = result.document
print("Document processing complete.")
with open("output.txt", "w") as file:
file.write(document_object.text)
print(f"Text: {document_object.text}")