-
Notifications
You must be signed in to change notification settings - Fork 105
/
extract_languages.py
76 lines (60 loc) · 1.93 KB
/
extract_languages.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
Detect Languages from a Document
Works with any processor that outputs "detectedLanguage"
"""
from google.api_core.client_options import ClientOptions
from google.cloud import documentai_v1 as documentai
import pandas as pd
def online_process(
project_id: str,
location: str,
processor_id: str,
file_path: str,
mime_type: str,
) -> documentai.Document:
"""
Processes a document using the Document AI Online Processing API.
"""
documentai_client = documentai.DocumentProcessorServiceClient(
client_options=ClientOptions(
api_endpoint=f"{location}-documentai.googleapis.com"
)
)
resource_name = documentai_client.processor_path(project_id, location, processor_id)
with open(file_path, "rb") as file:
file_content = file.read()
result = documentai_client.process_document(
request=documentai.ProcessRequest(
name=resource_name,
raw_document=documentai.RawDocument(
content=file_content, mime_type=mime_type
),
)
)
return result.document
PROJECT_ID = "YOUR_PROJECT_ID"
LOCATION = "us" # Format is 'us' or 'eu'
PROCESSOR_ID = "YOUR_PROCESSOR_ID" # Create processor in Cloud Console
FILE_PATH = "multi_language.pdf"
MIME_TYPE = "application/pdf"
document = online_process(
project_id=PROJECT_ID,
location=LOCATION,
processor_id=PROCESSOR_ID,
file_path=FILE_PATH,
mime_type=MIME_TYPE,
)
print("Document processing complete.")
extracted_languages = []
for page in document.pages:
for language in page.detected_languages:
extracted_languages.append(
{
"page_number": page.page_number,
"language_code": language.language_code,
"confidence": f"{language.confidence:.0%}",
}
)
# Create a Pandas Dataframe to print the values in tabular format.
df = pd.DataFrame(extracted_languages)
print(df)