-
Notifications
You must be signed in to change notification settings - Fork 8
/
demo.py
49 lines (39 loc) · 983 Bytes
/
demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
from hotpdf import HotPdf
pdf_file_path = "test.pdf"
# Load pdf file into memory
hotpdf_document = HotPdf(pdf_file_path)
# Alternatively, you can also pass an opened pdf stream to be loaded
with open(pdf_file_path, "rb") as f:
hotpdf_document_2 = HotPdf(f)
# Get number of pages
print(len(hotpdf_document.pages))
# Find text
text_occurences = hotpdf_document.find_text("foo")
# Find text and its full span
text_occurences_full_span = hotpdf_document.find_text("foo", take_span=True)
# Extract text in region
text_in_bbox = hotpdf_document.extract_text(
x0=0,
y0=0,
x1=100,
y1=10,
page=0,
)
# Extract spans in region
spans_in_bbox = hotpdf_document.extract_spans(
x0=0,
y0=0,
x1=100,
y1=10,
page=0,
)
# Extract spans text in region
spans_text_in_bbox = hotpdf_document.extract_spans_text(
x0=0,
y0=0,
x1=100,
y1=10,
page=0,
)
# Extract full page text
full_page_text = hotpdf_document.extract_page_text(page=0)