-
Notifications
You must be signed in to change notification settings - Fork 1
/
PDF_pages4keywords_PyPDF2.py
98 lines (67 loc) · 2.72 KB
/
PDF_pages4keywords_PyPDF2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python
# coding: utf-8
# Script to read KEYWORDS from CSV, find them in PDF and store the page numbers in a dictionary of lists.
# CAUTION: depending on the quality of the ingested PDF files, PyPDF2 may not perform as reliable as other PDF packages in Python.
# Please make sure to try alternatives such as PDFtoTEXT as well.
# PyPDF2
# https://pythonhosted.org/PyPDF2/PageObject.html
# get INDEX words from CSV file
import csv
import pandas as pd
import numpy as np
CSV_FILE='[PATH TO LOCAL CSV FILE]'
with open(CSV_FILE, encoding="utf-8", errors="ignore") as f:
data = pd.read_csv(f, sep=";")
words=data['WORD'].values
print(len(words))
print(words[:10])
# extract_doc_info.py
from PyPDF2 import PdfFileReader
from PyPDF2 import utils
import os
# define INDEX words
index_words=words
# open PDF file
def extract_information(filename):
with open(filename, 'rb') as f:
#print(f)
# create dictionary of lists for final results
content_all={}
# read PDF file
try:
pdf=PdfFileReader(filename)
print(pdf)
information=pdf.getDocumentInfo()
print(information)
number_of_pages=pdf.getNumPages()
print("Number of pages:", number_of_pages, "\n\n")
# get PDF content and check index words page by page
for i in index_words[:5]:
print("TRYING TO FIND", i, ":")
content_all[i]=[]
i_list=[] # create list for page results per word
for n in range(0, number_of_pages):
print("Page to check:", n, "\n\n")
page = pdf.getPage(n)
content=page.extractText()
if i in content:
print("WORD FOUND:", i, "on page", n)
i_list.append(n)
else:
continue
content_all[i].append(i_list) # add list to dict
# write dictionary of lists to new .TXT files
with open(os.path.join('[PATH TO TXT FILE]'), 'w', encoding="utf-8") as outfile:
outfile.write(str(content_all))
outfile.close()
# exception handling for malformed PDFs
except utils.PdfReadError:
print("error")
# iterate through all PDF files in directoy
if __name__ == '__main__':
path = '[PATH TO DIRECTORY WHERE PDF FILES ARE STORED'
for p in os.listdir(path):
filename=(os.path.join(path, p))
print(filename)
extract_information(filename)
print("Done")