-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
85 lines (55 loc) · 2.13 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import datefinder
import pyPdf
import textract
import regex as re
def extract_date_strings(self, text, strict=False):
"""
Scans text for possible datetime strings and extracts them
source: also return the original date string
index: also return the indices of the date string in the text
strict: Strict mode will only return dates sourced with day, month, and year
"""
for match in self.DATE_REGEX.finditer(text):
match_str = match.group(0)
indices = match.span(0)
## Get individual group matches
captures = match.capturesdict()
time = captures.get('time')
digits = captures.get('digits')
digits_modifiers = captures.get('digits_modifiers')
days = captures.get('days')
months = captures.get('months')
timezones = captures.get('timezones')
delimiters = captures.get('delimiters')
time = captures.get('time)')
time_periods = captures.get('time_periods')
extra_tokens = captures.get('extra_tokens')
if strict:
complete = False
## 12-05-2015
if len(digits) == 3:
complete = True
## 19 February 2013 year 09:10
elif (len(months) == 1) and (len(digits) == 1 or len(digits) == 2):
complete = True
if not complete:
continue
## sanitize date string
## replace unhelpful whitespace characters with single whitespace
match_str = re.sub('[\n\t\s\xa0]+', ' ', match_str)
match_str = match_str.strip(self.STRIP_CHARS)
## Save sanitized source string
yield match_str, indices, captures
datefinder.DateFinder.extract_date_strings = extract_date_strings
# pdf = pyPdf.PdfFileReader(open('syllabus/econ101.pdf', "rb"))
# pdf_text = ''
# for page in pdf.pages:
# pdf_text += page.extractText()
pdf_text = textract.process("syllabus/econ101.pdf")
print pdf_text
file = open('test_input.txt', 'r')
dates = datefinder.find_dates(pdf_text, True, False, True) # text)
for date in dates:
# if date[0].year == 2018:
print date
# file.close()