-
Notifications
You must be signed in to change notification settings - Fork 34
/
titlecaps.py
68 lines (49 loc) · 1.65 KB
/
titlecaps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import logging
from . import nlp
class titlecaps(object):
"""
Documents sometimes have sentences that are entirely in uppercase. This is
commonly found in titles and abstracts of older documents. This class
identifies sentences where every word is uppercase, and returns the
document with these sentences converted to lowercase.
"""
def __init__(self, min_length=4):
"""
Initialize the parser.
Args:
min_length: Minimum sentence length, otherwise sentence is returned
untouched.
"""
self.logger = logging.getLogger(__name__)
self.min_length = min_length
def __call__(self, text):
"""
Runs the parser.
Args:
text: a string document
Returns:
doc2: a string document
"""
# Need to keep the parser for sentence detection
sents = nlp(text, disable=["tagger"]).sents
doc2 = []
for sent in sents:
line = sent.text
if not is_any_lowercase(line):
if len(line) > self.min_length:
line = line.lower()
doc2.append(line + sent[-1].whitespace_)
doc2 = "".join(doc2)
return doc2
def is_any_lowercase(sentence):
"""
Checks if any letter in a sentence is lowercase, return False if there
are no alpha characters.
Args:
tokens: A list of string
Returns:
boolean: True if any letter in any token is lowercase
"""
if any(x.isalpha() & (x == x.lower()) for x in sentence):
return True
return not any(x.isalpha() for x in sentence)