Skip to content
This repository has been archived by the owner on Apr 4, 2023. It is now read-only.

Commit

Permalink
Merge pull request #202 from cmc333333/313-ignores
Browse files Browse the repository at this point in the history
Updates to IGNORE_DEFINITIONS_IN
  • Loading branch information
tadhg-ohiggins committed Feb 25, 2016
2 parents 44ae902 + d22f40c commit c8164ce
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 39 deletions.
53 changes: 34 additions & 19 deletions regparser/layer/terms.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ def parent_of(self, node):

class Terms(Layer):
shorthand = 'terms'
STARTS_WITH_WORDCHAR = re.compile('^\w.*$')
ENDS_WITH_WORDCHAR = re.compile('^.*\w$')

def __init__(self, *args, **kwargs):
Layer.__init__(self, *args, **kwargs)
Expand Down Expand Up @@ -94,11 +96,16 @@ def applicable_terms(self, label):
def is_exclusion(self, term, node):
"""Some definitions are exceptions/exclusions of a previously
defined term. At the moment, we do not want to include these as they
would replace previous (correct) definitions."""
would replace previous (correct) definitions. We also remove terms
which are inside an instance of the IGNORE_DEFINITIONS_IN setting"""
applicable_terms = self.applicable_terms(node.label)
if term in applicable_terms:
regex = 'the term .?' + re.escape(term) + '.? does not include'
return bool(re.search(regex, node.text.lower()))
if re.search(regex, node.text.lower()):
return True
for start, end in self.ignored_offsets(node.label[0], node.text):
if term in node.text[start:end]:
return True
return False

def node_definitions(self, node, stack=None):
Expand Down Expand Up @@ -131,9 +138,7 @@ def process(self, node):
(term, ref) for term, ref in applicable_terms.iteritems()
if ref.label != node.label_id()]

exclusions = self.excluded_offsets(node.label_id(), node.text)
exclusions = self.per_regulation_ignores(
exclusions, node.label, node.text)
exclusions = self.excluded_offsets(node)

matches = self.calculate_offsets(node.text, term_list, exclusions)
for term, ref, offsets in matches:
Expand All @@ -146,26 +151,36 @@ def process(self, node):
def _word_matches(self, term, text):
"""Return the start and end indexes of the term within the text,
accounting for word boundaries"""
return [(match.start(), match.end()) for match in
re.finditer(r'\b' + re.escape(term) + r'\b', text)]

def per_regulation_ignores(self, exclusions, label, text):
cfr_part = label[0]
if settings.IGNORE_DEFINITIONS_IN.get(cfr_part):
for ignore_term in settings.IGNORE_DEFINITIONS_IN[cfr_part]:
exclusions.extend(self._word_matches(ignore_term, text))
return exclusions

def excluded_offsets(self, label, text):
# @todo - this is rather slow -- probably want to memoize the results
regex = re.escape(term)
if self.STARTS_WITH_WORDCHAR.match(term):
regex = r'\b' + regex
if self.ENDS_WITH_WORDCHAR.match(term):
regex += r'\b'
regex = re.compile(regex)
return [(match.start(), match.end())
for match in regex.finditer(text)]

def ignored_offsets(self, cfr_part, text):
"""Return a list of offsets corresponding to the presence of an
"ignored" phrase in the text"""
ignored_phrases = (settings.IGNORE_DEFINITIONS_IN.get('ALL', []) +
settings.IGNORE_DEFINITIONS_IN.get(cfr_part, []))
positions = []
for phrase in ignored_phrases:
positions.extend(self._word_matches(phrase, text))
return positions

def excluded_offsets(self, node):
"""We explicitly exclude certain chunks of text (for example, words
we are defining shouldn't have links appear within the defined
term.) More will be added in the future"""
exclusions = []
for reflist in self.scoped_terms.values():
exclusions.extend(
ref.position for ref in reflist if ref.label == label)
for ignore_term in settings.IGNORE_DEFINITIONS_IN['ALL']:
exclusions.extend(self._word_matches(ignore_term, text))
ref.position for ref in reflist
if ref.label == node.label_id())
exclusions.extend(self.ignored_offsets(node.label[0], node.text))
return exclusions

def calculate_offsets(self, text, applicable_terms, exclusions=[],
Expand Down
16 changes: 8 additions & 8 deletions settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,17 +63,17 @@
'https://s3.amazonaws.com/images.federalregister.gov/' +
'%s/original.gif')

# list of strings: phrases which shouldn't be broken by definition links
IGNORE_DEFINITIONS_IN = {'ALL': []}
# Look in extensions for definition phrases to be excluded:
# dict: string->[string]: List of phrases which shouldn't contain defined
# terms. Keyed by CFR part or 'ALL'.
IGNORE_DEFINITIONS_IN = plugins.update_dictionary(
"eregs_ns.parser.term_definition_exclusions", IGNORE_DEFINITIONS_IN)
"eregs_ns.parser.term_ignores", {'ALL': []})

# List of strings: phrases which should be included as definition links
INCLUDE_DEFINITIONS_IN = {'ALL': []}
# Add include definitions from extensions:
# dict: string->[(string,string)]: List of phrases which *should* trigger a
# definition. Pair is of the form (term, context), where "context" refers to a
# substring match for a specific paragraph. e.g.
# ("bob", "text noting that it defines bob")
INCLUDE_DEFINITIONS_IN = plugins.update_dictionary(
"eregs_ns.parser.term_definitions", INCLUDE_DEFINITIONS_IN)
"eregs_ns.parser.term_definitions", {'ALL': []})

# list of modules implementing the __contains__ and __getitem__ methods
OVERRIDES_SOURCES = [
Expand Down
44 changes: 32 additions & 12 deletions tests/layer_terms_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,16 @@
class LayerTermTest(TestCase):
def setUp(self):
self.original_ignores = settings.IGNORE_DEFINITIONS_IN
settings.IGNORE_DEFINITIONS_IN = {'ALL': {}}
settings.IGNORE_DEFINITIONS_IN = {'ALL': []}

def tearDown(self):
settings.IGNORE_DEFINITIONS_IN = self.original_ignores

def test_is_exclusion(self):
"""There are certain indicators that a definition _should not_ be
considered the definition of that term. For example, exclusions to a
general definition should not replace the original. We can also
explicitly ignore chunks of text when finding definitions.."""
t = Terms(None)
n = Node('ex ex ex', label=['1111', '2'])
self.assertFalse(t.is_exclusion('ex', n))
Expand All @@ -33,6 +37,11 @@ def test_is_exclusion(self):
t.scoped_terms = {('1111',): [Ref('abc', '1', 0)]}
self.assertFalse(t.is_exclusion('ex', n))

settings.IGNORE_DEFINITIONS_IN['1111'] = ['phrase with abc in it']
self.assertFalse(t.is_exclusion('abc', n))
n.text = "Now the node has a phrase with abc in it, doesn't it?"
self.assertTrue(t.is_exclusion('abc', n))

def test_node_definitions_no_def(self):
"""Verify that none of the matchers match certain strings"""
t = Terms(None)
Expand Down Expand Up @@ -273,16 +282,22 @@ def test_excluded_offsets(self):
Ref('term', 'lablab', 4), Ref('other', 'lablab', 8),
Ref('more', 'nonnon', 1)
]
self.assertEqual([(4, 8), (8, 13)],
t.excluded_offsets('lablab', 'Some text'))
self.assertEqual([(1, 5)], t.excluded_offsets('nonnon', 'Other'))
self.assertEqual([], t.excluded_offsets('ababab', 'Ab ab ab'))
self.assertEqual(
[(4, 8), (8, 13)],
t.excluded_offsets(Node('Some text', label=['lablab'])))
self.assertEqual(
[(1, 5)],
t.excluded_offsets(Node('Other', label=['nonnon'])))
self.assertEqual(
[],
t.excluded_offsets(Node('Ab ab ab', label=['ababab'])))

def test_excluded_offsets_blacklist(self):
t = Terms(None)
t.scoped_terms['_'] = [Ref('bourgeois', '12-Q-2', 0)]
settings.IGNORE_DEFINITIONS_IN['ALL'] = ['bourgeois pig']
excluded = t.excluded_offsets('12-3', 'You are a bourgeois pig!')
excluded = t.excluded_offsets(Node('You are a bourgeois pig!',
label=['12', '3']))
self.assertEqual([(10, 23)], excluded)

def test_excluded_offsets_blacklist_per_reg(self):
Expand All @@ -294,17 +309,22 @@ def test_excluded_offsets_blacklist_per_reg(self):

settings.IGNORE_DEFINITIONS_IN['ALL'] = ['bourgeois pig']
settings.IGNORE_DEFINITIONS_IN['12'] = ['consumer price index']
exclusions = [(0, 4)]
excluded = t.per_regulation_ignores(
exclusions, ['12', '2'], 'There is a consumer price index')
self.assertEqual([(0, 4), (11, 31)], excluded)
excluded = t.excluded_offsets(
Node('There is a consumer price index', label=['12', '2']))
self.assertEqual([(11, 31)], excluded)

def test_excluded_offsets_blacklist_word_boundaries(self):
"""If an exclusion begins/ends with word characters, the searching
regex should make sure to only match on word boundaries"""
settings.IGNORE_DEFINITIONS_IN['ALL'] = ['shed act', '(phrase)']
t = Terms(None)
t.scoped_terms['_'] = [Ref('act', '28-6-d', 0)]
settings.IGNORE_DEFINITIONS_IN['ALL'] = ['shed act']
excluded = t.excluded_offsets('28-9', "That's a watershed act")
excluded = t.excluded_offsets(Node("That's a watershed act",
label=['28', '9']))
self.assertEqual([], excluded)
excluded = t.excluded_offsets(Node("This has a '(phrase)' in it",
label=['28', '9']))
self.assertNotEqual([], excluded)

def test_calculate_offsets(self):
applicable_terms = [('rock band', 'a'), ('band', 'b'), ('drum', 'c'),
Expand Down

0 comments on commit c8164ce

Please sign in to comment.