Merge pull request #202 from cmc333333/313-ignores

Updates to IGNORE_DEFINITIONS_IN
eregs · Feb 25, 2016 · c8164ce · c8164ce
2 parents 44ae902 + d22f40c
commit c8164ce
Show file tree

Hide file tree

Showing 3 changed files with 74 additions and 39 deletions.
diff --git a/regparser/layer/terms.py b/regparser/layer/terms.py
@@ -36,6 +36,8 @@ def parent_of(self, node):
 
 class Terms(Layer):
     shorthand = 'terms'
+    STARTS_WITH_WORDCHAR = re.compile('^\w.*$')
+    ENDS_WITH_WORDCHAR = re.compile('^.*\w$')
 
     def __init__(self, *args, **kwargs):
         Layer.__init__(self, *args, **kwargs)
@@ -94,11 +96,16 @@ def applicable_terms(self, label):
     def is_exclusion(self, term, node):
         """Some definitions are exceptions/exclusions of a previously
         defined term. At the moment, we do not want to include these as they
-        would replace previous (correct) definitions."""
+        would replace previous (correct) definitions. We also remove terms
+        which are inside an instance of the IGNORE_DEFINITIONS_IN setting"""
         applicable_terms = self.applicable_terms(node.label)
         if term in applicable_terms:
             regex = 'the term .?' + re.escape(term) + '.? does not include'
-            return bool(re.search(regex, node.text.lower()))
+            if re.search(regex, node.text.lower()):
+                return True
+            for start, end in self.ignored_offsets(node.label[0], node.text):
+                if term in node.text[start:end]:
+                    return True
         return False
 
     def node_definitions(self, node, stack=None):
@@ -131,9 +138,7 @@ def process(self, node):
             (term, ref) for term, ref in applicable_terms.iteritems()
             if ref.label != node.label_id()]
 
-        exclusions = self.excluded_offsets(node.label_id(), node.text)
-        exclusions = self.per_regulation_ignores(
-            exclusions, node.label, node.text)
+        exclusions = self.excluded_offsets(node)
 
         matches = self.calculate_offsets(node.text, term_list, exclusions)
         for term, ref, offsets in matches:
@@ -146,26 +151,36 @@ def process(self, node):
     def _word_matches(self, term, text):
         """Return the start and end indexes of the term within the text,
         accounting for word boundaries"""
-        return [(match.start(), match.end()) for match in
-                re.finditer(r'\b' + re.escape(term) + r'\b', text)]
-
-    def per_regulation_ignores(self, exclusions, label, text):
-        cfr_part = label[0]
-        if settings.IGNORE_DEFINITIONS_IN.get(cfr_part):
-            for ignore_term in settings.IGNORE_DEFINITIONS_IN[cfr_part]:
-                exclusions.extend(self._word_matches(ignore_term, text))
-        return exclusions
-
-    def excluded_offsets(self, label, text):
+        # @todo - this is rather slow -- probably want to memoize the results
+        regex = re.escape(term)
+        if self.STARTS_WITH_WORDCHAR.match(term):
+            regex = r'\b' + regex
+        if self.ENDS_WITH_WORDCHAR.match(term):
+            regex += r'\b'
+        regex = re.compile(regex)
+        return [(match.start(), match.end())
+                for match in regex.finditer(text)]
+
+    def ignored_offsets(self, cfr_part, text):
+        """Return a list of offsets corresponding to the presence of an
+        "ignored" phrase in the text"""
+        ignored_phrases = (settings.IGNORE_DEFINITIONS_IN.get('ALL', []) +
+                           settings.IGNORE_DEFINITIONS_IN.get(cfr_part, []))
+        positions = []
+        for phrase in ignored_phrases:
+            positions.extend(self._word_matches(phrase, text))
+        return positions
+
+    def excluded_offsets(self, node):
         """We explicitly exclude certain chunks of text (for example, words
         we are defining shouldn't have links appear within the defined
         term.) More will be added in the future"""
         exclusions = []
         for reflist in self.scoped_terms.values():
             exclusions.extend(
-                ref.position for ref in reflist if ref.label == label)
-        for ignore_term in settings.IGNORE_DEFINITIONS_IN['ALL']:
-            exclusions.extend(self._word_matches(ignore_term, text))
+                ref.position for ref in reflist
+                if ref.label == node.label_id())
+        exclusions.extend(self.ignored_offsets(node.label[0], node.text))
         return exclusions
 
     def calculate_offsets(self, text, applicable_terms, exclusions=[],

diff --git a/settings.py b/settings.py
@@ -63,17 +63,17 @@
     'https://s3.amazonaws.com/images.federalregister.gov/' +
     '%s/original.gif')
 
-# list of strings: phrases which shouldn't be broken by definition links
-IGNORE_DEFINITIONS_IN = {'ALL': []}
-# Look in extensions for definition phrases to be excluded:
+# dict: string->[string]: List of phrases which shouldn't contain defined
+# terms. Keyed by CFR part or 'ALL'.
 IGNORE_DEFINITIONS_IN = plugins.update_dictionary(
-    "eregs_ns.parser.term_definition_exclusions", IGNORE_DEFINITIONS_IN)
+    "eregs_ns.parser.term_ignores", {'ALL': []})
 
-# List of strings: phrases which should be included as definition links
-INCLUDE_DEFINITIONS_IN = {'ALL': []}
-# Add include definitions from extensions:
+# dict: string->[(string,string)]: List of phrases which *should* trigger a
+# definition. Pair is of the form (term, context), where "context" refers to a
+# substring match for a specific paragraph. e.g.
+# ("bob", "text noting that it defines bob")
 INCLUDE_DEFINITIONS_IN = plugins.update_dictionary(
-    "eregs_ns.parser.term_definitions", INCLUDE_DEFINITIONS_IN)
+    "eregs_ns.parser.term_definitions", {'ALL': []})
 
 # list of modules implementing the __contains__ and __getitem__ methods
 OVERRIDES_SOURCES = [

diff --git a/tests/layer_terms_tests.py b/tests/layer_terms_tests.py
@@ -12,12 +12,16 @@
 class LayerTermTest(TestCase):
     def setUp(self):
         self.original_ignores = settings.IGNORE_DEFINITIONS_IN
-        settings.IGNORE_DEFINITIONS_IN = {'ALL': {}}
+        settings.IGNORE_DEFINITIONS_IN = {'ALL': []}
 
     def tearDown(self):
         settings.IGNORE_DEFINITIONS_IN = self.original_ignores
 
     def test_is_exclusion(self):
+        """There are certain indicators that a definition _should not_ be
+        considered the definition of that term. For example, exclusions to a
+        general definition should not replace the original. We can also
+        explicitly ignore chunks of text when finding definitions.."""
         t = Terms(None)
         n = Node('ex ex ex', label=['1111', '2'])
         self.assertFalse(t.is_exclusion('ex', n))
@@ -33,6 +37,11 @@ def test_is_exclusion(self):
         t.scoped_terms = {('1111',): [Ref('abc', '1', 0)]}
         self.assertFalse(t.is_exclusion('ex', n))
 
+        settings.IGNORE_DEFINITIONS_IN['1111'] = ['phrase with abc in it']
+        self.assertFalse(t.is_exclusion('abc', n))
+        n.text = "Now the node has a phrase with abc in it, doesn't it?"
+        self.assertTrue(t.is_exclusion('abc', n))
+
     def test_node_definitions_no_def(self):
         """Verify that none of the matchers match certain strings"""
         t = Terms(None)
@@ -273,16 +282,22 @@ def test_excluded_offsets(self):
             Ref('term', 'lablab', 4), Ref('other', 'lablab', 8),
             Ref('more', 'nonnon', 1)
         ]
-        self.assertEqual([(4, 8), (8, 13)],
-                         t.excluded_offsets('lablab', 'Some text'))
-        self.assertEqual([(1, 5)], t.excluded_offsets('nonnon', 'Other'))
-        self.assertEqual([], t.excluded_offsets('ababab', 'Ab ab ab'))
+        self.assertEqual(
+            [(4, 8), (8, 13)],
+            t.excluded_offsets(Node('Some text', label=['lablab'])))
+        self.assertEqual(
+            [(1, 5)],
+            t.excluded_offsets(Node('Other', label=['nonnon'])))
+        self.assertEqual(
+            [],
+            t.excluded_offsets(Node('Ab ab ab', label=['ababab'])))
 
     def test_excluded_offsets_blacklist(self):
         t = Terms(None)
         t.scoped_terms['_'] = [Ref('bourgeois', '12-Q-2', 0)]
         settings.IGNORE_DEFINITIONS_IN['ALL'] = ['bourgeois pig']
-        excluded = t.excluded_offsets('12-3', 'You are a bourgeois pig!')
+        excluded = t.excluded_offsets(Node('You are a bourgeois pig!',
+                                           label=['12', '3']))
         self.assertEqual([(10, 23)], excluded)
 
     def test_excluded_offsets_blacklist_per_reg(self):
@@ -294,17 +309,22 @@ def test_excluded_offsets_blacklist_per_reg(self):
 
         settings.IGNORE_DEFINITIONS_IN['ALL'] = ['bourgeois pig']
         settings.IGNORE_DEFINITIONS_IN['12'] = ['consumer price index']
-        exclusions = [(0, 4)]
-        excluded = t.per_regulation_ignores(
-            exclusions, ['12', '2'], 'There is a consumer price index')
-        self.assertEqual([(0, 4), (11, 31)], excluded)
+        excluded = t.excluded_offsets(
+            Node('There is a consumer price index', label=['12', '2']))
+        self.assertEqual([(11, 31)], excluded)
 
     def test_excluded_offsets_blacklist_word_boundaries(self):
+        """If an exclusion begins/ends with word characters, the searching
+        regex should make sure to only match on word boundaries"""
+        settings.IGNORE_DEFINITIONS_IN['ALL'] = ['shed act', '(phrase)']
         t = Terms(None)
         t.scoped_terms['_'] = [Ref('act', '28-6-d', 0)]
-        settings.IGNORE_DEFINITIONS_IN['ALL'] = ['shed act']
-        excluded = t.excluded_offsets('28-9', "That's a watershed act")
+        excluded = t.excluded_offsets(Node("That's a watershed act",
+                                           label=['28', '9']))
         self.assertEqual([], excluded)
+        excluded = t.excluded_offsets(Node("This has a '(phrase)' in it",
+                                           label=['28', '9']))
+        self.assertNotEqual([], excluded)
 
     def test_calculate_offsets(self):
         applicable_terms = [('rock band', 'a'), ('band', 'b'), ('drum', 'c'),