RasaHQ · twhughes · Sep 11, 2018 · Aug 13, 2018 · Aug 13, 2018 · Aug 13, 2018
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -11,6 +11,7 @@ This project adheres to `Semantic Versioning`_ starting with version 0.7.0.
 
 Added
 -----
+- ability to specify lookup tables in training data
 
 Changed
 -------

diff --git a/data/test/lookup_tables/drinks.txt b/data/test/lookup_tables/drinks.txt
@@ -0,0 +1,2 @@
+mojito, lemonade, sweet berry wine
+tea, club mate
diff --git a/data/test/lookup_tables/lookup_table.json b/data/test/lookup_tables/lookup_table.json
@@ -0,0 +1,21 @@
+{
+  "rasa_nlu_data": {
+    "lookup_tables": [
+      {
+        "name": "plates",
+        "file_path": "data/test/lookup_tables/plates.txt"
+      },
+      {
+        "name": "drinks",
+        "file_path": "data/test/lookup_tables/drinks.txt"
+      }      
+    ],    
+    "common_examples": [
+      {
+        "text": "hey", 
+        "intent": "greet", 
+        "entities": []
+      }
+    ]
+  }
+}
diff --git a/data/test/lookup_tables/lookup_table.md b/data/test/lookup_tables/lookup_table.md
@@ -0,0 +1,11 @@
+## intent:restaurant_search
+- i'm looking for a [sushi](food) place to eat
+- I want to grab [tacos](food)
+- I am searching for a [pizza](food) spot
+- I would like to drink [sweet berry wine](beverage) with my meal
+
+## lookup:plates
+- data/test/lookup_tables/plates.txt
+
+## lookup:drinks
+- data/test/lookup_tables/drinks.txt
diff --git a/data/test/lookup_tables/plates.txt b/data/test/lookup_tables/plates.txt
@@ -0,0 +1,2 @@
+tacos, beef, mapo tofu
+burrito, lettuce wrap
diff --git a/docs/dataformat.rst b/docs/dataformat.rst
@@ -37,13 +37,16 @@ Examples are grouped by intent, and entities are annotated as markdown links.
     ## regex:zipcode
     - [0-9]{5}
 
+    ## lookup:streets
+    - path/to/streets.txt
 
 The training data for Rasa NLU is structured into different parts:
-examples, synonyms, and regex features. 
+examples, synonyms, regex features, and lookup tables. 
 
 Synonyms will map extracted entities to the same name, for example mapping "my savings account" to simply "savings".
 However, this only happens *after* the entities have been extracted, so you need to provide examples with the synonyms present so that Rasa can learn to pick them up. 
 
+Lookup tables may be specified as txt files containing comma-separated words or phrases.  Upon loading the training data, these files are used to generate case-insensitive regex patterns that are added to the regex features.
 
 JSON Format
 -----------
@@ -58,6 +61,7 @@ The most important one is ``common_examples``.
         "rasa_nlu_data": {
             "common_examples": [],
             "regex_features" : [],
+            "lookup_tables"  : [],
             "entity_synonyms": []
         }
     }
@@ -230,6 +234,36 @@ for these extractors. Currently, all intent classifiers make use of available re
     training data!
 
 
+Lookup Tables
+-------------
+Lookup tables in the form of external files can also be specified in the training data.  The externally supplied lookup tables must be in a comma-separated format.  For example, ``data/lookup_tables/streets.txt`` may contain
+
+    main street, washington ave, elm street, ...
+
+And can be loaded in along with ``data/lookup_tables/cities.txt`` as:
+
+.. code-block:: json
+
+    {
+        "rasa_nlu_data": {
+            "lookup_tables": [
+                {
+                    "name": "streets",
+                    "file_path": "data/lookup_tables/streets.txt"
+                },
+                {
+                    "name": "cities",
+                    "file_path": "data/lookup_tables/cities.txt"
+                }
+            ]
+        }
+    }
+
+When lookup tables are supplied in training data, the contents are combined into a large, case-insensitive regex pattern that looks for exact matches in the training examples.  These regexes match over multiple tokens, so ``main street`` would match ``meet me at 1223 main street at 5 pm`` as ``[0 0 0 0 1 1 0 0 0]``.  These regexes are processed identically to the regular regex patterns directly specified in the training data.  A few lookup tables for common entities are specified in ``rasa_nlu/data/lookups/``
+
+.. note::
+    For lookup tables to be effective, there must be a few examples of matches in your training data.  Otherwise the model will not learn to use the lookup table match features.
+
 Organization
 ------------
 

diff --git a/rasa_nlu/training_data/formats/markdown.py b/rasa_nlu/training_data/formats/markdown.py
@@ -7,15 +7,16 @@
 import logging
 
 from rasa_nlu.training_data import Message, TrainingData
-from rasa_nlu.training_data.util import check_duplicate_synonym
+from rasa_nlu.training_data.util import check_duplicate_synonym, generate_lookup_regex
 from rasa_nlu.utils import build_entity
 
 from rasa_nlu.training_data.formats.readerwriter import TrainingDataReader, TrainingDataWriter
 
 INTENT = "intent"
 SYNONYM = "synonym"
 REGEX = "regex"
-available_sections = [INTENT, SYNONYM, REGEX]
+LOOKUP = "lookup"
+available_sections = [INTENT, SYNONYM, REGEX, LOOKUP]
 ent_regex = re.compile(r'\[(?P<entity_text>[^\]]+)'
                        r'\]\((?P<entity>\w*?)'
                        r'(?:\:(?P<value>[^)]+))?\)')  # [entity_text](entity_type(:entity_synonym)?)
@@ -48,7 +49,6 @@ def reads(self, s, **kwargs):
                 self._set_current_section(header[0], header[1])
             else:
                 self._parse_item(line)
-
         return TrainingData(self.training_examples, self.entity_synonyms, self.regex_features)
 
     @staticmethod
@@ -81,8 +81,11 @@ def _parse_item(self, line):
                 self.training_examples.append(parsed)
             elif self.current_section == SYNONYM:
                 self._add_synonym(item, self.current_title)
-            else:
+            elif self.current_section == REGEX:
                 self.regex_features.append({"name": self.current_title, "pattern": item})
+            elif self.current_section == LOOKUP:
+                lookup_regex = generate_lookup_regex(item)
+                self.regex_features.append({"name": self.current_title, "pattern": lookup_regex})
 
     def _find_entities_in_training_example(self, example):
         """Extracts entities from a markdown intent example."""

diff --git a/rasa_nlu/training_data/formats/rasa.py b/rasa_nlu/training_data/formats/rasa.py
@@ -10,7 +10,7 @@
 from rasa_nlu.training_data.formats.readerwriter import (
     JsonTrainingDataReader,
     TrainingDataWriter)
-from rasa_nlu.training_data.util import transform_entity_synonyms
+from rasa_nlu.training_data.util import transform_entity_synonyms, generate_lookup_regex
 from rasa_nlu.utils import json_to_string
 
 logger = logging.getLogger(__name__)
@@ -27,6 +27,12 @@ def read_from_json(self, js, **kwargs):
         entity_examples = data.get("entity_examples", [])
         entity_synonyms = data.get("entity_synonyms", [])
         regex_features = data.get("regex_features", [])
+        lookup_tables = data.get("lookup_tables", [])
+
+        # generates regexes from lookup tables and adds to regex features
+        lookup_regexes = [{'name': t['name'],
+                           'pattern': generate_lookup_regex(t['file_path'])} for t in lookup_tables]
+        regex_features += lookup_regexes
 
         entity_synonyms = transform_entity_synonyms(entity_synonyms)
 
@@ -119,6 +125,14 @@ def _rasa_nlu_data_schema():
         }
     }
 
+    lookup_table_schema = {
+        "type": "object",
+        "properties": {
+            "name": {"type": "string"},
+            "file_path": {"type": "string"},
+        }
+    }
+
     return {
         "type": "object",
         "properties": {
@@ -140,6 +154,10 @@ def _rasa_nlu_data_schema():
                     "entity_examples": {
                         "type": "array",
                         "items": training_example_schema
+                    },
+                    "lookup_tables": {
+                        "type": "array",
+                        "items": lookup_table_schema
                     }
                 }
             }

diff --git a/rasa_nlu/training_data/util.py b/rasa_nlu/training_data/util.py
@@ -6,6 +6,7 @@
 from __future__ import unicode_literals
 
 import logging
+import sys
 
 logger = logging.getLogger(__name__)
 
@@ -24,3 +25,23 @@ def check_duplicate_synonym(entity_synonyms, text, syn, context_str=""):
     if text in entity_synonyms and entity_synonyms[text] != syn:
         logger.warning("Found inconsistent entity synonyms while {0}, overwriting {1}->{2}"
                        "with {1}->{2} during merge".format(context_str, text, entity_synonyms[text], syn))
+
+
+def generate_lookup_regex(file_path, print_data_size=True):
+    """creates a regex out of the contents of a lookup table file"""
+    lookup_elements = []
+    with open(file_path, 'r') as f:
+        for l in f.readlines():
+            new_elements = [e.strip() for e in l.split(',')]
+            if '' in new_elements:
+                new_elements.remove('')
+            lookup_elements += new_elements
+    regex_string = '(?i)(' + '|'.join(lookup_elements) + ')'
+
+    """log info about the lookup table"""
+    num_words = len(lookup_elements)
+    regex_size = sys.getsizeof(regex_string)
+    logger.info("found {} words in lookup table '{}'"
+                " with a size of {:.2e} bytes".format(num_words, file_path, regex_size))
+
+    return regex_string
diff --git a/tests/base/test_training_data.py b/tests/base/test_training_data.py
@@ -130,6 +130,22 @@ def test_markdown_single_sections():
                                            'Chinese': 'chinese'}
 
 
+def test_lookup_table_json():
+    td_lookup = training_data.load_data('data/test/lookup_tables/lookup_table.json')
+    assert td_lookup.regex_features[0]['name'] == 'drinks'
+    assert td_lookup.regex_features[0]['pattern'] == '(?i)(mojito|lemonade|sweet berry wine|tea|club mate)'
+    assert td_lookup.regex_features[1]['name'] == 'plates'
+    assert td_lookup.regex_features[1]['pattern'] == '(?i)(tacos|beef|mapo tofu|burrito|lettuce wrap)'
+
+
+def test_lookup_table_md():
+    td_lookup = training_data.load_data('data/test/lookup_tables/lookup_table.md')
+    assert td_lookup.regex_features[0]['name'] == 'drinks'
+    assert td_lookup.regex_features[0]['pattern'] == '(?i)(mojito|lemonade|sweet berry wine|tea|club mate)'
+    assert td_lookup.regex_features[1]['name'] == 'plates'
+    assert td_lookup.regex_features[1]['pattern'] == '(?i)(tacos|beef|mapo tofu|burrito|lettuce wrap)'
+
+
 def test_repeated_entities():
     data = """
 {