Support 'memory zones' for user memory management (#13621)

Add a context manage nlp.memory_zone(), which will begin memory_zone() blocks on the vocab, string store, and potentially other components. Example usage: ``` with nlp.memory_zone(): for text in nlp.pipe(texts): do_something(doc) # do_something(doc) <-- Invalid ``` Once the memory_zone() block expires, spaCy will free any shared resources that were allocated for the text-processing that occurred within the memory_zone. If you create Doc objects within a memory zone, it's invalid to access them once the memory zone is expired. The purpose of this is that spaCy creates and stores Lexeme objects in the Vocab that can be shared between multiple Doc objects. It also interns strings. Normally, spaCy can't know when all Doc objects using a Lexeme are out-of-scope, so new Lexemes accumulate in the vocab, causing memory pressure. Memory zones solve this problem by telling spaCy "okay none of the documents allocated within this block will be accessed again". This lets spaCy free all new Lexeme objects and other data that were created during the block. The mechanism is general, so memory_zone() context managers can be added to other components that could benefit from them, e.g. pipeline components. I experimented with adding memory zone support to the tokenizer as well, for its cache. However, this seems unnecessarily complicated. It makes more sense to just stick a limit on the cache size. This lets spaCy benefit from the efficiency advantage of the cache better, because we can maintain a (bounded) cache even if only small batches of documents are being processed.
explosion · Sep 9, 2024 · 1b8d560 · 1b8d560
1 parent 608f65c
commit 1b8d560
Show file tree

Hide file tree

Showing 8 changed files with 232 additions and 49 deletions.
diff --git a/spacy/strings.pxd b/spacy/strings.pxd
@@ -25,5 +25,8 @@ cdef class StringStore:
     cdef vector[hash_t] keys
     cdef public PreshMap _map
 
-    cdef const Utf8Str* intern_unicode(self, str py_string)
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash)
+    cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient)
+    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient) 
+    cdef vector[hash_t] _transient_keys
+    cdef PreshMap _transient_map
+    cdef Pool _non_temp_mem
diff --git a/spacy/strings.pyx b/spacy/strings.pyx
@@ -1,6 +1,10 @@
 # cython: infer_types=True
 # cython: profile=False
 cimport cython
+
+from contextlib import contextmanager
+from typing import Iterator, List, Optional
+
 from libc.stdint cimport uint32_t
 from libc.string cimport memcpy
 from murmurhash.mrmr cimport hash32, hash64
@@ -31,7 +35,7 @@ def get_string_id(key):
     This function optimises for convenience over performance, so shouldn't be
     used in tight loops.
     """
-    cdef hash_t str_hash    
+    cdef hash_t str_hash
     if isinstance(key, str):
         if len(key) == 0:
             return 0
@@ -45,8 +49,8 @@ def get_string_id(key):
     elif _try_coerce_to_hash(key, &str_hash):
         # Coerce the integral key to the expected primitive hash type.
         # This ensures that custom/overloaded "primitive" data types
-        # such as those implemented by numpy are not inadvertently used 
-        # downsteam (as these are internally implemented as custom PyObjects 
+        # such as those implemented by numpy are not inadvertently used
+        # downsteam (as these are internally implemented as custom PyObjects
         # whose comparison operators can incur a significant overhead).
         return str_hash
     else:
@@ -119,7 +123,9 @@ cdef class StringStore:
         strings (iterable): A sequence of unicode strings to add to the store.
         """
         self.mem = Pool()
+        self._non_temp_mem = self.mem
         self._map = PreshMap()
+        self._transient_map = None
         if strings is not None:
             for string in strings:
                 self.add(string)
@@ -152,10 +158,13 @@ cdef class StringStore:
                 return SYMBOLS_BY_INT[str_hash]
             else:
                 utf8str = <Utf8Str*>self._map.get(str_hash)
+                if utf8str is NULL and self._transient_map is not None:
+                    utf8str = <Utf8Str*>self._transient_map.get(str_hash)
         else:
             # TODO: Raise an error instead
             utf8str = <Utf8Str*>self._map.get(string_or_id)
-
+            if utf8str is NULL and self._transient_map is not None:
+                utf8str = <Utf8Str*>self._transient_map.get(str_hash)
         if utf8str is NULL:
             raise KeyError(Errors.E018.format(hash_value=string_or_id))
         else:
@@ -175,10 +184,46 @@ cdef class StringStore:
         else:
             return self[key]
 
-    def add(self, string):
+    def __reduce__(self):
+        strings = list(self.non_transient_keys())
+        return (StringStore, (strings,), None, None, None)
+
+    def __len__(self) -> int:
+        """The number of strings in the store.
+
+        RETURNS (int): The number of strings in the store.
+        """
+        return self._keys.size() + self._transient_keys.size()
+
+    @contextmanager
+    def memory_zone(self, mem: Optional[Pool] = None) -> Pool:
+        """Begin a block where all resources allocated during the block will
+        be freed at the end of it. If a resources was created within the
+        memory zone block, accessing it outside the block is invalid.
+        Behaviour of this invalid access is undefined. Memory zones should
+        not be nested.
+
+        The memory zone is helpful for services that need to process large
+        volumes of text with a defined memory budget.
+        """
+        if mem is None:
+            mem = Pool()
+        self.mem = mem
+        self._transient_map = PreshMap()
+        yield mem
+        self.mem = self._non_temp_mem
+        self._transient_map = None
+        self._transient_keys.clear()
+
+    def add(self, string: str, allow_transient: bool = False) -> int:
         """Add a string to the StringStore.
 
         string (str): The string to add.
+        allow_transient (bool): Allow the string to be stored in the 'transient'
+          map, which will be flushed at the end of the memory zone. Strings
+          encountered during arbitrary text processing should be added
+          with allow_transient=True, while labels and other strings used
+          internally should not.
         RETURNS (uint64): The string's hash value.
         """
         cdef hash_t str_hash
@@ -188,22 +233,26 @@ cdef class StringStore:
 
             string = string.encode("utf8")
             str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
+            self._intern_utf8(string, len(string), &str_hash, allow_transient)
         elif isinstance(string, bytes):
             if string in SYMBOLS_BY_STR:
                 return SYMBOLS_BY_STR[string]
             str_hash = hash_utf8(string, len(string))
-            self._intern_utf8(string, len(string), &str_hash)
+            self._intern_utf8(string, len(string), &str_hash, allow_transient)
         else:
             raise TypeError(Errors.E017.format(value_type=type(string)))
         return str_hash
 
     def __len__(self):
         """The number of strings in the store.
+        if string in SYMBOLS_BY_STR:
+            return SYMBOLS_BY_STR[string]
+        else:
+            return self._intern_str(string, allow_transient)
 
         RETURNS (int): The number of strings in the store.
         """
-        return self.keys.size()
+        return self.keys.size() + self._transient_keys.size()
 
     def __contains__(self, string_or_id not None):
         """Check whether a string or ID is in the store.
@@ -222,30 +271,70 @@ cdef class StringStore:
             pass
         else:
             # TODO: Raise an error instead
-            return self._map.get(string_or_id) is not NULL
-
+            if self._map.get(string_or_id) is not NULL:
+                return True
+            elif self._transient_map is not None and self._transient_map.get(string_or_id) is not NULL:
+                return True
+            else:
+                return False
         if str_hash < len(SYMBOLS_BY_INT):
             return True
         else:
-            return self._map.get(str_hash) is not NULL
+            if self._map.get(str_hash) is not NULL:
+                return True
+            elif self._transient_map is not None and self._transient_map.get(string_or_id) is not NULL:
+                return True
+            else:
+                return False
 
     def __iter__(self):
         """Iterate over the strings in the store, in order.
 
         YIELDS (str): A string in the store.
         """
+        yield from self.non_transient_keys()
+        yield from self.transient_keys()
+
+    def non_transient_keys(self) -> Iterator[str]:
+        """Iterate over the stored strings in insertion order.
+
+        RETURNS: A list of strings.
+        """
         cdef int i
         cdef hash_t key
         for i in range(self.keys.size()):
             key = self.keys[i]
             utf8str = <Utf8Str*>self._map.get(key)
             yield decode_Utf8Str(utf8str)
-        # TODO: Iterate OOV here?
 
     def __reduce__(self):
         strings = list(self)
         return (StringStore, (strings,), None, None, None)
 
+    def transient_keys(self) -> Iterator[str]:
+        if self._transient_map is None:
+            return []
+        for i in range(self._transient_keys.size()):
+            utf8str = <Utf8Str*>self._transient_map.get(self._transient_keys[i])
+            yield decode_Utf8Str(utf8str)
+
+    def values(self) -> List[int]:
+        """Iterate over the stored strings hashes in insertion order.
+
+        RETURNS: A list of string hashs.
+        """
+        cdef int i
+        hashes = [None] * self._keys.size()
+        for i in range(self._keys.size()):
+            hashes[i] = self._keys[i]
+        if self._transient_map is not None:
+            transient_hashes = [None] * self._transient_keys.size()
+            for i in range(self._transient_keys.size()):
+                transient_hashes[i] = self._transient_keys[i]
+        else:
+            transient_hashes = []
+        return hashes + transient_hashes
+
     def to_disk(self, path):
         """Save the current state to a directory.
 
@@ -269,7 +358,7 @@ cdef class StringStore:
         prev = list(self)
         self._reset_and_load(strings)
         for word in prev:
-            self.add(word)
+            self.add(word, allow_transient=False)
         return self
 
     def to_bytes(self, **kwargs):
@@ -289,30 +378,42 @@ cdef class StringStore:
         prev = list(self)
         self._reset_and_load(strings)
         for word in prev:
-            self.add(word)
+            self.add(word, allow_transient=False)
         return self
 
     def _reset_and_load(self, strings):
         self.mem = Pool()
         self._map = PreshMap()
         self.keys.clear()
         for string in strings:
-            self.add(string)
+            self.add(string, allow_transient=False)
 
-    cdef const Utf8Str* intern_unicode(self, str py_string):
+    cdef const Utf8Str* intern_unicode(self, str py_string, bint allow_transient):
         # 0 means missing, but we don't bother offsetting the index.
         cdef bytes byte_string = py_string.encode("utf8")
-        return self._intern_utf8(byte_string, len(byte_string), NULL)
+        return self._intern_utf8(byte_string, len(byte_string), NULL, allow_transient)
 
     @cython.final
-    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash):
+    cdef const Utf8Str* _intern_utf8(self, char* utf8_string, int length, hash_t* precalculated_hash, bint allow_transient):
         # TODO: This function's API/behaviour is an unholy mess...
         # 0 means missing, but we don't bother offsetting the index.
         cdef hash_t key = precalculated_hash[0] if precalculated_hash is not NULL else hash_utf8(utf8_string, length)
         cdef Utf8Str* value = <Utf8Str*>self._map.get(key)
         if value is not NULL:
             return value
+        if allow_transient and self._transient_map is not None:
+            # If we've already allocated a transient string, and now we
+            # want to intern it permanently, we'll end up with the string
+            # in both places. That seems fine -- I don't see why we need
+            # to remove it from the transient map.
+            value = <Utf8Str*>self._transient_map.get(key)
+            if value is not NULL:
+                return value
         value = _allocate(self.mem, <unsigned char*>utf8_string, length)
-        self._map.set(key, value)
-        self.keys.push_back(key)
+        if allow_transient and self._transient_map is not None:
+            self._transient_map.set(key, value)
+            self._transient_keys.push_back(key)
+        else:
+            self._map.set(key, value)
+            self.keys.push_back(key)
         return value
diff --git a/spacy/tests/vocab_vectors/test_memory_zone.py b/spacy/tests/vocab_vectors/test_memory_zone.py
@@ -0,0 +1,36 @@
+from spacy.vocab import Vocab
+
+
+def test_memory_zone_no_insertion():
+    vocab = Vocab()
+    with vocab.memory_zone():
+        pass
+    lex = vocab["horse"]
+    assert lex.text == "horse"
+
+
+def test_memory_zone_insertion():
+    vocab = Vocab()
+    _ = vocab["dog"]
+    assert "dog" in vocab
+    assert "horse" not in vocab
+    with vocab.memory_zone():
+        lex = vocab["horse"]
+        assert lex.text == "horse"
+    assert "dog" in vocab
+    assert "horse" not in vocab
+
+
+def test_memory_zone_redundant_insertion():
+    """Test that if we insert an already-existing word while
+    in the memory zone, it stays persistent"""
+    vocab = Vocab()
+    _ = vocab["dog"]
+    assert "dog" in vocab
+    assert "horse" not in vocab
+    with vocab.memory_zone():
+        lex = vocab["horse"]
+        assert lex.text == "horse"
+        _ = vocab["dog"]
+    assert "dog" in vocab
+    assert "horse" not in vocab
diff --git a/spacy/tokenizer.pxd b/spacy/tokenizer.pxd
@@ -25,9 +25,7 @@ cdef class Tokenizer:
     cdef PhraseMatcher _special_matcher
     # TODO convert to bool in v4
     cdef int _faster_heuristics
-    # TODO next one is unused and should be removed in v4
-    # https://github.com/explosion/spaCy/pull/9150
-    cdef int _unused_int2
+    cdef public int max_cache_size
 
     cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
     cdef int _apply_special_cases(self, Doc doc) except -1

diff --git a/spacy/tokenizer.pyx b/spacy/tokenizer.pyx
@@ -30,7 +30,7 @@ cdef class Tokenizer:
     """
     def __init__(self, Vocab vocab, rules=None, prefix_search=None,
                  suffix_search=None, infix_finditer=None, token_match=None,
-                 url_match=None, faster_heuristics=True):
+                 url_match=None, faster_heuristics=True, max_cache_size=10000):
         """Create a `Tokenizer`, to create `Doc` objects given unicode text.
 
         vocab (Vocab): A storage container for lexical types.
@@ -50,6 +50,7 @@ cdef class Tokenizer:
         faster_heuristics (bool): Whether to restrict the final
             Matcher-based pass for rules to those containing affixes or space.
             Defaults to True.
+        max_cache_size (int): Maximum number of tokenization chunks to cache.
 
         EXAMPLE:
             >>> tokenizer = Tokenizer(nlp.vocab)
@@ -69,6 +70,7 @@ cdef class Tokenizer:
         self._rules = {}
         self._special_matcher = PhraseMatcher(self.vocab)
         self._load_special_cases(rules)
+        self.max_cache_size = max_cache_size
 
     @property
     def token_match(self):
@@ -397,8 +399,9 @@ cdef class Tokenizer:
                                    has_special, with_special_cases)
         self._attach_tokens(tokens, span, &prefixes, &suffixes, has_special,
                             with_special_cases)
-        self._save_cached(&tokens.c[orig_size], orig_key, has_special,
-                          tokens.length - orig_size)
+        if len(self._cache) < self.max_cache_size:
+            self._save_cached(&tokens.c[orig_size], orig_key, has_special,
+                              tokens.length - orig_size)
 
     cdef str _split_affixes(
         self,
@@ -514,6 +517,9 @@ cdef class Tokenizer:
         if n <= 0:
             # avoid mem alloc of zero length
             return 0
+        # Historically this check was mostly used to avoid caching
+        # chunks that had tokens owned by the Doc. Now that that's
+        # not a thing, I don't think we need this?
         for i in range(n):
             if self.vocab._by_orth.get(tokens[i].lex.orth) == NULL:
                 return 0

diff --git a/spacy/vocab.pxd b/spacy/vocab.pxd
@@ -41,7 +41,9 @@ cdef class Vocab:
     cdef const TokenC* make_fused_token(self, substrings) except NULL
 
     cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
-    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex) except -1
+    cdef int _add_lex_to_vocab(self, hash_t key, const LexemeC* lex, bint is_transient) except -1
     cdef const LexemeC* _new_lexeme(self, Pool mem, str string) except NULL
 
     cdef PreshMap _by_orth
+    cdef Pool _non_temp_mem
+    cdef vector[attr_t] _transient_orths