Make sure to limit the amount of cached files parso stores, fixes dav…

…idhalter/jedi#1340
davidhalter · Jan 5, 2020 · 2b0b093 · 2b0b093
1 parent 29b57d9
commit 2b0b093
Show file tree

Hide file tree

Showing 2 changed files with 87 additions and 2 deletions.
diff --git a/parso/cache.py b/parso/cache.py
@@ -17,6 +17,21 @@
 
 LOG = logging.getLogger(__name__)
 
+_CACHED_FILE_MINIMUM_SURVIVAL = 60 * 10  # 10 minutes
+"""
+Cached files should survive at least a few minutes.
+"""
+_CACHED_SIZE_TRIGGER = 600
+"""
+This setting limits the amount of cached files. It's basically a way to start
+garbage collection.
+
+The reasoning for this limit being as big as it is, is the following:
+
+Numpy, Pandas, Matplotlib and Tensorflow together use about 500 files. This
+makes Jedi use ~500mb of memory. Since we might want a bit more than those few
+libraries, we just increase it a bit.
+"""
 
 _PICKLE_VERSION = 32
 """
@@ -76,6 +91,7 @@ def __init__(self, node, lines, change_time=None):
         if change_time is None:
             change_time = time.time()
         self.change_time = change_time
+        self.last_used = change_time
 
 
 def load_module(hashed_grammar, file_io, cache_path=None):
@@ -89,6 +105,7 @@ def load_module(hashed_grammar, file_io, cache_path=None):
     try:
         module_cache_item = parser_cache[hashed_grammar][file_io.path]
         if p_time <= module_cache_item.change_time:
+            module_cache_item.last_used = time.time()
             return module_cache_item.node
     except KeyError:
         return _load_from_file_system(
@@ -122,11 +139,27 @@ def _load_from_file_system(hashed_grammar, path, p_time, cache_path=None):
     except FileNotFoundError:
         return None
     else:
-        parser_cache.setdefault(hashed_grammar, {})[path] = module_cache_item
+        _set_cache_item(hashed_grammar, path, module_cache_item)
         LOG.debug('pickle loaded: %s', path)
         return module_cache_item.node
 
 
+def _set_cache_item(hashed_grammar, path, module_cache_item):
+    if sum(len(v) for v in parser_cache.values()) >= _CACHED_SIZE_TRIGGER:
+        # Garbage collection of old cache files.
+        # We are basically throwing everything away that hasn't been accessed
+        # in 10 minutes.
+        cutoff_time = time.time() - _CACHED_FILE_MINIMUM_SURVIVAL
+        for key, path_to_item_map in parser_cache.items():
+            parser_cache[key] = {
+                path: node_item
+                for path, node_item in path_to_item_map.items()
+                if node_item.last_used > cutoff_time
+            }
+
+    parser_cache.setdefault(hashed_grammar, {})[path] = module_cache_item
+
+
 def save_module(hashed_grammar, file_io, module, lines, pickling=True, cache_path=None):
     path = file_io.path
     try:
@@ -136,7 +169,7 @@ def save_module(hashed_grammar, file_io, module, lines, pickling=True, cache_pat
         pickling = False
 
     item = _NodeCacheItem(module, lines, p_time)
-    parser_cache.setdefault(hashed_grammar, {})[path] = item
+    _set_cache_item(hashed_grammar, path, item)
     if pickling and path is not None:
         _save_to_file_system(hashed_grammar, path, item, cache_path=cache_path)
 

diff --git a/test/test_cache.py b/test/test_cache.py
@@ -5,12 +5,14 @@
 from os import unlink
 
 import pytest
+import time
 
 from parso.cache import _NodeCacheItem, save_module, load_module, \
     _get_hashed_path, parser_cache, _load_from_file_system, _save_to_file_system
 from parso import load_grammar
 from parso import cache
 from parso import file_io
+from parso import parse
 
 
 @pytest.fixture()
@@ -87,3 +89,53 @@ def test_modulepickling_simulate_deleted_cache(tmpdir):
 
     cached2 = load_module(grammar._hashed, io)
     assert cached2 is None
+
+
+def test_cache_limit():
+    def cache_size():
+        return sum(len(v) for v in parser_cache.values())
+
+    try:
+        parser_cache.clear()
+        future_node_cache_item = _NodeCacheItem('bla', [], change_time=time.time() + 10e6)
+        old_node_cache_item = _NodeCacheItem('bla', [], change_time=time.time() - 10e4)
+        parser_cache['some_hash_old'] = {
+            '/path/%s' % i: old_node_cache_item for i in range(300)
+        }
+        parser_cache['some_hash_new'] = {
+            '/path/%s' % i: future_node_cache_item for i in range(300)
+        }
+        assert cache_size() == 600
+        parse('somecode', cache=True, path='/path/somepath')
+        assert cache_size() == 301
+    finally:
+        parser_cache.clear()
+
+
+class _FixedTimeFileIO(file_io.KnownContentFileIO):
+    def __init__(self, path, content, last_modified):
+        super(_FixedTimeFileIO, self).__init__(path, content)
+        self._last_modified = last_modified
+
+    def get_last_modified(self):
+        return self._last_modified
+
+
+@pytest.mark.parametrize('diff_cache', [False, True])
+@pytest.mark.parametrize('use_file_io', [False, True])
+def test_cache_last_used_update(diff_cache, use_file_io):
+    p = '/path/last-used'
+    parser_cache.clear()  # Clear, because then it's easier to find stuff.
+    parse('somecode', cache=True, path=p)
+    node_cache_item = next(iter(parser_cache.values()))[p]
+    now = time.time()
+    assert node_cache_item.last_used < now
+
+    if use_file_io:
+        f = _FixedTimeFileIO(p, 'code', node_cache_item.last_used - 10)
+        parse(file_io=f, cache=True, diff_cache=diff_cache)
+    else:
+        parse('somecode2', cache=True, path=p, diff_cache=diff_cache)
+
+    node_cache_item = next(iter(parser_cache.values()))[p]
+    assert now < node_cache_item.last_used < time.time()