Skip to content

Commit

Permalink
add settings.CACHE_SIZE_LIMIT (#1140)
Browse files Browse the repository at this point in the history
Co-authored-by: tungsten <[email protected]>
  • Loading branch information
chebotarevmichael and tungsten authored Mar 15, 2023
1 parent 3a1d7a6 commit a11d128
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 16 deletions.
4 changes: 4 additions & 0 deletions dateparser/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ class Settings:
* `PARSERS`
* `DEFAULT_LANGUAGES`
* `LANGUAGE_DETECTION_CONFIDENCE_THRESHOLD`
* `CACHE_SIZE_LIMIT`
"""

_default = True
Expand Down Expand Up @@ -226,6 +227,9 @@ def check_settings(settings):
'type': float,
'extra_check': _check_between_0_and_1
},
'CACHE_SIZE_LIMIT': {
'type': int,
},
}

modified_settings = settings._mod_settings # check only modified settings
Expand Down
44 changes: 28 additions & 16 deletions dateparser/languages/dictionary.py
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,13 @@ def split(self, string, keep_formatting=False):

return list(filter(bool, chain.from_iterable(tokens)))

def _add_to_cache(self, value, cache):
cache.setdefault(
self._settings.registry_key, {}
)[self.info['name']] = value
if self._settings.CACHE_SIZE_LIMIT and len(cache) > self._settings.CACHE_SIZE_LIMIT:
cache.pop(list(cache.keys())[0])

def _split_by_known_words(self, string, keep_formatting):
if not string:
return string
Expand Down Expand Up @@ -174,9 +181,10 @@ def _get_sorted_words_from_cache(self):
self._settings.registry_key not in self._sorted_words_cache
or self.info['name'] not in self._sorted_words_cache[self._settings.registry_key]
):
self._sorted_words_cache.setdefault(
self._settings.registry_key, {})[self.info['name']] = \
sorted([key for key in self], key=len, reverse=True)
self._add_to_cache(
cache=self._sorted_words_cache,
value=sorted([key for key in self], key=len, reverse=True),
)
return self._sorted_words_cache[self._settings.registry_key][self.info['name']]

def _get_split_regex_cache(self):
Expand All @@ -193,19 +201,21 @@ def _construct_split_regex(self):
regex = r"^(.*?)({})(.*)$".format(known_words_group)
else:
regex = r"^(.*?(?:\A|\W|_|\d))({})((?:\Z|\W|_|\d).*)$".format(known_words_group)
self._split_regex_cache.setdefault(
self._settings.registry_key, {})[self.info['name']] = \
re.compile(regex, re.UNICODE | re.IGNORECASE)
self._add_to_cache(
cache=self._split_regex_cache,
value=re.compile(regex, re.UNICODE | re.IGNORECASE),
)

def _get_sorted_relative_strings_from_cache(self):
if (
self._settings.registry_key not in self._sorted_relative_strings_cache
or self.info['name'] not in self._sorted_relative_strings_cache[self._settings.registry_key]
):
self._sorted_relative_strings_cache.setdefault(
self._settings.registry_key, {})[self.info['name']] = \
sorted([PARENTHESES_PATTERN.sub('', key) for key in
self._relative_strings], key=len, reverse=True)
self._add_to_cache(
cache=self._sorted_relative_strings_cache,
value=sorted([PARENTHESES_PATTERN.sub('', key) for key in
self._relative_strings], key=len, reverse=True),
)
return self._sorted_relative_strings_cache[self._settings.registry_key][self.info['name']]

def _get_split_relative_regex_cache(self):
Expand All @@ -222,9 +232,10 @@ def _construct_split_relative_regex(self):
regex = "({})".format(known_relative_strings_group)
else:
regex = "(?<=(?:\\A|\\W|_))({})(?=(?:\\Z|\\W|_))".format(known_relative_strings_group)
self._split_relative_regex_cache.setdefault(
self._settings.registry_key, {})[self.info['name']] = \
re.compile(regex, re.UNICODE | re.IGNORECASE)
self._add_to_cache(
cache=self._split_relative_regex_cache,
value=re.compile(regex, re.UNICODE | re.IGNORECASE),
)

def _get_match_relative_regex_cache(self):
if (
Expand All @@ -237,9 +248,10 @@ def _get_match_relative_regex_cache(self):
def _construct_match_relative_regex(self):
known_relative_strings_group = "|".join(self._get_sorted_relative_strings_from_cache())
regex = "^({})$".format(known_relative_strings_group)
self._match_relative_regex_cache.setdefault(
self._settings.registry_key, {})[self.info['name']] = \
re.compile(regex, re.UNICODE | re.IGNORECASE)
self._add_to_cache(
cache=self._match_relative_regex_cache,
value=re.compile(regex, re.UNICODE | re.IGNORECASE),
)


class NormalizedDictionary(Dictionary):
Expand Down
1 change: 1 addition & 0 deletions dateparser_data/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,4 +33,5 @@
# Other settings
'RETURN_TIME_AS_PERIOD': False,
'PARSERS': default_parsers,
'CACHE_SIZE_LIMIT': 1000,
}
3 changes: 3 additions & 0 deletions docs/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -210,3 +210,6 @@ Dateparser in the future. For example, to ignore relative times:
>>> from dateparser_data.settings import default_parsers
>>> parsers = [parser for parser in default_parsers if parser != 'relative-time']
>>> parse('today', settings={'PARSERS': parsers})

``CACHE_SIZE_LIMIT``: limits the size of caches, that store data for already processed dates.
Default to ``1000``, but you can set ``0`` for turning off the limit.

0 comments on commit a11d128

Please sign in to comment.