From 16683a7a32ebac94c323511288d10aa5991b2a05 Mon Sep 17 00:00:00 2001 From: Kevin Meinhardt Date: Mon, 18 Nov 2024 11:51:30 +0100 Subject: [PATCH] Remove time inefficient loop in not_blocked_items --- src/olympia/blocklist/mlbf.py | 69 +++++++++++++++--------- src/olympia/blocklist/tests/test_cron.py | 40 ++++++++++++++ 2 files changed, 85 insertions(+), 24 deletions(-) diff --git a/src/olympia/blocklist/mlbf.py b/src/olympia/blocklist/mlbf.py index d2cc578afb33..f514effc5623 100644 --- a/src/olympia/blocklist/mlbf.py +++ b/src/olympia/blocklist/mlbf.py @@ -32,6 +32,41 @@ def ordered_diff_lists( return extras, deletes, changed_count +def get_not_blocked_items(all_blocks_ids: List[int]) -> List[Tuple[str, str]]: + """ + Returns a list of tuples containing the guid, version of all not blocked + versions. We use distinct to avoid duplicates, order by ID to ensure + cache.json is always sorted consistently, and return the values as a tuple + to make it easier to mock in tests. + """ + return ( + Version.unfiltered.exclude(id__in=all_blocks_ids or ()) + .distinct() + .order_by('id') + .values_list('addon__addonguid__guid', 'version') + ) + + +def get_all_blocked_items() -> List[Tuple[str, str]]: + """ + Returns a list of tuples containing the guid, version, version_id, and + block_type of all blocked items. We use distinct to avoid duplicates, + Order by ID to ensure cache.json is always sorted consistently, + and return the values as a tuple to make it easier to mock in tests. + """ + return ( + BlockVersion.objects.filter(version__file__is_signed=True) + .distinct() + .order_by('id') + .values_list( + 'block__guid', + 'version__version', + 'version_id', + 'block_type', + ) + ) + + def generate_mlbf(stats, blocked, not_blocked): log.info('Starting to generating bloomfilter') @@ -136,25 +171,14 @@ def __init__(self, *args, **kwargs): @cached_property def _all_blocks(self): - return ( - BlockVersion.objects.filter(version__file__is_signed=True) - .distinct() - .order_by('id') - .values_list( - 'block__guid', - 'version__version', - 'version_id', - 'block_type', - named=True, - ) - ) + return get_all_blocked_items() def _format_blocks(self, block_type: BlockType) -> List[str]: return MLBF.hash_filter_inputs( [ - (version.block__guid, version.version__version) + (version[0], version[1]) for version in self._all_blocks - if version.block_type == block_type + if version[3] == block_type ] ) @@ -168,21 +192,18 @@ def soft_blocked_items(self) -> List[str]: @cached_property def not_blocked_items(self) -> List[str]: - all_blocks_ids = [version.version_id for version in self._all_blocks] + all_blocks_ids = [version[2] for version in self._all_blocks] not_blocked_items = MLBF.hash_filter_inputs( - Version.unfiltered.exclude(id__in=all_blocks_ids or ()) - .distinct() - .order_by('id') - .values_list('addon__addonguid__guid', 'version') + get_not_blocked_items(all_blocks_ids) ) + blocked_items = set(self.blocked_items + self.soft_blocked_items) # even though we exclude all the version ids in the query there's an # edge case where the version string occurs twice for an addon so we # ensure not_blocked_items contain no blocked_items or soft_blocked_items. - return [ - item - for item in not_blocked_items - if item not in self.blocked_items + self.soft_blocked_items - ] + newly_not_blocked_items, _, _ = ordered_diff_lists( + blocked_items, not_blocked_items + ) + return newly_not_blocked_items class MLBF: diff --git a/src/olympia/blocklist/tests/test_cron.py b/src/olympia/blocklist/tests/test_cron.py index 0441c64dd3a8..4fbeb769f177 100644 --- a/src/olympia/blocklist/tests/test_cron.py +++ b/src/olympia/blocklist/tests/test_cron.py @@ -379,6 +379,46 @@ def test_invalid_cache_results_in_diff(self): in self.mocks['olympia.blocklist.cron.upload_filter.delay'].call_args_list ) + @mock.patch('olympia.blocklist.mlbf.get_not_blocked_items') + @mock.patch('olympia.blocklist.mlbf.get_all_blocked_items') + def test_large_data_set( + self, + mock_get_all_blocked_items, + mock_get_not_blocked_items, + ): + """ + Test that the cron can handle a large data set + We can safely mock the filter cascase initialize and verify methods + because they massively impact performance of the test and we don't + need to test them here. + """ + two_million_blocked = [ + ( + f'{x}@blocked', + f'{x}@version', + x, + # even numbers are blocked, odd numbers are soft blocked + BlockType.BLOCKED if x % 2 == 0 else BlockType.SOFT_BLOCKED, + ) + for x in range(0, 2_000_000) + ] + one_million_not_blocked = [ + (f'{x}@not_blocked', f'{x}@version') for x in range(2_000_000, 3_000_000) + ] + + mock_get_all_blocked_items.return_value = two_million_blocked + mock_get_not_blocked_items.return_value = one_million_not_blocked + + upload_mlbf_to_remote_settings() + + mlbf = MLBF.load_from_storage(self.current_time) + assert len(mlbf.data.blocked_items) == 1_000_000 + assert len(mlbf.data.soft_blocked_items) == 1_000_000 + assert len(mlbf.data.not_blocked_items) == 1_000_000 + + with override_switch('enable-soft-blocking', active=True): + upload_mlbf_to_remote_settings() + class TestTimeMethods(TestCase): @freeze_time('2024-10-10 12:34:56')