Skip to content

Commit

Permalink
Remove time inefficient loop in not_blocked_items
Browse files Browse the repository at this point in the history
  • Loading branch information
KevinMind committed Nov 18, 2024
1 parent 794c227 commit 16683a7
Show file tree
Hide file tree
Showing 2 changed files with 85 additions and 24 deletions.
69 changes: 45 additions & 24 deletions src/olympia/blocklist/mlbf.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,41 @@ def ordered_diff_lists(
return extras, deletes, changed_count


def get_not_blocked_items(all_blocks_ids: List[int]) -> List[Tuple[str, str]]:
"""
Returns a list of tuples containing the guid, version of all not blocked
versions. We use distinct to avoid duplicates, order by ID to ensure
cache.json is always sorted consistently, and return the values as a tuple
to make it easier to mock in tests.
"""
return (
Version.unfiltered.exclude(id__in=all_blocks_ids or ())
.distinct()
.order_by('id')
.values_list('addon__addonguid__guid', 'version')
)


def get_all_blocked_items() -> List[Tuple[str, str]]:
"""
Returns a list of tuples containing the guid, version, version_id, and
block_type of all blocked items. We use distinct to avoid duplicates,
Order by ID to ensure cache.json is always sorted consistently,
and return the values as a tuple to make it easier to mock in tests.
"""
return (
BlockVersion.objects.filter(version__file__is_signed=True)
.distinct()
.order_by('id')
.values_list(
'block__guid',
'version__version',
'version_id',
'block_type',
)
)


def generate_mlbf(stats, blocked, not_blocked):
log.info('Starting to generating bloomfilter')

Expand Down Expand Up @@ -136,25 +171,14 @@ def __init__(self, *args, **kwargs):

@cached_property
def _all_blocks(self):
return (
BlockVersion.objects.filter(version__file__is_signed=True)
.distinct()
.order_by('id')
.values_list(
'block__guid',
'version__version',
'version_id',
'block_type',
named=True,
)
)
return get_all_blocked_items()

def _format_blocks(self, block_type: BlockType) -> List[str]:
return MLBF.hash_filter_inputs(
[
(version.block__guid, version.version__version)
(version[0], version[1])
for version in self._all_blocks
if version.block_type == block_type
if version[3] == block_type
]
)

Expand All @@ -168,21 +192,18 @@ def soft_blocked_items(self) -> List[str]:

@cached_property
def not_blocked_items(self) -> List[str]:
all_blocks_ids = [version.version_id for version in self._all_blocks]
all_blocks_ids = [version[2] for version in self._all_blocks]
not_blocked_items = MLBF.hash_filter_inputs(
Version.unfiltered.exclude(id__in=all_blocks_ids or ())
.distinct()
.order_by('id')
.values_list('addon__addonguid__guid', 'version')
get_not_blocked_items(all_blocks_ids)
)
blocked_items = set(self.blocked_items + self.soft_blocked_items)
# even though we exclude all the version ids in the query there's an
# edge case where the version string occurs twice for an addon so we
# ensure not_blocked_items contain no blocked_items or soft_blocked_items.
return [
item
for item in not_blocked_items
if item not in self.blocked_items + self.soft_blocked_items
]
newly_not_blocked_items, _, _ = ordered_diff_lists(
blocked_items, not_blocked_items
)
return newly_not_blocked_items


class MLBF:
Expand Down
40 changes: 40 additions & 0 deletions src/olympia/blocklist/tests/test_cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,46 @@ def test_invalid_cache_results_in_diff(self):
in self.mocks['olympia.blocklist.cron.upload_filter.delay'].call_args_list
)

@mock.patch('olympia.blocklist.mlbf.get_not_blocked_items')
@mock.patch('olympia.blocklist.mlbf.get_all_blocked_items')
def test_large_data_set(
self,
mock_get_all_blocked_items,
mock_get_not_blocked_items,
):
"""
Test that the cron can handle a large data set
We can safely mock the filter cascase initialize and verify methods
because they massively impact performance of the test and we don't
need to test them here.
"""
two_million_blocked = [
(
f'{x}@blocked',
f'{x}@version',
x,
# even numbers are blocked, odd numbers are soft blocked
BlockType.BLOCKED if x % 2 == 0 else BlockType.SOFT_BLOCKED,
)
for x in range(0, 2_000_000)
]
one_million_not_blocked = [
(f'{x}@not_blocked', f'{x}@version') for x in range(2_000_000, 3_000_000)
]

mock_get_all_blocked_items.return_value = two_million_blocked
mock_get_not_blocked_items.return_value = one_million_not_blocked

upload_mlbf_to_remote_settings()

mlbf = MLBF.load_from_storage(self.current_time)
assert len(mlbf.data.blocked_items) == 1_000_000
assert len(mlbf.data.soft_blocked_items) == 1_000_000
assert len(mlbf.data.not_blocked_items) == 1_000_000

with override_switch('enable-soft-blocking', active=True):
upload_mlbf_to_remote_settings()


class TestTimeMethods(TestCase):
@freeze_time('2024-10-10 12:34:56')
Expand Down

0 comments on commit 16683a7

Please sign in to comment.