From 22871154664e87833bee2d3558415fb816555dc7 Mon Sep 17 00:00:00 2001 From: Jiradet Ounjai Date: Thu, 30 Nov 2023 21:57:23 +0700 Subject: [PATCH] Add GreenBench implementation (#1912) This PR ports implementation of https://github.com/Rigorous-Software-Engineering/greenbench into fuzzbench. Essentially, we introduce new experiment option to allow users to opt in micro benchmark fuzzing that described in the GreenBench paper. --- common/experiment_utils.py | 16 +++ common/random_corpus_fuzzing_utils.py | 101 ++++++++++++++++++ database/models.py | 1 + experiment/dispatcher.py | 11 +- .../runner-startup-script-template.sh | 2 + experiment/run_experiment.py | 3 + experiment/runner.py | 16 ++- experiment/scheduler.py | 12 ++- experiment/test_data/experiment-config.yaml | 1 + .../test_data/local-experiment-config.yaml | 1 + experiment/test_scheduler.py | 2 + 11 files changed, 160 insertions(+), 6 deletions(-) create mode 100644 common/random_corpus_fuzzing_utils.py diff --git a/common/experiment_utils.py b/common/experiment_utils.py index 604d0218f..3911751ac 100644 --- a/common/experiment_utils.py +++ b/common/experiment_utils.py @@ -97,6 +97,17 @@ def get_custom_seed_corpora_filestore_path(): 'custom_seed_corpora') +def get_oss_fuzz_corpora_unarchived_path(): + """Returns path containing the user-provided seed corpora.""" + return posixpath.join(get_experiment_filestore_path(), + 'oss_fuzz_unarchived') + + +def get_random_corpora_filestore_path(): + """Returns path containing seed corpora for the target fuzzing experiment.""" # pylint: disable=line-too-long + return posixpath.join(get_experiment_filestore_path(), 'random_corpora') + + def get_dispatcher_instance_name(experiment: str) -> str: """Returns a dispatcher instance name for an experiment.""" return f'd-{experiment}' @@ -138,6 +149,11 @@ def is_local_experiment(): return bool(environment.get('LOCAL_EXPERIMENT')) +def is_micro_experiment(): + """Returns True if running a micro experiment.""" + return bool(environment.get('MICRO_EXPERIMENT')) + + def get_trial_dir(fuzzer, benchmark, trial_id): """Returns the unique directory for |fuzzer|, |benchmark|, and |trial_id|.""" diff --git a/common/random_corpus_fuzzing_utils.py b/common/random_corpus_fuzzing_utils.py new file mode 100644 index 000000000..1bca7561e --- /dev/null +++ b/common/random_corpus_fuzzing_utils.py @@ -0,0 +1,101 @@ +# Copyright 2023 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Utility functions for micro-experiment run.""" + +import random +import os +import tempfile +import multiprocessing +import zipfile +from typing import List + +from common import experiment_utils +from common import filesystem +from common import logs + +MAX_SOURCE_CORPUS_FILES = 1 +CORPUS_ELEMENT_BYTES_LIMIT = 1 * 1024 * 1024 + + +def initialize_random_corpus_fuzzing(benchmarks: List[str], num_trials: int): + """Prepare corpus for micro experiment.""" + pool_args = () + with multiprocessing.Pool(*pool_args) as pool: + pool.starmap(prepare_benchmark_random_corpus, + [(benchmark, num_trials) for benchmark in benchmarks]) + logs.info('Done preparing corpus for micro experiment') + + +# pylint: disable=too-many-locals +def prepare_benchmark_random_corpus(benchmark: str, num_trials: int): + """Prepare corpus for given benchmark.""" + # Temporary location to park corpus files before get picked randomly. + benchmark_unarchived_corpora = os.path.join( + experiment_utils.get_oss_fuzz_corpora_unarchived_path(), benchmark) + filesystem.create_directory(benchmark_unarchived_corpora) + + # Unzip oss fuzz corpus. + corpus_archive_filename = f'{benchmark}.zip' + oss_fuzz_corpus_archive_path = os.path.join( + experiment_utils.get_oss_fuzz_corpora_filestore_path(), + corpus_archive_filename) + with zipfile.ZipFile(oss_fuzz_corpus_archive_path) as zip_file: + idx = 0 + for seed_corpus_file in zip_file.infolist(): + if seed_corpus_file.filename.endswith('/'): + # Ignore directories. + continue + # Allow callers to opt-out of unpacking large files. + if seed_corpus_file.file_size > CORPUS_ELEMENT_BYTES_LIMIT: + continue + output_filename = f'{idx:016d}' + output_file_path = os.path.join(benchmark_unarchived_corpora, + output_filename) + zip_file.extract(seed_corpus_file, output_file_path) + idx += 1 + + # Path used to store and feed seed corpus for benchmark runner + # each trial group will have the same seed input(s). + benchmark_random_corpora = os.path.join( + experiment_utils.get_random_corpora_filestore_path(), benchmark) + filesystem.create_directory(benchmark_random_corpora) + + with tempfile.TemporaryDirectory() as tmp_dir: + all_corpus_files = [] + for root, _, files in os.walk(benchmark_unarchived_corpora): + for filename in files: + file_path = os.path.join(root, filename) + all_corpus_files.append(file_path) + + all_corpus_files.sort() + trial_group_num = 0 + # All trials in the same group will start with the same + # set of randomly selected seed files. + while trial_group_num < num_trials: + trial_group_subdir = f'trial-group-{trial_group_num}' + custom_corpus_trial_dir = os.path.join(benchmark_random_corpora, + trial_group_subdir) + src_dir = os.path.join(tmp_dir, 'source') + filesystem.recreate_directory(src_dir) + + source_files = random.sample(all_corpus_files, + MAX_SOURCE_CORPUS_FILES) + for file in source_files: + filesystem.copy(file, src_dir) + + # Copy only the src directory. + filesystem.copytree(src_dir, custom_corpus_trial_dir) + trial_group_num += 1 + + return [] diff --git a/database/models.py b/database/models.py index 7cf902397..02bfff7a9 100644 --- a/database/models.py +++ b/database/models.py @@ -54,6 +54,7 @@ class Trial(Base): # Columns used for preemptible experiments. preemptible = Column(Boolean, default=False, nullable=False) preempted = Column(Boolean, default=False, nullable=False) + trial_group_num = Column(Integer, nullable=True) # Every trial has snapshots which is basically the saved state of that trial # at a given time. The snapshots field here and the trial field on Snapshot, diff --git a/experiment/dispatcher.py b/experiment/dispatcher.py index 796c796b8..9f442d755 100755 --- a/experiment/dispatcher.py +++ b/experiment/dispatcher.py @@ -24,6 +24,7 @@ import time from typing import List +from common import random_corpus_fuzzing_utils from common import experiment_path as exp_path from common import experiment_utils from common import logs @@ -89,7 +90,7 @@ def _initialize_trials_in_db(trials: List[models.Trial]): db_utils.bulk_save(trials) -class Experiment: +class Experiment: # pylint: disable=too-many-instance-attributes """Class representing an experiment.""" def __init__(self, experiment_config_filepath: str): @@ -101,6 +102,7 @@ def __init__(self, experiment_config_filepath: str): self.experiment_name = self.config['experiment'] self.git_hash = self.config['git_hash'] self.preemptible = self.config.get('preemptible_runners') + self.micro_experiment = self.config.get('micro_experiment') def build_images_for_trials(fuzzers: List[str], benchmarks: List[str], @@ -123,7 +125,8 @@ def build_images_for_trials(fuzzers: List[str], benchmarks: List[str], models.Trial(fuzzer=fuzzer, experiment=experiment_name, benchmark=benchmark, - preemptible=preemptible) for _ in range(num_trials) + preemptible=preemptible, + trial_group_num=trial) for trial in range(num_trials) ] trials.extend(fuzzer_benchmark_trials) return trials @@ -150,6 +153,10 @@ def dispatcher_main(): experiment.preemptible) _initialize_trials_in_db(trials) + if experiment.micro_experiment: + random_corpus_fuzzing_utils.initialize_random_corpus_fuzzing( + experiment.benchmarks, experiment.num_trials) + create_work_subdirs(['experiment-folders', 'measurement-folders']) # Start measurer and scheduler in seperate threads/processes. diff --git a/experiment/resources/runner-startup-script-template.sh b/experiment/resources/runner-startup-script-template.sh index 79f84c22c..5ef1e40bf 100644 --- a/experiment/resources/runner-startup-script-template.sh +++ b/experiment/resources/runner-startup-script-template.sh @@ -42,6 +42,8 @@ docker run \ -e BENCHMARK={{benchmark}} \ -e EXPERIMENT={{experiment}} \ -e TRIAL_ID={{trial_id}} \ +-e TRIAL_GROUP_NUM={{trial_group_num}} \ +-e MICRO_EXPERIMENT={{micro_experiment}} \ -e MAX_TOTAL_TIME={{max_total_time}} \ -e SNAPSHOT_PERIOD={{snapshot_period}} \ -e NO_SEEDS={{no_seeds}} \ diff --git a/experiment/run_experiment.py b/experiment/run_experiment.py index 4771dfebf..5f61ad6a8 100644 --- a/experiment/run_experiment.py +++ b/experiment/run_experiment.py @@ -74,6 +74,7 @@ def _set_default_config_values(config: Dict[str, Union[int, str, bool]], config['snapshot_period'] = config.get( 'snapshot_period', experiment_utils.DEFAULT_SNAPSHOT_SECONDS) config['private'] = config.get('private', False) + config['micro_experiment'] = config.get('micro_experiment', False) def _validate_config_parameters( @@ -187,6 +188,8 @@ def read_and_validate_experiment_config(config_filename: str) -> Dict: Requirement(False, int, False, ''), 'runner_memory': Requirement(False, str, False, ''), + 'micro_experiment': + Requirement(False, bool, False, ''), } all_params_valid = _validate_config_parameters(config, config_requirements) diff --git a/experiment/runner.py b/experiment/runner.py index a4efc5b6f..b955ff665 100644 --- a/experiment/runner.py +++ b/experiment/runner.py @@ -101,6 +101,18 @@ def get_clusterfuzz_seed_corpus_path(fuzz_target_path): return seed_corpus_path if os.path.exists(seed_corpus_path) else None +def _unpack_random_corpus(corpus_directory): + shutil.rmtree(corpus_directory) + + benchmark = environment.get('BENCHMARK') + trial_group_num = environment.get('TRIAL_GROUP_NUM', 0) + random_corpora_dir = experiment_utils.get_random_corpora_filestore_path() + random_corpora_sub_dir = f'trial-group-{int(trial_group_num)}' + random_corpus_dir = posixpath.join(random_corpora_dir, benchmark, + random_corpora_sub_dir) + filestore_utils.cp(random_corpus_dir, corpus_directory, recursive=True) + + def _copy_custom_seed_corpus(corpus_directory): """Copy custom seed corpus provided by user""" shutil.rmtree(corpus_directory) @@ -257,7 +269,9 @@ def set_up_corpus_directories(self): FUZZ_TARGET_DIR, fuzz_target_name) input_corpus = environment.get('SEED_CORPUS_DIR') os.makedirs(input_corpus, exist_ok=True) - if not environment.get('CUSTOM_SEED_CORPUS_DIR'): + if environment.get('MICRO_EXPERIMENT'): + _unpack_random_corpus(input_corpus) + elif not environment.get('CUSTOM_SEED_CORPUS_DIR'): _unpack_clusterfuzz_seed_corpus(target_binary, input_corpus) else: _copy_custom_seed_corpus(input_corpus) diff --git a/experiment/scheduler.py b/experiment/scheduler.py index 0d9da0b22..b189eabb6 100644 --- a/experiment/scheduler.py +++ b/experiment/scheduler.py @@ -689,7 +689,7 @@ def start_trials(trials, experiment_config: dict, pool, core_allocation=None): return started_trials -class TrialProxy: +class TrialProxy: # pylint: disable=too-many-instance-attributes """A proxy object for a model.Trial. TrialProxy's allow these fields to be set and retreived without making any database calls.""" @@ -701,6 +701,7 @@ def __init__(self, trial): self.time_ended = trial.time_ended self.preemptible = trial.preemptible self.cpuset = None + self.trial_group_num = trial.trial_group_num def _initialize_logs(experiment): @@ -729,7 +730,7 @@ def _start_trial(trial: TrialProxy, experiment_config: dict, cpuset=None): logger.info('Start trial %d.', trial.id) started = create_trial_instance(trial.fuzzer, trial.benchmark, trial.id, experiment_config, trial.preemptible, - cpuset) + cpuset, trial.trial_group_num) if started: trial.time_started = datetime_now() trial.cpuset = cpuset @@ -743,6 +744,7 @@ def render_startup_script_template( # pylint: disable=too-many-arguments fuzzer: str, benchmark: str, trial_id: int, + trial_group_num: int, experiment_config: dict, cpuset=None): """Render the startup script using the template and the parameters @@ -760,6 +762,8 @@ def render_startup_script_template( # pylint: disable=too-many-arguments 'experiment': experiment, 'fuzzer': fuzzer, 'trial_id': trial_id, + 'trial_group_num': trial_group_num, + 'micro_experiment': experiment_config['micro_experiment'], 'max_total_time': experiment_config['max_total_time'], 'snapshot_period': experiment_config['snapshot_period'], 'experiment_filestore': experiment_config['experiment_filestore'], @@ -790,13 +794,15 @@ def create_trial_instance( # pylint: disable=too-many-arguments trial_id: int, experiment_config: dict, preemptible: bool, - cpuset=None) -> bool: + cpuset=None, + trial_group_num: int = 0) -> bool: """Create or start a trial instance for a specific trial_id,fuzzer,benchmark.""" instance_name = experiment_utils.get_trial_instance_name( experiment_config['experiment'], trial_id) startup_script = render_startup_script_template(instance_name, fuzzer, benchmark, trial_id, + trial_group_num, experiment_config, cpuset) startup_script_path = f'/tmp/{instance_name}-start-docker.sh' with open(startup_script_path, 'w', encoding='utf-8') as file_handle: diff --git a/experiment/test_data/experiment-config.yaml b/experiment/test_data/experiment-config.yaml index 4cecd0e6e..deabcee9a 100644 --- a/experiment/test_data/experiment-config.yaml +++ b/experiment/test_data/experiment-config.yaml @@ -40,3 +40,4 @@ measurers_cpus: null runner_num_cpu_cores: 1 runner_machine_type: 'n1-standard-1' private: false +micro_experiment: false \ No newline at end of file diff --git a/experiment/test_data/local-experiment-config.yaml b/experiment/test_data/local-experiment-config.yaml index adf30ea11..8f57cc739 100644 --- a/experiment/test_data/local-experiment-config.yaml +++ b/experiment/test_data/local-experiment-config.yaml @@ -22,3 +22,4 @@ report_filestore: /tmp/web-reports local_experiment: true benchmarks: "benchmark-1,benchmark-2" git_hash: "git-hash" +micro_experiment: false diff --git a/experiment/test_scheduler.py b/experiment/test_scheduler.py index 7f6f3b5f8..1a5e85e8b 100644 --- a/experiment/test_scheduler.py +++ b/experiment/test_scheduler.py @@ -114,6 +114,8 @@ def test_create_trial_instance(benchmark, expected_image, expected_target, -e BENCHMARK={benchmark} \\ -e EXPERIMENT=test-experiment \\ -e TRIAL_ID=9 \\ +-e TRIAL_GROUP_NUM=0 \\ +-e MICRO_EXPERIMENT=False \\ -e MAX_TOTAL_TIME=86400 \\ -e SNAPSHOT_PERIOD=900 \\ -e NO_SEEDS=False \\