Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add GreenBench implementation #1912

Merged
merged 18 commits into from
Nov 30, 2023
Merged
16 changes: 16 additions & 0 deletions common/experiment_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,17 @@ def get_custom_seed_corpora_filestore_path():
'custom_seed_corpora')


def get_oss_fuzz_corpora_unarchived_path():
"""Returns path containing the user-provided seed corpora."""
return posixpath.join(get_experiment_filestore_path(),
'oss_fuzz_unarchived')


def get_random_corpora_filestore_path():
"""Returns path containing seed corpora for the target fuzzing experiment.""" # pylint: disable=line-too-long
return posixpath.join(get_experiment_filestore_path(), 'random_corpora')


def get_dispatcher_instance_name(experiment: str) -> str:
"""Returns a dispatcher instance name for an experiment."""
return f'd-{experiment}'
Expand Down Expand Up @@ -138,6 +149,11 @@ def is_local_experiment():
return bool(environment.get('LOCAL_EXPERIMENT'))


def is_micro_experiment():
"""Returns True if running a micro experiment."""
return bool(environment.get('MICRO_EXPERIMENT'))


def get_trial_dir(fuzzer, benchmark, trial_id):
"""Returns the unique directory for |fuzzer|, |benchmark|, and
|trial_id|."""
Expand Down
101 changes: 101 additions & 0 deletions common/random_corpus_fuzzing_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Utility functions for micro-experiment run."""

import random
import os
import tempfile
import multiprocessing
import zipfile
from typing import List

from common import experiment_utils
from common import filesystem
from common import logs

MAX_SOURCE_CORPUS_FILES = 1
CORPUS_ELEMENT_BYTES_LIMIT = 1 * 1024 * 1024


def initialize_random_corpus_fuzzing(benchmarks: List[str], num_trials: int):
"""Prepare corpus for micro experiment."""
pool_args = ()
with multiprocessing.Pool(*pool_args) as pool:
pool.starmap(prepare_benchmark_random_corpus,
[(benchmark, num_trials) for benchmark in benchmarks])
logs.info('Done preparing corpus for micro experiment')


# pylint: disable=too-many-locals
def prepare_benchmark_random_corpus(benchmark: str, num_trials: int):
"""Prepare corpus for given benchmark."""
# Temporary location to park corpus files before get picked randomly.
benchmark_unarchived_corpora = os.path.join(
experiment_utils.get_oss_fuzz_corpora_unarchived_path(), benchmark)
filesystem.create_directory(benchmark_unarchived_corpora)

# Unzip oss fuzz corpus.
corpus_archive_filename = f'{benchmark}.zip'
oss_fuzz_corpus_archive_path = os.path.join(
experiment_utils.get_oss_fuzz_corpora_filestore_path(),
corpus_archive_filename)
with zipfile.ZipFile(oss_fuzz_corpus_archive_path) as zip_file:
idx = 0
for seed_corpus_file in zip_file.infolist():
if seed_corpus_file.filename.endswith('/'):
# Ignore directories.
continue
# Allow callers to opt-out of unpacking large files.
if seed_corpus_file.file_size > CORPUS_ELEMENT_BYTES_LIMIT:
continue
output_filename = f'{idx:016d}'
output_file_path = os.path.join(benchmark_unarchived_corpora,
output_filename)
zip_file.extract(seed_corpus_file, output_file_path)
idx += 1

# Path used to store and feed seed corpus for benchmark runner
# each trial group will have the same seed input(s).
benchmark_random_corpora = os.path.join(
experiment_utils.get_random_corpora_filestore_path(), benchmark)
filesystem.create_directory(benchmark_random_corpora)

with tempfile.TemporaryDirectory() as tmp_dir:
all_corpus_files = []
for root, _, files in os.walk(benchmark_unarchived_corpora):
for filename in files:
file_path = os.path.join(root, filename)
all_corpus_files.append(file_path)

all_corpus_files.sort()
trial_group_num = 0
# All trials in the same group will start with the same
# set of randomly selected seed files.
while trial_group_num < num_trials:
trial_group_subdir = f'trial-group-{trial_group_num}'
custom_corpus_trial_dir = os.path.join(benchmark_random_corpora,
trial_group_subdir)
src_dir = os.path.join(tmp_dir, 'source')
filesystem.recreate_directory(src_dir)

source_files = random.sample(all_corpus_files,
MAX_SOURCE_CORPUS_FILES)
for file in source_files:
filesystem.copy(file, src_dir)

# Copy only the src directory.
filesystem.copytree(src_dir, custom_corpus_trial_dir)
trial_group_num += 1

return []
1 change: 1 addition & 0 deletions database/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class Trial(Base):
# Columns used for preemptible experiments.
preemptible = Column(Boolean, default=False, nullable=False)
preempted = Column(Boolean, default=False, nullable=False)
trial_group_num = Column(Integer, nullable=True)

# Every trial has snapshots which is basically the saved state of that trial
# at a given time. The snapshots field here and the trial field on Snapshot,
Expand Down
11 changes: 9 additions & 2 deletions experiment/dispatcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import time
from typing import List

from common import random_corpus_fuzzing_utils
from common import experiment_path as exp_path
from common import experiment_utils
from common import logs
Expand Down Expand Up @@ -89,7 +90,7 @@ def _initialize_trials_in_db(trials: List[models.Trial]):
db_utils.bulk_save(trials)


class Experiment:
class Experiment: # pylint: disable=too-many-instance-attributes
"""Class representing an experiment."""

def __init__(self, experiment_config_filepath: str):
Expand All @@ -101,6 +102,7 @@ def __init__(self, experiment_config_filepath: str):
self.experiment_name = self.config['experiment']
self.git_hash = self.config['git_hash']
self.preemptible = self.config.get('preemptible_runners')
self.micro_experiment = self.config.get('micro_experiment')


def build_images_for_trials(fuzzers: List[str], benchmarks: List[str],
Expand All @@ -123,7 +125,8 @@ def build_images_for_trials(fuzzers: List[str], benchmarks: List[str],
models.Trial(fuzzer=fuzzer,
experiment=experiment_name,
benchmark=benchmark,
preemptible=preemptible) for _ in range(num_trials)
preemptible=preemptible,
trial_group_num=trial) for trial in range(num_trials)
]
trials.extend(fuzzer_benchmark_trials)
return trials
Expand All @@ -150,6 +153,10 @@ def dispatcher_main():
experiment.preemptible)
_initialize_trials_in_db(trials)

if experiment.micro_experiment:
random_corpus_fuzzing_utils.initialize_random_corpus_fuzzing(
experiment.benchmarks, experiment.num_trials)

create_work_subdirs(['experiment-folders', 'measurement-folders'])

# Start measurer and scheduler in seperate threads/processes.
Expand Down
2 changes: 2 additions & 0 deletions experiment/resources/runner-startup-script-template.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ docker run \
-e BENCHMARK={{benchmark}} \
-e EXPERIMENT={{experiment}} \
-e TRIAL_ID={{trial_id}} \
-e TRIAL_GROUP_NUM={{trial_group_num}} \
-e MICRO_EXPERIMENT={{micro_experiment}} \
-e MAX_TOTAL_TIME={{max_total_time}} \
-e SNAPSHOT_PERIOD={{snapshot_period}} \
-e NO_SEEDS={{no_seeds}} \
Expand Down
3 changes: 3 additions & 0 deletions experiment/run_experiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@ def _set_default_config_values(config: Dict[str, Union[int, str, bool]],
config['snapshot_period'] = config.get(
'snapshot_period', experiment_utils.DEFAULT_SNAPSHOT_SECONDS)
config['private'] = config.get('private', False)
config['micro_experiment'] = config.get('micro_experiment', False)


def _validate_config_parameters(
Expand Down Expand Up @@ -187,6 +188,8 @@ def read_and_validate_experiment_config(config_filename: str) -> Dict:
Requirement(False, int, False, ''),
'runner_memory':
Requirement(False, str, False, ''),
'micro_experiment':
Requirement(False, bool, False, ''),
}

all_params_valid = _validate_config_parameters(config, config_requirements)
Expand Down
16 changes: 15 additions & 1 deletion experiment/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,18 @@ def get_clusterfuzz_seed_corpus_path(fuzz_target_path):
return seed_corpus_path if os.path.exists(seed_corpus_path) else None


def _unpack_random_corpus(corpus_directory):
shutil.rmtree(corpus_directory)

benchmark = environment.get('BENCHMARK')
trial_group_num = environment.get('TRIAL_GROUP_NUM', 0)
random_corpora_dir = experiment_utils.get_random_corpora_filestore_path()
random_corpora_sub_dir = f'trial-group-{int(trial_group_num)}'
random_corpus_dir = posixpath.join(random_corpora_dir, benchmark,
random_corpora_sub_dir)
filestore_utils.cp(random_corpus_dir, corpus_directory, recursive=True)


def _copy_custom_seed_corpus(corpus_directory):
"""Copy custom seed corpus provided by user"""
shutil.rmtree(corpus_directory)
Expand Down Expand Up @@ -257,7 +269,9 @@ def set_up_corpus_directories(self):
FUZZ_TARGET_DIR, fuzz_target_name)
input_corpus = environment.get('SEED_CORPUS_DIR')
os.makedirs(input_corpus, exist_ok=True)
if not environment.get('CUSTOM_SEED_CORPUS_DIR'):
if environment.get('MICRO_EXPERIMENT'):
_unpack_random_corpus(input_corpus)
elif not environment.get('CUSTOM_SEED_CORPUS_DIR'):
_unpack_clusterfuzz_seed_corpus(target_binary, input_corpus)
else:
_copy_custom_seed_corpus(input_corpus)
Expand Down
12 changes: 9 additions & 3 deletions experiment/scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -689,7 +689,7 @@ def start_trials(trials, experiment_config: dict, pool, core_allocation=None):
return started_trials


class TrialProxy:
class TrialProxy: # pylint: disable=too-many-instance-attributes
"""A proxy object for a model.Trial. TrialProxy's allow these fields to be
set and retreived without making any database calls."""

Expand All @@ -701,6 +701,7 @@ def __init__(self, trial):
self.time_ended = trial.time_ended
self.preemptible = trial.preemptible
self.cpuset = None
self.trial_group_num = trial.trial_group_num
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this has been plumbed through properly but I'm not 100% sure (no reason to think it's wrong). Sorry this setup isn't great.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @jonathanmetzman the trial_group_num field is used as an identifier of the group of trials and this idea of trial grouping is used to make sure that we provide the same set of seeds to certain trails (within the same group).

For example, if in the config.yaml we set trials to 2 for the experiment so we have following trials in total (for fuzzer afl, libfuzzer and benchmark libjpeg):

  • trial_id: 1, benchmark: libjpeg, fuzzer: afl
  • trial_id: 2, benchmark: libjpeg, fuzzer: afl
  • trial_id: 3, benchmark: libjpeg, fuzzer: libfuzzer
  • trial_id: 4, benchmark: libjpeg, fuzzer: libfuzzer

so fuzzbench will generate following group (synthetically) and assign the same set initial seeds to trials within the same group:

  • trial_id: 1, benchmark: libjpeg, fuzzer: afl --> Group A
  • trial_id: 2, benchmark: libjpeg, fuzzer: afl --> Group B
  • trial_id: 3, benchmark: libjpeg, fuzzer: libfuzzer --> Group A
  • trial_id: 4, benchmark: libjpeg, fuzzer: libfuzzer --> Group B

Please let me know if it makes sense? also is there a way to only add/inject this trial_group_num field only when are running in MICRO_EXPERIMENT mode?



def _initialize_logs(experiment):
Expand Down Expand Up @@ -729,7 +730,7 @@ def _start_trial(trial: TrialProxy, experiment_config: dict, cpuset=None):
logger.info('Start trial %d.', trial.id)
started = create_trial_instance(trial.fuzzer, trial.benchmark, trial.id,
experiment_config, trial.preemptible,
cpuset)
cpuset, trial.trial_group_num)
if started:
trial.time_started = datetime_now()
trial.cpuset = cpuset
Expand All @@ -743,6 +744,7 @@ def render_startup_script_template( # pylint: disable=too-many-arguments
fuzzer: str,
benchmark: str,
trial_id: int,
trial_group_num: int,
experiment_config: dict,
cpuset=None):
"""Render the startup script using the template and the parameters
Expand All @@ -760,6 +762,8 @@ def render_startup_script_template( # pylint: disable=too-many-arguments
'experiment': experiment,
'fuzzer': fuzzer,
'trial_id': trial_id,
'trial_group_num': trial_group_num,
'micro_experiment': experiment_config['micro_experiment'],
'max_total_time': experiment_config['max_total_time'],
'snapshot_period': experiment_config['snapshot_period'],
'experiment_filestore': experiment_config['experiment_filestore'],
Expand Down Expand Up @@ -790,13 +794,15 @@ def create_trial_instance( # pylint: disable=too-many-arguments
trial_id: int,
experiment_config: dict,
preemptible: bool,
cpuset=None) -> bool:
cpuset=None,
trial_group_num: int = 0) -> bool:
"""Create or start a trial instance for a specific
trial_id,fuzzer,benchmark."""
instance_name = experiment_utils.get_trial_instance_name(
experiment_config['experiment'], trial_id)
startup_script = render_startup_script_template(instance_name, fuzzer,
benchmark, trial_id,
trial_group_num,
experiment_config, cpuset)
startup_script_path = f'/tmp/{instance_name}-start-docker.sh'
with open(startup_script_path, 'w', encoding='utf-8') as file_handle:
Expand Down
1 change: 1 addition & 0 deletions experiment/test_data/experiment-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,4 @@ measurers_cpus: null
runner_num_cpu_cores: 1
runner_machine_type: 'n1-standard-1'
private: false
micro_experiment: false
1 change: 1 addition & 0 deletions experiment/test_data/local-experiment-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@ report_filestore: /tmp/web-reports
local_experiment: true
benchmarks: "benchmark-1,benchmark-2"
git_hash: "git-hash"
micro_experiment: false
2 changes: 2 additions & 0 deletions experiment/test_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,8 @@ def test_create_trial_instance(benchmark, expected_image, expected_target,
-e BENCHMARK={benchmark} \\
-e EXPERIMENT=test-experiment \\
-e TRIAL_ID=9 \\
-e TRIAL_GROUP_NUM=0 \\
-e MICRO_EXPERIMENT=False \\
-e MAX_TOTAL_TIME=86400 \\
-e SNAPSHOT_PERIOD=900 \\
-e NO_SEEDS=False \\
Expand Down
Loading