From 22871154664e87833bee2d3558415fb816555dc7 Mon Sep 17 00:00:00 2001
From: Jiradet Ounjai <jiradet.jd@gmail.com>
Date: Thu, 30 Nov 2023 21:57:23 +0700
Subject: [PATCH] Add GreenBench implementation (#1912)

This PR ports implementation of
https://github.com/Rigorous-Software-Engineering/greenbench into
fuzzbench. Essentially, we introduce new experiment option to allow
users to opt in micro benchmark fuzzing that described in the GreenBench
paper.
---
 common/experiment_utils.py                    |  16 +++
 common/random_corpus_fuzzing_utils.py         | 101 ++++++++++++++++++
 database/models.py                            |   1 +
 experiment/dispatcher.py                      |  11 +-
 .../runner-startup-script-template.sh         |   2 +
 experiment/run_experiment.py                  |   3 +
 experiment/runner.py                          |  16 ++-
 experiment/scheduler.py                       |  12 ++-
 experiment/test_data/experiment-config.yaml   |   1 +
 .../test_data/local-experiment-config.yaml    |   1 +
 experiment/test_scheduler.py                  |   2 +
 11 files changed, 160 insertions(+), 6 deletions(-)
 create mode 100644 common/random_corpus_fuzzing_utils.py

diff --git a/common/experiment_utils.py b/common/experiment_utils.py
index 604d0218f..3911751ac 100644
--- a/common/experiment_utils.py
+++ b/common/experiment_utils.py
@@ -97,6 +97,17 @@ def get_custom_seed_corpora_filestore_path():
                           'custom_seed_corpora')
 
 
+def get_oss_fuzz_corpora_unarchived_path():
+    """Returns path containing the user-provided seed corpora."""
+    return posixpath.join(get_experiment_filestore_path(),
+                          'oss_fuzz_unarchived')
+
+
+def get_random_corpora_filestore_path():
+    """Returns path containing seed corpora for the target fuzzing experiment."""  # pylint: disable=line-too-long
+    return posixpath.join(get_experiment_filestore_path(), 'random_corpora')
+
+
 def get_dispatcher_instance_name(experiment: str) -> str:
     """Returns a dispatcher instance name for an experiment."""
     return f'd-{experiment}'
@@ -138,6 +149,11 @@ def is_local_experiment():
     return bool(environment.get('LOCAL_EXPERIMENT'))
 
 
+def is_micro_experiment():
+    """Returns True if running a micro experiment."""
+    return bool(environment.get('MICRO_EXPERIMENT'))
+
+
 def get_trial_dir(fuzzer, benchmark, trial_id):
     """Returns the unique directory for |fuzzer|, |benchmark|, and
     |trial_id|."""
diff --git a/common/random_corpus_fuzzing_utils.py b/common/random_corpus_fuzzing_utils.py
new file mode 100644
index 000000000..1bca7561e
--- /dev/null
+++ b/common/random_corpus_fuzzing_utils.py
@@ -0,0 +1,101 @@
+# Copyright 2023 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Utility functions for micro-experiment run."""
+
+import random
+import os
+import tempfile
+import multiprocessing
+import zipfile
+from typing import List
+
+from common import experiment_utils
+from common import filesystem
+from common import logs
+
+MAX_SOURCE_CORPUS_FILES = 1
+CORPUS_ELEMENT_BYTES_LIMIT = 1 * 1024 * 1024
+
+
+def initialize_random_corpus_fuzzing(benchmarks: List[str], num_trials: int):
+    """Prepare corpus for micro experiment."""
+    pool_args = ()
+    with multiprocessing.Pool(*pool_args) as pool:
+        pool.starmap(prepare_benchmark_random_corpus,
+                     [(benchmark, num_trials) for benchmark in benchmarks])
+        logs.info('Done preparing corpus for micro experiment')
+
+
+# pylint: disable=too-many-locals
+def prepare_benchmark_random_corpus(benchmark: str, num_trials: int):
+    """Prepare corpus for given benchmark."""
+    # Temporary location to park corpus files before get picked randomly.
+    benchmark_unarchived_corpora = os.path.join(
+        experiment_utils.get_oss_fuzz_corpora_unarchived_path(), benchmark)
+    filesystem.create_directory(benchmark_unarchived_corpora)
+
+    # Unzip oss fuzz corpus.
+    corpus_archive_filename = f'{benchmark}.zip'
+    oss_fuzz_corpus_archive_path = os.path.join(
+        experiment_utils.get_oss_fuzz_corpora_filestore_path(),
+        corpus_archive_filename)
+    with zipfile.ZipFile(oss_fuzz_corpus_archive_path) as zip_file:
+        idx = 0
+        for seed_corpus_file in zip_file.infolist():
+            if seed_corpus_file.filename.endswith('/'):
+                # Ignore directories.
+                continue
+            # Allow callers to opt-out of unpacking large files.
+            if seed_corpus_file.file_size > CORPUS_ELEMENT_BYTES_LIMIT:
+                continue
+            output_filename = f'{idx:016d}'
+            output_file_path = os.path.join(benchmark_unarchived_corpora,
+                                            output_filename)
+            zip_file.extract(seed_corpus_file, output_file_path)
+            idx += 1
+
+    # Path used to store and feed seed corpus for benchmark runner
+    # each trial group will have the same seed input(s).
+    benchmark_random_corpora = os.path.join(
+        experiment_utils.get_random_corpora_filestore_path(), benchmark)
+    filesystem.create_directory(benchmark_random_corpora)
+
+    with tempfile.TemporaryDirectory() as tmp_dir:
+        all_corpus_files = []
+        for root, _, files in os.walk(benchmark_unarchived_corpora):
+            for filename in files:
+                file_path = os.path.join(root, filename)
+                all_corpus_files.append(file_path)
+
+        all_corpus_files.sort()
+        trial_group_num = 0
+        # All trials in the same group will start with the same
+        # set of randomly selected seed files.
+        while trial_group_num < num_trials:
+            trial_group_subdir = f'trial-group-{trial_group_num}'
+            custom_corpus_trial_dir = os.path.join(benchmark_random_corpora,
+                                                   trial_group_subdir)
+            src_dir = os.path.join(tmp_dir, 'source')
+            filesystem.recreate_directory(src_dir)
+
+            source_files = random.sample(all_corpus_files,
+                                         MAX_SOURCE_CORPUS_FILES)
+            for file in source_files:
+                filesystem.copy(file, src_dir)
+
+            # Copy only the src directory.
+            filesystem.copytree(src_dir, custom_corpus_trial_dir)
+            trial_group_num += 1
+
+    return []
diff --git a/database/models.py b/database/models.py
index 7cf902397..02bfff7a9 100644
--- a/database/models.py
+++ b/database/models.py
@@ -54,6 +54,7 @@ class Trial(Base):
     # Columns used for preemptible experiments.
     preemptible = Column(Boolean, default=False, nullable=False)
     preempted = Column(Boolean, default=False, nullable=False)
+    trial_group_num = Column(Integer, nullable=True)
 
     # Every trial has snapshots which is basically the saved state of that trial
     # at a given time. The snapshots field here and the trial field on Snapshot,
diff --git a/experiment/dispatcher.py b/experiment/dispatcher.py
index 796c796b8..9f442d755 100755
--- a/experiment/dispatcher.py
+++ b/experiment/dispatcher.py
@@ -24,6 +24,7 @@
 import time
 from typing import List
 
+from common import random_corpus_fuzzing_utils
 from common import experiment_path as exp_path
 from common import experiment_utils
 from common import logs
@@ -89,7 +90,7 @@ def _initialize_trials_in_db(trials: List[models.Trial]):
     db_utils.bulk_save(trials)
 
 
-class Experiment:
+class Experiment:  # pylint: disable=too-many-instance-attributes
     """Class representing an experiment."""
 
     def __init__(self, experiment_config_filepath: str):
@@ -101,6 +102,7 @@ def __init__(self, experiment_config_filepath: str):
         self.experiment_name = self.config['experiment']
         self.git_hash = self.config['git_hash']
         self.preemptible = self.config.get('preemptible_runners')
+        self.micro_experiment = self.config.get('micro_experiment')
 
 
 def build_images_for_trials(fuzzers: List[str], benchmarks: List[str],
@@ -123,7 +125,8 @@ def build_images_for_trials(fuzzers: List[str], benchmarks: List[str],
             models.Trial(fuzzer=fuzzer,
                          experiment=experiment_name,
                          benchmark=benchmark,
-                         preemptible=preemptible) for _ in range(num_trials)
+                         preemptible=preemptible,
+                         trial_group_num=trial) for trial in range(num_trials)
         ]
         trials.extend(fuzzer_benchmark_trials)
     return trials
@@ -150,6 +153,10 @@ def dispatcher_main():
                                      experiment.preemptible)
     _initialize_trials_in_db(trials)
 
+    if experiment.micro_experiment:
+        random_corpus_fuzzing_utils.initialize_random_corpus_fuzzing(
+            experiment.benchmarks, experiment.num_trials)
+
     create_work_subdirs(['experiment-folders', 'measurement-folders'])
 
     # Start measurer and scheduler in seperate threads/processes.
diff --git a/experiment/resources/runner-startup-script-template.sh b/experiment/resources/runner-startup-script-template.sh
index 79f84c22c..5ef1e40bf 100644
--- a/experiment/resources/runner-startup-script-template.sh
+++ b/experiment/resources/runner-startup-script-template.sh
@@ -42,6 +42,8 @@ docker run \
 -e BENCHMARK={{benchmark}} \
 -e EXPERIMENT={{experiment}} \
 -e TRIAL_ID={{trial_id}} \
+-e TRIAL_GROUP_NUM={{trial_group_num}} \
+-e MICRO_EXPERIMENT={{micro_experiment}} \
 -e MAX_TOTAL_TIME={{max_total_time}} \
 -e SNAPSHOT_PERIOD={{snapshot_period}} \
 -e NO_SEEDS={{no_seeds}} \
diff --git a/experiment/run_experiment.py b/experiment/run_experiment.py
index 4771dfebf..5f61ad6a8 100644
--- a/experiment/run_experiment.py
+++ b/experiment/run_experiment.py
@@ -74,6 +74,7 @@ def _set_default_config_values(config: Dict[str, Union[int, str, bool]],
     config['snapshot_period'] = config.get(
         'snapshot_period', experiment_utils.DEFAULT_SNAPSHOT_SECONDS)
     config['private'] = config.get('private', False)
+    config['micro_experiment'] = config.get('micro_experiment', False)
 
 
 def _validate_config_parameters(
@@ -187,6 +188,8 @@ def read_and_validate_experiment_config(config_filename: str) -> Dict:
             Requirement(False, int, False, ''),
         'runner_memory':
             Requirement(False, str, False, ''),
+        'micro_experiment':
+            Requirement(False, bool, False, ''),
     }
 
     all_params_valid = _validate_config_parameters(config, config_requirements)
diff --git a/experiment/runner.py b/experiment/runner.py
index a4efc5b6f..b955ff665 100644
--- a/experiment/runner.py
+++ b/experiment/runner.py
@@ -101,6 +101,18 @@ def get_clusterfuzz_seed_corpus_path(fuzz_target_path):
     return seed_corpus_path if os.path.exists(seed_corpus_path) else None
 
 
+def _unpack_random_corpus(corpus_directory):
+    shutil.rmtree(corpus_directory)
+
+    benchmark = environment.get('BENCHMARK')
+    trial_group_num = environment.get('TRIAL_GROUP_NUM', 0)
+    random_corpora_dir = experiment_utils.get_random_corpora_filestore_path()
+    random_corpora_sub_dir = f'trial-group-{int(trial_group_num)}'
+    random_corpus_dir = posixpath.join(random_corpora_dir, benchmark,
+                                       random_corpora_sub_dir)
+    filestore_utils.cp(random_corpus_dir, corpus_directory, recursive=True)
+
+
 def _copy_custom_seed_corpus(corpus_directory):
     """Copy custom seed corpus provided by user"""
     shutil.rmtree(corpus_directory)
@@ -257,7 +269,9 @@ def set_up_corpus_directories(self):
             FUZZ_TARGET_DIR, fuzz_target_name)
         input_corpus = environment.get('SEED_CORPUS_DIR')
         os.makedirs(input_corpus, exist_ok=True)
-        if not environment.get('CUSTOM_SEED_CORPUS_DIR'):
+        if environment.get('MICRO_EXPERIMENT'):
+            _unpack_random_corpus(input_corpus)
+        elif not environment.get('CUSTOM_SEED_CORPUS_DIR'):
             _unpack_clusterfuzz_seed_corpus(target_binary, input_corpus)
         else:
             _copy_custom_seed_corpus(input_corpus)
diff --git a/experiment/scheduler.py b/experiment/scheduler.py
index 0d9da0b22..b189eabb6 100644
--- a/experiment/scheduler.py
+++ b/experiment/scheduler.py
@@ -689,7 +689,7 @@ def start_trials(trials, experiment_config: dict, pool, core_allocation=None):
     return started_trials
 
 
-class TrialProxy:
+class TrialProxy:  # pylint: disable=too-many-instance-attributes
     """A proxy object for a model.Trial. TrialProxy's allow these fields to be
     set and retreived without making any database calls."""
 
@@ -701,6 +701,7 @@ def __init__(self, trial):
         self.time_ended = trial.time_ended
         self.preemptible = trial.preemptible
         self.cpuset = None
+        self.trial_group_num = trial.trial_group_num
 
 
 def _initialize_logs(experiment):
@@ -729,7 +730,7 @@ def _start_trial(trial: TrialProxy, experiment_config: dict, cpuset=None):
     logger.info('Start trial %d.', trial.id)
     started = create_trial_instance(trial.fuzzer, trial.benchmark, trial.id,
                                     experiment_config, trial.preemptible,
-                                    cpuset)
+                                    cpuset, trial.trial_group_num)
     if started:
         trial.time_started = datetime_now()
         trial.cpuset = cpuset
@@ -743,6 +744,7 @@ def render_startup_script_template(  # pylint: disable=too-many-arguments
         fuzzer: str,
         benchmark: str,
         trial_id: int,
+        trial_group_num: int,
         experiment_config: dict,
         cpuset=None):
     """Render the startup script using the template and the parameters
@@ -760,6 +762,8 @@ def render_startup_script_template(  # pylint: disable=too-many-arguments
         'experiment': experiment,
         'fuzzer': fuzzer,
         'trial_id': trial_id,
+        'trial_group_num': trial_group_num,
+        'micro_experiment': experiment_config['micro_experiment'],
         'max_total_time': experiment_config['max_total_time'],
         'snapshot_period': experiment_config['snapshot_period'],
         'experiment_filestore': experiment_config['experiment_filestore'],
@@ -790,13 +794,15 @@ def create_trial_instance(  # pylint: disable=too-many-arguments
         trial_id: int,
         experiment_config: dict,
         preemptible: bool,
-        cpuset=None) -> bool:
+        cpuset=None,
+        trial_group_num: int = 0) -> bool:
     """Create or start a trial instance for a specific
     trial_id,fuzzer,benchmark."""
     instance_name = experiment_utils.get_trial_instance_name(
         experiment_config['experiment'], trial_id)
     startup_script = render_startup_script_template(instance_name, fuzzer,
                                                     benchmark, trial_id,
+                                                    trial_group_num,
                                                     experiment_config, cpuset)
     startup_script_path = f'/tmp/{instance_name}-start-docker.sh'
     with open(startup_script_path, 'w', encoding='utf-8') as file_handle:
diff --git a/experiment/test_data/experiment-config.yaml b/experiment/test_data/experiment-config.yaml
index 4cecd0e6e..deabcee9a 100644
--- a/experiment/test_data/experiment-config.yaml
+++ b/experiment/test_data/experiment-config.yaml
@@ -40,3 +40,4 @@ measurers_cpus: null
 runner_num_cpu_cores: 1
 runner_machine_type: 'n1-standard-1'
 private: false
+micro_experiment: false
\ No newline at end of file
diff --git a/experiment/test_data/local-experiment-config.yaml b/experiment/test_data/local-experiment-config.yaml
index adf30ea11..8f57cc739 100644
--- a/experiment/test_data/local-experiment-config.yaml
+++ b/experiment/test_data/local-experiment-config.yaml
@@ -22,3 +22,4 @@ report_filestore: /tmp/web-reports
 local_experiment: true
 benchmarks: "benchmark-1,benchmark-2"
 git_hash: "git-hash"
+micro_experiment: false
diff --git a/experiment/test_scheduler.py b/experiment/test_scheduler.py
index 7f6f3b5f8..1a5e85e8b 100644
--- a/experiment/test_scheduler.py
+++ b/experiment/test_scheduler.py
@@ -114,6 +114,8 @@ def test_create_trial_instance(benchmark, expected_image, expected_target,
 -e BENCHMARK={benchmark} \\
 -e EXPERIMENT=test-experiment \\
 -e TRIAL_ID=9 \\
+-e TRIAL_GROUP_NUM=0 \\
+-e MICRO_EXPERIMENT=False \\
 -e MAX_TOTAL_TIME=86400 \\
 -e SNAPSHOT_PERIOD=900 \\
 -e NO_SEEDS=False \\