-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_experiments.py
142 lines (110 loc) · 4.58 KB
/
run_experiments.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
#!/usr/bin/env python3
"""Run evaluation pipelines."""
import os
import subprocess
from typing import Optional
def run(args: list[str], check: bool = True):
"""Run the given command and check that it succeeds."""
subprocess.run(args, check=check)
def print_done(result_path: str):
"""Print that the experiment is done."""
print(f'✅ {result_path}')
def kill_dafny():
"""Kills all running Dafny processes.
execute.py seems to leave some of these processes still running
even after search.py finishes, and they don't let CUDA memory be
freed up. This is a workaround, though we might want to fix execute.py
"""
run(['killall', '-9', 'dafny'], check=False)
def run_base_model_experiment(
n_eval_programs: int,
base_model: str
):
"""Run an experiment with a base model (no fine-tuning)."""
model_name = base_model.split('/')[-1]
result_path = os.path.join(f'results/base-{model_name}.json')
if not os.path.exists(result_path):
run(['python', 'search.py',
'--num-programs', str(n_eval_programs),
'--output', result_path,
'--model', base_model])
kill_dafny()
print_done(result_path)
def run_dafnybench_finetuning_experiment(
n_eval_programs: int,
base_model: str,
finetuning_fraction: float,
include_graph: Optional[str] = None,
):
"""Run an experiment with fine-tuning on DafnyBench + (optionally) synthetic data.
Args:
n_eval_programs (int): Number of programs to hold out for the test set.
base_model (str): Model string.
finetuning_fraction (float): How much of the training set to use (between 0 and 1).
include_graph (Optional[str]): If not None, then should be a path to a JSON
representing an edit graph, from which we'll extract
fine-tuning examples.
"""
DAFNYBENCH_SIZE = 1326 # TODO: get this from the dataset.
available_training_set = DAFNYBENCH_SIZE - n_eval_programs
used_training_set = int(finetuning_fraction * available_training_set)
skipped_training_set = available_training_set - used_training_set
n_skip = skipped_training_set + n_eval_programs
model_name = base_model.split('/')[-1]
ft_percent = int(100 * finetuning_fraction)
suffix = '+graph' if include_graph else ''
result_path = os.path.join(
f'results/finetuned-{model_name}-db{ft_percent}{suffix}.json') # noqa
training_set_path = f'data/finetuning_examples_{finetuning_fraction}.json'
model_path = f'models/finetuned_{model_name}_db{ft_percent}{suffix}'
if not os.path.exists(result_path):
if not os.path.exists(model_path):
# 1- Collect training set
run(['python', 'training.py',
'--extract-direct',
'--skip', str(n_skip),
'--output', training_set_path,
])
training_set = [training_set_path]
# 2- (Optional) if include_graph is provided, also include
# examples extracted from the graph in the training set.
if include_graph:
graph_examples = os.path.splitext(include_graph)[0] + '-examples.json'
run(['python', 'training.py',
'--extract-direct-from-graph',
'--graph', include_graph,
'--output', graph_examples,
])
training_set.append(graph_examples)
# 3- Fine-tune
run(['python', 'training.py',
'--finetune',
'--model', base_model,
'--training-set', *training_set,
'--output', model_path,
])
# 4- Evaluate
run(['python', 'search.py',
'--num-programs', str(n_eval_programs),
'--output', result_path,
'--model', model_path,
])
kill_dafny()
print_done(result_path)
def main():
N_EVAL_PROGRAMS = 326
# Make huggingface tokenizers behave well with multiprocessing.
os.environ['TOKENIZERS_PARALLELISM'] = 'false'
BASE_MODELS = [
'meta-llama/Meta-Llama-3.1-8B',
'meta-llama/CodeLlama-7b-hf',
]
for m in BASE_MODELS:
run_base_model_experiment(N_EVAL_PROGRAMS, m)
TRAINING_SET_FRACTIONS = [.25, .5, 1.0]
for graph in [None, 'edit_graph.json']:
for m in BASE_MODELS:
for f in TRAINING_SET_FRACTIONS:
run_dafnybench_finetuning_experiment(N_EVAL_PROGRAMS, m, f, include_graph=graph)
if __name__ == '__main__':
main()