-
Notifications
You must be signed in to change notification settings - Fork 3.7k
292 lines (264 loc) · 9.76 KB
/
workflow-run-replay-verify.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
name: "*run replay-verify reusable workflow"
on:
# This allows the workflow to be triggered from another workflow
workflow_call:
inputs:
GIT_SHA:
required: true
type: string
description: The git SHA1 to test.
# replay-verify config
BUCKET:
required: true
type: string
description: The bucket to use for the backup. If not specified, it will use the default bucket.
SUB_DIR:
required: true
type: string
description: The subdirectory to use for the backup. If not specified, it will use the default subdirectory.
HISTORY_START:
required: true
type: string
description: The history start to use for the backup. If not specified, it will use the default history start.
TXNS_TO_SKIP:
required: false
type: string
description: The list of transaction versions to skip. If not specified, it will use the default list.
RANGES_TO_SKIP:
required: false
type: string
description: The optional list of transaction ranges to skip..
BACKUP_CONFIG_TEMPLATE_PATH:
description: "The path to the backup config template to use."
type: string
required: true
# GHA job config
RUNS_ON:
description: "The runner to use for the job."
type: string
required: true
default: "high-perf-docker-with-local-ssd"
TIMEOUT_MINUTES:
description: "Github job timeout in minutes"
type: number
required: true
default: 180
MAX_VERSIONS_PER_RANGE:
description: "The maximum number of versions to process in a single job."
type: number
required: true
# This allows the workflow to be triggered manually from the Github UI or CLI
# NOTE: because the "number" type is not supported, we default to 720 minute timeout
workflow_dispatch:
inputs:
GIT_SHA:
required: true
type: string
description: The git SHA1 to test.
# replay-verify config
BUCKET:
required: true
type: string
description: The bucket to use for the backup. If not specified, it will use the default bucket.
SUB_DIR:
required: true
type: string
description: The subdirectory to use for the backup. If not specified, it will use the default subdirectory.
HISTORY_START:
required: true
type: string
description: The history start to use for the backup. If not specified, it will use the default history start.
TXNS_TO_SKIP:
required: false
type: string
description: The list of transaction versions to skip. If not specified, it will use the default list.
RANGES_TO_SKIP:
required: false
type: string
description: The optional list of transaction ranges to skip..
BACKUP_CONFIG_TEMPLATE_PATH:
description: "The path to the backup config template to use."
type: string
required: true
# GHA job config
RUNS_ON:
description: "The runner to use for the job."
type: string
required: true
default: "high-perf-docker-with-local-ssd"
MAX_VERSIONS_PER_RANGE:
description: "The maximum number of versions to process in a single job."
type: number
required: true
jobs:
prepare:
runs-on: ${{ inputs.RUNS_ON }}
outputs:
job_ids: ${{ steps.gen-jobs.outputs.job_ids }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.GIT_SHA }}
- name: Load cached aptos-debugger binary
id: cache-aptos-debugger-binary
uses: actions/cache@v4
with:
# copy the binary to the root of the repo and cache it there, because rust-setup calls a cache-rust action
# which cleans up the target directory in its post action
path: |
aptos-debugger
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}
- name: Prepare for build if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
with:
GIT_CREDENTIALS: ${{ inputs.GIT_CREDENTIALS }}
- name: Build and strip aptos-debugger binary if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
shell: bash
run: |
cargo build --release -p aptos-debugger
strip -s target/release/aptos-debugger
cp target/release/aptos-debugger .
- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"
- name: get timestamp to use in cache key
id: get-timestamp
run: echo "ts=$(date +%s)" >> $GITHUB_OUTPUT
- name: Load cached backup storage metadata cache dir (and save back afterwards)
uses: actions/cache@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ steps.get-timestamp.outputs.ts }}
restore-keys: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-
- name: Generate job ranges
id: gen-jobs
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
run: |
./aptos-debugger aptos-db gen-replay-verify-jobs \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} \
--start-version ${{ inputs.HISTORY_START }} \
--ranges-to-skip "${{ inputs.RANGES_TO_SKIP }}" \
--max-versions-per-range ${{ inputs.MAX_VERSIONS_PER_RANGE }} \
\
--max-ranges-per-job 16 \
--output-json-file jobs.json \
jq -c 'length as $N | [range(0; $N)]' jobs.json > job_ids.json
cat job_ids.json
jq . jobs.json
echo "job_ids=$(cat job_ids.json)" >> $GITHUB_OUTPUT
- name: Cache backup storage config and job definition
uses: actions/cache/save@v4
with:
path: |
${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
jobs.json
key: backup-config-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ github.run_id }}
replay-verify:
needs: prepare
timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 180 }}
runs-on: ${{ inputs.RUNS_ON }}
strategy:
fail-fast: false
matrix:
job_id: ${{ fromJson(needs.prepare.outputs.job_ids) }}
steps:
- name: Load cached aptos-debugger binary
uses: actions/cache/restore@v4
with:
path: |
aptos-debugger
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}
fail-on-cache-miss: true
- name: Load cached backup storage metadata cache dir
uses: actions/cache/restore@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-
fail-on-cache-miss: true
- name: Load cached backup storage config and job definitions
uses: actions/cache/restore@v4
with:
path: |
${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
jobs.json
key: backup-config-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ github.run_id }}
fail-on-cache-miss: true
- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"
- name: Run replay-verify in parallel
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
shell: bash
run: |
set -o nounset -o errexit -o pipefail
replay() {
idx=$1
id=$2
begin=$3
end=$4
desc=$5
echo ---------
echo Job start. $id: $desc
echo ---------
MC=metadata_cache_$idx
cp -r metadata_cache $MC
DB=db_$idx
for try in {0..6}
do
if [ $try -gt 0 ]; then
SLEEP=$((10 * $try))
echo "sleeping for $SLEEP seconds before retry #$try" >&2
sleep $SLEEP
fi
res=0
./aptos-debugger aptos-db replay-verify \
--metadata-cache-dir $MC \
--command-adapter-config ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} \
--start-version $begin \
--end-version $end \
\
--lazy-quit \
--enable-storage-sharding \
--target-db-dir $DB \
--concurrent-downloads 8 \
--replay-concurrency-level 4 \
|| res=$?
if [[ $res == 0 || $res == 2 ]]
then
return $res
fi
done
return 1
}
pids=()
idx=0
while read id begin end desc; do
replay $idx $id $begin $end "$desc" 2>&1 | sed "s/^/[partition $idx]: /" &
pids[$idx]=$!
idx=$((idx+1))
done < <(jq '.[${{ matrix.job_id }}][]' jobs.json)
res=0
for idx in `seq 0 $((idx-1))`
do
range_res=0
wait ${pids[$idx]} || range_res=$?
echo partition $idx returned $range_res
if [[ $range_res != 0 ]]
then
res=$range_res
fi
done
echo All partitions done, returning $res
exit $res