Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: new Tracy Time preset and more efficient univariate extension #8789

Merged
merged 23 commits into from
Sep 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
753cf18
Start; still no luck on mainframe
codygunton Sep 24, 2024
db93e9b
Add local benchmarking script for convenience
codygunton Sep 25, 2024
4602ba6
Update default preset
codygunton Sep 25, 2024
2736c74
Base on clang16-dbg
codygunton Sep 25, 2024
654f95c
Revert "Base on clang16-dbg"
codygunton Sep 25, 2024
9bd40ce
Add deps
codygunton Sep 25, 2024
c0471d0
Add and rename scripts
codygunton Sep 25, 2024
61299e9
mainframe-side changes for remote build local execution
codygunton Sep 26, 2024
1393754
local-side changes for remote build local execution
codygunton Sep 26, 2024
0c2b10f
CIVC bench with two folds
codygunton Sep 26, 2024
08fc540
Prototype improvements to extend_to
codygunton Sep 26, 2024
a451f97
Implement and use self_extend_from
codygunton Sep 26, 2024
bf69253
revert change to extend_to
codygunton Sep 26, 2024
096b49b
Does std::move help?
codygunton Sep 26, 2024
12d6557
WIP using macros well
codygunton Sep 26, 2024
be4756f
delete two tracy benchmark scripts
lucasxia01 Sep 26, 2024
6e69ec8
test out TRACY_MEMORY variable to see if it works (it does)
lucasxia01 Sep 26, 2024
bd7324a
use TRACY_MEMORY flag so that zones don't affect other tracy builds
lucasxia01 Sep 26, 2024
21c7599
remove tracy-default preset
lucasxia01 Sep 27, 2024
2ed1fc5
Merge remote-tracking branch 'origin/master' into cg/civc-profile
lucasxia01 Sep 27, 2024
ad0326d
add ZoneScoped to BB_OP_COUNT_TIME
lucasxia01 Sep 27, 2024
5bf2a58
add #ifdef TRACY_MEMORY to everywhere
lucasxia01 Sep 28, 2024
b6d9531
Merge branch 'master' into cg/civc-profile
lucasxia01 Sep 30, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions barretenberg/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,15 @@ else()
SET(TRACY_LIBS)
endif()

if(TRACY_PROFILE_MEMORY)
add_compile_options(-DTRACY_MEMORY)
endif()

if(TRACY_PROFILE_TIME)
add_compile_options(-DTRACY_TIME)
endif()


if(ENABLE_ASAN)
add_compile_options(-fsanitize=address)
add_link_options(-fsanitize=address)
Expand Down
33 changes: 28 additions & 5 deletions barretenberg/cpp/CMakePresets.json
Original file line number Diff line number Diff line change
Expand Up @@ -111,13 +111,31 @@
}
},
{
"name": "tracy",
"name": "tracy-memory",
"displayName": "Release build with tracy, optimized for memory tracking",
"description": "Release build with tracy, optimized for memory tracking",
"inherits": "clang16",
"binaryDir": "build-tracy",
"binaryDir": "build-tracy-memory",
"cacheVariables": {
"ENABLE_TRACY": "ON"
"ENABLE_TRACY": "ON",
"TRACY_PROFILE_MEMORY": "ON"
}
},
{
"name": "tracy-time",
"displayName": "Build for tracy time profiling",
"description": "Build for tracy time profiling",
"binaryDir": "build-tracy-time",
"inherits": "clang16",
"environment": {
"CMAKE_BUILD_TYPE": "RelWithDebInfo",
"CFLAGS": "-g -fno-omit-frame-pointer",
"CXXFLAGS": "-g -fno-omit-frame-pointer",
"LDFLAGS": "-g -fno-omit-frame-pointer -rdynamic"
},
"cacheVariables": {
"ENABLE_TRACY": "ON",
"TRACY_PROFILE_TIME": "ON"
}
},
{
Expand Down Expand Up @@ -472,9 +490,14 @@
"configurePreset": "clang16-dbg"
},
{
"name": "tracy",
"name": "tracy-memory",
"inherits": "default",
"configurePreset": "tracy-memory"
},
{
"name": "tracy-time",
"inherits": "default",
"configurePreset": "tracy"
"configurePreset": "tracy-time"
},
{
"name": "clang16-pic",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@

# NOTE: intended to be ran from one's external computer, connecting to Aztec mainframe
# IF ON YOUR LOCAL COMPUTER USE NORMAL INTERACTIVE TRACY WORKFLOW
# the benchmark runs with headless capture and then we copy the trace file and run tracy profiler
# This is thus only really useful internally at Aztec, sorry external folks. It can be easily tweaked
# however for any SSH setup, especially an ubuntu one.
# on local machine run:
# export USER=...
# export PRESET=...tracy for memory or tracy-gates for circuit gates...
# ssh $USER-box "cat ~/aztec-packages/barretenberg/cpp/scripts/benchmark_tracy.sh" | bash /dev/stdin $USER
set -eux
USER=${1:-$USER}
BOX=$USER-box
BENCHMARK=${2:-protogalaxy_bench}
COMMAND=${3:-./bin/$BENCHMARK --benchmark_filter=fold_k/17}

# Can also set PRESET=tracy-gates env variable
PRESET=${PRESET:-tracy-time}

wait # TODO(AD) hack - not sure why needed
! [ -d ~/tracy ] && git clone https://github.com/wolfpld/tracy ~/tracy
cd ~/tracy
git checkout 075395620a504c0cdcaf9bab3d196db16a043de7 # release 0.11.0
cmake -B profiler/build -S profiler -DCMAKE_BUILD_TYPE=Release
cmake --build profiler/build --parallel
cd -

ssh $BOX "
set -eux ;
cd ~/aztec-packages/barretenberg/cpp/ ;
cmake --preset $PRESET && cmake --build --preset $PRESET --target $BENCHMARK ;
" &
wait
if [ ! -d build-$PRESET/bin ]; then
echo build-$PRESET/bin;
mkdir -p build-$PRESET/bin;
fi
scp $BOX:/mnt/user-data/$USER/aztec-packages/barretenberg/cpp/build-$PRESET/bin/$BENCHMARK build-$PRESET/bin/. ;
! [ -d ~/tracy ] && git clone https://github.com/wolfpld/tracy ~/tracy ;
cd ~/tracy/capture ;
git checkout 075395620a504c0cdcaf9bab3d196db16a043de7 ;
mkdir -p build && cd build && cmake .. && make -j ;

./tracy-capture -a 127.0.0.1 -f -o ../trace-$BENCHMARK &
sleep 0.1 ;
cd ~/aztec-packages/barretenberg/cpp/build-$PRESET/
$COMMAND ;

~/tracy/profiler/build/tracy-profiler ~/tracy/capture/trace-$BENCHMARK
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ BENCHMARK_DEFINE_F(ClientIVCBench, Full)(benchmark::State& state)
}
}

#define ARGS Arg(ClientIVCBench::NUM_ITERATIONS_MEDIUM_COMPLEXITY)
#define ARGS Arg(ClientIVCBench::NUM_ITERATIONS_MEDIUM_COMPLEXITY)->Arg(2)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's the point of this 2? doesn't seem to be used.


BENCHMARK_REGISTER_F(ClientIVCBench, Full)->Unit(benchmark::kMillisecond)->ARGS;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ void fold_k(State& state) noexcept
}
}

BENCHMARK(vector_of_evaluations)->DenseRange(15, 21)->Unit(kMillisecond);
BENCHMARK(vector_of_evaluations)->DenseRange(15, 21)->Unit(kMillisecond)->Iterations(1);
BENCHMARK(compute_row_evaluations)->DenseRange(15, 21)->Unit(kMillisecond);
// We stick to just k=1 for compile-time reasons.
BENCHMARK(fold_k)->/* vary the circuit size */ DenseRange(14, 20)->Unit(kMillisecond);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,50 @@ using bb::Univariate;

namespace bb::benchmark {

void extend_2_to_6(State& state) noexcept
void extend_2_to_11(State& state) noexcept
{
auto univariate = Univariate<FF, 2>::get_random();
for (auto _ : state) {
DoNotOptimize(univariate.extend_to<6>());
DoNotOptimize(univariate.extend_to<11>());
}
}
BENCHMARK(extend_2_to_6);

// 93.9s goes down to 62.7
// Theoretical min: 1 sub, 9 additions at about 3.8ns each, 38ns
void fake_extend_2_to_11(State& state) noexcept
{
std::array<FF, 11> univariate;
std::generate(univariate.begin(), univariate.end(), [&]() { return FF::random_element(); });

const auto extend_to_11 = [](auto& arr) {
FF tmp = arr[1];
const FF delta = tmp - arr[0];
for (size_t idx = 2; idx < 10; idx++) {
arr[idx] = (tmp += delta); // fused ~> 62.9ns; non-fused ~>69.5ns
}
arr[10] = tmp; // save one +=;
return arr;
};

for (auto _ : state) {
DoNotOptimize(extend_to_11(univariate));
}
}

// 93.9s goes down to 62.7
// Theoretical min: 1 sub, 9 additions at about 3.8ns each, 38ns
void self_extend_2_to_11(State& state) noexcept
{
auto univariate = Univariate<FF, 11>::get_random();

for (auto _ : state) {
univariate.self_extend_from<2>();
}
}

BENCHMARK(extend_2_to_11);
BENCHMARK(fake_extend_2_to_11);
BENCHMARK(self_extend_2_to_11);

} // namespace bb::benchmark

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ Prover get_prover(void (*test_circuit_function)(typename Prover::Flavor::Circuit
Composer composer;
return composer.create_prover(builder);
} else {
#ifdef TRACY_MEMORY
ZoneScopedN("creating prover");
#endif
return Prover(builder);
}
};
Expand Down
12 changes: 10 additions & 2 deletions barretenberg/cpp/src/barretenberg/common/op_count.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
#pragma once

#include <memory>
#include <tracy/Tracy.hpp>
#ifndef BB_USE_OP_COUNT
// require a semicolon to appease formatters
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
Expand All @@ -11,12 +12,19 @@
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_CYCLES_NAME(name) (void)0
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME_NAME(name) (void)0
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_CYCLES() (void)0
#ifndef TRACY_TIME
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME_NAME(name) (void)0
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME() (void)0
#else
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME_NAME(name) ZoneScopedN(name)
// NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
#define BB_OP_COUNT_TIME() BB_OP_COUNT_TIME_NAME(__func__)
#endif
#else
/**
* Provides an abstraction that counts operations based on function names.
* For efficiency, we spread out counts across threads.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,9 @@ namespace bb {

template <class Flavor> void ExecutionTrace_<Flavor>::populate_public_inputs_block(Builder& builder)
{
#ifdef TRACY_MEMORY
ZoneScopedN("populate_public_inputs_block");
#endif
// Update the public inputs block
for (const auto& idx : builder.public_inputs) {
for (size_t wire_idx = 0; wire_idx < NUM_WIRES; ++wire_idx) {
Expand All @@ -27,7 +29,10 @@ template <class Flavor> void ExecutionTrace_<Flavor>::populate_public_inputs_blo
template <class Flavor>
void ExecutionTrace_<Flavor>::populate(Builder& builder, typename Flavor::ProvingKey& proving_key, bool is_structured)
{

#ifdef TRACY_MEMORY
ZoneScopedN("trace populate");
#endif
// Share wire polynomials, selector polynomials between proving key and builder and copy cycles from raw circuit
// data
auto trace_data = construct_trace_data(builder, proving_key, is_structured);
Expand All @@ -36,18 +41,27 @@ void ExecutionTrace_<Flavor>::populate(Builder& builder, typename Flavor::Provin
proving_key.pub_inputs_offset = trace_data.pub_inputs_offset;
}
if constexpr (IsUltraPlonkOrHonk<Flavor>) {

#ifdef TRACY_MEMORY
ZoneScopedN("add_memory_records_to_proving_key");
#endif
add_memory_records_to_proving_key(trace_data, builder, proving_key);
}

if constexpr (IsGoblinFlavor<Flavor>) {

#ifdef TRACY_MEMORY
ZoneScopedN("add_ecc_op_wires_to_proving_key");
#endif
add_ecc_op_wires_to_proving_key(builder, proving_key);
}

// Compute the permutation argument polynomials (sigma/id) and add them to proving key
{

#ifdef TRACY_MEMORY
ZoneScopedN("compute_permutation_argument_polynomials");
#endif
compute_permutation_argument_polynomials<Flavor>(builder, &proving_key, trace_data.copy_cycles);
}
}
Expand All @@ -73,7 +87,10 @@ template <class Flavor>
typename ExecutionTrace_<Flavor>::TraceData ExecutionTrace_<Flavor>::construct_trace_data(
Builder& builder, typename Flavor::ProvingKey& proving_key, bool is_structured)
{

#ifdef TRACY_MEMORY
ZoneScopedN("construct_trace_data");
#endif

if constexpr (IsPlonkFlavor<Flavor>) {
// Complete the public inputs execution trace block from builder.public_inputs
Expand All @@ -91,7 +108,10 @@ typename ExecutionTrace_<Flavor>::TraceData ExecutionTrace_<Flavor>::construct_t
// Update wire polynomials and copy cycles
// NB: The order of row/column loops is arbitrary but needs to be row/column to match old copy_cycle code
{

#ifdef TRACY_MEMORY
ZoneScopedN("populating wires and copy_cycles");
#endif
for (uint32_t block_row_idx = 0; block_row_idx < block_size; ++block_row_idx) {
for (uint32_t wire_idx = 0; wire_idx < NUM_WIRES; ++wire_idx) {
uint32_t var_idx = block.wires[wire_idx][block_row_idx]; // an index into the variables array
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,10 @@ template <class Flavor> class ExecutionTrace_ {

TraceData(Builder& builder, ProvingKey& proving_key)
{

#ifdef TRACY_MEMORY
ZoneScopedN("TraceData constructor");
#endif
if constexpr (IsHonkFlavor<Flavor>) {
// Initialize and share the wire and selector polynomials
for (auto [wire, other_wire] : zip_view(wires, proving_key.polynomials.get_wires())) {
Expand All @@ -45,7 +48,10 @@ template <class Flavor> class ExecutionTrace_ {
proving_key.polynomial_store.put(wire_tag, wires[idx].share());
}
{

#ifdef TRACY_MEMORY
ZoneScopedN("selector initialization");
#endif
for (size_t idx = 0; idx < Builder::Arithmetization::NUM_SELECTORS; ++idx) {
selectors[idx] = Polynomial(proving_key.circuit_size);
std::string selector_tag = builder.selector_names[idx] + "_lagrange";
Expand All @@ -54,7 +60,10 @@ template <class Flavor> class ExecutionTrace_ {
}
}
{

#ifdef TRACY_MEMORY
ZoneScopedN("copy cycle initialization");
#endif
copy_cycles.resize(builder.variables.size());
}
}
Expand Down
3 changes: 3 additions & 0 deletions barretenberg/cpp/src/barretenberg/flavor/flavor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,10 @@ template <typename FF, typename CommitmentKey_> class ProvingKey_ {
std::shared_ptr<CommitmentKey_> commitment_key = nullptr)
{
if (commitment_key == nullptr) {

#ifdef TRACY_MEMORY
ZoneScopedN("init commitment key");
#endif
this->commitment_key = std::make_shared<CommitmentKey_>(circuit_size);
} else {
// Don't create another commitment key if we already have one
Expand Down
Loading
Loading