-
Notifications
You must be signed in to change notification settings - Fork 268
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: Use structured polys to reduce prover memory #8587
Changes from 27 commits
ff435f3
b184022
9942d6e
aa6db5c
1362cb4
da06f55
125d47c
e906410
f01b646
e1e949b
9f5b8bb
b045e61
3cb43fb
c689569
f95c018
313c491
f22f7ef
fe91327
8277038
d7e7530
7e75ce6
111a037
f1b5578
3770913
1080eb6
f8a6c00
56d36e7
3c8d5d4
746a58b
b4c30ea
a472716
e770973
1773af7
648c014
765b074
7c193e6
0b03bca
4162a8c
006da13
7d44083
3a190d8
de71b76
33d65cb
8254e6a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -55,12 +55,61 @@ template <class Flavor> | |
typename ExecutionTrace_<Flavor>::TraceData ExecutionTrace_<Flavor>::construct_trace_data( | ||
Builder& builder, typename Flavor::ProvingKey& proving_key, bool is_structured) | ||
{ | ||
ZoneScopedN("construct_trace_data"); | ||
TraceData trace_data{ builder, proving_key }; | ||
|
||
// Complete the public inputs execution trace block from builder.public_inputs | ||
populate_public_inputs_block(builder); | ||
|
||
ZoneScopedN("construct_trace_data"); | ||
// Allocate the wires and selectors polynomials | ||
if constexpr (IsHonkFlavor<Flavor>) { | ||
for (auto& wire : proving_key.polynomials.get_wires()) { | ||
wire = Polynomial::shiftable(proving_key.circuit_size); | ||
} | ||
// Define selectors over the block they are isolated to | ||
uint32_t offset = Flavor::has_zero_row ? 1 : 0; | ||
lucasxia01 marked this conversation as resolved.
Show resolved
Hide resolved
|
||
if constexpr (IsGoblinFlavor<Flavor>) { | ||
offset += builder.blocks.ecc_op.get_fixed_size(is_structured); | ||
} | ||
offset += builder.blocks.pub_inputs.get_fixed_size(is_structured); | ||
|
||
proving_key.polynomials.q_arith = | ||
Polynomial(proving_key.circuit_size - offset, proving_key.circuit_size, offset); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is this because of the aux relation using q_arith? I don't see why There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. oh yeah this was me just checking that things worked. Forgot to set it to what it should be, which is the end of the aux block. |
||
offset += builder.blocks.arithmetic.get_fixed_size(is_structured); | ||
|
||
proving_key.polynomials.q_delta_range = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This may not be an important optimization but my original vision was that the data in the block would simply become (or be moved into) the polynomial mem block. Not sure to what extent this is supported with the current poly implementation There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yeah, I share that same vision. Was planning to try to do it in a followup PR |
||
Polynomial(builder.blocks.delta_range.size(), proving_key.circuit_size, offset); | ||
offset += builder.blocks.delta_range.get_fixed_size(is_structured); | ||
|
||
proving_key.polynomials.q_elliptic = | ||
Polynomial(builder.blocks.elliptic.size(), proving_key.circuit_size, offset); | ||
offset += builder.blocks.elliptic.get_fixed_size(is_structured); | ||
|
||
proving_key.polynomials.q_aux = Polynomial(builder.blocks.aux.size(), proving_key.circuit_size, offset); | ||
offset += builder.blocks.aux.get_fixed_size(is_structured); | ||
|
||
proving_key.polynomials.q_lookup = Polynomial(builder.blocks.lookup.size(), proving_key.circuit_size, offset); | ||
offset += builder.blocks.lookup.get_fixed_size(is_structured); | ||
|
||
if constexpr (HasDataBus<Flavor>) { | ||
proving_key.polynomials.q_busread = | ||
Polynomial(builder.blocks.busread.size(), proving_key.circuit_size, offset); | ||
offset += builder.blocks.busread.get_fixed_size(is_structured); | ||
} | ||
|
||
proving_key.polynomials.q_poseidon2_external = | ||
Polynomial(builder.blocks.poseidon2_external.size(), proving_key.circuit_size, offset); | ||
|
||
offset += builder.blocks.poseidon2_external.get_fixed_size(is_structured); | ||
proving_key.polynomials.q_poseidon2_internal = | ||
Polynomial(builder.blocks.poseidon2_internal.size(), proving_key.circuit_size, offset); | ||
offset += builder.blocks.poseidon2_internal.get_fixed_size(is_structured); | ||
// Set the other selector polynomials to full size | ||
for (auto& selector : proving_key.polynomials.get_non_gate_selectors()) { | ||
selector = Polynomial(proving_key.circuit_size); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. these along with the wires, sigmas, ids, and z_perm could be restricted to the actual circuit size and not the dyadic circuit_size. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is true but due to the nature of the structured trace I dont think this would ever be significant. Most we could save is the unused portion of the last block There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or potentially more if we don't even use some blocks (which means getting rid of the ensure_nonzero stuff) |
||
} | ||
} | ||
|
||
TraceData trace_data{ builder, proving_key }; | ||
|
||
uint32_t offset = Flavor::has_zero_row ? 1 : 0; // Offset at which to place each block in the trace polynomials | ||
// For each block in the trace, populate wire polys, copy cycles and selector polys | ||
|
||
|
@@ -134,12 +183,20 @@ void ExecutionTrace_<Flavor>::add_ecc_op_wires_to_proving_key(Builder& builder, | |
typename Flavor::ProvingKey& proving_key) | ||
requires IsGoblinFlavor<Flavor> | ||
{ | ||
// Copy the ecc op data from the conventional wires into the op wires over the range of ecc op gates | ||
auto& ecc_op_selector = proving_key.polynomials.lagrange_ecc_op; | ||
const size_t op_wire_offset = Flavor::has_zero_row ? 1 : 0; | ||
// Allocate the ecc op wires and selector | ||
const size_t num_ecc_ops = builder.blocks.ecc_op.size(); | ||
if constexpr (IsHonkFlavor<Flavor>) { | ||
for (auto& wire : proving_key.polynomials.get_ecc_op_wires()) { | ||
wire = Polynomial(num_ecc_ops, proving_key.circuit_size, op_wire_offset); | ||
} | ||
ecc_op_selector = Polynomial(num_ecc_ops, proving_key.circuit_size, op_wire_offset); | ||
} | ||
// Copy the ecc op data from the conventional wires into the op wires over the range of ecc op gates | ||
for (auto [ecc_op_wire, wire] : | ||
zip_view(proving_key.polynomials.get_ecc_op_wires(), proving_key.polynomials.get_wires())) { | ||
for (size_t i = 0; i < builder.blocks.ecc_op.size(); ++i) { | ||
for (size_t i = 0; i < num_ecc_ops; ++i) { | ||
size_t idx = i + op_wire_offset; | ||
ecc_op_wire.at(idx) = wire[idx]; | ||
ecc_op_selector.at(idx) = 1; // construct selector as the indicator on the ecc op block | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -68,8 +68,6 @@ template <typename FF, size_t NUM_WIRES, size_t NUM_SELECTORS> class ExecutionTr | |
bool has_ram_rom = false; // does the block contain RAM/ROM gates | ||
bool is_pub_inputs = false; // is this the public inputs block | ||
|
||
uint32_t fixed_size = 0; // Fixed size for use in structured trace | ||
|
||
bool operator==(const ExecutionTraceBlock& other) const = default; | ||
|
||
size_t size() const { return std::get<0>(this->wires).size(); } | ||
|
@@ -104,6 +102,8 @@ template <typename FF, size_t NUM_WIRES, size_t NUM_SELECTORS> class ExecutionTr | |
} | ||
} | ||
#endif | ||
private: | ||
uint32_t fixed_size = 0; // Fixed size for use in structured trace | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. made this private to avoid accidentally getting it. Need to call get_fixed_size() instead because it's 0 in the unstructured case. |
||
}; | ||
|
||
} // namespace bb |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -384,6 +384,13 @@ void compute_permutation_argument_polynomials(const typename Flavor::CircuitBuil | |
compute_monomial_and_coset_fft_polynomials_from_lagrange<Flavor::NUM_WIRES>("id", key); | ||
} | ||
} else if constexpr (IsUltraFlavor<Flavor>) { // any UltraHonk flavor | ||
// Allocate sigma and id polynomials | ||
for (auto& sigma : key->polynomials.get_sigmas()) { | ||
sigma = typename Flavor::Polynomial(key->circuit_size); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. wasn't sure if I could restrict any of these There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. could restrict to the non-dyadic circuit size potentially? Unsure because computing them iterates over the EvaluationDomain... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not much we can do here I dont think |
||
} | ||
for (auto& id : key->polynomials.get_ids()) { | ||
id = typename Flavor::Polynomial(key->circuit_size); | ||
} | ||
// Compute Honk-style sigma and ID polynomials from the corresponding mappings | ||
{ | ||
ZoneScopedN("compute_honk_style_permutation_lagrange_polynomials_from_mapping"); | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,6 +47,7 @@ SharedShiftedVirtualZeroesArray<Fr> _clone(const SharedShiftedVirtualZeroesArray | |
template <typename Fr> | ||
void Polynomial<Fr>::allocate_backing_memory(size_t size, size_t virtual_size, size_t start_index) | ||
{ | ||
ASSERT(start_index + size <= virtual_size); | ||
coefficients_ = SharedShiftedVirtualZeroesArray<Fr>{ | ||
start_index, /* start index, used for shifted polynomials and offset 'islands' of non-zeroes */ | ||
size + start_index, /* end index, actual memory used is (end - start) */ | ||
|
@@ -158,8 +159,9 @@ template <typename Fr> bool Polynomial<Fr>::operator==(Polynomial const& rhs) co | |
template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator+=(PolynomialSpan<const Fr> other) | ||
{ | ||
// Compute the subset that is defined in both polynomials | ||
ASSERT(start_index() <= other.start_index); | ||
ASSERT(end_index() >= other.end_index()); | ||
if (other.start_index < start_index() || other.end_index() > end_index()) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Did this situation arise? I'm wondering if its best to silently expand polys like this when it happens.. I'm assuming resizing the memory like this can pretty undesirable. Might be better to alert the user so that the memory can be allocated correctly in the first place. Maybe I'm missing some context tho There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, this situation arises in the unstructured case because we end up folding polynomials with different active regions. You're right that this could be dangerous, but not sure how we could alert the user unless we throw an error... I think it would be unfortunate if we had to set the active regions of all polys to be the whole circuit size in the unstructured case. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Note this can cause bugs because it breaks .share() functionality |
||
*this = expand(std::min(start_index(), other.start_index), std::max(end_index(), other.end_index())); | ||
} | ||
size_t num_threads = calculate_num_threads(other.size()); | ||
size_t range_per_thread = other.size() / num_threads; | ||
size_t leftovers = other.size() - (range_per_thread * num_threads); | ||
|
@@ -287,14 +289,15 @@ Fr Polynomial<Fr>::compute_barycentric_evaluation(const Fr& z, const EvaluationD | |
template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator-=(PolynomialSpan<const Fr> other) | ||
{ | ||
// Compute the subset that is defined in both polynomials | ||
ASSERT(start_index() <= other.start_index); | ||
ASSERT(end_index() >= other.end_index()); | ||
size_t num_threads = calculate_num_threads(other.size()); | ||
size_t range_per_thread = other.size() / num_threads; | ||
size_t leftovers = other.size() - (range_per_thread * num_threads); | ||
if (other.start_index < start_index() || other.end_index() > end_index()) { | ||
*this = expand(std::min(start_index(), other.start_index), std::max(end_index(), other.end_index())); | ||
} | ||
const size_t num_threads = calculate_num_threads(other.size()); | ||
const size_t range_per_thread = other.size() / num_threads; | ||
const size_t leftovers = other.size() - (range_per_thread * num_threads); | ||
parallel_for(num_threads, [&](size_t j) { | ||
size_t offset = j * range_per_thread + other.start_index; | ||
size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread; | ||
const size_t offset = j * range_per_thread + other.start_index; | ||
const size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread; | ||
for (size_t i = offset; i < end; ++i) { | ||
at(i) -= other[i]; | ||
} | ||
|
@@ -304,12 +307,12 @@ template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator-=(PolynomialSpan | |
|
||
template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator*=(const Fr scaling_factor) | ||
{ | ||
size_t num_threads = calculate_num_threads(size()); | ||
size_t range_per_thread = size() / num_threads; | ||
size_t leftovers = size() - (range_per_thread * num_threads); | ||
const size_t num_threads = calculate_num_threads(size()); | ||
const size_t range_per_thread = size() / num_threads; | ||
const size_t leftovers = size() - (range_per_thread * num_threads); | ||
parallel_for(num_threads, [&](size_t j) { | ||
size_t offset = j * range_per_thread; | ||
size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread; | ||
const size_t offset = j * range_per_thread; | ||
const size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread; | ||
for (size_t i = offset; i < end; ++i) { | ||
data()[i] *= scaling_factor; | ||
} | ||
|
@@ -318,24 +321,38 @@ template <typename Fr> Polynomial<Fr>& Polynomial<Fr>::operator*=(const Fr scali | |
return *this; | ||
} | ||
|
||
template <typename Fr> Polynomial<Fr> Polynomial<Fr>::full() const | ||
template <typename Fr> | ||
Polynomial<Fr> Polynomial<Fr>::expand(const size_t new_start_index, const size_t new_end_index) const | ||
{ | ||
ASSERT(new_end_index <= virtual_size()); | ||
ASSERT(new_start_index <= start_index()); | ||
ASSERT(new_end_index >= end_index()); | ||
if (new_start_index == start_index() && new_end_index == end_index()) { | ||
return *this; | ||
} | ||
Polynomial result = *this; | ||
// Make 0..virtual_size usable | ||
result.coefficients_ = _clone(coefficients_, virtual_size() - end_index(), start_index()); | ||
// Make new_start_index..new_end_index usable | ||
result.coefficients_ = _clone(coefficients_, new_end_index - end_index(), start_index() - new_start_index); | ||
return result; | ||
} | ||
|
||
template <typename Fr> Polynomial<Fr> Polynomial<Fr>::full() const | ||
{ | ||
// Make 0..virtual_size usable | ||
return expand(0, virtual_size()); | ||
} | ||
template <typename Fr> void Polynomial<Fr>::add_scaled(PolynomialSpan<const Fr> other, Fr scaling_factor) & | ||
{ | ||
// Compute the subset that is defined in both polynomials | ||
ASSERT(start_index() <= other.start_index); | ||
ASSERT(end_index() >= other.end_index()); | ||
size_t num_threads = calculate_num_threads(other.size()); | ||
size_t range_per_thread = other.size() / num_threads; | ||
size_t leftovers = other.size() - (range_per_thread * num_threads); | ||
if (other.start_index < start_index() || other.end_index() > end_index()) { | ||
*this = expand(std::min(start_index(), other.start_index), std::max(end_index(), other.end_index())); | ||
} | ||
const size_t num_threads = calculate_num_threads(other.size()); | ||
const size_t range_per_thread = other.size() / num_threads; | ||
const size_t leftovers = other.size() - (range_per_thread * num_threads); | ||
parallel_for(num_threads, [&](size_t j) { | ||
size_t offset = j * range_per_thread + other.start_index; | ||
size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread; | ||
const size_t offset = j * range_per_thread + other.start_index; | ||
const size_t end = (j == num_threads - 1) ? offset + range_per_thread + leftovers : offset + range_per_thread; | ||
for (size_t i = offset; i < end; ++i) { | ||
at(i) += scaling_factor * other[i]; | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
switched the order so that we populate the public_inputs_block first