Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add UniformBinning to schemav2 #186

Merged
merged 7 commits into from
Jul 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 21 additions & 3 deletions include/correction.h
Original file line number Diff line number Diff line change
Expand Up @@ -153,26 +153,44 @@ class HashPRNG {
// common internal for Binning and MultiBinning
enum class _FlowBehavior {value, clamp, error};

using _NonUniformBins = std::vector<double>;

struct _UniformBins {
std::size_t n; // number of bins
double low; // lower edge of first bin
double high; // upper edge of last bin
};

class Binning {
public:
Binning(const JSONObject& json, const Correction& context);
const Content& child(const std::vector<Variable::Type>& values) const;

private:
std::vector<std::tuple<double, Content>> bins_;
std::variant<_UniformBins, _NonUniformBins> bins_; // bin edges
// bin contents: contents_[i] is the value corresponding to bins_[i+1].
// the default value is at contents_[0]
std::vector<Content> contents_;
size_t variableIdx_;
_FlowBehavior flow_;
};

struct _MultiBinningAxis {
size_t variableIdx;
size_t stride;
std::variant<_UniformBins, _NonUniformBins> bins;
};

class MultiBinning {
public:
MultiBinning(const JSONObject& json, const Correction& context);
size_t ndimensions() const { return axes_.size(); };
const Content& child(const std::vector<Variable::Type>& values) const;

private:
// variableIdx, stride, edges
std::vector<std::tuple<size_t, size_t, std::vector<double>>> axes_;
size_t nbins(size_t dimension) const;

std::vector<_MultiBinningAxis> axes_;
std::vector<Content> content_;
_FlowBehavior flow_;
};
Expand Down
210 changes: 154 additions & 56 deletions src/correction.cc
Original file line number Diff line number Diff line change
Expand Up @@ -319,17 +319,40 @@ double HashPRNG::evaluate(const std::vector<Variable::Type>& values) const {

Binning::Binning(const JSONObject& json, const Correction& context)
{
std::vector<double> edges;
for (const auto& item : json.getRequired<rapidjson::Value::ConstArray>("edges")) {
if ( ! item.IsDouble() ) { throw std::runtime_error("Invalid edges array type"); }
double val = item.GetDouble();
if ( edges.size() > 0 && edges.back() >= val ) { throw std::runtime_error("binning edges are not monotone increasing"); }
edges.push_back(val);
}
const auto& content = json.getRequired<rapidjson::Value::ConstArray>("content");
if ( edges.size() != content.Size() + 1 ) {
throw std::runtime_error("Inconsistency in Binning: number of content nodes does not match binning");

// set bins_
const auto &edgesObj = json.getRequiredValue("edges");
if ( edgesObj.IsArray() ) { // non-uniform binning
std::vector<double> edges;
rapidjson::Value::ConstArray edgesArr = edgesObj.GetArray();
for (const auto& edge : edgesArr) {
if ( ! edge.IsDouble() ) { throw std::runtime_error("Invalid edges array type"); }
double val = edge.GetDouble();
if ( edges.size() > 0 && edges.back() >= val ) { throw std::runtime_error("binning edges are not monotone increasing"); }
edges.push_back(val);
}
if ( edges.size() != content.Size() + 1 ) {
throw std::runtime_error("Inconsistency in Binning: number of content nodes does not match binning");
}
_NonUniformBins bins{std::move(edges)};
bins_ = std::move(bins);
} else if ( edgesObj.IsObject() ) { // UniformBinning
const JSONObject uniformBins{edgesObj.GetObject()};
const auto n = uniformBins.getRequired<uint64_t>("n");
nsmith- marked this conversation as resolved.
Show resolved Hide resolved
if ( n == 0 ) {
throw std::runtime_error("Error when processing Binning with UniformBinning: number of bins is zero");
}
if ( n != content.Size() ) {
throw std::runtime_error("Inconsistency in Binning: number of content nodes does not match binning");
}
const auto low = uniformBins.getRequired<double>("low");
const auto high = uniformBins.getRequired<double>("high");
bins_ = _UniformBins{n, low, high};
} else {
throw std::runtime_error ("Error when processing Binning: edges are neither an array nor a UniformBinning object");
}

variableIdx_ = context.input_index(json.getRequired<std::string_view>("input"));
Content default_value{0.};
const auto& flowbehavior = json.getRequiredValue("flow");
Expand All @@ -343,19 +366,40 @@ Binning::Binning(const JSONObject& json, const Correction& context)
flow_ = _FlowBehavior::value;
default_value = resolve_content(flowbehavior, context);
}
bins_.reserve(edges.size());
// first bin is never accessed for content in range (corresponds to std::upper_bound underflow)
// use it to store default value
bins_.push_back({*edges.begin(), std::move(default_value)});
for (size_t i=0; i < content.Size(); ++i) {
bins_.push_back({edges[i + 1], resolve_content(content[i], context)});
}

// set bin contents
contents_.push_back(std::move(default_value));
for (size_t i=0; i < content.Size(); ++i)
contents_.push_back(resolve_content(content[i], context));
}

const Content& Binning::child(const std::vector<Variable::Type>& values) const {
double value = std::get<double>(values[variableIdx_]);
auto it = std::upper_bound(std::begin(bins_), std::end(bins_), value, [](const double& a, const auto& b) { return a < std::get<0>(b); });
if ( it == std::begin(bins_) ) {

if ( auto *bins = std::get_if<_UniformBins>(&bins_) ) { // uniform binning
std::size_t binIdx = bins->n * ((value - bins->low) / (bins->high - bins->low));
if (value < bins->low || value >= bins->high) {
switch (flow_) {
case _FlowBehavior::value:
return contents_[0u]; // the default value
case _FlowBehavior::clamp:
binIdx = value < bins->low ? 0 : bins->n - 1; // assuming we always have at least 1 bin
break;
case _FlowBehavior::error:
const std::string belowOrAbove = value < bins->low ? "below" : "above";
const auto msg = "Index " + belowOrAbove + " bounds in Binning for input argument " + std::to_string(variableIdx_) + " value: " + std::to_string(value);
throw std::runtime_error(std::move(msg));
}
}

return contents_[binIdx + 1u]; // skipping the default value at index 0
}

// otherwise we have non-uniform binning
const auto bins = std::get<_NonUniformBins>(bins_);

auto it = std::upper_bound(std::begin(bins), std::end(bins), value);
if ( it == std::begin(bins) ) {
if ( flow_ == _FlowBehavior::value ) {
// default value already at std::begin
}
Expand All @@ -366,9 +410,9 @@ const Content& Binning::child(const std::vector<Variable::Type>& values) const {
it++;
}
}
else if ( it == std::end(bins_) ) {
else if ( it == std::end(bins) ) {
if ( flow_ == _FlowBehavior::value ) {
it = std::begin(bins_);
it = std::begin(bins);
}
else if ( flow_ == _FlowBehavior::error ) {
throw std::runtime_error("Index above bounds in Binning for input argument " + std::to_string(variableIdx_) + " value: " + std::to_string(value));
Expand All @@ -377,35 +421,53 @@ const Content& Binning::child(const std::vector<Variable::Type>& values) const {
it--;
}
}
return std::get<1>(*it);

return contents_[std::distance(std::begin(bins), it)];
}

MultiBinning::MultiBinning(const JSONObject& json, const Correction& context)
{
const auto& edges = json.getRequired<rapidjson::Value::ConstArray>("edges");
const auto& inputs = json.getRequired<rapidjson::Value::ConstArray>("inputs");

const auto& edges = json.getRequired<rapidjson::Value::ConstArray>("edges");
axes_.reserve(edges.Size());
size_t idx {0};
for (const auto& dimension : edges) {
if ( ! dimension.IsArray() ) { throw std::runtime_error("Invalid MultiBinning edges array"); }
std::vector<double> dim_edges;
dim_edges.reserve(dimension.GetArray().Size());
for (const auto& item : dimension.GetArray()) {
double val = item.GetDouble();
if ( dim_edges.size() > 0 && dim_edges.back() >= val ) { throw std::runtime_error("binning edges are not monotone increasing"); }
dim_edges.push_back(val);
}
const auto& input = inputs[idx];
if ( ! input.IsString() ) { throw std::runtime_error("invalid multibinning input type"); }
axes_.push_back({context.input_index(input.GetString()), 0, std::move(dim_edges)});
idx++;
if ( dimension.IsArray() ) { // non-uniform binning
std::vector<double> dim_edges;
dim_edges.reserve(dimension.GetArray().Size());
for (const auto& item : dimension.GetArray()) {
double val = item.GetDouble();
if ( dim_edges.size() > 0 && dim_edges.back() >= val ) { throw std::runtime_error("binning edges are not monotone increasing"); }
dim_edges.push_back(val);
}
if ( ! input.IsString() ) { throw std::runtime_error("invalid multibinning input type"); }
axes_.push_back({context.input_index(input.GetString()), 0, _NonUniformBins(std::move(dim_edges))});
} else if ( dimension.IsObject() ) { // UniformBinning
const JSONObject uniformBins{dimension.GetObject()};
const auto n = uniformBins.getRequired<uint64_t>("n");
if ( n == 0 ) {
auto msg = "Error when processing MultiBinning: number of bins for dimension " + std::to_string(idx) + " is zero";
throw std::runtime_error(std::move(msg));
}
const auto low = uniformBins.getRequired<double>("low");
const auto high = uniformBins.getRequired<double>("high");
axes_.push_back({context.input_index(input.GetString()), 0, _UniformBins{n, low, high}});
} else {
auto msg = "Error when processing MultiBinning: edges for dimension " + std::to_string(idx) + " are neither an array nor a UniformBinning object";
throw std::runtime_error (std::move(msg));
}
++idx;
}

const auto& content = json.getRequired<rapidjson::Value::ConstArray>("content");
size_t stride {1};
--idx; // now corresponds to the last dimension
for (auto it=axes_.rbegin(); it != axes_.rend(); ++it) {
std::get<1>(*it) = stride;
stride *= std::get<2>(*it).size() - 1;
it->stride = stride;
stride *= nbins(idx);
--idx;
}
content_.reserve(content.Size() + 1); // + 1 for default value
for (const auto& item : content) {
Expand All @@ -429,39 +491,75 @@ MultiBinning::MultiBinning(const JSONObject& json, const Correction& context)
}
}

// TODO factor out logic in common with Binning::child.
// One notable difference is that MultiBinning stores the default value at the end of content_ instead of the beginning.
const Content& MultiBinning::child(const std::vector<Variable::Type>& values) const {
size_t idx {0};
for (const auto& [variableIdx, stride, edges] : axes_) {
size_t localidx {0};

for (const auto& [variableIdx, stride, edgesVariant] : axes_) {
double value = std::get<double>(values[variableIdx]);
auto it = std::upper_bound(std::begin(edges), std::end(edges), value);
if ( it == std::begin(edges) ) {
if ( flow_ == _FlowBehavior::value ) {
return *content_.rbegin();
}
else if ( flow_ == _FlowBehavior::error ) {
throw std::runtime_error("Index below bounds in MultiBinning for input argument " + std::to_string(variableIdx) + " val: " + std::to_string(value));
}
else { // clamp
it++;
}
}
else if ( it == std::end(edges) ) {
if ( flow_ == _FlowBehavior::value ) {
return *content_.rbegin();

if ( auto *bins = std::get_if<_UniformBins>(&edgesVariant) ) { // uniform bins
std::size_t binIdx = bins->n * ((value - bins->low) / (bins->high - bins->low));
if (value < bins->low || value >= bins->high) {
switch (flow_) {
case _FlowBehavior::value:
return content_.back(); // the default value
case _FlowBehavior::clamp:
binIdx = value < bins->low ? 0 : bins->n - 1; // assuming we always have at least 1 bin
break;
case _FlowBehavior::error:
const std::string belowOrAbove = value < bins->low ? "below" : "above";
const auto msg = "Index " + belowOrAbove + " bounds in MultiBinning for input argument " + std::to_string(variableIdx) + " value: " + std::to_string(value);
throw std::runtime_error(std::move(msg));
}
}
else if ( flow_ == _FlowBehavior::error ) {
throw std::runtime_error("Index above bounds in MultiBinning input argument" + std::to_string(variableIdx) + " val: " + std::to_string(value));
localidx = binIdx;
} else { // non-uniform bins
const auto edges = std::get<_NonUniformBins>(edgesVariant);
auto it = std::upper_bound(std::begin(edges), std::end(edges), value);
if ( it == std::begin(edges) ) {
if ( flow_ == _FlowBehavior::value ) {
return *content_.rbegin();
}
else if ( flow_ == _FlowBehavior::error ) {
throw std::runtime_error("Index below bounds in MultiBinning for input argument " + std::to_string(variableIdx) + " val: " + std::to_string(value));
}
else { // clamp
it++;
}
}
else { // clamp
it--;
else if ( it == std::end(edges) ) {
if ( flow_ == _FlowBehavior::value ) {
return content_.back();
}
else if ( flow_ == _FlowBehavior::error ) {
throw std::runtime_error("Index above bounds in MultiBinning input argument" + std::to_string(variableIdx) + " val: " + std::to_string(value));
}
else { // clamp
it--;
}
}
localidx = std::distance(std::begin(edges), it) - 1;
}
size_t localidx = std::distance(std::begin(edges), it) - 1;

idx += localidx * stride;
}

return content_.at(idx);
}

size_t MultiBinning::nbins(size_t dimension) const
{
if ( const auto *bins = std::get_if<_UniformBins>(&axes_[dimension].bins) )
return bins->n; // using uniform bins

// otherwise we must have non-uniform bins
const auto &bins = std::get<_NonUniformBins>(axes_[dimension].bins);
return bins.size() - 1;
}

Category::Category(const JSONObject& json, const Correction& context)
{
variableIdx_ = context.input_index(json.getRequired<std::string_view>("input"));
Expand Down
Loading