Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

policy: Add stats #995

Merged
merged 1 commit into from
Nov 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cilium/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ envoy_cc_library(
"@envoy//source/common/local_info:local_info_lib",
"@envoy//source/common/network:address_lib",
"@envoy//source/common/router:config_utility_lib",
"@envoy//source/common/stats:timespan_lib",
"@envoy//source/common/tls:context_config_lib",
"@envoy//source/server:transport_socket_config_lib",
"@envoy_api//envoy/type/matcher/v3:pkg_cc_proto",
Expand Down
6 changes: 6 additions & 0 deletions cilium/api/bpf_metadata.proto
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ option go_package = "github.com/cilium/proxy/go/cilium/api;cilium";

package cilium;

import "google/protobuf/duration.proto";

message BpfMetadata {
// File system root for bpf. Defaults to "/sys/fs/bpf" if left empty.
string bpf_root = 1;
Expand Down Expand Up @@ -44,4 +46,8 @@ message BpfMetadata {
// proxy_id is passed to access log messages and allows relating access log messages to
// listeners.
uint32 proxy_id = 8;

// policy_update_warning_limit is the time in milliseconds after which a warning is logged if
// network policy update took longer
google.protobuf.Duration policy_update_warning_limit = 9;
}
19 changes: 14 additions & 5 deletions cilium/bpf_metadata.cc
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,13 @@ createHostMap(Server::Configuration::ListenerFactoryContext& context) {
}

std::shared_ptr<const Cilium::NetworkPolicyMap>
createPolicyMap(Server::Configuration::FactoryContext& context, Cilium::CtMapSharedPtr& ct) {
createPolicyMap(Server::Configuration::FactoryContext& context, Cilium::CtMapSharedPtr& ct,
std::chrono::milliseconds policy_update_warning_limit_ms) {
return context.serverFactoryContext().singletonManager().getTyped<const Cilium::NetworkPolicyMap>(
SINGLETON_MANAGER_REGISTERED_NAME(cilium_network_policy), [&context, &ct] {
auto map = std::make_shared<Cilium::NetworkPolicyMap>(context, ct);
SINGLETON_MANAGER_REGISTERED_NAME(cilium_network_policy),
[&context, &ct, &policy_update_warning_limit_ms] {
auto map =
std::make_shared<Cilium::NetworkPolicyMap>(context, ct, policy_update_warning_limit_ms);
map->startSubscription();
return map;
});
Expand All @@ -113,7 +116,13 @@ Config::Config(const ::cilium::BpfMetadata& config,
Network::Utility::parseInternetAddressNoThrow(config.ipv4_source_address())),
ipv6_source_address_(
Network::Utility::parseInternetAddressNoThrow(config.ipv6_source_address())),
enforce_policy_on_l7lb_(config.enforce_policy_on_l7lb()) {
enforce_policy_on_l7lb_(config.enforce_policy_on_l7lb()),
policy_update_warning_limit_ms_(std::chrono::milliseconds(100)) {
const uint64_t limit = DurationUtil::durationToMilliseconds(config.policy_update_warning_limit());
if (limit > 0) {
policy_update_warning_limit_ms_ = std::chrono::milliseconds(limit);
}

if (is_l7lb_ && is_ingress_) {
throw EnvoyException("cilium.bpf_metadata: is_l7lb may not be set with is_ingress");
}
Expand Down Expand Up @@ -155,7 +164,7 @@ Config::Config(const ::cilium::BpfMetadata& config,
// Get the shared policy provider, or create it if not already created.
// Note that the API config source is assumed to be the same for all filter
// instances!
npmap_ = createPolicyMap(context, ct_maps_);
npmap_ = createPolicyMap(context, ct_maps_, policy_update_warning_limit_ms_);
}
}

Expand Down
1 change: 1 addition & 0 deletions cilium/bpf_metadata.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class Config : public Cilium::PolicyResolver,
Cilium::CtMapSharedPtr ct_maps_{};
Cilium::IPCacheSharedPtr ipcache_{};
std::shared_ptr<const Cilium::PolicyHostMap> hosts_{};
std::chrono::milliseconds policy_update_warning_limit_ms_;

private:
uint32_t resolveSourceIdentity(const PolicyInstanceConstSharedPtr policy,
Expand Down
11 changes: 7 additions & 4 deletions cilium/l7policy.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ namespace Cilium {
* All Cilium L7 filter stats. @see stats_macros.h
*/
// clang-format off
#define ALL_CILIUM_STATS(COUNTER) \
COUNTER(access_denied) \
#define ALL_CILIUM_STATS(COUNTER) \
mhofstetter marked this conversation as resolved.
Show resolved Hide resolved
COUNTER(access_denied)
// clang-format on

/**
Expand All @@ -37,7 +37,8 @@ class Config : public Logger::Loggable<Logger::Id::filter> {
public:
Config(const std::string& access_log_path, const std::string& denied_403_body,
TimeSource& time_source, Stats::Scope& scope, bool is_upstream);
Config(const ::cilium::L7Policy& config, TimeSource& time_source, Stats::Scope& scope, bool is_upstream);
Config(const ::cilium::L7Policy& config, TimeSource& time_source, Stats::Scope& scope,
bool is_upstream);

void Log(AccessLog::Entry&, ::cilium::EntryType);

Expand All @@ -55,7 +56,9 @@ typedef std::shared_ptr<Config> ConfigSharedPtr;
// Each request gets their own instance of this filter, and
// they can run parallel from multiple worker threads, all accessing
// the shared configuration.
class AccessFilter : public Http::StreamFilter, Logger::Loggable<Logger::Id::filter>, public Http::UpstreamCallbacks {
class AccessFilter : public Http::StreamFilter,
Logger::Loggable<Logger::Id::filter>,
public Http::UpstreamCallbacks {
public:
AccessFilter(ConfigSharedPtr& config) : config_(config) {}

Expand Down
43 changes: 26 additions & 17 deletions cilium/network_policy.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "source/common/init/watcher_impl.h"
#include "source/common/network/utility.h"
#include "source/common/protobuf/protobuf.h"
#include "source/common/stats/timespan_impl.h"

#include "absl/container/btree_set.h"
#include "absl/container/flat_hash_set.h"
Expand Down Expand Up @@ -1080,6 +1081,9 @@ NetworkPolicyMap::NetworkPolicyMap(Server::Configuration::FactoryContext& contex
local_ip_str_(context_.localInfo().address()->ip()->addressAsString()),
name_(fmt::format("cilium.policymap.{}.{}.", local_ip_str_, ++instance_id_)),
scope_(context_.serverScope().createScope(name_)),
stats_{ALL_CILIUM_POLICY_STATS(POOL_COUNTER_PREFIX(*scope_, "policy."),
jrajahalme marked this conversation as resolved.
Show resolved Hide resolved
POOL_HISTOGRAM_PREFIX(*scope_, "policy."))},
policy_update_warning_limit_ms_(100),
init_target_(fmt::format("Cilium Network Policy subscription start"),
[this]() { subscription_->start({}); }),
transport_factory_context_(
Expand All @@ -1106,9 +1110,11 @@ NetworkPolicyMap::NetworkPolicyMap(Server::Configuration::FactoryContext& contex

// This is used in production
NetworkPolicyMap::NetworkPolicyMap(Server::Configuration::FactoryContext& context,
Cilium::CtMapSharedPtr& ct)
Cilium::CtMapSharedPtr& ct,
std::chrono::milliseconds policy_update_warning_limit_ms)
: NetworkPolicyMap(context) {
ctmap_ = ct;
policy_update_warning_limit_ms_ = policy_update_warning_limit_ms;
}

// Both subscribe() call and subscription_->start() use
Expand Down Expand Up @@ -1185,6 +1191,9 @@ NetworkPolicyMap::onConfigUpdate(const std::vector<Envoy::Config::DecodedResourc
const std::string& version_info) {
ENVOY_LOG(debug, "NetworkPolicyMap::onConfigUpdate({}), {} resources, version: {}", name_,
resources.size(), version_info);
update_latency_ms_ = std::make_unique<Stats::HistogramCompletableTimespanImpl>(
stats_.update_latency_ms_, context_.timeSource());
stats_.updates_.inc();

absl::flat_hash_set<std::string> keeps;
absl::flat_hash_set<std::string> ct_maps_to_keep;
Expand Down Expand Up @@ -1231,7 +1240,8 @@ NetworkPolicyMap::onConfigUpdate(const std::vector<Envoy::Config::DecodedResourc
to_be_added->emplace_back(std::make_shared<PolicyInstanceImpl>(*this, new_hash, config));
}
} catch (const EnvoyException& e) {
ENVOY_LOG(debug, "NetworkPolicy update for version {} failed: {}", version_info, e.what());
ENVOY_LOG(warn, "NetworkPolicy update for version {} failed: {}", version_info, e.what());
stats_.updates_rejected_.inc();

// Allow main (Listener) init to continue after exceptions
init_target_.ready();
Expand Down Expand Up @@ -1313,9 +1323,6 @@ NetworkPolicyMap::onConfigUpdate(const std::vector<Envoy::Config::DecodedResourc
}
});

auto time_source = &context_.timeSource();
auto start_time = time_source->monotonicTime();

// Execute changes on all threads.
tls_map_.runOnAllThreads(
[to_be_added, to_be_deleted, version_info](OptRef<ThreadLocalPolicyMap> npmap) {
Expand All @@ -1333,18 +1340,7 @@ NetworkPolicyMap::onConfigUpdate(const std::vector<Envoy::Config::DecodedResourc
npmap->Update(*to_be_added, *to_be_deleted, version_info);
},
// All threads have executed updates, delete old cts and mark the local init target ready.
[shared_this, to_be_added, to_be_deleted, version_info, cts_to_be_closed, time_source,
start_time]() {
auto workers_done_time = time_source->monotonicTime();
auto duration =
std::chrono::duration_cast<std::chrono::milliseconds>(workers_done_time - start_time)
.count();
if (duration > 100) {
ENVOY_LOG(warn,
"Cilium L7 NetworkPolicyMap::onConfigUpdate(): Worker threads took longer "
"than 100ms to update network policy for version {} ({}ms)",
version_info, duration);
}
[shared_this, to_be_added, to_be_deleted, version_info, cts_to_be_closed]() {
// Update on the main thread last, so that deletes happen on the same thread as allocs
auto npmap = shared_this->tls_map_.get();
npmap->Update(*to_be_added, *to_be_deleted, version_info);
Expand All @@ -1353,6 +1349,19 @@ NetworkPolicyMap::onConfigUpdate(const std::vector<Envoy::Config::DecodedResourc
shared_this->ctmap_->closeMaps(cts_to_be_closed);
}
shared_this->version_init_target_->ready();

shared_this->update_latency_ms_->complete();
auto duration = shared_this->update_latency_ms_->elapsed();
shared_this->update_latency_ms_.reset();

if (duration > shared_this->policy_update_warning_limit_ms_) {
shared_this->stats_.updates_timed_out_.inc();
ENVOY_LOG(warn,
"Cilium L7 NetworkPolicyMap::onConfigUpdate(): Worker threads took longer "
"than {}ms to update network policy for version {} ({}ms)",
shared_this->policy_update_warning_limit_ms_.count(), version_info,
duration.count());
}
});
// Initialize SDS secrets, the watcher callback above will be called after all threads have
// updated policies and secrets have been fetched, or timeout (15 seconds) hits.
Expand Down
26 changes: 25 additions & 1 deletion cilium/network_policy.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
#include "envoy/http/header_map.h"
#include "envoy/server/filter_config.h"
#include "envoy/singleton/instance.h"
#include "envoy/stats/stats_macros.h"
#include "envoy/stats/timespan.h"
#include "envoy/thread_local/thread_local.h"

#include "source/common/common/logger.h"
Expand Down Expand Up @@ -163,13 +165,32 @@ class NetworkPolicyDecoder : public Envoy::Config::OpaqueResourceDecoder {
ProtobufMessage::ValidationVisitor& validation_visitor_;
};

/**
* All Cilium L7 filter stats. @see stats_macros.h
*/
// clang-format off
#define ALL_CILIUM_POLICY_STATS(COUNTER, HISTOGRAM) \
COUNTER(updates) \
COUNTER(updates_rejected) \
COUNTER(updates_timed_out) \
HISTOGRAM(update_latency_ms, Milliseconds)
// clang-format on

/**
* Struct definition for all policy stats. @see stats_macros.h
*/
struct PolicyStats {
ALL_CILIUM_POLICY_STATS(GENERATE_COUNTER_STRUCT, GENERATE_HISTOGRAM_STRUCT)
};

class NetworkPolicyMap : public Singleton::Instance,
public Envoy::Config::SubscriptionCallbacks,
public std::enable_shared_from_this<NetworkPolicyMap>,
public Logger::Loggable<Logger::Id::config> {
public:
NetworkPolicyMap(Server::Configuration::FactoryContext& context);
NetworkPolicyMap(Server::Configuration::FactoryContext& context, Cilium::CtMapSharedPtr& ct);
NetworkPolicyMap(Server::Configuration::FactoryContext& context, Cilium::CtMapSharedPtr& ct,
std::chrono::milliseconds policy_update_warning_limit_ms);
~NetworkPolicyMap() {
ENVOY_LOG(debug, "Cilium L7 NetworkPolicyMap({}): NetworkPolicyMap is deleted NOW!", name_);
}
Expand Down Expand Up @@ -232,6 +253,9 @@ class NetworkPolicyMap : public Singleton::Instance,
const std::string local_ip_str_;
std::string name_;
Stats::ScopeSharedPtr scope_;
PolicyStats stats_;
Stats::TimespanPtr update_latency_ms_;
std::chrono::milliseconds policy_update_warning_limit_ms_;

// init target which starts gRPC subscription
Init::TargetImpl init_target_;
Expand Down
Loading