diff --git a/docs/root/configuration/observability/statistics.rst b/docs/root/configuration/observability/statistics.rst index 0042051e0acb..376263f42bb4 100644 --- a/docs/root/configuration/observability/statistics.rst +++ b/docs/root/configuration/observability/statistics.rst @@ -16,7 +16,7 @@ Server related statistics are rooted at *server.* with following statistics: uptime, Gauge, Current server uptime in seconds concurrency, Gauge, Number of worker threads - memory_allocated, Gauge, Current amount of allocated memory in bytes. Total of both new and old Envoy processes on hot restart. + memory_allocated, Gauge, Current amount of allocated memory in bytes. Total of both new and old Envoy processes on hot restart. memory_heap_size, Gauge, Current reserved heap size in bytes. New Envoy process heap size on hot restart. live, Gauge, "1 if the server is not currently draining, 0 otherwise" state, Gauge, Current :ref:`State ` of the Server. @@ -30,6 +30,8 @@ Server related statistics are rooted at *server.* with following statistics: static_unknown_fields, Counter, Number of messages in static configuration with unknown fields dynamic_unknown_fields, Counter, Number of messages in dynamic configuration with unknown fields +.. _filesystem_stats: + File system ----------- @@ -40,7 +42,8 @@ Statistics related to file system are emitted in the *filesystem.* namespace. :widths: 1, 1, 2 write_buffered, Counter, Total number of times file data is moved to Envoy's internal flush buffer - write_completed, Counter, Total number of times a file was written + write_completed, Counter, Total number of times a file was successfully written + write_failed, Counter, Total number of times an error occurred during a file write operation flushed_by_timer, Counter, Total number of times internal flush buffers are written to a file due to flush timeout reopen_failed, Counter, Total number of times a file was failed to be opened write_total_buffered, Gauge, Current total size of internal flush buffer in bytes diff --git a/docs/root/intro/version_history.rst b/docs/root/intro/version_history.rst index 63fb953c060a..5152e89fdc65 100644 --- a/docs/root/intro/version_history.rst +++ b/docs/root/intro/version_history.rst @@ -5,6 +5,7 @@ Version history ================ * access log: added :ref:`buffering ` and :ref:`periodical flushing ` support to gRPC access logger. Defaults to 16KB buffer and flushing every 1 second. * access log: gRPC Access Log Service (ALS) support added for :ref:`TCP access logs `. +* access log: reintroduce :ref:`filesystem ` stats and added the `write_failed` counter to track failed log writes * admin: added ability to configure listener :ref:`socket options `. * admin: added config dump support for Secret Discovery Service :ref:`SecretConfigDump `. * api: added ::ref:`set_node_on_first_message_only ` option to omit the node identifier from the subsequent discovery requests on the same stream. diff --git a/source/common/access_log/access_log_manager_impl.cc b/source/common/access_log/access_log_manager_impl.cc index cf5a98daaf47..fd576b9b00e1 100644 --- a/source/common/access_log/access_log_manager_impl.cc +++ b/source/common/access_log/access_log_manager_impl.cc @@ -100,8 +100,12 @@ void AccessLogFileImpl::doWrite(Buffer::Instance& buffer) { for (const Buffer::RawSlice& slice : slices) { absl::string_view data(static_cast(slice.mem_), slice.len_); const Api::IoCallSizeResult result = file_->write(data); - ASSERT(result.rc_ == static_cast(slice.len_)); - stats_.write_completed_.inc(); + if (result.ok() && result.rc_ == static_cast(slice.len_)) { + stats_.write_completed_.inc(); + } else { + // Probably disk full. + stats_.write_failed_.inc(); + } } } diff --git a/source/common/access_log/access_log_manager_impl.h b/source/common/access_log/access_log_manager_impl.h index 03efbf2d030b..64c9c2594b49 100644 --- a/source/common/access_log/access_log_manager_impl.h +++ b/source/common/access_log/access_log_manager_impl.h @@ -20,6 +20,7 @@ namespace Envoy { COUNTER(reopen_failed) \ COUNTER(write_buffered) \ COUNTER(write_completed) \ + COUNTER(write_failed) \ GAUGE(write_total_buffered, Accumulate) struct AccessLogFileStats { @@ -34,9 +35,9 @@ class AccessLogManagerImpl : public AccessLogManager { Event::Dispatcher& dispatcher, Thread::BasicLockable& lock, Stats::Store& stats_store) : file_flush_interval_msec_(file_flush_interval_msec), api_(api), dispatcher_(dispatcher), - lock_(lock), file_stats_{ACCESS_LOG_FILE_STATS( - POOL_COUNTER_PREFIX(stats_store, "access_log_file."), - POOL_GAUGE_PREFIX(stats_store, "access_log_file."))} {} + lock_(lock), file_stats_{ + ACCESS_LOG_FILE_STATS(POOL_COUNTER_PREFIX(stats_store, "filesystem."), + POOL_GAUGE_PREFIX(stats_store, "filesystem."))} {} // AccessLog::AccessLogManager void reopen() override; diff --git a/test/common/access_log/access_log_manager_impl_test.cc b/test/common/access_log/access_log_manager_impl_test.cc index d8d9446d856f..3b28ee6526a2 100644 --- a/test/common/access_log/access_log_manager_impl_test.cc +++ b/test/common/access_log/access_log_manager_impl_test.cc @@ -8,6 +8,8 @@ #include "test/mocks/api/mocks.h" #include "test/mocks/event/mocks.h" #include "test/mocks/filesystem/mocks.h" +#include "test/test_common/test_time.h" +#include "test/test_common/utility.h" #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -37,6 +39,14 @@ class AccessLogManagerImplTest : public testing::Test { EXPECT_CALL(api_, threadFactory()).WillRepeatedly(ReturnRef(thread_factory_)); } + void waitForCounterEq(const std::string& name, uint64_t value) { + TestUtility::waitForCounterEq(store_, name, value, time_system_); + } + + void waitForGaugeEq(const std::string& name, uint64_t value) { + TestUtility::waitForGaugeEq(store_, name, value, time_system_); + } + NiceMock api_; NiceMock file_system_; NiceMock* file_; @@ -46,6 +56,7 @@ class AccessLogManagerImplTest : public testing::Test { NiceMock dispatcher_; Thread::MutexBasicLockable lock_; AccessLogManagerImpl access_log_manager_; + Event::TestRealTimeSystem time_system_; }; TEST_F(AccessLogManagerImplTest, BadFile) { @@ -74,9 +85,18 @@ TEST_F(AccessLogManagerImplTest, flushToLogFilePeriodically) { EXPECT_CALL(*file_, open_(_)).WillOnce(Return(ByMove(Filesystem::resultSuccess(true)))); AccessLogFileSharedPtr log_file = access_log_manager_.createAccessLog("foo"); + EXPECT_EQ(0UL, store_.counter("filesystem.write_failed").value()); + EXPECT_EQ(0UL, store_.counter("filesystem.write_completed").value()); + EXPECT_EQ(0UL, store_.counter("filesystem.flushed_by_timer").value()); + EXPECT_EQ(0UL, store_.counter("filesystem.write_buffered").value()); + EXPECT_CALL(*timer, enableTimer(timeout_40ms_, _)); EXPECT_CALL(*file_, write_(_)) - .WillOnce(Invoke([](absl::string_view data) -> Api::IoCallSizeResult { + .WillOnce(Invoke([&](absl::string_view data) -> Api::IoCallSizeResult { + EXPECT_EQ( + 4UL, + store_.gauge("filesystem.write_total_buffered", Stats::Gauge::ImportMode::Accumulate) + .value()); EXPECT_EQ(0, data.compare("test")); return Filesystem::resultSuccess(static_cast(data.length())); })); @@ -90,14 +110,25 @@ TEST_F(AccessLogManagerImplTest, flushToLogFilePeriodically) { } } + waitForCounterEq("filesystem.write_completed", 1); + EXPECT_EQ(1UL, store_.counter("filesystem.write_buffered").value()); + EXPECT_EQ(0UL, store_.counter("filesystem.flushed_by_timer").value()); + waitForGaugeEq("filesystem.write_total_buffered", 0); + EXPECT_CALL(*file_, write_(_)) - .WillOnce(Invoke([](absl::string_view data) -> Api::IoCallSizeResult { + .WillOnce(Invoke([&](absl::string_view data) -> Api::IoCallSizeResult { + EXPECT_EQ( + 5UL, + store_.gauge("filesystem.write_total_buffered", Stats::Gauge::ImportMode::Accumulate) + .value()); EXPECT_EQ(0, data.compare("test2")); return Filesystem::resultSuccess(static_cast(data.length())); })); - // make sure timer is re-enabled on callback call log_file->write("test2"); + EXPECT_EQ(2UL, store_.counter("filesystem.write_buffered").value()); + + // make sure timer is re-enabled on callback call EXPECT_CALL(*timer, enableTimer(timeout_40ms_, _)); timer->invokeCallback(); @@ -107,6 +138,13 @@ TEST_F(AccessLogManagerImplTest, flushToLogFilePeriodically) { file_->write_event_.wait(file_->write_mutex_); } } + + waitForCounterEq("filesystem.write_completed", 2); + EXPECT_EQ(0UL, store_.counter("filesystem.write_failed").value()); + EXPECT_EQ(1UL, store_.counter("filesystem.flushed_by_timer").value()); + EXPECT_EQ(2UL, store_.counter("filesystem.write_buffered").value()); + waitForGaugeEq("filesystem.write_total_buffered", 0); + EXPECT_CALL(*file_, close_()).WillOnce(Return(ByMove(Filesystem::resultSuccess(true)))); } @@ -116,21 +154,24 @@ TEST_F(AccessLogManagerImplTest, flushToLogFileOnDemand) { EXPECT_CALL(*file_, open_(_)).WillOnce(Return(ByMove(Filesystem::resultSuccess(true)))); AccessLogFileSharedPtr log_file = access_log_manager_.createAccessLog("foo"); + EXPECT_EQ(0UL, store_.counter("filesystem.flushed_by_timer").value()); + EXPECT_CALL(*timer, enableTimer(timeout_40ms_, _)); - // The first write to a given file will start the flush thread, which can flush - // immediately (race on whether it will or not). So do a write and flush to - // get that state out of the way, then test that small writes don't trigger a flush. + // The first write to a given file will start the flush thread. Because AccessManagerImpl::write + // holds the write_lock_ when the thread is started, the thread will flush on its first loop, once + // it obtains the write_lock_. Perform a write to get all that out of the way. EXPECT_CALL(*file_, write_(_)) .WillOnce(Invoke([](absl::string_view data) -> Api::IoCallSizeResult { return Filesystem::resultSuccess(static_cast(data.length())); })); log_file->write("prime-it"); - log_file->flush(); uint32_t expected_writes = 1; { Thread::LockGuard lock(file_->write_mutex_); - EXPECT_EQ(expected_writes, file_->num_writes_); + while (file_->num_writes_ != expected_writes) { + file_->write_event_.wait(file_->write_mutex_); + } } EXPECT_CALL(*file_, write_(_)) @@ -150,9 +191,14 @@ TEST_F(AccessLogManagerImplTest, flushToLogFileOnDemand) { expected_writes++; { Thread::LockGuard lock(file_->write_mutex_); - EXPECT_EQ(expected_writes, file_->num_writes_); + while (file_->num_writes_ != expected_writes) { + file_->write_event_.wait(file_->write_mutex_); + } } + waitForCounterEq("filesystem.write_completed", 2); + EXPECT_EQ(0UL, store_.counter("filesystem.flushed_by_timer").value()); + EXPECT_CALL(*file_, write_(_)) .WillOnce(Invoke([](absl::string_view data) -> Api::IoCallSizeResult { EXPECT_EQ(0, data.compare("test2")); @@ -174,6 +220,36 @@ TEST_F(AccessLogManagerImplTest, flushToLogFileOnDemand) { EXPECT_CALL(*file_, close_()).WillOnce(Return(ByMove(Filesystem::resultSuccess(true)))); } +TEST_F(AccessLogManagerImplTest, flushCountsIOErrors) { + NiceMock* timer = new NiceMock(&dispatcher_); + + EXPECT_CALL(*file_, open_(_)).WillOnce(Return(ByMove(Filesystem::resultSuccess(true)))); + AccessLogFileSharedPtr log_file = access_log_manager_.createAccessLog("foo"); + + EXPECT_EQ(0UL, store_.counter("filesystem.write_failed").value()); + + EXPECT_CALL(*timer, enableTimer(timeout_40ms_, _)); + EXPECT_CALL(*file_, write_(_)) + .WillOnce(Invoke([](absl::string_view data) -> Api::IoCallSizeResult { + EXPECT_EQ(0, data.compare("test")); + return Filesystem::resultFailure(2UL, ENOSPC); + })); + + log_file->write("test"); + + { + Thread::LockGuard lock(file_->write_mutex_); + while (file_->num_writes_ != 1) { + file_->write_event_.wait(file_->write_mutex_); + } + } + + waitForCounterEq("filesystem.write_failed", 1); + EXPECT_EQ(0UL, store_.counter("filesystem.write_completed").value()); + + EXPECT_CALL(*file_, close_()).WillOnce(Return(ByMove(Filesystem::resultSuccess(true)))); +} + TEST_F(AccessLogManagerImplTest, reopenFile) { NiceMock* timer = new NiceMock(&dispatcher_); @@ -274,6 +350,8 @@ TEST_F(AccessLogManagerImplTest, reopenThrows) { // write call should not cause any exceptions log_file->write("random data"); timer->invokeCallback(); + + waitForCounterEq("filesystem.reopen_failed", 1); } TEST_F(AccessLogManagerImplTest, bigDataChunkShouldBeFlushedWithoutTimer) { diff --git a/test/exe/main_common_test.cc b/test/exe/main_common_test.cc index 4dadcb31875f..7af5aee32c6a 100644 --- a/test/exe/main_common_test.cc +++ b/test/exe/main_common_test.cc @@ -240,7 +240,7 @@ class AdminRequestTest : public MainCommonTest { TEST_P(AdminRequestTest, AdminRequestGetStatsAndQuit) { startEnvoy(); started_.WaitForNotification(); - EXPECT_THAT(adminRequest("/stats", "GET"), HasSubstr("access_log_file.reopen_failed")); + EXPECT_THAT(adminRequest("/stats", "GET"), HasSubstr("filesystem.reopen_failed")); adminRequest("/quitquitquit", "POST"); EXPECT_TRUE(waitForEnvoyToExit()); } @@ -253,7 +253,7 @@ TEST_P(AdminRequestTest, AdminRequestGetStatsAndKill) { // TODO(htuch): Remove when https://github.com/libevent/libevent/issues/779 is // fixed, started_ will then become our real synchronization point. waitForEnvoyRun(); - EXPECT_THAT(adminRequest("/stats", "GET"), HasSubstr("access_log_file.reopen_failed")); + EXPECT_THAT(adminRequest("/stats", "GET"), HasSubstr("filesystem.reopen_failed")); kill(getpid(), SIGTERM); EXPECT_TRUE(waitForEnvoyToExit()); } @@ -266,7 +266,7 @@ TEST_P(AdminRequestTest, AdminRequestGetStatsAndCtrlC) { // TODO(htuch): Remove when https://github.com/libevent/libevent/issues/779 is // fixed, started_ will then become our real synchronization point. waitForEnvoyRun(); - EXPECT_THAT(adminRequest("/stats", "GET"), HasSubstr("access_log_file.reopen_failed")); + EXPECT_THAT(adminRequest("/stats", "GET"), HasSubstr("filesystem.reopen_failed")); kill(getpid(), SIGINT); EXPECT_TRUE(waitForEnvoyToExit()); } @@ -335,7 +335,7 @@ TEST_P(AdminRequestTest, AdminRequestBeforeRun) { EXPECT_TRUE(admin_handler_was_called); // This just checks that some stat output was reported. We could pick any stat. - EXPECT_THAT(out, HasSubstr("access_log_file.reopen_failed")); + EXPECT_THAT(out, HasSubstr("filesystem.reopen_failed")); } // Class to track whether an object has been destroyed, which it does by bumping an atomic.