Skip to content

Commit

Permalink
apacheGH-41329: [C++][Gandiva] Fix gandiva cache size env var (apache…
Browse files Browse the repository at this point in the history
…#41330)

### Rationale for this change

Gandiva cache size validity checks are not robust enough (the negativity test is broken), and they are not currently tested.

### What changes are included in this PR?

1. Fix checking gandiva cache size env var.
2. Make cache size static so it only gets evaluated once.
3. Add test cases.
4. Enrich the description in the document about this env var.

### Are these changes tested?

UT included.

### Are there any user-facing changes?

None.

* GitHub Issue: apache#41329

Lead-authored-by: Ruoxi Sun <[email protected]>
Co-authored-by: Rossi Sun <[email protected]>
Co-authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Antoine Pitrou <[email protected]>
  • Loading branch information
2 people authored and vibhatha committed May 25, 2024
1 parent 088d7b0 commit 51173ec
Show file tree
Hide file tree
Showing 4 changed files with 116 additions and 18 deletions.
47 changes: 31 additions & 16 deletions cpp/src/gandiva/cache.cc
Original file line number Diff line number Diff line change
Expand Up @@ -20,26 +20,41 @@
#include "arrow/result.h"
#include "arrow/util/io_util.h"
#include "arrow/util/logging.h"
#include "arrow/util/value_parsing.h"

namespace gandiva {

static const size_t DEFAULT_CACHE_SIZE = 5000;

int GetCapacity() {
size_t capacity = DEFAULT_CACHE_SIZE;
auto maybe_env_cache_size = ::arrow::internal::GetEnvVar("GANDIVA_CACHE_SIZE");
if (maybe_env_cache_size.ok()) {
const auto env_cache_size = *std::move(maybe_env_cache_size);
if (!env_cache_size.empty()) {
capacity = std::atol(env_cache_size.c_str());
if (capacity <= 0) {
ARROW_LOG(WARNING) << "Invalid cache size provided in GANDIVA_CACHE_SIZE. "
<< "Using default cache size: " << DEFAULT_CACHE_SIZE;
capacity = DEFAULT_CACHE_SIZE;
}
}
constexpr auto kCacheCapacityEnvVar = "GANDIVA_CACHE_SIZE";
constexpr auto kDefaultCacheSize = 5000;

namespace internal {
int GetCacheCapacityFromEnvVar() {
auto maybe_env_value = ::arrow::internal::GetEnvVar(kCacheCapacityEnvVar);
if (!maybe_env_value.ok()) {
return kDefaultCacheSize;
}
const auto env_value = *std::move(maybe_env_value);
if (env_value.empty()) {
return kDefaultCacheSize;
}
int capacity = 0;
bool ok = ::arrow::internal::ParseValue<::arrow::Int32Type>(
env_value.c_str(), env_value.size(), &capacity);
if (!ok || capacity <= 0) {
ARROW_LOG(WARNING) << "Invalid cache size provided in " << kCacheCapacityEnvVar
<< ". Using default cache size: " << kDefaultCacheSize;
return kDefaultCacheSize;
}
return static_cast<int>(capacity);
return capacity;
}
} // namespace internal

// Deprecated in 17.0.0. Use GetCacheCapacity instead.
int GetCapacity() { return GetCacheCapacity(); }

int GetCacheCapacity() {
static const int capacity = internal::GetCacheCapacityFromEnvVar();
return capacity;
}

void LogCacheSize(size_t capacity) {
Expand Down
15 changes: 14 additions & 1 deletion cpp/src/gandiva/cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,27 @@
#include <cstdlib>
#include <mutex>

#include "arrow/util/macros.h"
#include "gandiva/lru_cache.h"
#include "gandiva/visibility.h"

namespace gandiva {

namespace internal {
// Only called once by GetCacheCapacity().
// Do the actual work of getting the cache capacity from env var.
// Also makes the testing easier.
GANDIVA_EXPORT
int GetCacheCapacityFromEnvVar();
} // namespace internal

ARROW_DEPRECATED("Deprecated in 17.0.0. Use GetCacheCapacity instead.")
GANDIVA_EXPORT
int GetCapacity();

GANDIVA_EXPORT
int GetCacheCapacity();

GANDIVA_EXPORT
void LogCacheSize(size_t capacity);

Expand All @@ -36,7 +49,7 @@ class Cache {
public:
explicit Cache(size_t capacity) : cache_(capacity) { LogCacheSize(capacity); }

Cache() : Cache(GetCapacity()) {}
Cache() : Cache(GetCacheCapacity()) {}

ValueType GetObjectCode(const KeyType& cache_key) {
std::optional<ValueType> result;
Expand Down
68 changes: 67 additions & 1 deletion cpp/src/gandiva/cache_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,14 @@
// under the License.

#include "gandiva/cache.h"
#include "arrow/testing/gtest_util.h"
#include "arrow/util/io_util.h"
#include "arrow/util/logging.h"

#include <gtest/gtest.h>

namespace gandiva {

class TestCacheKey {
public:
explicit TestCacheKey(int value) : value_(value) {}
Expand All @@ -38,5 +42,67 @@ TEST(TestCache, TestGetPut) {
ASSERT_EQ(cache.GetObjectCode(TestCacheKey(2)), "world");
}

TEST(TestCache, TestGetCacheCapacity) { ASSERT_EQ(GetCapacity(), 5000); }
namespace {
constexpr auto cache_capacity_env_var = "GANDIVA_CACHE_SIZE";
constexpr auto default_cache_capacity = 5000;
} // namespace

TEST(TestCache, TestGetCacheCapacityDefault) {
ASSERT_EQ(GetCacheCapacity(), default_cache_capacity);
}

TEST(TestCache, TestGetCacheCapacityEnvVar) {
using ::arrow::EnvVarGuard;

// Empty.
{
EnvVarGuard guard(cache_capacity_env_var, "");
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
}

// Non-number.
{
EnvVarGuard guard(cache_capacity_env_var, "invalid");
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
}

// Number with invalid suffix.
{
EnvVarGuard guard(cache_capacity_env_var, "42MB");
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
}

// Valid positive number.
{
EnvVarGuard guard(cache_capacity_env_var, "42");
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), 42);
}

// Int max.
{
auto str = std::to_string(std::numeric_limits<int>::max());
EnvVarGuard guard(cache_capacity_env_var, str.c_str());
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), std::numeric_limits<int>::max());
}

// Zero.
{
EnvVarGuard guard(cache_capacity_env_var, "0");
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
}

// Negative number.
{
EnvVarGuard guard(cache_capacity_env_var, "-1");
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
}

// Over int max.
{
auto str = std::to_string(static_cast<int64_t>(std::numeric_limits<int>::max()) + 1);
EnvVarGuard guard(cache_capacity_env_var, str.c_str());
ASSERT_EQ(internal::GetCacheCapacityFromEnvVar(), default_cache_capacity);
}
}

} // namespace gandiva
4 changes: 4 additions & 0 deletions docs/source/cpp/env_vars.rst
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,10 @@ that changing their value later will have an effect.
The number of entries to keep in the Gandiva JIT compilation cache.
The cache is in-memory and does not persist across processes.

The default cache size is 5000. The value of this environment variable
should be a positive integer and should not exceed the maximum value
of int32. Otherwise the default value is used.

.. envvar:: HADOOP_HOME

The path to the Hadoop installation.
Expand Down

0 comments on commit 51173ec

Please sign in to comment.