From a5a191269eaac96877b81601230f1ee52bc386cb Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 14 Sep 2021 10:10:14 -0700 Subject: [PATCH 1/2] Revert "Revert "Route core worker ERROR/FATAL logs to driver logs (#18577)" (#18602)" This reverts commit 7f8cdce67d6494fa32fa9b6f014f4dc94b93ae73. --- python/ray/tests/test_output.py | 19 ++++++++++++++++ src/ray/common/ray_config_def.h | 3 +++ src/ray/core_worker/core_worker.cc | 8 +++---- src/ray/util/logging.cc | 35 +++++++++++++++++++----------- 4 files changed, 48 insertions(+), 17 deletions(-) diff --git a/python/ray/tests/test_output.py b/python/ray/tests/test_output.py index d813e85cbb65..f1159b5cc3cb 100644 --- a/python/ray/tests/test_output.py +++ b/python/ray/tests/test_output.py @@ -32,6 +32,25 @@ def foo(out_str, err_str): assert err_str.split("\n")[-2].endswith("def") +@pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") +def test_core_worker_error_message(): + script = """ +import ray +import sys + +ray.init(local_mode=True) + +# In local mode this generates an ERROR level log. +ray._private.utils.push_error_to_driver( + ray.worker.global_worker, "type", "Hello there") + """ + + proc = run_string_as_driver_nonblocking(script) + err_str = proc.stderr.read().decode("ascii") + + assert "Hello there" in err_str, err_str + + @pytest.mark.skipif(sys.platform == "win32", reason="Failing on Windows.") def test_disable_driver_logs_breakpoint(): script = """ diff --git a/src/ray/common/ray_config_def.h b/src/ray/common/ray_config_def.h index 47848f9a9bd5..3369739a838d 100644 --- a/src/ray/common/ray_config_def.h +++ b/src/ray/common/ray_config_def.h @@ -98,6 +98,9 @@ RAY_CONFIG(bool, preallocate_plasma_memory, false) /// then spread via weighted (by critical resource usage). RAY_CONFIG(bool, scheduler_hybrid_scheduling, true) +/// The fraction of resource utilization on a node after which the scheduler starts +/// to prefer spreading tasks to other nodes. This balances between locality and +/// even balancing of load. Low values (min 0.0) encourage more load spreading. RAY_CONFIG(float, scheduler_spread_threshold, getenv("RAY_SCHEDULER_SPREAD_THRESHOLD") != nullptr ? std::stof(getenv("RAY_SCHEDULER_SPREAD_THRESHOLD")) diff --git a/src/ray/core_worker/core_worker.cc b/src/ray/core_worker/core_worker.cc index 2e00b9d04245..f95db47fbad1 100644 --- a/src/ray/core_worker/core_worker.cc +++ b/src/ray/core_worker/core_worker.cc @@ -570,13 +570,13 @@ CoreWorker::CoreWorker(const CoreWorkerOptions &options, const WorkerID &worker_ // Retry after a delay to emulate the existing Raylet reconstruction // behaviour. TODO(ekl) backoff exponentially. uint32_t delay = RayConfig::instance().task_retry_delay_ms(); - RAY_LOG(ERROR) << "Will resubmit task after a " << delay - << "ms delay: " << spec.DebugString(); + RAY_LOG(INFO) << "Will resubmit task after a " << delay + << "ms delay: " << spec.DebugString(); absl::MutexLock lock(&mutex_); to_resubmit_.push_back(std::make_pair(current_time_ms() + delay, spec)); } else { - RAY_LOG(ERROR) << "Resubmitting task that produced lost plasma object: " - << spec.DebugString(); + RAY_LOG(INFO) << "Resubmitting task that produced lost plasma object: " + << spec.DebugString(); if (spec.IsActorTask()) { auto actor_handle = actor_manager_->GetActorHandle(spec.ActorId()); actor_handle->SetResubmittedActorTaskSpec(spec, spec.ActorDummyObject()); diff --git a/src/ray/util/logging.cc b/src/ray/util/logging.cc index 3e0bf9c11547..00eb96b88aad 100644 --- a/src/ray/util/logging.cc +++ b/src/ray/util/logging.cc @@ -196,6 +196,10 @@ void RayLog::StartRayLog(const std::string &app_name, RayLogLevel severity_thres app_name_ = app_name; log_dir_ = log_dir; + // All the logging sinks to add. + std::vector sinks; + auto level = static_cast(severity_threshold_); + if (!log_dir_.empty()) { // Enable log file if log_dir_ is not empty. std::string dir_ends_with_slash = log_dir_; @@ -243,26 +247,31 @@ void RayLog::StartRayLog(const std::string &app_name, RayLogLevel severity_thres // logger. spdlog::drop(RayLog::GetLoggerName()); } - file_logger = spdlog::rotating_logger_mt( - RayLog::GetLoggerName(), + auto file_sink = std::make_shared( dir_ends_with_slash + app_name_without_path + "_" + std::to_string(pid) + ".log", log_rotation_max_size_, log_rotation_file_num_); - spdlog::set_default_logger(file_logger); + sinks.push_back(file_sink); } else { auto console_sink = std::make_shared(); console_sink->set_pattern(log_format_pattern_); - auto level = static_cast(severity_threshold_); console_sink->set_level(level); - - auto err_sink = std::make_shared(); - err_sink->set_pattern(log_format_pattern_); - err_sink->set_level(spdlog::level::err); - - auto logger = std::shared_ptr( - new spdlog::logger(RayLog::GetLoggerName(), {console_sink, err_sink})); - logger->set_level(level); - spdlog::set_default_logger(logger); + sinks.push_back(console_sink); } + + // In all cases, log errors to the console log so they are in driver logs. + // https://github.com/ray-project/ray/issues/12893 + auto err_sink = std::make_shared(); + err_sink->set_pattern(log_format_pattern_); + err_sink->set_level(spdlog::level::err); + sinks.push_back(err_sink); + + // Set the combined logger. + auto logger = std::make_shared(RayLog::GetLoggerName(), sinks.begin(), + sinks.end()); + logger->set_level(level); + spdlog::set_pattern(log_format_pattern_); + spdlog::set_level(static_cast(severity_threshold_)); + spdlog::set_default_logger(logger); } void RayLog::UninstallSignalAction() { From b594bf5af898c9ea14cd76dbe6e672b80adc6b76 Mon Sep 17 00:00:00 2001 From: Eric Liang Date: Tue, 14 Sep 2021 11:08:04 -0700 Subject: [PATCH 2/2] set pattern --- src/ray/util/logging.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/ray/util/logging.cc b/src/ray/util/logging.cc index 00eb96b88aad..7713551ae4f2 100644 --- a/src/ray/util/logging.cc +++ b/src/ray/util/logging.cc @@ -269,8 +269,9 @@ void RayLog::StartRayLog(const std::string &app_name, RayLogLevel severity_thres auto logger = std::make_shared(RayLog::GetLoggerName(), sinks.begin(), sinks.end()); logger->set_level(level); - spdlog::set_pattern(log_format_pattern_); + logger->set_pattern(log_format_pattern_); spdlog::set_level(static_cast(severity_threshold_)); + spdlog::set_pattern(log_format_pattern_); spdlog::set_default_logger(logger); }