From 3b1b5f8029e7505d10f3a80e50b22537f9561cfe Mon Sep 17 00:00:00 2001 From: Krzysztof Lecki Date: Thu, 11 Apr 2024 12:51:23 +0200 Subject: [PATCH] Extend context and name propagation in errors (#5396) Add error context related to particular operator during graph construction and pipeline build: * error propagation in Pipeline::Build() is adjusted * Python calls to backend: Pipeline::AddOperator now augment errors with context. * OpSpec building errors propagate the proper operator name. Replace most of the SchemaName() occurrences that were used to indicate the name operator with the fully formatted operator name in the correct API. Make sure that the naming information is added as soon as possible, so it can be accessed in partially constructed schema with the correct values present. Note: This PR mostly adds the context like: ``` Error in operator `nvidia.dali.fn.operator_name`, which was used in the pipeline definition with the following traceback: encountered: ``` to places where it was not previously used, but we are processing a single operator. Error messages are adjusted to show the user-facing input/output/argument name (in uniform way) rather than the internal one. Otherwise the checks and messages are preserved. The types of error messages are not adjusted. Signed-off-by: Krzysztof Lecki --- dali/operators/generic/reshape.cc | 11 +- dali/operators/generic/reshape.h | 4 - dali/operators/reader/reader_op.h | 5 +- dali/pipeline/graph/graph_descr.cc | 38 ++-- .../operator/builtin/external_source.h | 6 +- dali/pipeline/operator/error_reporting.cc | 5 +- dali/pipeline/operator/error_reporting.h | 5 +- dali/pipeline/operator/name_utils.cc | 18 +- dali/pipeline/operator/name_utils.h | 29 +++ dali/pipeline/operator/op_spec.cc | 61 ++++++- dali/pipeline/operator/op_spec.h | 45 +---- dali/pipeline/operator/operator.h | 14 +- dali/pipeline/pipeline.cc | 167 +++++++++--------- dali/pipeline/pipeline.h | 11 +- dali/pipeline/pipeline_test.cc | 2 +- dali/python/nvidia/dali/ops/__init__.py | 6 + .../conditionals/test_logical_expressions.py | 4 +- .../test_pipeline_conditionals.py | 4 +- .../operator_1/test_operator_exception.py | 26 +++ dali/test/python/operator_1/test_slice.py | 4 +- dali/test/python/test_dali_cpu_only.py | 27 ++- 21 files changed, 295 insertions(+), 197 deletions(-) diff --git a/dali/operators/generic/reshape.cc b/dali/operators/generic/reshape.cc index 66778daf215..9d9b4e4b541 100644 --- a/dali/operators/generic/reshape.cc +++ b/dali/operators/generic/reshape.cc @@ -1,4 +1,4 @@ -// Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -21,6 +21,7 @@ #include "dali/core/tensor_shape_print.h" #include "dali/operators/generic/reshape.h" #include "dali/pipeline/data/views.h" +#include "dali/pipeline/operator/name_utils.h" namespace dali { @@ -135,11 +136,11 @@ Reshape::Reshape(const OpSpec &spec) : Base(spec) { && !has_src_dims_arg) { bool can_have_dtype = spec.GetSchema().HasArgument("dtype"); if (can_have_dtype) { - DALI_ENFORCE(output_type_arg_ != DALI_NO_TYPE, make_string(OpName(), - " is no-op: arguments specify neither new shape, layout nor type.")); + DALI_ENFORCE(output_type_arg_ != DALI_NO_TYPE, make_string("`", GetOpDisplayName(spec, true), + "` is no-op: arguments specify neither new shape, layout nor type.")); } else { - DALI_FAIL(make_string(OpName(), - " is no-op: arguments specify neither new shape nor layout.")); + DALI_FAIL(make_string("`", GetOpDisplayName(spec, true), + "` is no-op: arguments specify neither new shape nor layout.")); } } use_layout_ = has_layout_arg; diff --git a/dali/operators/generic/reshape.h b/dali/operators/generic/reshape.h index 6c5459e8fc9..6a9e696be92 100644 --- a/dali/operators/generic/reshape.h +++ b/dali/operators/generic/reshape.h @@ -63,10 +63,6 @@ class Reshape : public StatelessOperator { TensorLayout layout_; private: - inline const std::string &OpName() const { - return this->spec_.SchemaName(); - } - TensorListShape<> input_shape_; TensorShape<> uniform_shape_; std::vector rel_uniform_shape_; diff --git a/dali/operators/reader/reader_op.h b/dali/operators/reader/reader_op.h index dc8011e6986..a63134ab6cd 100644 --- a/dali/operators/reader/reader_op.h +++ b/dali/operators/reader/reader_op.h @@ -29,6 +29,7 @@ #include "dali/operators/reader/loader/loader.h" #include "dali/operators/reader/parser/parser.h" #include "dali/pipeline/operator/checkpointing/snapshot_serializer.h" +#include "dali/pipeline/operator/name_utils.h" #include "dali/pipeline/operator/operator.h" namespace dali { @@ -104,7 +105,7 @@ class DataReader : public Operator { void SaveState(OpCheckpoint &cpt, AccessOrder order) override { if constexpr (!supports_checkpointing) { - DALI_FAIL("The reader ", spec_.SchemaName(), " does not support checkpointing."); + DALI_FAIL("The reader `", GetOpDisplayName(spec_, true), "` does not support checkpointing."); } else { DALI_ENFORCE(checkpointing_, "Cannot save the checkpoint, because " @@ -116,7 +117,7 @@ class DataReader : public Operator { void RestoreState(const OpCheckpoint &cpt) override { if constexpr (!supports_checkpointing) { - DALI_FAIL("The reader ", spec_.SchemaName(), " does not support checkpointing."); + DALI_FAIL("The reader `", GetOpDisplayName(spec_, true), "` does not support checkpointing."); } else { DALI_ENFORCE(checkpointing_, "Cannot restore the checkpoint, because " diff --git a/dali/pipeline/graph/graph_descr.cc b/dali/pipeline/graph/graph_descr.cc index 0196590308b..aba22f37507 100644 --- a/dali/pipeline/graph/graph_descr.cc +++ b/dali/pipeline/graph/graph_descr.cc @@ -13,12 +13,14 @@ // limitations under the License. #include +#include #include #include "dali/core/error_handling.h" #include "dali/pipeline/graph/op_graph.h" #include "dali/pipeline/operator/error_reporting.h" +#include "dali/pipeline/operator/name_utils.h" #include "dali/pipeline/operator/op_schema.h" #include "dali/pipeline/operator/builtin/make_contiguous.h" @@ -55,19 +57,20 @@ void CheckOpConstraints(const OpSpec &spec) { const int additional_outputs = schema.CalculateAdditionalOutputs(spec); DALI_ENFORCE(schema.SupportsInPlace(spec) || !spec.GetArgument("inplace"), - "Op '" + spec.SchemaName() + "' does not support in-place execution."); - DALI_ENFORCE(spec.NumRegularInput() <= schema.MaxNumInput(), - "Operator '" + spec.SchemaName() + - "' supports a maximum of " + std::to_string(schema.MaxNumInput()) + " inputs, " - "but was passed " + std::to_string(spec.NumRegularInput()) + "."); - DALI_ENFORCE(spec.NumRegularInput() >= schema.MinNumInput(), - "Operator '" + spec.SchemaName() + - "' supports a minimum of " + std::to_string(schema.MinNumInput()) + " inputs, " - "but was passed " + std::to_string(spec.NumRegularInput()) + "."); + make_string("Operator `", GetOpDisplayName(spec, true), + "` does not support in-place execution.")); + DALI_ENFORCE( + spec.NumRegularInput() <= schema.MaxNumInput(), + make_string("Operator `", GetOpDisplayName(spec, true), "` supports a maximum of ", + schema.MaxNumInput(), " inputs, but was passed ", spec.NumRegularInput(), ".")); + DALI_ENFORCE( + spec.NumRegularInput() >= schema.MinNumInput(), + make_string("Operator `", GetOpDisplayName(spec, true), "` supports a minimum of ", + schema.MinNumInput(), " inputs, but was passed ", spec.NumRegularInput(), ".")); DALI_ENFORCE(spec.NumOutput() == schema.CalculateOutputs(spec) + additional_outputs, - "Operator '" + spec.SchemaName() + "' supports " - + std::to_string(schema.CalculateOutputs(spec) + additional_outputs) - + " outputs, but was passed " + std::to_string(spec.NumOutput()) + "."); + make_string("Operator `", GetOpDisplayName(spec, true), "` supports ", + schema.CalculateOutputs(spec) + additional_outputs, + " outputs, but was passed ", spec.NumOutput(), ".")); } OpType ParseOpType(const std::string &device) { @@ -207,17 +210,14 @@ void OpGraph::InstantiateOperators() { for (auto op_type : order) { for (auto op_id : op_partitions_[static_cast(op_type)]) { - std::exception_ptr eptr; try { op_nodes_[op_id].InstantiateOperator(); } catch (...) { - eptr = std::current_exception(); + PropagateError({std::current_exception(), + "Critical error when building pipeline:\n" + + GetErrorContextMessage(op_nodes_[op_id].spec), + "\nCurrent pipeline object is no longer valid."}); } - - PropagateError({eptr, - "Critical error when building pipeline:\n" + - GetErrorContextMessage(op_nodes_[op_id].spec), - "\nCurrent pipeline object is no longer valid."}); } } } diff --git a/dali/pipeline/operator/builtin/external_source.h b/dali/pipeline/operator/builtin/external_source.h index 1cdacfd7242..fc347876897 100644 --- a/dali/pipeline/operator/builtin/external_source.h +++ b/dali/pipeline/operator/builtin/external_source.h @@ -1,4 +1,4 @@ -// Copyright (c) 2017-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// Copyright (c) 2017-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -64,10 +64,6 @@ class ExternalSource : public InputOperator { virtual ~ExternalSource() = default; - inline string name() const override { - return "ExternalSource (" + output_name_ + ")"; - } - const TensorLayout& in_layout() const override { return layout_; } diff --git a/dali/pipeline/operator/error_reporting.cc b/dali/pipeline/operator/error_reporting.cc index 6f4e1d1a67c..351797c737c 100644 --- a/dali/pipeline/operator/error_reporting.cc +++ b/dali/pipeline/operator/error_reporting.cc @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -96,7 +97,7 @@ void PropagateError(ErrorInfo error) { } } -std::string GetErrorContextMessage(const OpSpec &spec) { +std::string GetErrorContextMessage(const OpSpec &spec, std::string_view message_name) { auto device = spec.GetArgument("device"); auto op_name = GetOpDisplayName(spec, true); std::transform(device.begin(), device.end(), device.begin(), ::toupper); @@ -109,7 +110,7 @@ std::string GetErrorContextMessage(const OpSpec &spec) { formatted_origin_stack + "\n") : " "; // we need space before "encountered" - return make_string("Error in ", device, " operator `", op_name, "`", + return make_string(message_name, " in ", device, " operator `", op_name, "`", optional_stack_mention, "encountered:\n\n"); } diff --git a/dali/pipeline/operator/error_reporting.h b/dali/pipeline/operator/error_reporting.h index d84bdf0f6f9..182b83773f2 100644 --- a/dali/pipeline/operator/error_reporting.h +++ b/dali/pipeline/operator/error_reporting.h @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -183,8 +184,10 @@ class DaliStopIteration : public DaliError { * * the origin stack trace of the operator within pipeline definition. * * It can be prepended to the original error message. + * @param message_name Will be used as the prefix of the error message, for example: + * "Error in operator " or "Warning in operator " */ -std::string GetErrorContextMessage(const OpSpec &spec); +std::string GetErrorContextMessage(const OpSpec &spec, std::string_view message_name = "Error"); } // namespace dali diff --git a/dali/pipeline/operator/name_utils.cc b/dali/pipeline/operator/name_utils.cc index 7460478cbc2..ed5326e6f9e 100644 --- a/dali/pipeline/operator/name_utils.cc +++ b/dali/pipeline/operator/name_utils.cc @@ -23,7 +23,7 @@ namespace dali { std::string GetOpModule(const OpSpec &spec) { - return spec.GetArgument("_module");; + return spec.GetArgument("_module"); } std::string GetOpDisplayName(const OpSpec &spec, bool include_module_path) { @@ -39,4 +39,20 @@ std::string GetOpDisplayName(const OpSpec &spec, bool include_module_path) { } } +std::string FormatInput(const OpSpec &spec, int input_idx, bool capitalize) { + if (spec.GetSchema().HasInputDox()) { + return make_string(capitalize ? "I" : "i", "nput `", input_idx, "` ('__", + spec.GetSchema().GetInputName(input_idx), "')"); + } + return make_string(capitalize ? "I" : "i", "nput `", input_idx, "`"); +} + +std::string FormatOutput(const OpSpec &spec, int output_idx, bool capitalize) { + return make_string(capitalize ? "O" : "o", "utput `", output_idx, "`"); +} + +std::string FormatArgument(const OpSpec &spec, const std::string &argument, bool capitalize) { + return make_string(capitalize ? "A" : "a", "rgument '", argument, "'"); +} + } // namespace dali diff --git a/dali/pipeline/operator/name_utils.h b/dali/pipeline/operator/name_utils.h index bcfcbfa54e5..48759be4e0d 100644 --- a/dali/pipeline/operator/name_utils.h +++ b/dali/pipeline/operator/name_utils.h @@ -42,6 +42,35 @@ DLL_PUBLIC std::string GetOpModule(const OpSpec &spec); */ DLL_PUBLIC std::string GetOpDisplayName(const OpSpec &spec, bool include_module_path = false); +/** + * @brief Uniformly format the display of the operator input index, optionally including the name + * if provided in schema doc. + * + * @param input_idx Index of the input + * @param capitalize should be true if the output should start with capital letter (used at the + * start of the sentence) + */ +DLL_PUBLIC std::string FormatInput(const OpSpec &spec, int input_idx, bool capitalize = false); + +/** + * @brief Uniformly format the display of the operator output index. + * + * @param input_idx Index of the output + * @param capitalize should be true if the output should start with capital letter (used at the + * start of the sentence) + */ +DLL_PUBLIC std::string FormatOutput(const OpSpec &spec, int output_idx, bool capitalize = false); + +/** + * @brief Uniformly format the display of the operator argument name + * + * @param argument string representing the name of the argument (without additional quotes) + * @param capitalize should be true if the output should start with capital letter (used at the + * start of the sentence) + */ +DLL_PUBLIC std::string FormatArgument(const OpSpec &spec, const std::string &argument, + bool capitalize = false); + } // namespace dali #endif // DALI_PIPELINE_OPERATOR_NAME_UTILS_H_ diff --git a/dali/pipeline/operator/op_spec.cc b/dali/pipeline/operator/op_spec.cc index 7a1ec1ccf3a..33c7ee7ec34 100644 --- a/dali/pipeline/operator/op_spec.cc +++ b/dali/pipeline/operator/op_spec.cc @@ -13,8 +13,8 @@ // limitations under the License. #include "dali/pipeline/operator/op_spec.h" - #include "dali/pipeline/data/types.h" +#include "dali/pipeline/operator/name_utils.h" namespace dali { @@ -26,7 +26,7 @@ OpSpec& OpSpec::AddInput(const string &name, const string &device, bool regular_ // We rely on the fact that regular inputs are first in inputs_ vector DALI_ENFORCE(NumArgumentInput() == 0, "All regular inputs (particularly, `" + name + "`) need to be added to the op `" + - this->SchemaName() + "` before argument inputs."); + GetOpDisplayName(*this, true) + "` before argument inputs."); } inputs_.push_back({name, device}); @@ -50,12 +50,14 @@ OpSpec& OpSpec::AddOutput(const string &name, const string &device) { OpSpec& OpSpec::AddArgumentInput(const string &arg_name, const string &inp_name) { DALI_ENFORCE(!this->HasArgument(arg_name), make_string( - "Argument ", arg_name, " is already specified.")); + "Argument '", arg_name, "' is already specified.")); const OpSchema& schema = GetSchema(); - DALI_ENFORCE(schema.HasArgument(arg_name), make_string( - "Argument '", arg_name, "' is not part of the op schema '", schema.name(), "'")); - DALI_ENFORCE(schema.IsTensorArgument(arg_name), make_string( - "Argument `", arg_name, "` in operator `", schema.name(), "` is not a a tensor argument.")); + DALI_ENFORCE(schema.HasArgument(arg_name), + make_string("Argument '", arg_name, "' is not supported by operator `", + GetOpDisplayName(*this, true), "`.")); + DALI_ENFORCE(schema.IsTensorArgument(arg_name), + make_string("Argument '", arg_name, "' in operator `", GetOpDisplayName(*this, true), + "` is not an argument input.")); int idx = inputs_.size(); argument_inputs_.push_back({ arg_name, idx }); argument_input_idxs_[arg_name] = idx; @@ -63,4 +65,49 @@ OpSpec& OpSpec::AddArgumentInput(const string &arg_name, const string &inp_name) return *this; } +OpSpec& OpSpec::SetInitializedArg(const string& arg_name, std::shared_ptr arg) { + if (schema_ && schema_->IsDeprecatedArg(arg_name)) { + const auto& deprecation_meta = schema_->DeprecatedArgMeta(arg_name); + // Argument was removed, and we can discard it + if (deprecation_meta.removed) { + return *this; + } + if (!deprecation_meta.renamed_to.empty()) { + const auto& new_arg_name = deprecation_meta.renamed_to; + DALI_ENFORCE(argument_idxs_.find(new_arg_name) == argument_idxs_.end(), + make_string("Operator `", GetOpDisplayName(*this, true), "` got an unexpected '", + arg_name, "' deprecated argument when '", new_arg_name, + "' was already provided.")); + + set_through_deprecated_arguments_[new_arg_name] = arg_name; + // Adjust the arg so it carries the proper name for serialization + if (arg->has_name()) { + arg->set_name(new_arg_name); + } + auto [it, inserted] = argument_idxs_.insert({new_arg_name, arguments_.size()}); + if (inserted) + arguments_.push_back(std::move(arg)); + else + arguments_[it->second] = std::move(arg); + return *this; + } + } + EnforceNoAliasWithDeprecated(arg_name); + auto [it, inserted] = argument_idxs_.insert({arg_name, arguments_.size()}); + if (inserted) + arguments_.push_back(std::move(arg)); + else + arguments_[it->second] = std::move(arg); + return *this; +} + +void OpSpec::EnforceNoAliasWithDeprecated(const string& arg_name) { + auto set_through = set_through_deprecated_arguments_.find(arg_name); + DALI_ENFORCE(set_through == set_through_deprecated_arguments_.end(), + make_string("Operator `", GetOpDisplayName(*this, true), "` got an unexpected '", + set_through->second, "' deprecated argument when '", arg_name, + "' was already provided.")); +} + + } // namespace dali diff --git a/dali/pipeline/operator/op_spec.h b/dali/pipeline/operator/op_spec.h index 2c524316f67..dda4a2690c6 100644 --- a/dali/pipeline/operator/op_spec.h +++ b/dali/pipeline/operator/op_spec.h @@ -146,53 +146,12 @@ class DLL_PUBLIC OpSpec { * * @remarks Deprecated arguments are renamed (or dropped, if no longer used). */ - DLL_PUBLIC inline OpSpec& SetInitializedArg(const string& arg_name, - std::shared_ptr arg) { - if (schema_ && schema_->IsDeprecatedArg(arg_name)) { - const auto& deprecation_meta = schema_->DeprecatedArgMeta(arg_name); - // Argument was removed, and we can discard it - if (deprecation_meta.removed) { - return *this; - } - if (!deprecation_meta.renamed_to.empty()) { - const auto& new_arg_name = deprecation_meta.renamed_to; - DALI_ENFORCE( - argument_idxs_.find(new_arg_name) == argument_idxs_.end(), - make_string("Operator ", SchemaName(), " got an unexpected '", arg_name, - "' deprecated argument when '", new_arg_name, "' was already provided.")); - - set_through_deprecated_arguments_[new_arg_name] = arg_name; - // Adjust the arg so it carries the proper name for serialization - if (arg->has_name()) { - arg->set_name(new_arg_name); - } - auto [it, inserted] = argument_idxs_.insert({ new_arg_name, arguments_.size() }); - if (inserted) - arguments_.push_back(std::move(arg)); - else - arguments_[it->second] = std::move(arg); - return *this; - } - } - EnforceNoAliasWithDeprecated(arg_name); - auto [it, inserted] = argument_idxs_.insert({ arg_name, arguments_.size() }); - if (inserted) - arguments_.push_back(std::move(arg)); - else - arguments_[it->second] = std::move(arg); - return *this; - } + DLL_PUBLIC OpSpec& SetInitializedArg(const string& arg_name, std::shared_ptr arg); /** * @brief Check if the `arg_name` was already set through a deprecated argument */ - DLL_PUBLIC inline void EnforceNoAliasWithDeprecated(const string& arg_name) { - auto set_through = set_through_deprecated_arguments_.find(arg_name); - DALI_ENFORCE( - set_through == set_through_deprecated_arguments_.end(), - make_string("Operator ", SchemaName(), " got an unexpected '", set_through->second, - "' deprecated argument when '", arg_name, "' was already provided.")); - } + DLL_PUBLIC void EnforceNoAliasWithDeprecated(const string& arg_name); // Forward to string implementation template diff --git a/dali/pipeline/operator/operator.h b/dali/pipeline/operator/operator.h index c68a5875c34..1fa897e70ab 100644 --- a/dali/pipeline/operator/operator.h +++ b/dali/pipeline/operator/operator.h @@ -30,6 +30,7 @@ #include "dali/core/tensor_shape_print.h" #include "dali/pipeline/data/backend.h" #include "dali/pipeline/operator/common.h" +#include "dali/pipeline/operator/name_utils.h" #include "dali/pipeline/operator/op_schema.h" #include "dali/pipeline/operator/op_spec.h" #include "dali/pipeline/operator/operator_factory.h" @@ -97,15 +98,6 @@ class DLL_PUBLIC OperatorBase { */ DLL_PUBLIC virtual void Run(Workspace &ws) = 0; - /** - * @brief returns the name of the operator. By default returns - * the name of the op as specified by the OpSpec it was constructed - * from. - */ - DLL_PUBLIC virtual string name() const { - return spec_.SchemaName(); - } - /** * @brief For reader Ops, returns the metadata of the reader and dataset, * See ReaderMeta strucutre for the data returned @@ -211,8 +203,8 @@ class DLL_PUBLIC OperatorBase { } [[noreturn]] void CheckpointingUnsupportedError() const { - DALI_FAIL( - make_string("Checkpointing is not implemented for this operator: ", spec_.SchemaName())); + DALI_FAIL(make_string("Checkpointing is not implemented for this operator: `", + GetOpDisplayName(spec_, true), "`.")); } // TODO(mszolucha): remove these two to allow i2i variable batch size, when all ops are ready diff --git a/dali/pipeline/pipeline.cc b/dali/pipeline/pipeline.cc index ff142ae8d4a..2637c4ccff9 100644 --- a/dali/pipeline/pipeline.cc +++ b/dali/pipeline/pipeline.cc @@ -22,13 +22,14 @@ #include #include +#include "dali/core/device_guard.h" +#include "dali/core/mm/default_resources.h" +#include "dali/pipeline/dali.pb.h" #include "dali/pipeline/executor/executor_factory.h" #include "dali/pipeline/operator/argument.h" #include "dali/pipeline/operator/common.h" #include "dali/pipeline/operator/error_reporting.h" -#include "dali/core/device_guard.h" -#include "dali/core/mm/default_resources.h" -#include "dali/pipeline/dali.pb.h" +#include "dali/pipeline/operator/name_utils.h" namespace dali { @@ -245,8 +246,31 @@ int Pipeline::AddOperator(const OpSpec &spec) { int Pipeline::AddOperator(const OpSpec &const_spec, const std::string& inst_name, int logical_id) { - DALI_ENFORCE(!built_, "Alterations to the pipeline after " - "\"Build()\" has been called are not allowed"); + DALI_ENFORCE(!built_, + "Alterations to the pipeline after \"Build()\" has been called are not allowed"); + + // Validate op device + auto device = const_spec.GetArgument("device"); + // Doesn't make sense to state the wrong device in the error message, so we use the operator name + // directly instead of the error context message.. + DALI_ENFORCE(device == "cpu" || device == "gpu" || device == "mixed" || device == "support", + make_string("Invalid device argument \"", device, "\" for operator `", + GetOpDisplayName(const_spec, true), + "`. Valid options are \"cpu\", \"gpu\" or \"mixed\"")); + + int result = -1; + try { + result = AddOperatorImpl(const_spec, inst_name, logical_id); + } catch (...) { + PropagateError({std::current_exception(), GetErrorContextMessage(const_spec), ""}); + } + return result; +} + +int Pipeline::AddOperatorImpl(const OpSpec &const_spec, const std::string &inst_name, + int logical_id) { + assert(!built_ && "Already checked by AddOperator()"); + DALI_ENFORCE(0 <= logical_id, "Logical id of the node must be positive, got " + std::to_string(logical_id) + "."); @@ -256,22 +280,20 @@ int Pipeline::AddOperator(const OpSpec &const_spec, const std::string& inst_name // we modify spec so copy it OpSpec spec = const_spec; - auto operator_name = spec.SchemaName(); + auto operator_name = GetOpDisplayName(spec, true); // Take a copy of the passed OpSpec for serialization purposes before any modification this->op_specs_for_serialization_.push_back({inst_name, spec, logical_id}); - // Validate op device - string device = spec.GetArgument("device"); - DALI_ENFORCE(device == "cpu" || device == "gpu" || device == "mixed" || device == "support", - "Invalid device argument \"" + device + - "\". Valid options are \"cpu\", \"gpu\" or \"mixed\""); + auto device = const_spec.GetArgument("device"); DALI_ENFORCE(device != "gpu" || device_id_ != CPU_ONLY_DEVICE_ID, - make_string("Cannot add a GPU operator ", operator_name, ", device_id should not be" - " equal CPU_ONLY_DEVICE_ID.")); + "Cannot add a GPU operator. Pipeline 'device_id' should not be equal to " + "`CPU_ONLY_DEVICE_ID`."); if (device == "support") { - DALI_WARN("\"support\" device is deprecated; use \"cpu\" or leave blank instead"); + auto warning_context = GetErrorContextMessage(spec, "Warning"); + DALI_WARN(warning_context, + " \"support\" device is deprecated; use \"cpu\" or leave blank instead."); device = "cpu"; spec.SetArg("device", "cpu"); } @@ -288,8 +310,9 @@ int Pipeline::AddOperator(const OpSpec &const_spec, const std::string& inst_name string input_device = spec.InputDevice(i); auto it = edge_names_.find(input_name); - DALI_ENFORCE(it != edge_names_.end(), "Input '" + input_name + - "' to op '" + spec.SchemaName() + "' is not known to the pipeline."); + DALI_ENFORCE(it != edge_names_.end(), + make_string("Data node \"", input_name, "\" requested as ", FormatInput(spec, i), + " to the operator is not known to the pipeline.")); // Table of possible scenarios: // Op location / requested input type / data location @@ -305,21 +328,24 @@ int Pipeline::AddOperator(const OpSpec &const_spec, const std::string& inst_name // mixed / cpu / gpu -> error, data does not exist on cpu // mixed / gpu / cpu -> error, mixed op not allowed to have gpu inputs // mixed / gpu / gpu -> both of above errors - string error_str = "(op: '" + spec.SchemaName() + "', input: '" + - input_name + "')"; - if (device == "cpu" || device == "mixed") { - DALI_ENFORCE(input_device == "cpu", "cpu/mixed ops can only take cpu " - "inputs. CPU op cannot follow GPU op. " + error_str); - DALI_ENFORCE(it->second.has_cpu, "cpu input requested by op exists " - "only on gpu. CPU op cannot follow GPU op. " + error_str); + DALI_ENFORCE(input_device == "cpu", + make_string("Error while specifying ", FormatInput(spec, i), + ". CPU/Mixed ops can only take CPU inputs. CPU operator cannot " + "follow GPU operator. ")); + DALI_ENFORCE(it->second.has_cpu, + make_string("Error while specifying ", FormatInput(spec, i), + ". CPU input requested by operator exists only on GPU. CPU " + "operator cannot follow GPU operator.")); DALI_ENFORCE(device_id_ != CPU_ONLY_DEVICE_ID || device == "cpu", - make_string("Cannot add a mixed operator ", operator_name, - " with a GPU output, device_id should not be CPU_ONLY_DEVICE_ID.")); + "Cannot add a Mixed operator with a GPU output, 'device_id' " + "should not be `CPU_ONLY_DEVICE_ID`."); } else if (input_device == "cpu") { // device == gpu - DALI_ENFORCE(it->second.has_cpu, "cpu input requested by op exists " - "only on gpu. CPU op cannot follow GPU op. " + error_str); + DALI_ENFORCE(it->second.has_cpu, + make_string("Error while specifying ", FormatInput(spec, i), + ". CPU input requested by operator exists only on GPU. CPU " + "operator cannot follow GPU operator.")); SetupCPUInput(it, i, &spec); } else { SetupGPUInput(it); @@ -331,16 +357,16 @@ int Pipeline::AddOperator(const OpSpec &const_spec, const std::string& inst_name std::string input_name = spec.InputName(input_idx); auto it = edge_names_.find(input_name); - DALI_ENFORCE(it != edge_names_.end(), "Input '" + input_name + - "' to op '" + spec.SchemaName() + "' is not known to the pipeline."); - - string error_str = "(op: '" + spec.SchemaName() + "', input: '" + - input_name + "')"; + DALI_ENFORCE( + it != edge_names_.end(), + make_string("Data node \"", input_name, "\" requested as ", FormatArgument(spec, arg_name), + " to operator is not known to the pipeline.")); if (!it->second.has_cpu) { - DALI_FAIL(make_string( - "Named arguments inputs to operators must be CPU data nodes. However, a GPU ", - "data node was provided (op: '", spec.SchemaName(), "', input: '", input_name, "')")); + assert(it->second.has_gpu); + DALI_FAIL(make_string("Error while specifying ", FormatArgument(spec, arg_name), + ". Named argument inputs to operators must be CPU data nodes. " + "However, a GPU data node was provided.")); } if (device == "gpu" && separated_execution_) @@ -351,13 +377,12 @@ int Pipeline::AddOperator(const OpSpec &const_spec, const std::string& inst_name for (int i = 0; i < spec.NumOutput(); ++i) { string output_name = spec.OutputName(i); string output_device = spec.OutputDevice(i); - string error_str = "(op: '" + spec.SchemaName() + "', output: '" + - output_name + "')"; auto it = edge_names_.find(output_name); - DALI_ENFORCE(it == edge_names_.end(), "Output name '" + - output_name + "' conflicts with existing intermediate " - "result name. " + error_str); + DALI_ENFORCE( + it == edge_names_.end(), + make_string("Error while specifying ", FormatOutput(spec, i), ". Output name \"", + output_name, "\" conflicts with an existing intermediate result name.")); // Validate output data conforms to graph constraints // Note: DALI CPU -> GPU flow is enforced, when the operators are added via the Python layer @@ -365,8 +390,9 @@ int Pipeline::AddOperator(const OpSpec &const_spec, const std::string& inst_name // TODO(klecki): Are there any GPU ops that actually return CPU outputs? bool mark_explicitly_contiguous = false; if (device == "cpu") { - DALI_ENFORCE(output_device == "cpu", "Only CPU operators can produce CPU outputs." + - error_str); + DALI_ENFORCE(output_device == "cpu", + make_string("Error while specifying ", FormatOutput(spec, i), + ". Only CPU operators can produce CPU outputs.")); } else if (device == "gpu") { if (output_device == "cpu") { mark_explicitly_contiguous = true; @@ -384,7 +410,8 @@ int Pipeline::AddOperator(const OpSpec &const_spec, const std::string& inst_name } DALI_ENFORCE(edge_names_.insert({output_name, meta}).second, - "Output name insertion failure."); + make_string("Error while specifying ", FormatOutput(spec, i), "node name: \"", + output_name, "\". Output name insertion failure.")); } // store updated spec @@ -402,13 +429,13 @@ void Pipeline::AddToOpSpecs(const std::string &inst_name, const OpSpec &spec, in const auto &group_name = op_specs_[logical_group.front()].spec.SchemaName(); DALI_ENFORCE( group_name == spec.SchemaName(), - "Different Operator types cannot be groupped with the same logical id. Tried to group " + - spec.SchemaName() + " using logical_id=" + std::to_string(logical_id) + + "Different Operator types cannot be grouped with the same logical id. Tried to group `" + + GetOpDisplayName(spec, true) + "` using logical_id=" + std::to_string(logical_id) + " which is already assigned to " + group_name + "."); const OpSchema &schema = SchemaRegistry::GetSchema(spec.SchemaName()); DALI_ENFORCE(schema.AllowsInstanceGrouping(), - "Operator " + spec.SchemaName() + - " does not support synced random execution required " + "Operator `" + GetOpDisplayName(spec, true) + + "` does not support synced random execution required " "for multiple input sets processing."); } op_specs_.push_back({inst_name, spec, logical_id}); @@ -479,35 +506,17 @@ void Pipeline::Build(std::vector output_descs) { executor_->Init(); // Creating the graph + for (auto& name_op_spec : op_specs_) { string& inst_name = name_op_spec.instance_name; OpSpec op_spec = name_op_spec.spec; PrepareOpSpec(&op_spec, name_op_spec.logical_id); try { graph_.AddOp(op_spec, inst_name); - } catch (std::exception &e) { - int same_op_count = 0; - for (const auto& elem : op_specs_) { - if (elem.spec.SchemaName() == op_spec.SchemaName()) { - same_op_count++; - } - if (same_op_count > 1) { - break; - } - } - if (same_op_count > 1) { - throw std::runtime_error(make_string( - "Critical error when building pipeline:\nError when adding operator: ", - op_spec.SchemaName(), ", instance name: \"", inst_name, "\", encountered:\n", e.what(), - "\nCurrent pipeline object is no longer valid.")); - } else { - throw std::runtime_error( - make_string("Critical error when building pipeline:\nError when adding operator: ", - op_spec.SchemaName(), "\" encountered:\n", e.what(), - "\nCurrent pipeline object is no longer valid.")); - } } catch (...) { - throw std::runtime_error("Unknown critical error when building pipeline."); + PropagateError({std::current_exception(), + "Critical error when building pipeline:\n" + GetErrorContextMessage(op_spec), + "\nCurrent pipeline object is no longer valid."}); } } @@ -534,8 +543,8 @@ void Pipeline::Build(std::vector output_descs) { DALI_ENFORCE(device_id_ != CPU_ONLY_DEVICE_ID, make_string( "Cannot move the data node ", name, " to the GPU " - "in a CPU-only pipeline. The `device_id` parameter " - "is set to `CPU_ONLY_DEVICE_ID`. Set `device_id` " + "in a CPU-only pipeline. The 'device_id' parameter " + "is set to `CPU_ONLY_DEVICE_ID`. Set 'device_id' " "to a valid GPU identifier to enable GPU features " "in the pipeline.")); if (!it->second.has_gpu) { @@ -623,39 +632,33 @@ bool Pipeline::ValidateOutputs(const Workspace &ws) const { void Pipeline::Outputs(Workspace *ws) { DALI_ENFORCE(built_, "\"Build()\" must be called prior to executing the pipeline."); - std::exception_ptr eptr; try { executor_->Outputs(ws); } catch (...) { - eptr = std::current_exception(); + ProcessException(std::current_exception()); } - ProcessException(eptr); ValidateOutputs(*ws); } void Pipeline::ShareOutputs(Workspace *ws) { DALI_ENFORCE(built_, "\"Build()\" must be called prior to executing the pipeline."); - std::exception_ptr eptr; try { executor_->ShareOutputs(ws); } catch (...) { - eptr = std::current_exception(); + ProcessException(std::current_exception()); } - ProcessException(eptr); ValidateOutputs(*ws); } void Pipeline::ReleaseOutputs() { DALI_ENFORCE(built_, "\"Build()\" must be called prior to executing the pipeline."); - std::exception_ptr eptr; try { executor_->ReleaseOutputs(); } catch (...) { - eptr = std::current_exception(); + ProcessException(std::current_exception()); } - ProcessException(eptr); } void Pipeline::SetupCPUInput(std::map::iterator it, int input_idx, OpSpec *spec) { @@ -764,8 +767,8 @@ string Pipeline::SerializeToProtobuf() const { const auto& p = this->op_specs_for_serialization_[i]; const OpSpec& spec = p.spec; - DALI_ENFORCE(spec.GetSchema().IsSerializable(), "Could not serialize the operator: " - + spec.SchemaName()); + DALI_ENFORCE(spec.GetSchema().IsSerializable(), "Could not serialize the operator: `" + + GetOpDisplayName(spec, true) + "`."); dali::SerializeToProtobuf(op_def, p.instance_name, spec, p.logical_id); } diff --git a/dali/pipeline/pipeline.h b/dali/pipeline/pipeline.h index ea2802e7ed0..0c2caad4287 100644 --- a/dali/pipeline/pipeline.h +++ b/dali/pipeline/pipeline.h @@ -596,6 +596,14 @@ class DLL_PUBLIC Pipeline { return base_ptr_offset; } + /** + * @brief See Pipeline::AddOperator for details. + * + * Does the internal processing allowing the errors to be processed once. + * Assumes that Build() has not been called. + */ + DLL_PUBLIC int AddOperatorImpl(const OpSpec &spec, const std::string& inst_name, int logical_id); + void SetupCPUInput(std::map::iterator it, int input_idx, OpSpec *spec); void SetupGPUInput(std::map::iterator it); @@ -683,7 +691,8 @@ class DLL_PUBLIC Pipeline { void DiscoverInputOperators(); /** - * @brief Process exception that was thrown when executing DALI. + * @brief Process exception that was thrown when executing DALI. Executor already provided context + * for operator if possible. */ void ProcessException(std::exception_ptr eptr); diff --git a/dali/pipeline/pipeline_test.cc b/dali/pipeline/pipeline_test.cc index 2a6872e3ba3..bb5e4661a97 100644 --- a/dali/pipeline/pipeline_test.cc +++ b/dali/pipeline/pipeline_test.cc @@ -147,7 +147,7 @@ class PipelineTest : public DALITest { ASSERT_EQ(graph.NumOp(OpType::MIXED), 1); ASSERT_EQ(graph.NumOp(OpType::GPU), 2); - ASSERT_EQ(graph.Node(OpType::MIXED, 0).op->name(), "MakeContiguous"); + ASSERT_EQ(graph.Node(OpType::MIXED, 0).op->GetSpec().GetSchema().name(), "MakeContiguous"); // Validate the source op auto &node = graph.Node(0); diff --git a/dali/python/nvidia/dali/ops/__init__.py b/dali/python/nvidia/dali/ops/__init__.py index f1c8b0e5fc9..cd87bb87f97 100644 --- a/dali/python/nvidia/dali/ops/__init__.py +++ b/dali/python/nvidia/dali/ops/__init__.py @@ -573,7 +573,13 @@ def __init__(self, *, device="cpu", **kwargs): self._init_args.update({"_module": Operator.__module__.replace(".hidden", "")}) if "_display_name" not in self._init_args: self._init_args.update({"_display_name": type(self).__name__}) + # Make sure that the internal name arguments are added first in case backend + # needs them to report errors. + name_internal_keys = ["_display_name", "_module"] + name_args = {key: self._init_args.pop(key) for key in name_internal_keys} + _process_arguments(self._schema, self._spec, name_args, type(self).__name__) _process_arguments(self._schema, self._spec, self._init_args, type(self).__name__) + self._init_args.update(name_args) @property def spec(self): diff --git a/dali/test/python/conditionals/test_logical_expressions.py b/dali/test/python/conditionals/test_logical_expressions.py index 0538bbae084..5660163a440 100644 --- a/dali/test/python/conditionals/test_logical_expressions.py +++ b/dali/test/python/conditionals/test_logical_expressions.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2023-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -242,7 +242,7 @@ def gpu_input(): "Logical expression `.*` is restricted to scalar \\(0-d tensors\\)" " inputs of `bool` type, that are placed on CPU." " Got a GPU input .*in logical expression.*|" - "Named arguments inputs to operators must be CPU data nodes." + "Named argument inputs to operators must be CPU data nodes." " However, a GPU data node was provided" ), ): diff --git a/dali/test/python/conditionals/test_pipeline_conditionals.py b/dali/test/python/conditionals/test_pipeline_conditionals.py index 934498e89e4..bda184fb491 100644 --- a/dali/test/python/conditionals/test_pipeline_conditionals.py +++ b/dali/test/python/conditionals/test_pipeline_conditionals.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -983,7 +983,7 @@ def gpu_condition(): with assert_raises( RuntimeError, glob=( - "Named arguments inputs to operators must be CPU data nodes." + "Named argument inputs to operators must be CPU data nodes." " However, a GPU data node was provided" ), ): diff --git a/dali/test/python/operator_1/test_operator_exception.py b/dali/test/python/operator_1/test_operator_exception.py index b7a3b99f72e..4d55c92c01a 100644 --- a/dali/test/python/operator_1/test_operator_exception.py +++ b/dali/test/python/operator_1/test_operator_exception.py @@ -265,3 +265,29 @@ def non_scalar_condition(): pipe = non_scalar_condition() pipe.build() pipe.run() + + +def test_operator_exception_arg_input_placement(): + @pipeline_def(batch_size=1, num_threads=1, device_id=0) + def pipe(): + batch_gpu = fn.copy(1.0).gpu() + 0 + return fn.random.coin_flip(probability=batch_gpu) + + with assert_raises( + RuntimeError, + glob=( + """Error in CPU operator `nvidia.dali.fn.random.coin_flip`, +which was used in the pipeline definition with the following traceback: + + File "*/test_operator_exception.py", line *, in pipe + return fn.random.coin_flip(probability=batch_gpu) + +encountered: + +Error while specifying argument 'probability'. Named argument inputs to operators must be CPU data nodes.* +""" # noqa(E501) + ), + ): + p = pipe() + p.build() + p.run() diff --git a/dali/test/python/operator_1/test_slice.py b/dali/test/python/operator_1/test_slice.py index e6885db86b4..c486f6c8fc0 100644 --- a/dali/test/python/operator_1/test_slice.py +++ b/dali/test/python/operator_1/test_slice.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# Copyright (c) 2019-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -1191,7 +1191,7 @@ def make_pipe(): return sliced with assert_raises( - RuntimeError, glob="Named arguments inputs to operators must be CPU data nodes" + RuntimeError, glob="Named argument inputs to operators must be CPU data nodes" ): p = make_pipe() p.build() diff --git a/dali/test/python/test_dali_cpu_only.py b/dali/test/python/test_dali_cpu_only.py index 0b36dbbeac9..82900ec272d 100644 --- a/dali/test/python/test_dali_cpu_only.py +++ b/dali/test/python/test_dali_cpu_only.py @@ -94,8 +94,8 @@ def get_data(): RuntimeError, pipe.build, glob="Cannot move the data node __ExternalSource_* to the GPU in a CPU-only pipeline. " - "The `device_id` parameter is set to `CPU_ONLY_DEVICE_ID`. " - "Set `device_id` to a valid GPU identifier to enable GPU features in the pipeline.", + "The 'device_id' parameter is set to `CPU_ONLY_DEVICE_ID`. " + "Set 'device_id' to a valid GPU identifier to enable GPU features in the pipeline.", ) @@ -113,7 +113,11 @@ def get_data(): assert_raises( RuntimeError, pipe.build, - glob="Cannot add a GPU operator Rotate, device_id should not be equal CPU_ONLY_DEVICE_ID.", + glob=( + "Error in GPU operator `nvidia.dali.fn.rotate`*" + "Cannot add a GPU operator. " + "Pipeline 'device_id' should not be equal to `CPU_ONLY_DEVICE_ID`." + ), ) @@ -133,7 +137,11 @@ def get_data(): def test_gpu_op_bad_device(): device_ids = [None, 0] error_msgs = [ - "Cannot add a GPU operator ExternalSource, device_id should not be equal CPU_ONLY_DEVICE_ID.", # noqa: E501 + ( + "Error in GPU operator `nvidia.dali.fn.external_source`*" + "Cannot add a GPU operator." + " Pipeline 'device_id' should not be equal to `CPU_ONLY_DEVICE_ID`." + ), "You are trying to create a GPU DALI pipeline, while CUDA is not available.*", ] @@ -152,7 +160,11 @@ def check_mixed_op_bad_device(device_id, error_msg): def test_mixed_op_bad_device(): device_ids = [None, 0] error_msgs = [ - "Cannot add a mixed operator decoders__Image with a GPU output, device_id should not be CPU_ONLY_DEVICE_ID.", # noqa: E501 + ( + "Error in MIXED operator `nvidia.dali.fn.decoders.image`*" + "Cannot add a Mixed operator with a GPU output," + " 'device_id' should not be `CPU_ONLY_DEVICE_ID`" + ), "You are trying to create a GPU DALI pipeline, while CUDA is not available.*", ] @@ -1150,8 +1162,9 @@ def test_arithm_ops_cpu_gpu(): RuntimeError, pipe.build, glob=( - "Cannot add a GPU operator ArithmeticGenericOp," - " device_id should not be equal CPU_ONLY_DEVICE_ID." + "Error in GPU operator `nvidia.dali.math.*`*" + "Cannot add a GPU operator." + " Pipeline 'device_id' should not be equal to `CPU_ONLY_DEVICE_ID`." ), )