Skip to content

Commit

Permalink
Enable mlir reduction emitter by default.
Browse files Browse the repository at this point in the history
This can be disabled with the flag --xla_gpu_mlir_emitter_level, setting it to
any value < 4.
Change some tests to still use the old emitters. We have separate IR tests for
the new emitters, and keeping the old tests running with the old emitters ensures
we still have coverage for the old emitters, in case we need to rollback.
One notable change with enabling emitter level 4 is that the heuristic to avoid
code duplication due to cache invalidation is disabled. This was always a
a workaround, and the new emitters fixed the problem. This is the most common
source of why the tests behave differently between the old and the new emitters.

FUTURE_COPYBARA_INTEGRATE_REVIEW=openxla/xla#15444 from eaplatanios:u/eaplatanios/cpp-17-fixes 73f3cd7e0135ec05c97595f795ec318fb635bd32
PiperOrigin-RevId: 653901032
  • Loading branch information
akuegel authored and tensorflower-gardener committed Jul 31, 2024
1 parent 5843411 commit 27df0fd
Show file tree
Hide file tree
Showing 29 changed files with 165 additions and 91 deletions.
24 changes: 12 additions & 12 deletions third_party/xla/xla/backends/profiler/gpu/cupti_buffer_events.cc
Original file line number Diff line number Diff line change
Expand Up @@ -186,18 +186,18 @@ void AddGraphTraceActivityEvent(CuptiEventCollectorDelegate &collector,
AnnotationMap::AnnotationInfo info = collector.annotation_map.LookUp(
graph_trace->deviceId, graph_trace->correlationId);
collector.receive(CuptiTracerEvent{
.type = CuptiTracerEventType::CudaGraph,
.source = CuptiTracerEventSource::Activity,
.name = absl::StrCat("CudaGraphExec:", graph_trace->graphId),
.annotation = info.annotation,
.nvtx_range = info.nvtx_range,
.start_time_ns = graph_trace->start,
.end_time_ns = graph_trace->end,
.device_id = graph_trace->deviceId,
.correlation_id = graph_trace->correlationId,
.context_id = graph_trace->contextId,
.stream_id = graph_trace->streamId,
.graph_id = graph_trace->graphId,
/* .type = */ CuptiTracerEventType::CudaGraph,
/* .source = */ CuptiTracerEventSource::Activity,
/* .name = */ absl::StrCat("CudaGraphExec:", graph_trace->graphId),
/* .annotation = */ info.annotation,
/* .nvtx_range = */ info.nvtx_range,
/* .start_time_ns = */ graph_trace->start,
/* .end_time_ns = */ graph_trace->end,
/* .device_id = */ graph_trace->deviceId,
/* .correlation_id = */ graph_trace->correlationId,
/* .context_id = */ graph_trace->contextId,
/* .stream_id = */ graph_trace->streamId,
/* .graph_id = */ graph_trace->graphId,
});
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ struct MemcpyDetails {
int8_t dst_mem_kind;

// ID of the hardware channel on which this operation ran.
uint32_t channel_id = -1;
uint32_t channel_id = static_cast<uint32_t>(-1);
// CUpti_ChannelType of the channel above.
int8_t channel_type = 0; // CUPTI_CHANNEL_TYPE_INVALID
};
Expand Down
2 changes: 1 addition & 1 deletion third_party/xla/xla/debug_options_flags.cc
Original file line number Diff line number Diff line change
Expand Up @@ -246,7 +246,7 @@ DebugOptions DefaultDebugOptionsIgnoringFlags() {
opts.set_xla_gpu_nccl_p2p_max_nchannels(0);

#if GOOGLE_CUDA
opts.set_xla_gpu_mlir_emitter_level(3);
opts.set_xla_gpu_mlir_emitter_level(4);
#else
opts.set_xla_gpu_mlir_emitter_level(0);
#endif
Expand Down
2 changes: 2 additions & 0 deletions third_party/xla/xla/hlo/evaluator/hlo_evaluator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -535,7 +535,9 @@ std::optional<DynamicOrStaticInteger> EvaluateWhileLoopParamInitValue(

namespace internal {

#if !defined(_MSC_VER)
constexpr absl::string_view kEvalErrorDetailUrl = "EvalErrorDetailUrl";
#endif

std::optional<EvalErrorDetail> ParseEvalErrorDetail(const absl::Status& error) {
auto error_detail = error.GetPayload(kEvalErrorDetailUrl);
Expand Down
4 changes: 4 additions & 0 deletions third_party/xla/xla/hlo/evaluator/hlo_evaluator.h
Original file line number Diff line number Diff line change
Expand Up @@ -530,7 +530,11 @@ enum class EvalErrorDetail : uint32_t {
kDynamicValueDependence = 0,
};

#if defined(_MSC_VER)
extern const absl::string_view kEvalErrorDetailUrl = "EvalErrorDetailUrl";
#else
extern const absl::string_view kEvalErrorDetailUrl;
#endif

std::optional<EvalErrorDetail> ParseEvalErrorDetail(const absl::Status& error);

Expand Down
2 changes: 1 addition & 1 deletion third_party/xla/xla/pjrt/c/pjrt_c_api_wrapper_impl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -2129,7 +2129,7 @@ PJRT_Error* PJRT_Layouts_MemoryLayout_Serialize(
PJRT_Layouts_MemoryLayout_Serialize_Args_STRUCT_SIZE, args->struct_size));

PJRT_Layouts_SerializedLayout* s_layout = new PJRT_Layouts_SerializedLayout{
.serialized = args->layout->layout->Serialize()};
/* .serialized = */ args->layout->layout->Serialize()};
args->serialized_layout = s_layout;
args->serialized_bytes = s_layout->serialized.data();
args->serialized_bytes_size = s_layout->serialized.size();
Expand Down
14 changes: 8 additions & 6 deletions third_party/xla/xla/pjrt/gpu/se_gpu_pjrt_compiler.cc
Original file line number Diff line number Diff line change
Expand Up @@ -199,13 +199,15 @@ StreamExecutorGpuCompiler::Compile(CompileOptions options,
#endif
}

STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, {
PjRtRegisterCompiler(
#if TENSORFLOW_USE_ROCM
RocmName(),
STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, {
PjRtRegisterCompiler(RocmName(),
std::make_unique<StreamExecutorGpuCompiler>());
});
#else
CudaName(),
#endif
std::make_unique<StreamExecutorGpuCompiler>());
STREAM_EXECUTOR_REGISTER_MODULE_INITIALIZER(pjrt_register_se_gpu_compiler, {
PjRtRegisterCompiler(CudaName(),
std::make_unique<StreamExecutorGpuCompiler>());
});
#endif
} // namespace xla
8 changes: 4 additions & 4 deletions third_party/xla/xla/service/cpu/runtime/conv_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ void EigenConv2DImpl(
Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation,
Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation,
Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count,
std::optional<std::function<void()>> done_callback = std::nullopt) {
std::optional<std::function<void()>> done_callback) {
const Eigen::TensorMap<Eigen::Tensor<const ScalarType, 4, Eigen::RowMajor>,
Eigen::Aligned>
input(lhs, input_batch, input_x, input_y, input_channels);
Expand Down Expand Up @@ -129,7 +129,7 @@ void EigenConv3DImpl(
Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation,
Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation,
Eigen::Index feature_group_count,
std::optional<std::function<void()>> done_callback = std::nullopt) {
std::optional<std::function<void()>> done_callback) {
using ConstTType =
Eigen::TensorMap<Eigen::Tensor<const ScalarType, 5, Eigen::RowMajor>,
Eigen::Aligned>;
Expand Down Expand Up @@ -223,7 +223,7 @@ void EigenConv3DImpl(
Eigen::Index padding_y_after, Eigen::Index lhs_x_dilation, \
Eigen::Index lhs_y_dilation, Eigen::Index rhs_x_dilation, \
Eigen::Index rhs_y_dilation, Eigen::Index feature_group_count, \
std::optional<std::function<void()>> done_callback = std::nullopt)
std::optional<std::function<void()>> done_callback)

CONV2D_EXTERN_TEMPLATE(Eigen::DefaultDevice, Eigen::half);
CONV2D_EXTERN_TEMPLATE(Eigen::DefaultDevice, float);
Expand All @@ -249,7 +249,7 @@ CONV2D_EXTERN_TEMPLATE(Eigen::ThreadPoolDevice, float);
Eigen::Index lhs_z_dilation, Eigen::Index rhs_x_dilation, \
Eigen::Index rhs_y_dilation, Eigen::Index rhs_z_dilation, \
Eigen::Index feature_group_count, \
std::optional<std::function<void()>> done_callback = std::nullopt)
std::optional<std::function<void()>> done_callback)

CONV3D_EXTERN_TEMPLATE(Eigen::DefaultDevice, Eigen::half);
CONV3D_EXTERN_TEMPLATE(Eigen::DefaultDevice, float);
Expand Down
6 changes: 4 additions & 2 deletions third_party/xla/xla/service/cpu/runtime_conv2d.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ limitations under the License.

#include "xla/service/cpu/runtime_conv2d.h"

#include <optional>

#define EIGEN_USE_THREADS

#include "absl/base/dynamic_annotations.h"
Expand All @@ -41,7 +43,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF32(
kernel_channels, kernel_filters, output_rows, output_cols, row_stride,
col_stride, padding_top, padding_bottom, padding_left, padding_right,
lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
feature_group_count);
feature_group_count, std::nullopt);
}

ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF16(
Expand All @@ -63,5 +65,5 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv2DF16(
kernel_channels, kernel_filters, output_rows, output_cols, row_stride,
col_stride, padding_top, padding_bottom, padding_left, padding_right,
lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
feature_group_count);
feature_group_count, std::nullopt);
}
6 changes: 4 additions & 2 deletions third_party/xla/xla/service/cpu/runtime_conv3d.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ limitations under the License.

#include "xla/service/cpu/runtime_conv3d.h"

#include <optional>

#define EIGEN_USE_THREADS

#include "absl/base/dynamic_annotations.h"
Expand Down Expand Up @@ -44,7 +46,7 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF32(
y_stride, z_stride, padding_x_before, padding_x_after, padding_y_before,
padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
rhs_z_dilation, feature_group_count);
rhs_z_dilation, feature_group_count, std::nullopt);
}

ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF16(
Expand All @@ -69,5 +71,5 @@ ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void __xla_cpu_runtime_EigenConv3DF16(
y_stride, z_stride, padding_x_before, padding_x_after, padding_y_before,
padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
rhs_z_dilation, feature_group_count);
rhs_z_dilation, feature_group_count, std::nullopt);
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ limitations under the License.

#include "xla/service/cpu/runtime_single_threaded_conv2d.h"

#include <optional>

#include "absl/base/dynamic_annotations.h"
#include "xla/service/cpu/runtime/conv_impl.h"

Expand All @@ -35,7 +37,7 @@ __xla_cpu_runtime_EigenSingleThreadedConv2DF16(
kernel_filters, output_rows, output_cols, row_stride, col_stride,
padding_top, padding_bottom, padding_left, padding_right,
lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
feature_group_count);
feature_group_count, std::nullopt);
}

ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
Expand All @@ -55,5 +57,5 @@ __xla_cpu_runtime_EigenSingleThreadedConv2DF32(
kernel_filters, output_rows, output_cols, row_stride, col_stride,
padding_top, padding_bottom, padding_left, padding_right,
lhs_row_dilation, lhs_col_dilation, rhs_row_dilation, rhs_col_dilation,
feature_group_count);
feature_group_count, std::nullopt);
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ limitations under the License.

#include "xla/service/cpu/runtime_single_threaded_conv3d.h"

#include <optional>

#include "absl/base/dynamic_annotations.h"
#include "xla/service/cpu/runtime/conv_impl.h"

Expand All @@ -38,7 +40,7 @@ __xla_cpu_runtime_EigenSingleThreadedConv3DF32(
z_stride, padding_x_before, padding_x_after, padding_y_before,
padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
rhs_z_dilation, feature_group_count);
rhs_z_dilation, feature_group_count, std::nullopt);
}

ABSL_ATTRIBUTE_NO_SANITIZE_MEMORY void
Expand All @@ -61,5 +63,5 @@ __xla_cpu_runtime_EigenSingleThreadedConv3DF16(
z_stride, padding_x_before, padding_x_after, padding_y_before,
padding_y_after, padding_z_before, padding_z_after, lhs_x_dilation,
lhs_y_dilation, lhs_z_dilation, rhs_x_dilation, rhs_y_dilation,
rhs_z_dilation, feature_group_count);
rhs_z_dilation, feature_group_count, std::nullopt);
}
52 changes: 28 additions & 24 deletions third_party/xla/xla/service/gpu/fusion_merger_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -135,42 +135,42 @@ f32add {
}
comp0 {
p = (f32[100000000], f32[100000000], f32[100000000], f32[100000000]) parameter(0)
gte0 = f32[100000000] get-tuple-element(p), index=0
gte1 = f32[100000000] get-tuple-element(p), index=1
add.9 = f32[100000000] add(gte0, gte1)
gte2 = f32[100000000] get-tuple-element(p), index=2
add.10 = f32[100000000] add(add.9, gte2)
gte3 = f32[100000000] get-tuple-element(p), index=3
add.11 = f32[100000000] add(add.10, gte3)
p1 = (f32[100000000], f32[100000000], f32[100000000], f32[100000000]) parameter(1)
gte4 = f32[100000000] get-tuple-element(p1), index=0
gte5 = f32[100000000] get-tuple-element(p1), index=1
add.12 = f32[100000000] add(gte4, gte5)
gte6 = f32[100000000] get-tuple-element(p1), index=2
add.13 = f32[100000000] add(add.12, gte6)
gte7 = f32[100000000] get-tuple-element(p1), index=3
add.14 = f32[100000000] add(add.13, gte7)
ROOT r = f32[100000000] add(add.14, add.11)
p = (f32[2048], f32[2048], f32[2048], f32[2048]) parameter(0)
gte0 = f32[2048] get-tuple-element(p), index=0
gte1 = f32[2048] get-tuple-element(p), index=1
add.9 = f32[2048] add(gte0, gte1)
gte2 = f32[2048] get-tuple-element(p), index=2
add.10 = f32[2048] add(add.9, gte2)
gte3 = f32[2048] get-tuple-element(p), index=3
add.11 = f32[2048] add(add.10, gte3)
p1 = (f32[2048], f32[2048], f32[2048], f32[2048]) parameter(1)
gte4 = f32[2048] get-tuple-element(p1), index=0
gte5 = f32[2048] get-tuple-element(p1), index=1
add.12 = f32[2048] add(gte4, gte5)
gte6 = f32[2048] get-tuple-element(p1), index=2
add.13 = f32[2048] add(add.12, gte6)
gte7 = f32[2048] get-tuple-element(p1), index=3
add.14 = f32[2048] add(add.13, gte7)
ROOT r = f32[2048] add(add.14, add.11)
}
comp1 {
p = f32[100000000] parameter(0)
p = f32[2048] parameter(0)
c0 = f32[] constant(0)
ROOT r = f32[] reduce(p, c0), dimensions={0}, to_apply=f32add
}
comp2 {
p = f32[100000000] parameter(0)
p = f32[2048] parameter(0)
c0 = f32[] constant(0)
r = f32[] reduce(p, c0), dimensions={0}, to_apply=f32add
ROOT n = f32[] negate(r)
}
ENTRY m.Computation2 {
p0 = (f32[100000000], f32[100000000], f32[100000000], f32[100000000]) parameter(0)
p1 = (f32[100000000], f32[100000000], f32[100000000], f32[100000000]) parameter(1)
fusion.0 = f32[100000000] fusion(p0, p1), kind=kLoop, calls=comp0
p0 = (f32[2048], f32[2048], f32[2048], f32[2048]) parameter(0)
p1 = (f32[2048], f32[2048], f32[2048], f32[2048]) parameter(1)
fusion.0 = f32[2048] fusion(p0, p1), kind=kLoop, calls=comp0
fusion.1 = f32[] fusion(fusion.0), kind=kLoop, calls=comp1
fusion.2 = f32[] fusion(fusion.0), kind=kLoop, calls=comp2
ROOT tuple = (f32[], f32[]) tuple(fusion.1, fusion.2)
Expand Down Expand Up @@ -362,14 +362,14 @@ TEST_F(FusionMergerTest, WillMergeReduceNotTooUnfriendlyLayouts) {
f2_computation {
f2_p0 = f32[16,16,256]{2,1,0} parameter(0)
f2_zero = f32[] constant(0)
ROOT f2_root = f32[] reduce(f2_p0, f2_zero), dimensions={0,1,2},
ROOT f2_root = f32[16,16] reduce(f2_p0, f2_zero), dimensions={2},
to_apply=add_computation
}
ENTRY entry {
p0 = f32[16,16,256]{0,1,2} parameter(0)
f1 = f32[16,16,256]{2,1,0} fusion(p0), kind=kLoop, calls=f1_computation
ROOT f2 = f32[] fusion(f1), kind=kInput, calls=f2_computation
ROOT f2 = f32[16,16] fusion(f1), kind=kInput, calls=f2_computation
})")
.value();
EXPECT_TRUE(fusion_merger_.Run(module.get()).value());
Expand Down Expand Up @@ -685,6 +685,8 @@ ENTRY entry {
}
)")
.value();
auto& debug_options = module->mutable_config().mutable_debug_options();
debug_options.set_xla_gpu_mlir_emitter_level(3);
EXPECT_TRUE(fusion_merger_.Run(module.get()).value());
}

Expand Down Expand Up @@ -995,6 +997,8 @@ ENTRY e {
}
)")
.value();
auto& debug_options = module->mutable_config().mutable_debug_options();
debug_options.set_xla_gpu_mlir_emitter_level(3);
EXPECT_FALSE(fusion_merger_.Run(module.get()).value());
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -300,12 +300,12 @@ PartitionedComputation::PartitionedComputation(
absl::StrJoin(roots, "_", [](std::string* out, const auto* root) {
absl::StrAppend(out, root->name());
})));
subgraphs_.push_back(
Subgraph{.name = std::move(name),
.instructions = {instructions.begin(), instructions.end()},
.roots = std::move(roots),
.index_ranges = std::move(ranges),
.root_indexing = std::move(root_indexing)});
subgraphs_.push_back(Subgraph{
/* .name = */ std::move(name),
/* .instructions = */ {instructions.begin(), instructions.end()},
/* .roots = */ std::move(roots),
/* .index_ranges = */ std::move(ranges),
/* .root_indexing = */ std::move(root_indexing)});
}

for (const auto& subgraph : subgraphs_) {
Expand Down
Loading

0 comments on commit 27df0fd

Please sign in to comment.