Skip to content

Commit

Permalink
Merge branch 'main' into vectorized_serial_reduction
Browse files Browse the repository at this point in the history
  • Loading branch information
jacobhinkle authored Jan 8, 2024
2 parents 0ef6204 + a67bb99 commit 50186ab
Show file tree
Hide file tree
Showing 62 changed files with 2,246 additions and 423 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ list(APPEND NVFUSER_SRCS
${NVFUSER_SRCS_DIR}/scheduler/mma_utils.cpp
${NVFUSER_SRCS_DIR}/optimization/add_axioms.cpp
${NVFUSER_SRCS_DIR}/optimization/consecutive_cast.cpp
${NVFUSER_SRCS_DIR}/optimization/optimize_layout.cpp
${NVFUSER_SRCS_DIR}/optimization/mark_aliases_prepare.cpp
${NVFUSER_SRCS_DIR}/optimization/pre_segmenter.cpp
${NVFUSER_SRCS_DIR}/optimization/remove_empty.cpp
${NVFUSER_SRCS_DIR}/val_graph.cpp
Expand Down
31 changes: 21 additions & 10 deletions benchmark/matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
#include <scheduler/all_schedulers.h>
#include <scheduler/matmul.h>
#include <scheduler/matmul_heuristic.h>
#include <utils.h>

#include <benchmark/benchmark.h>

Expand Down Expand Up @@ -121,14 +122,12 @@ static void SingleMatmulBase(
benchmark::State& benchmark_state,
MmaLayout layout,
MatmulParams params) {
std::vector<int64_t> input_mnk{
benchmark_state.range(0),
benchmark_state.range(1),
benchmark_state.range(2)};
int64_t m = benchmark_state.range(0);
int64_t n = benchmark_state.range(1);
int64_t k = benchmark_state.range(2);

// Tensor inputs
auto inputs =
matmulAtInput(input_mnk.at(0), input_mnk.at(1), input_mnk.at(2), layout);
auto inputs = matmulAtInput(m, n, k, layout);
auto expected_output = atMatmul(
inputs.first.to(at::kDouble), inputs.second.to(at::kDouble), layout);

Expand All @@ -155,8 +154,16 @@ static void SingleMatmulBase(
// Disable magic zero
CompileParams cparams;
cparams.enable_magic_zero = false;
// Always use 32b indexing mode for now.
cparams.index_type = PrimDataType::Int32;
KernelIndexTypeCompute index_type_helper;
index_type_helper.addDim(m, k); // A
index_type_helper.addDim(n, k); // B
index_type_helper.addDim(m, n); // D
cparams.index_type = index_type_helper.getType();
if (cparams.index_type == DataType::Int) {
// Notify as this can have a slight perf impact, but is necessary for large
// inputs
debug() << "Using int64_t as index type" << std::endl;
}

// Compile kernel
auto launch_constraints = LaunchParams();
Expand All @@ -172,7 +179,7 @@ static void SingleMatmulBase(

// Warm up run
auto outputs = fe.runFusion(aten_inputs);
checkMatch(expected_output, outputs.at(0).to(at::kDouble), input_mnk.at(2));
checkMatch(expected_output, outputs.at(0).to(at::kDouble), k);

runBenchmarkIterations(benchmark_state, &fe, aten_inputs);

Expand Down Expand Up @@ -671,7 +678,11 @@ static void MatmulShapeWarpStageAutoSplitK(benchmark::internal::Benchmark* b) {

ForAllLayouts(EagerModeBenchmark);
ForAllLayouts(NvfuserMatmulBenchmark);
ForAllLayouts(AutoSplitKBenchmark);
// Disable split-K benchmarks due to slow compilation.
// See https://github.com/NVIDIA/Fuser/issues/1389.
// These benchmarks should be enabled again after merging
// https://github.com/NVIDIA/Fuser/pull/1510
// ForAllLayouts(AutoSplitKBenchmark);
ForAllLayouts(AutoPartitionedKBenchmark);

// Note: SplitK Reduction benchmarks are parametrized only by M, N. The splitk
Expand Down
Loading

0 comments on commit 50186ab

Please sign in to comment.