Skip to content

Commit

Permalink
Add support for some empty fusion (#1981)
Browse files Browse the repository at this point in the history
  • Loading branch information
zasdfgbnm authored Sep 14, 2022
1 parent eabe8d8 commit 634820c
Show file tree
Hide file tree
Showing 4 changed files with 133 additions and 7 deletions.
5 changes: 0 additions & 5 deletions torch/csrc/jit/codegen/cuda/executor_utils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -825,11 +825,6 @@ void bindInputForExprEvaluation(

const auto value =
root_domain[dim]->hasExpandedExtent() ? 1 : tensor_arg_size;
if (value == 0 && cg_tensor->uses().empty()) {
// If there's no uses, ignore there's a size-0 dimension.
continue;
}
TORCH_INTERNAL_ASSERT(value != 0, "Cannot handle size-0 dimensions");
bool should_bind = true;
if (check_consistency) {
const auto prev_value = expr_eval.evaluate(extent);
Expand Down
6 changes: 6 additions & 0 deletions torch/csrc/jit/codegen/cuda/kernel_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -663,6 +663,12 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
const auto iter = output_holder.find(output);
if (iter != output_holder.end()) {
fusion_outputs.push_back(iter->second);
} else if (output->isFusionInput()) {
const auto iter = tensor_map.find(output);
TORCH_INTERNAL_ASSERT(
iter != tensor_map.end(), "Can not find output as aliased intput");
auto arg = dynamic_cast<const TensorArgAbstract*>(iter->second);
fusion_outputs.push_back(arg->getTensor());
} else {
bool empty_type_check = output->getDataType().has_value() &&
output->getDataType().value() == DataType::Float;
Expand Down
20 changes: 18 additions & 2 deletions torch/csrc/jit/codegen/cuda/scheduler/registry.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -866,8 +866,22 @@ class NoOpScheduler : public SchedulerEntry {
//! Check if the no-op heuristics apply in given fusion
static bool canScheduleCompileTime(Fusion* fusion) {
// Check there're no non-trivial reduction ops.
if (!ir_utils::getReductionOps(fusion, true /* ignore_trivial */).empty()) {
return false;
for (auto reduction :
ir_utils::getReductionOps(fusion, true /* ignore_trivial */)) {
for (auto input :
ir_utils::filterByType<TensorView>(reduction->inputs())) {
auto root_dom = input->getRootDomain();
auto all_nonzero =
std::none_of(root_dom.begin(), root_dom.end(), [](IterDomain* id) {
return id->extent()->isZeroInt();
});
if (all_nonzero) {
scheduler_debug_utils::canScheduleRejectReason(
ScheduleHeuristic::NoOp,
"reduction of non-zero elements is not supported");
return false;
}
}
}

// Check that all outputs are either broadcast or ignored reduction.
Expand All @@ -893,6 +907,8 @@ class NoOpScheduler : public SchedulerEntry {
[](IterDomain* id) { return id->extent()->isZeroInt(); })) {
// We have found a out_tv with a dimension that NoOp scheduler couldn't
// handle and therefore reject this fusion.
scheduler_debug_utils::canScheduleRejectReason(
ScheduleHeuristic::NoOp, "output has a concrete dimension");
return false;
}
}
Expand Down
109 changes: 109 additions & 0 deletions torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25824,6 +25824,115 @@ TEST_F(NVFuserTest, FusionNullScheduler_CUDA) {
}
}

// Simple test case exercising the null scheduler path.
TEST_F(NVFuserTest, FusionNullScheduler2_CUDA) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

auto tv0 = makeConcreteTensor({0, 1, 9223372036854775807L});
fusion->addInput(tv0);

auto tv1 = sum(tv0, {0, 1, 2});

fusion->addOutput(tv1);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({0, 1, 9223372036854775807L}, options);

std::vector<IValue> aten_inputs({t0});

FusionExecutorCache executor_cache(std::move(fusion));
auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);

auto t1 = t0.sum({0, 1, 2});

testValidate(
executor_cache.fusion(), cg_outputs, {t0}, {t1}, __LINE__, __FILE__);

auto groups =
executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();

// Check that all groups on the resulting runtime are null.
for (auto group : groups) {
TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
}
}

// Simple test case exercising the null scheduler path.
TEST_F(NVFuserTest, FusionNullScheduler3_CUDA) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

auto tv0 = TensorViewBuilder().ndims(0).build();
auto tv1 = TensorViewBuilder().ndims(0).build();
fusion->addInput(tv0);
fusion->addInput(tv1);
auto tv2 = add(tv0, tv1);
fusion->addOutput(tv2);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({}, options);
at::Tensor t1 = at::randn({}, options);

std::vector<IValue> aten_inputs({t0, t1});

FusionExecutorCache executor_cache(std::move(fusion));
auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);

testValidate(
executor_cache.fusion(),
cg_outputs,
{t0, t1},
{t0 + t1},
__LINE__,
__FILE__);

auto groups =
executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();

// Check that all groups on the resulting runtime are null.
for (auto group : groups) {
TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
}
}

TEST_F(NVFuserTest, FusionEmpty_CUDA) {
auto fusion = std::make_unique<Fusion>();
FusionGuard fg(fusion.get());

auto tv0 = makeConcreteTensor({10, 10, 10});
auto tv1 = makeConcreteTensor({10, 10, 10});
fusion->addInput(tv0);
fusion->addInput(tv1);
fusion->addOutput(tv0);
fusion->addOutput(tv1);

auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
at::Tensor t0 = at::randn({10, 10, 10}, options);
at::Tensor t1 = at::randn({10, 10, 10}, options);

std::vector<IValue> aten_inputs({t0, t1});

FusionExecutorCache executor_cache(std::move(fusion));
auto cg_outputs = executor_cache.runFusionWithInputs(aten_inputs);

testValidate(
executor_cache.fusion(),
cg_outputs,
{t0, t1},
{t0, t1},
__LINE__,
__FILE__);

auto groups =
executor_cache.getMostRecentKernelRuntime()->fusionSegments()->groups();

// Check that all groups on the resulting runtime are null.
for (auto group : groups) {
TORCH_INTERNAL_ASSERT(group->heuristic() == ScheduleHeuristic::NoOp);
}
}

TEST_F(NVFuserTest, FusionMappingRelation_CUDA) {
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
auto fusion = fusion_ptr.get();
Expand Down

0 comments on commit 634820c

Please sign in to comment.