-
Notifications
You must be signed in to change notification settings - Fork 52
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Vectorization Factor patch for computeInfoC2P with Broadcast in mapped IterDomain #1625
Changes from 49 commits
3aced73
f66fd74
3577582
cbb3409
0aed1e2
3723352
b20537e
95d5f04
eb057e8
d7fa997
d091c9d
890be18
a971273
d57ecc1
fdb0a42
34bd074
a97a9a8
298f56f
280509b
48bde18
3dcf287
9cd6610
28389b4
99d0411
79724d6
abb44fc
a23ab05
139578f
7d2fb11
5c391fd
9050cd5
e2da518
56e17c6
9d6f425
fb3a4d8
1253946
543c08e
6a00b22
502b33d
c46d855
8b0beb8
a21873a
c95686e
c2e4b96
04f6765
800ff1a
0cb45ab
a7396cc
eb3a618
eb45314
221573a
65bef69
34772d4
d607d91
e542bb2
e5b8af4
c502fcd
0b4ad9e
5df9c74
c8ae949
77c24db
b633619
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -201,4 +201,172 @@ TEST_F(PointwiseTest, VectorizeAllocationDomain) { | |
testValidate(fusion, cg_outputs, {t0}, __LINE__, __FILE__); | ||
} | ||
|
||
// All inputs & outputs share the same allocation domain permutation from root | ||
// domain, but intermediate tv2 isn't specified a stride order. There's also a | ||
// broadcast IterDomain on tv1, which is tricky for vectorization analysis to | ||
// figure out which axes should be excluded from the computation of | ||
// vectorization factor. | ||
TEST_F(PointwiseTest, VectorizeAllocationDomainIssue1567) { | ||
auto fusion_ptr = std::make_unique<Fusion>(); | ||
auto fusion = fusion_ptr.get(); | ||
FusionGuard fg(fusion); | ||
|
||
TensorView* tv0 = TensorViewBuilder() | ||
.ndims(3) | ||
.contiguity({true, true, true}) | ||
.strideOrder({2, 0, 1}) | ||
.build(); | ||
TensorView* tv1 = TensorViewBuilder() | ||
.ndims(3) | ||
.shape({1, -1, 1}) | ||
.contiguity({std::nullopt, std::nullopt, true}) | ||
.strideOrder({2, 0, 1}) | ||
.build(); | ||
fusion->addInput(tv0); | ||
fusion->addInput(tv1); | ||
auto tv2 = add(tv0, tv1); | ||
auto tv3 = add(tv2, IrBuilder::create<Val>(1.0, DataType::Float)); | ||
tv3->setAllocationDomain({tv3->axis(0), tv3->axis(2), tv3->axis(1)}, true); | ||
fusion->addOutput(tv3); | ||
|
||
FusionExecutorCache fec(std::move(fusion_ptr)); | ||
fec.profile(true); | ||
|
||
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); | ||
at::Tensor t0 = at::empty_strided({1024, 128, 25}, {128*25, 1, 128}, options); | ||
at::Tensor t1 = at::empty_strided({1, 128, 1}, {128, 1, 128}, options); | ||
auto cg_outputs = fec.runFusionWithInputs({t0, t1}); | ||
EXPECT_EQ(getVecSizeForPointwise(fec), 4); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should also check that the input cache of tv0 and tv1 is vectorized? Because it is possible that, if not done correctly, only the output tv is vectorized, and inputs are just unrolled. |
||
testValidate(fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); | ||
} | ||
|
||
TEST_F(PointwiseTest, VectorizationFactorAnalysisCase0) { | ||
auto fusion_ptr = std::make_unique<Fusion>(); | ||
auto fusion = fusion_ptr.get(); | ||
FusionGuard fg(fusion); | ||
|
||
TensorView* tv0 = TensorViewBuilder() | ||
.ndims(3) | ||
.contiguity({true, true, std::nullopt}) | ||
.shape({-1, -1, 1}) | ||
.build(); | ||
TensorView* tv1 = TensorViewBuilder() | ||
.ndims(3) | ||
.contiguity({true, true, true}) | ||
.build(); | ||
fusion->addInput(tv0); | ||
fusion->addInput(tv1); | ||
auto tv2 = add(tv0, tv1); | ||
fusion->addOutput(tv2); | ||
|
||
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); | ||
at::Tensor t0 = at::randn({1024, 2, 1}, options); | ||
at::Tensor t1 = at::randn({1024, 2, 512}, options); | ||
std::vector<c10::IValue> aten_inputs = {t0, t1}; | ||
|
||
// NOTE: force pointwise scheduler here just for testing purpose | ||
auto params = getPointwiseHeuristics(fusion, aten_inputs); | ||
auto lparams = schedulePointwise(fusion, aten_inputs); | ||
FusionExecutor fe; | ||
fe.compileFusion(fusion, aten_inputs, lparams); | ||
auto cg_outputs = fe.runFusion(aten_inputs, lparams); | ||
|
||
EXPECT_EQ(params->vectorize, true); | ||
EXPECT_EQ(params->unroll_factor, 4); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should check that, in the scheduled fusion, tv0's input cache is not vectorized? |
||
|
||
testValidate(fusion, cg_outputs, aten_inputs, __LINE__, __FILE__); | ||
} | ||
|
||
TEST_F(PointwiseTest, VectorizationFactorAnalysisCase1) { | ||
auto fusion_ptr = std::make_unique<Fusion>(); | ||
auto fusion = fusion_ptr.get(); | ||
FusionGuard fg(fusion); | ||
|
||
TensorView* tv0 = TensorViewBuilder() | ||
.ndims(3) | ||
.contiguity({true, std::nullopt, true}) | ||
.shape({-1, 1, -1}) | ||
.build(); | ||
TensorView* tv1 = TensorViewBuilder() | ||
.ndims(3) | ||
.contiguity({true, true, true}) | ||
.build(); | ||
fusion->addInput(tv0); | ||
fusion->addInput(tv1); | ||
auto tv2 = add(tv0, tv1); | ||
fusion->addOutput(tv2); | ||
|
||
FusionExecutorCache fec(std::move(fusion_ptr)); | ||
fec.profile(true); | ||
|
||
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); | ||
at::Tensor t0 = at::randn({1024, 1, 2}, options); | ||
at::Tensor t1 = at::randn({1024, 512, 2}, options); | ||
auto cg_outputs = fec.runFusionWithInputs({t0, t1}); | ||
EXPECT_EQ(getVecSizeForPointwise(fec), 2); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should also check that the input cache of tv0 is vectorized? |
||
testValidate(fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); | ||
} | ||
|
||
TEST_F(PointwiseTest, VectorizationFactorAnalysisCase2) { | ||
auto fusion_ptr = std::make_unique<Fusion>(); | ||
auto fusion = fusion_ptr.get(); | ||
FusionGuard fg(fusion); | ||
|
||
TensorView* tv0 = TensorViewBuilder() | ||
.ndims(3) | ||
.contiguity({true, std::nullopt, true}) | ||
.shape({-1, 1, -1}) | ||
.build(); | ||
TensorView* tv1 = TensorViewBuilder() | ||
.ndims(3) | ||
.contiguity({true, true, true}) | ||
.build(); | ||
fusion->addInput(tv0); | ||
fusion->addInput(tv1); | ||
auto tv2 = add(tv0, tv1); | ||
auto tv3 = transpose(tv2, 0, 1); | ||
fusion->addOutput(tv3); | ||
|
||
FusionExecutorCache fec(std::move(fusion_ptr)); | ||
fec.profile(true); | ||
|
||
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); | ||
at::Tensor t0 = at::randn({1024, 1, 2}, options); | ||
at::Tensor t1 = at::randn({1024, 512, 2}, options); | ||
auto cg_outputs = fec.runFusionWithInputs({t0, t1}); | ||
EXPECT_EQ(getVecSizeForPointwise(fec), 4); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should also check that the input cache of tv0 is vectorized? |
||
testValidate(fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); | ||
} | ||
|
||
TEST_F(PointwiseTest, VectorizationFactorAnalysisCase3) { | ||
auto fusion_ptr = std::make_unique<Fusion>(); | ||
auto fusion = fusion_ptr.get(); | ||
FusionGuard fg(fusion); | ||
|
||
TensorView* tv0 = TensorViewBuilder() | ||
.ndims(3) | ||
.contiguity({std::nullopt, true, true}) | ||
.shape({1, -1, -1}) | ||
.build(); | ||
TensorView* tv1 = TensorViewBuilder() | ||
.ndims(3) | ||
.contiguity({true, true, true}) | ||
.build(); | ||
fusion->addInput(tv0); | ||
fusion->addInput(tv1); | ||
auto tv2 = add(tv0, tv1); | ||
auto tv3 = transpose(tv2, 0, 1); | ||
fusion->addOutput(tv3); | ||
|
||
FusionExecutorCache fec(std::move(fusion_ptr)); | ||
fec.profile(true); | ||
|
||
auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0); | ||
at::Tensor t0 = at::randn({1, 1024, 2}, options); | ||
at::Tensor t1 = at::randn({512, 1024, 2}, options); | ||
auto cg_outputs = fec.runFusionWithInputs({t0, t1}); | ||
EXPECT_EQ(getVecSizeForPointwise(fec), 2); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we should also check that the input cache of tv0 is vectorized? |
||
testValidate(fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__); | ||
} | ||
|
||
} // namespace nvfuser |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it be better to iterate from
from_ids.size()
to 0?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🤕 Good catch! I don't know why I was just mindlessly copying the old behavior.