NVIDIA · jjsjann123 · Feb 1, 2024 · Jan 11, 2024 · Jan 11, 2024 · Jan 11, 2024
diff --git a/csrc/scheduler/vectorize_helper.cpp b/csrc/scheduler/vectorize_helper.cpp
@@ -436,6 +436,9 @@ ContiguousInnerDimensionsMapper::computeInfoC2P(
     std::shared_ptr<MaxInfoSpanningTree::Information> from_info) {
   auto from_ids = std::dynamic_pointer_cast<const MappedDomain>(from_info)
                       ->mapped_root_ids_;
+  // When we propagate, we should check the resolved broadcast in the order of
+  // mapped from_ids.
+  //
   // If we have a case where we have a concretized broadcast that's being
   // tracked in a consumer but not concretized in the producer we should break
   // off the dimensions connected to the left of that dimension. So if we have:
@@ -445,29 +448,32 @@ ContiguousInnerDimensionsMapper::computeInfoC2P(
   // T3[i0, i1, i2] = T1 + T2
   // and we're propogating from T3 with {i0, i1, i2}
   // When we go from T3 to T0, we don't have any mechanism to understand that i0
-  // and i2 are not contiguous in the original domain of T3. It's not ideal with
-  // transpose, but when this happens we'll clear all dimensions mapped left of
-  // the concretized broadcast.
-  // So if we have:
-  // T0[i1, i2]
-  // T1[b0, i1, i2] = broadcast(T0)
-  // T2[i1, b0, i2] = transpose(T1)
-  // T3[i1, i0, i2]
-  // T4[i1, i0, i2] = T2 + T3
-  // T5[i0, i1, i2] = transpose(T4)
-  // Then i1 and i2 are contiguous in both T0 and T5, but due to the realization
-  // of the broadcast on T4 we will have removed i1 from the mapped set.
+  // and i2 are not contiguous in the original domain of T3.
+  //
+  // Another example is that, if the last broadcast dimension resolved in
+  // consumers root domain is mapped for vectorization, the merge order in
+  // the vectorization axes matters.
+  //
+  // T0[i0, i1]
+  // T1[i0, i1, b2] = broadcast(T0)
+  // T2[i0, i1, i3]
+  // T3[i0, i1, i2] = T1 + T2
+  //
+  // If the mapped ids are {i0, i2, i1}, when propagating from T3 to T1, the
+  // resolved broadcast iterdomain is `i2`/`b2`, which would give clear_pos=1.
+  // So we'll skip all from_ids with index < clear_pos. see issue:
+  // https://github.com/NVIDIA/Fuser/issues/1567#issuecomment-1894605385
   PairwiseRootDomainMap root_map(to, from);
   auto c2p_map = root_map.mapConsumerToProducer();
 
   // Id's in consumer to clear from the mapped set due to broadcast
   // concretization.
   std::unordered_set<IterDomain*> consumer_ids_to_clear;
+  int clear_pos = -1;
   if (to->hasBroadcast()) {
-    // Find the last broadcast dimension resolved in consumers root domain
-    int clear_pos = -1;
-    for (auto i : c10::irange(from->getRootDomain().size())) {
-      auto c_id = from->getRootDomain()[i];
+    // Find the last broadcast dimension resolved in consumers through from_ids
+    for (auto i : c10::irange(from_ids.size())) {
+      auto c_id = from_ids[i];
       auto c_it = c2p_map.find(c_id);
       if (c_it == c2p_map.end()) {
         continue;
@@ -477,17 +483,14 @@ ContiguousInnerDimensionsMapper::computeInfoC2P(
         clear_pos = (int)i;
       }
     }
-    // Clear everything to the left of the inner most resolved broadcast
-    // dimension, including the broadcasted domain.
-    if (clear_pos >= 0) {
-      consumer_ids_to_clear.insert(
-          from->getRootDomain().begin(),
-          from->getRootDomain().begin() + clear_pos + 1);
-    }
   }
 
   std::vector<IterDomain*> producer_rfactor_ids;
-  for (auto from_id : from_ids) {
+  for (int64_t i : c10::irange(from_ids.size())) {
+    if (i < clear_pos) {
+      continue;
+    }
+    auto from_id = from_ids[i];
     auto c2p_it = c2p_map.find(from_id);
     if (c2p_it != c2p_map.end() &&
         consumer_ids_to_clear.find(c2p_it->first) ==

diff --git a/test/test_pointwise.cpp b/test/test_pointwise.cpp
@@ -201,4 +201,172 @@ TEST_F(PointwiseTest, VectorizeAllocationDomain) {
   testValidate(fusion, cg_outputs, {t0}, __LINE__, __FILE__);
 }
 
+// All inputs & outputs share the same allocation domain permutation from root
+// domain, but intermediate tv2 isn't specified a stride order. There's also a
+// broadcast IterDomain on tv1, which is tricky for vectorization analysis to
+// figure out which axes should be excluded from the computation of
+// vectorization factor.
+TEST_F(PointwiseTest, VectorizeAllocationDomainIssue1567) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = TensorViewBuilder()
+                        .ndims(3)
+                        .contiguity({true, true, true})
+                        .strideOrder({2, 0, 1})
+                        .build();
+  TensorView* tv1 = TensorViewBuilder()
+                        .ndims(3)
+                        .shape({1, -1, 1})
+                        .contiguity({std::nullopt, std::nullopt, true})
+                        .strideOrder({2, 0, 1})
+                        .build();
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  auto tv2 = add(tv0, tv1);
+  auto tv3 = add(tv2, IrBuilder::create<Val>(1.0, DataType::Float));
+  tv3->setAllocationDomain({tv3->axis(0), tv3->axis(2), tv3->axis(1)}, true);
+  fusion->addOutput(tv3);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  fec.profile(true);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::empty_strided({1024, 128, 25}, {128*25, 1, 128}, options);
+  at::Tensor t1 = at::empty_strided({1, 128, 1}, {128, 1, 128}, options);
+  auto cg_outputs = fec.runFusionWithInputs({t0, t1});
+  EXPECT_EQ(getVecSizeForPointwise(fec), 4);
+  testValidate(fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
+}
+
+TEST_F(PointwiseTest, VectorizationFactorAnalysisCase0) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = TensorViewBuilder()
+                        .ndims(3)
+                        .contiguity({true, true, std::nullopt})
+                        .shape({-1, -1, 1})
+                        .build();
+  TensorView* tv1 = TensorViewBuilder()
+                        .ndims(3)
+                        .contiguity({true, true, true})
+                        .build();
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  auto tv2 = add(tv0, tv1);
+  fusion->addOutput(tv2);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({1024, 2, 1}, options);
+  at::Tensor t1 = at::randn({1024, 2, 512}, options);
+  std::vector<c10::IValue> aten_inputs = {t0, t1};
+
+  // NOTE: force pointwise scheduler here just for testing purpose
+  auto params = getPointwiseHeuristics(fusion, aten_inputs);
+  auto lparams = schedulePointwise(fusion, aten_inputs);
+  FusionExecutor fe;
+  fe.compileFusion(fusion, aten_inputs, lparams);
+  auto cg_outputs = fe.runFusion(aten_inputs, lparams);
+
+  EXPECT_EQ(params->vectorize, true);
+  EXPECT_EQ(params->unroll_factor, 4);
+
+  testValidate(fusion, cg_outputs, aten_inputs, __LINE__, __FILE__);
+}
+
+TEST_F(PointwiseTest, VectorizationFactorAnalysisCase1) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = TensorViewBuilder()
+                        .ndims(3)
+                        .contiguity({true, std::nullopt, true})
+                        .shape({-1, 1, -1})
+                        .build();
+  TensorView* tv1 = TensorViewBuilder()
+                        .ndims(3)
+                        .contiguity({true, true, true})
+                        .build();
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  auto tv2 = add(tv0, tv1);
+  fusion->addOutput(tv2);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  fec.profile(true);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({1024, 1, 2}, options);
+  at::Tensor t1 = at::randn({1024, 512, 2}, options);
+  auto cg_outputs = fec.runFusionWithInputs({t0, t1});
+  EXPECT_EQ(getVecSizeForPointwise(fec), 2);
+  testValidate(fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
+}
+
+TEST_F(PointwiseTest, VectorizationFactorAnalysisCase2) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = TensorViewBuilder()
+                        .ndims(3)
+                        .contiguity({true, std::nullopt, true})
+                        .shape({-1, 1, -1})
+                        .build();
+  TensorView* tv1 = TensorViewBuilder()
+                        .ndims(3)
+                        .contiguity({true, true, true})
+                        .build();
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  auto tv2 = add(tv0, tv1);
+  auto tv3 = transpose(tv2, 0, 1);
+  fusion->addOutput(tv3);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  fec.profile(true);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({1024, 1, 2}, options);
+  at::Tensor t1 = at::randn({1024, 512, 2}, options);
+  auto cg_outputs = fec.runFusionWithInputs({t0, t1});
+  EXPECT_EQ(getVecSizeForPointwise(fec), 4);
+  testValidate(fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
+}
+
+TEST_F(PointwiseTest, VectorizationFactorAnalysisCase3) {
+  auto fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = TensorViewBuilder()
+                        .ndims(3)
+                        .contiguity({std::nullopt, true, true})
+                        .shape({1, -1, -1})
+                        .build();
+  TensorView* tv1 = TensorViewBuilder()
+                        .ndims(3)
+                        .contiguity({true, true, true})
+                        .build();
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  auto tv2 = add(tv0, tv1);
+  auto tv3 = transpose(tv2, 0, 1);
+  fusion->addOutput(tv3);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  fec.profile(true);
+
+  auto options = at::TensorOptions().dtype(at::kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({1, 1024, 2}, options);
+  at::Tensor t1 = at::randn({512, 1024, 2}, options);
+  auto cg_outputs = fec.runFusionWithInputs({t0, t1});
+  EXPECT_EQ(getVecSizeForPointwise(fec), 2);
+  testValidate(fusion, cg_outputs, {t0, t1}, __LINE__, __FILE__);
+}
+
 } // namespace nvfuser