Question concerning assignment 4, with respect to `Section 4` #32

Ja1Zhou · 2022-07-26T04:05:40Z

Ja1Zhou
Jul 26, 2022

Hi, thanks again for helping out. I had some difficulties when I was working out this section. One question connects with the following description in the markdown file:

if you described conv2d and bias computation separately, or you used the conv2d provided by TOPI, then the target function should not have the bias block at the end. The original function of the target is generated by using TOPI conv2d.

I checked the document of topi.nn.conv2d here, and found that bias needs to be separated operated. Therefore I keep getting an T_add block that I cannot reduce.
Moreover, I was unable to parallelize or vectorize the fused loops in the conv2d block. I tried to remove the addition of bias that follows for debugging, but didn't work as well. The best I could get was to unroll the loop. I would be very grateful if you could answer my question!
Here is my toy code for trying out the transformations

def debug_pytorch_model(A, w0, b0):
    B = topi.nn.conv2d(A, w0, [1, 1], [0, 0], [1,1])
    B = topi.add(B, b0)
    return B
def create_debug_model_via_emit_te():
    bb = relax.BlockBuilder()
    x = relax.Var("x", input_shape, relax.DynTensorType(batch_size, "float32"))

    conv2d_weight = relax.const(weight_map["conv2d_weight"], "float32")
    conv2d_bias = relax.const(weight_map["conv2d_bias"].reshape(1, 32, 1, 1), "float32")
    with bb.function("main", [x]):
        with bb.dataflow():
            lv0 = bb.emit_te(
                debug_pytorch_model, x, 
                conv2d_weight, 
                conv2d_bias,
            )
            gv = bb.emit_output(lv0)
        bb.emit_func_output(gv)

    return bb.get()
mod = create_debug_model_via_emit_te()
sch = tvm.tir.Schedule(mod)
# %%
# Step 1. Get blocks
conv_init = sch.get_block(name="pad_temp", func_name="debug_pytorch_model")
conv2d = sch.get_block(name="conv2d_nchw", func_name="debug_pytorch_model")
conv2d_bias = sch.get_block(name="T_add", func_name="debug_pytorch_model")
# %%
# Step 2. Inline the padding block (if exists)
sch.compute_inline(conv_init)
# %%
# Step 3. Get loops
i0, i1, i2, i3, i4, i5, i6 = sch.get_loops(conv2d)
# %%
# Step 4. Organize the loops
i0_0, i0_1 = sch.split(i0, factors=[2, 2])
i1_0, i1_1 = sch.split(i1, factors=[None, 4])
i2_0, i2_1 = sch.split(i2, factors=[None, 2])
i3_0, i3_1 = sch.split(i3, factors=[None, 2])
sch.reorder(i0_0, i1_0, i2_0, i3_0, i0_1, i1_1, i2_1, i3_1)
# %%
# Step 5. decompose reduction
sch.decompose_reduction(conv2d, i4)
# sch.reverse_compute_at(conv2d_bias, i3_1)
# %%
# Step 6. fuse + vectorize / fuse + parallel / fuse + unroll
sch.fuse(i0_0,i1_0,i2_0,i3_0)
sch.fuse(i0_1, i1_1)
sch.fuse(i2_1, i3_1)
i0_0_i1_0_i2_0_i3_0_fuse, i0_1_i1_1_fuse, i2_1_i3_1_fuse ,i4, i5, i6 = sch.get_loops(conv2d)
# sch.parallel(i0_0_i1_0_i2_0_i3_0_fuse)
sch.unroll(i0_1_i1_1_fuse)
# sch.vectorize(i2_1_i3_1_fuse)
IPython.display.Code(sch.mod.script(), language="python")

Outputs:

@tvm.script.ir_module
class Module:
    @tir.prim_func
    def debug_pytorch_model(rxplaceholder: tir.Buffer[(4, 1, 28, 28), "float32"], rxplaceholder_1: tir.Buffer[(32, 1, 3, 3), "float32"], rxplaceholder_2: tir.Buffer[(1, 32, 1, 1), "float32"], T_add: tir.Buffer[(4, 32, 26, 26), "float32"]) -> None:
        # function attr dict
        tir.func_attr({"global_symbol": "debug_pytorch_model", "tir.noalias": True})
        # body
        # with tir.block("root")
        conv2d_nchw = tir.alloc_buffer([4, 32, 26, 26], dtype="float32")
        for i0_0_i1_0_i2_0_i3_0_fused in tir.serial(2704):
            for i0_1_i1_1_fused in tir.unroll(8):
                for i2_1_i3_1_fused in tir.serial(4):
                    with tir.block("conv2d_nchw_init"):
                        nn = tir.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 1352 * 2 + i0_1_i1_1_fused // 4)
                        ff = tir.axis.spatial(32, i0_0_i1_0_i2_0_i3_0_fused % 1352 // 169 * 4 + i0_1_i1_1_fused % 4)
                        yy = tir.axis.spatial(26, i0_0_i1_0_i2_0_i3_0_fused % 169 // 13 * 2 + i2_1_i3_1_fused // 2)
                        xx = tir.axis.spatial(26, i0_0_i1_0_i2_0_i3_0_fused % 13 * 2 + i2_1_i3_1_fused % 2)
                        tir.reads()
                        tir.writes(conv2d_nchw[nn, ff, yy, xx])
                        conv2d_nchw[nn, ff, yy, xx] = tir.float32(0)
                    for i4, i5, i6 in tir.grid(1, 3, 3):
                        with tir.block("conv2d_nchw_update"):
                            nn = tir.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 1352 * 2 + i0_1_i1_1_fused // 4)
                            ff = tir.axis.spatial(32, i0_0_i1_0_i2_0_i3_0_fused % 1352 // 169 * 4 + i0_1_i1_1_fused % 4)
                            yy = tir.axis.spatial(26, i0_0_i1_0_i2_0_i3_0_fused % 169 // 13 * 2 + i2_1_i3_1_fused // 2)
                            xx = tir.axis.spatial(26, i0_0_i1_0_i2_0_i3_0_fused % 13 * 2 + i2_1_i3_1_fused % 2)
                            rc, ry, rx = tir.axis.remap("RRR", [i4, i5, i6])
                            tir.reads(conv2d_nchw[nn, ff, yy, xx], rxplaceholder[nn, rc, yy + ry, xx + rx], rxplaceholder_1[ff, rc, ry, rx])
                            tir.writes(conv2d_nchw[nn, ff, yy, xx])
                            conv2d_nchw[nn, ff, yy, xx] = conv2d_nchw[nn, ff, yy, xx] + rxplaceholder[nn, rc, yy + ry, xx + rx] * rxplaceholder_1[ff, rc, ry, rx]
        for i0, i1, i2, i3 in tir.grid(4, 32, 26, 26):
            with tir.block("T_add"):
                ax0, ax1, ax2, ax3 = tir.axis.remap("SSSS", [i0, i1, i2, i3])
                tir.reads(conv2d_nchw[ax0, ax1, ax2, ax3], rxplaceholder_2[0, ax1, 0, 0])
                tir.writes(T_add[ax0, ax1, ax2, ax3])
                T_add[ax0, ax1, ax2, ax3] = conv2d_nchw[ax0, ax1, ax2, ax3] + rxplaceholder_2[0, ax1, 0, 0]
    
    @relax.function
    def main(x: Tensor((4, 1, 28, 28), "float32")) -> Tensor(None, "float32", ndim = 4):
        # block 0
        with relax.dataflow():
            lv = relax.call_tir(debug_pytorch_model, (x, meta[relay.Constant][0], meta[relay.Constant][1]), (4, 32, 26, 26), dtype="float32")
            gv: Tensor((4, 32, 26, 26), "float32") = lv
            relax.output(gv)
        return gv

GeLee-Q · 2022-07-27T12:31:50Z

GeLee-Q
Jul 27, 2022

Hi, My output is similar to the res , I can't vectorize i2_1_i3_1_fused, and I find it can be unrolled.
And the transfromed IRModule passed check_equivalence(sch.mod, torch_model, test_loader)

My code :

IRModlue

def conv2d_1(Input, fliter):
  return tvm.topi.nn.conv2d(Input, fliter, 1, 0, 1)

def add(Input, bias):
  return tvm.topi.add(Input, bias)   


def relu_2(Input):
  tvm.topi.nn.pool2d
  return tvm.topi.nn.relu(Input)

def maxPool_3(Input):
  # return tvm.topi.nn.pool2d(Input, (2, 2), (0, 0) , 1, (0, 0, 0, 0), 'max', False, 'NHCW', False)
  return tvm.topi.nn.pool2d(data = Input, kernel = [2, 2], dilation = (1,1), stride = [2,2], padding = [0,0,0,0], pool_type = 'max')
   
def flatten_4(Input):
  return tvm.topi.nn.flatten(Input)

def linear_5(Input, weight, bias):
  lv5_0 = tvm.topi.nn.dense(Input, weight)
  return tvm.topi.add(lv5_0, bias) 
    
def relu_6(Input):
  return tvm.topi.nn.relu(Input)

def linear_7(Input, weight ,bias):
  lv7_0 = tvm.topi.nn.dense(Input, weight)
  return tvm.topi.add(lv7_0, bias) 

def softMax_8(Intput):
  return tvm.topi.nn.softmax(Intput, axis=- 1)

def create_model_via_emit_te_4():
    bb = relax.BlockBuilder()
    x = relax.Var("x", input_shape, relax.DynTensorType(batch_size, "float32"))

    conv2d_weight = relax.const(weight_map["conv2d_weight"], "float32")
    conv2d_bias = relax.const(weight_map["conv2d_bias"].reshape(1, 32, 1, 1), "float32")
    linear0_weight = relax.const(weight_map["linear0_weight"], "float32")
    linear0_bias = relax.const(weight_map["linear0_bias"].reshape(1, 100), "float32")
    linear1_weight = relax.const(weight_map["linear1_weight"], "float32")
    linear1_bias = relax.const(weight_map["linear1_bias"].reshape(1, 10), "float32")

    with bb.function("main", [x]):
        with bb.dataflow():
          lv1_0 = bb.emit_te(conv2d_1, x, conv2d_weight)
          lv1 = bb.emit_te(add, lv1_0, conv2d_bias)
          lv2 = bb.emit_te(relu_2, lv1)
          lv3 = bb.emit_te(maxPool_3, lv2)
          lv4 = bb.emit_te(flatten_4, lv3)
          lv5 = bb.emit_te(linear_5, lv4, linear0_weight, linear0_bias)
          lv6 = bb.emit_te(relu_6, lv5)
          lv7 = bb.emit_te(linear_7, lv6, linear1_weight, linear1_bias)
          lv8 = bb.emit_te(softMax_8, lv7)

          gv = bb.emit_output(lv8)
        bb.emit_func_output(gv)

    return bb.get()

schedule

mod = create_model_via_emit_te_4()
sch = tvm.tir.Schedule(mod)
# Step 1. Get blocks
# block = sch.get_block(name="your_block_name", func_name="your_function_name")
# block = sch.get_block("root","conv2d_1")

# Step 2. Inline the padding block (if exists)
pad_temp = sch.get_block("pad_temp", "conv2d_1")
sch.compute_inline(pad_temp)

# Step 3. Get loops
conv = sch.get_block("conv2d_nchw","conv2d_1")

# Step 4. Organize the loops


i0, i1, i2, i3, i4, i5, i6 = sch.get_loops(conv)

i0_0, i0_1 = sch.split(i0, factors=[2, 2])
i1_0, i1_1 = sch.split(i1, factors=[None, 4])
i2_0, i2_1 = sch.split(i2, factors=[None, 2])
i3_0, i3_1 = sch.split(i3, factors=[None, 2])
sch.reorder(i0_0, i1_0, i2_0, i3_0, i4, i5, i6, i0_1, i1_1, i2_1, i3_1)


i0_0, i1_0, i2_0, i3_0, i4, i5, i6, i0_1, i1_1, i2_1, i3_1 = sch.get_loops(conv)

sch.fuse(i0_0, i1_0, i2_0, i3_0)
i0_0_i1_0_i2_0_i3_0_fuse, i4, i5, i6, i0_1, i1_1, i2_1, i3_1 = sch.get_loops(conv)

sch.parallel(i0_0_i1_0_i2_0_i3_0_fuse)


# i0_i2_i3_fused, i4, i5, i6, i1= sch.get_loops(conv)
# sch.parallel(i0_i2_i3_fused)

# Step 5. decompose reduction
sch.decompose_reduction(conv, i4)

# Step 6. fuse + vectorize / fuse + parallel / fuse + unroll
conv_init = sch.get_block("conv2d_nchw_init","conv2d_1")
i0_0_i1_0_i2_0_i3_0_fused, i0_1_init, i1_1_init, i2_1_init, i3_1_init  = sch.get_loops(conv_init)
sch.fuse(i0_1_init, i1_1_init)
sch.fuse(i2_1_init, i3_1_init)
i0_0_i1_0_i2_0_i3_0_fused, i0_1_init_i1_1_init_fused, i2_1_init_i3_1_init_fused  = sch.get_loops(conv_init)
sch.unroll(i0_1_init_i1_1_init_fused)
sch.vectorize(i2_1_init_i3_1_init_fused)

conv_update = sch.get_block("conv2d_nchw_update","conv2d_1")
i0_0_i1_0_i2_0_i3_0_fused, i4, i5, i6, i0_1, i1_1, i2_1, i3_1  = sch.get_loops(conv_update)

sch.fuse(i0_1, i1_1)
sch.fuse(i2_1, i3_1)
i0_0_i1_0_i2_0_i3_0_fused, i4, i5, i6, i0_1_i1_1_fused, i2_1_i3_1_fused = sch.get_loops(conv_update)
sch.unroll(i0_1_i1_1_fused)
# sch.vectorize(i2_1_i3_1_fused)
sch.unroll(i2_1_i3_1_fused)



IPython.display.HTML(code2html(sch.mod.script()))

output

@tir.prim_func
    def conv2d_1(rxplaceholder: tir.Buffer[(4, 1, 28, 28), "float32"], rxplaceholder_1: tir.Buffer[(32, 1, 3, 3), "float32"], conv2d_nchw: tir.Buffer[(4, 32, 26, 26), "float32"]) -> None:
        # function attr dict
        tir.func_attr({"global_symbol": "conv2d_1", "tir.noalias": True})
        # body
        # with tir.block("root")
        for i0_0_i1_0_i2_0_i3_0_fused in tir.parallel(2704):
            for i0_1_init_i1_1_init_fused in tir.unroll(8):
                for i2_1_init_i3_1_init_fused in tir.vectorized(4):
                    with tir.block("conv2d_nchw_init"):
                        nn = tir.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 1352 * 2 + i0_1_init_i1_1_init_fused // 4)
                        ff = tir.axis.spatial(32, i0_0_i1_0_i2_0_i3_0_fused % 1352 // 169 * 4 + i0_1_init_i1_1_init_fused % 4)
                        yy = tir.axis.spatial(26, i0_0_i1_0_i2_0_i3_0_fused % 169 // 13 * 2 + i2_1_init_i3_1_init_fused // 2)
                        xx = tir.axis.spatial(26, i0_0_i1_0_i2_0_i3_0_fused % 13 * 2 + i2_1_init_i3_1_init_fused % 2)
                        tir.reads()
                        tir.writes(conv2d_nchw[nn, ff, yy, xx])
                        conv2d_nchw[nn, ff, yy, xx] = tir.float32(0)
            for i4, i5, i6 in tir.grid(1, 3, 3):
                for i0_1_i1_1_fused in tir.unroll(8):
                    for i2_1_i3_1_fused in tir.unroll(4):
                        with tir.block("conv2d_nchw_update"):
                            nn = tir.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 1352 * 2 + i0_1_i1_1_fused // 4)
                            ff = tir.axis.spatial(32, i0_0_i1_0_i2_0_i3_0_fused % 1352 // 169 * 4 + i0_1_i1_1_fused % 4)
                            yy = tir.axis.spatial(26, i0_0_i1_0_i2_0_i3_0_fused % 169 // 13 * 2 + i2_1_i3_1_fused // 2)
                            xx = tir.axis.spatial(26, i0_0_i1_0_i2_0_i3_0_fused % 13 * 2 + i2_1_i3_1_fused % 2)
                            rc, ry, rx = tir.axis.remap("RRR", [i4, i5, i6])
                            tir.reads(conv2d_nchw[nn, ff, yy, xx], rxplaceholder[nn, rc, yy + ry, xx + rx], rxplaceholder_1[ff, rc, ry, rx])
                            tir.writes(conv2d_nchw[nn, ff, yy, xx])
                            conv2d_nchw[nn, ff, yy, xx] = conv2d_nchw[nn, ff, yy, xx] + rxplaceholder[nn, rc, yy + ry, xx + rx] * rxplaceholder_1[ff, rc, ry, rx]

3 replies

Ja1Zhou Jul 27, 2022
Author

Sorry should my question pose confusion. I managed to work it out. Several points as a reminder.

The ordering of the 6 steps mentioned as tips is not strict.
Unrolling, as described here, facilitates parallelization.
From my own practice, vectorization as describe here, is the strictest and should be applied after parallelization.

GeLee-Q Jul 29, 2022

Parallel needs to operate before the Decompose Reduction (answer from Siyuan Feng)
Bro, and I want to know how to vectorize the i2_1_i3_1_fused? Have you solved it？

GeLee-Q Jul 29, 2022

Thanks! I figure it out, the operation of sch.decompose_reduction(conv, i4) shoule be done at last

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Question concerning assignment 4, with respect to `Section 4` #32

{{title}}

Replies: 1 comment 3 replies

{{title}}

{{editor}}'s edit

{{editor}}'s edit

{{title}}

{{title}}

{{title}}

Select a reply

Question concerning assignment 4, with respect to Section 4 #32

Ja1Zhou Jul 26, 2022

Replies: 1 comment · 3 replies

GeLee-Q Jul 27, 2022

Ja1Zhou Jul 27, 2022 Author

GeLee-Q Jul 29, 2022

GeLee-Q Jul 29, 2022

Question concerning assignment 4, with respect to `Section 4` #32

Ja1Zhou
Jul 26, 2022

Replies: 1 comment 3 replies

GeLee-Q
Jul 27, 2022

Ja1Zhou Jul 27, 2022
Author