Question concerning assignment 4, with respect to Section 4
#32
Ja1Zhou
started this conversation in
Show and tell
Replies: 1 comment 3 replies
-
Hi, My output is similar to the res , I can't vectorize My code :
def conv2d_1(Input, fliter):
return tvm.topi.nn.conv2d(Input, fliter, 1, 0, 1)
def add(Input, bias):
return tvm.topi.add(Input, bias)
def relu_2(Input):
tvm.topi.nn.pool2d
return tvm.topi.nn.relu(Input)
def maxPool_3(Input):
# return tvm.topi.nn.pool2d(Input, (2, 2), (0, 0) , 1, (0, 0, 0, 0), 'max', False, 'NHCW', False)
return tvm.topi.nn.pool2d(data = Input, kernel = [2, 2], dilation = (1,1), stride = [2,2], padding = [0,0,0,0], pool_type = 'max')
def flatten_4(Input):
return tvm.topi.nn.flatten(Input)
def linear_5(Input, weight, bias):
lv5_0 = tvm.topi.nn.dense(Input, weight)
return tvm.topi.add(lv5_0, bias)
def relu_6(Input):
return tvm.topi.nn.relu(Input)
def linear_7(Input, weight ,bias):
lv7_0 = tvm.topi.nn.dense(Input, weight)
return tvm.topi.add(lv7_0, bias)
def softMax_8(Intput):
return tvm.topi.nn.softmax(Intput, axis=- 1)
def create_model_via_emit_te_4():
bb = relax.BlockBuilder()
x = relax.Var("x", input_shape, relax.DynTensorType(batch_size, "float32"))
conv2d_weight = relax.const(weight_map["conv2d_weight"], "float32")
conv2d_bias = relax.const(weight_map["conv2d_bias"].reshape(1, 32, 1, 1), "float32")
linear0_weight = relax.const(weight_map["linear0_weight"], "float32")
linear0_bias = relax.const(weight_map["linear0_bias"].reshape(1, 100), "float32")
linear1_weight = relax.const(weight_map["linear1_weight"], "float32")
linear1_bias = relax.const(weight_map["linear1_bias"].reshape(1, 10), "float32")
with bb.function("main", [x]):
with bb.dataflow():
lv1_0 = bb.emit_te(conv2d_1, x, conv2d_weight)
lv1 = bb.emit_te(add, lv1_0, conv2d_bias)
lv2 = bb.emit_te(relu_2, lv1)
lv3 = bb.emit_te(maxPool_3, lv2)
lv4 = bb.emit_te(flatten_4, lv3)
lv5 = bb.emit_te(linear_5, lv4, linear0_weight, linear0_bias)
lv6 = bb.emit_te(relu_6, lv5)
lv7 = bb.emit_te(linear_7, lv6, linear1_weight, linear1_bias)
lv8 = bb.emit_te(softMax_8, lv7)
gv = bb.emit_output(lv8)
bb.emit_func_output(gv)
return bb.get()
mod = create_model_via_emit_te_4()
sch = tvm.tir.Schedule(mod)
# Step 1. Get blocks
# block = sch.get_block(name="your_block_name", func_name="your_function_name")
# block = sch.get_block("root","conv2d_1")
# Step 2. Inline the padding block (if exists)
pad_temp = sch.get_block("pad_temp", "conv2d_1")
sch.compute_inline(pad_temp)
# Step 3. Get loops
conv = sch.get_block("conv2d_nchw","conv2d_1")
# Step 4. Organize the loops
i0, i1, i2, i3, i4, i5, i6 = sch.get_loops(conv)
i0_0, i0_1 = sch.split(i0, factors=[2, 2])
i1_0, i1_1 = sch.split(i1, factors=[None, 4])
i2_0, i2_1 = sch.split(i2, factors=[None, 2])
i3_0, i3_1 = sch.split(i3, factors=[None, 2])
sch.reorder(i0_0, i1_0, i2_0, i3_0, i4, i5, i6, i0_1, i1_1, i2_1, i3_1)
i0_0, i1_0, i2_0, i3_0, i4, i5, i6, i0_1, i1_1, i2_1, i3_1 = sch.get_loops(conv)
sch.fuse(i0_0, i1_0, i2_0, i3_0)
i0_0_i1_0_i2_0_i3_0_fuse, i4, i5, i6, i0_1, i1_1, i2_1, i3_1 = sch.get_loops(conv)
sch.parallel(i0_0_i1_0_i2_0_i3_0_fuse)
# i0_i2_i3_fused, i4, i5, i6, i1= sch.get_loops(conv)
# sch.parallel(i0_i2_i3_fused)
# Step 5. decompose reduction
sch.decompose_reduction(conv, i4)
# Step 6. fuse + vectorize / fuse + parallel / fuse + unroll
conv_init = sch.get_block("conv2d_nchw_init","conv2d_1")
i0_0_i1_0_i2_0_i3_0_fused, i0_1_init, i1_1_init, i2_1_init, i3_1_init = sch.get_loops(conv_init)
sch.fuse(i0_1_init, i1_1_init)
sch.fuse(i2_1_init, i3_1_init)
i0_0_i1_0_i2_0_i3_0_fused, i0_1_init_i1_1_init_fused, i2_1_init_i3_1_init_fused = sch.get_loops(conv_init)
sch.unroll(i0_1_init_i1_1_init_fused)
sch.vectorize(i2_1_init_i3_1_init_fused)
conv_update = sch.get_block("conv2d_nchw_update","conv2d_1")
i0_0_i1_0_i2_0_i3_0_fused, i4, i5, i6, i0_1, i1_1, i2_1, i3_1 = sch.get_loops(conv_update)
sch.fuse(i0_1, i1_1)
sch.fuse(i2_1, i3_1)
i0_0_i1_0_i2_0_i3_0_fused, i4, i5, i6, i0_1_i1_1_fused, i2_1_i3_1_fused = sch.get_loops(conv_update)
sch.unroll(i0_1_i1_1_fused)
# sch.vectorize(i2_1_i3_1_fused)
sch.unroll(i2_1_i3_1_fused)
IPython.display.HTML(code2html(sch.mod.script()))
@tir.prim_func
def conv2d_1(rxplaceholder: tir.Buffer[(4, 1, 28, 28), "float32"], rxplaceholder_1: tir.Buffer[(32, 1, 3, 3), "float32"], conv2d_nchw: tir.Buffer[(4, 32, 26, 26), "float32"]) -> None:
# function attr dict
tir.func_attr({"global_symbol": "conv2d_1", "tir.noalias": True})
# body
# with tir.block("root")
for i0_0_i1_0_i2_0_i3_0_fused in tir.parallel(2704):
for i0_1_init_i1_1_init_fused in tir.unroll(8):
for i2_1_init_i3_1_init_fused in tir.vectorized(4):
with tir.block("conv2d_nchw_init"):
nn = tir.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 1352 * 2 + i0_1_init_i1_1_init_fused // 4)
ff = tir.axis.spatial(32, i0_0_i1_0_i2_0_i3_0_fused % 1352 // 169 * 4 + i0_1_init_i1_1_init_fused % 4)
yy = tir.axis.spatial(26, i0_0_i1_0_i2_0_i3_0_fused % 169 // 13 * 2 + i2_1_init_i3_1_init_fused // 2)
xx = tir.axis.spatial(26, i0_0_i1_0_i2_0_i3_0_fused % 13 * 2 + i2_1_init_i3_1_init_fused % 2)
tir.reads()
tir.writes(conv2d_nchw[nn, ff, yy, xx])
conv2d_nchw[nn, ff, yy, xx] = tir.float32(0)
for i4, i5, i6 in tir.grid(1, 3, 3):
for i0_1_i1_1_fused in tir.unroll(8):
for i2_1_i3_1_fused in tir.unroll(4):
with tir.block("conv2d_nchw_update"):
nn = tir.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 1352 * 2 + i0_1_i1_1_fused // 4)
ff = tir.axis.spatial(32, i0_0_i1_0_i2_0_i3_0_fused % 1352 // 169 * 4 + i0_1_i1_1_fused % 4)
yy = tir.axis.spatial(26, i0_0_i1_0_i2_0_i3_0_fused % 169 // 13 * 2 + i2_1_i3_1_fused // 2)
xx = tir.axis.spatial(26, i0_0_i1_0_i2_0_i3_0_fused % 13 * 2 + i2_1_i3_1_fused % 2)
rc, ry, rx = tir.axis.remap("RRR", [i4, i5, i6])
tir.reads(conv2d_nchw[nn, ff, yy, xx], rxplaceholder[nn, rc, yy + ry, xx + rx], rxplaceholder_1[ff, rc, ry, rx])
tir.writes(conv2d_nchw[nn, ff, yy, xx])
conv2d_nchw[nn, ff, yy, xx] = conv2d_nchw[nn, ff, yy, xx] + rxplaceholder[nn, rc, yy + ry, xx + rx] * rxplaceholder_1[ff, rc, ry, rx] |
Beta Was this translation helpful? Give feedback.
3 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
-
Hi, thanks again for helping out. I had some difficulties when I was working out this section. One question connects with the following description in the markdown file:
I checked the document of
topi.nn.conv2d
here, and found that bias needs to be separated operated. Therefore I keep getting anT_add
block that I cannot reduce.Moreover, I was unable to
parallelize
orvectorize
the fused loops in the conv2d block. I tried to remove the addition of bias that follows for debugging, but didn't work as well. The best I could get was tounroll
the loop. I would be very grateful if you could answer my question!Here is my toy code for trying out the transformations
Outputs:
Beta Was this translation helpful? Give feedback.
All reactions