Question concerning assignment 4, with respect to Section 4
#32
Ja1Zhou
started this conversation in
Show and tell
Replies: 1 comment 3 replies
-
Hi, My output is similar to the res , I can't vectorize My code :
def conv2d_1(Input, fliter):
return tvm.topi.nn.conv2d(Input, fliter, 1, 0, 1)
def add(Input, bias):
return tvm.topi.add(Input, bias)
def relu_2(Input):
tvm.topi.nn.pool2d
return tvm.topi.nn.relu(Input)
def maxPool_3(Input):
# return tvm.topi.nn.pool2d(Input, (2, 2), (0, 0) , 1, (0, 0, 0, 0), 'max', False, 'NHCW', False)
return tvm.topi.nn.pool2d(data = Input, kernel = [2, 2], dilation = (1,1), stride = [2,2], padding = [0,0,0,0], pool_type = 'max')
def flatten_4(Input):
return tvm.topi.nn.flatten(Input)
def linear_5(Input, weight, bias):
lv5_0 = tvm.topi.nn.dense(Input, weight)
return tvm.topi.add(lv5_0, bias)
def relu_6(Input):
return tvm.topi.nn.relu(Input)
def linear_7(Input, weight ,bias):
lv7_0 = tvm.topi.nn.dense(Input, weight)
return tvm.topi.add(lv7_0, bias)
def softMax_8(Intput):
return tvm.topi.nn.softmax(Intput, axis=- 1)
def create_model_via_emit_te_4():
bb = relax.BlockBuilder()
x = relax.Var("x", input_shape, relax.DynTensorType(batch_size, "float32"))
conv2d_weight = relax.const(weight_map["conv2d_weight"], "float32")
conv2d_bias = relax.const(weight_map["conv2d_bias"].reshape(1, 32, 1, 1), "float32")
linear0_weight = relax.const(weight_map["linear0_weight"], "float32")
linear0_bias = relax.const(weight_map["linear0_bias"].reshape(1, 100), "float32")
linear1_weight = relax.const(weight_map["linear1_weight"], "float32")
linear1_bias = relax.const(weight_map["linear1_bias"].reshape(1, 10), "float32")
with bb.function("main", [x]):
with bb.dataflow():
lv1_0 = bb.emit_te(conv2d_1, x, conv2d_weight)
lv1 = bb.emit_te(add, lv1_0, conv2d_bias)
lv2 = bb.emit_te(relu_2, lv1)
lv3 = bb.emit_te(maxPool_3, lv2)
lv4 = bb.emit_te(flatten_4, lv3)
lv5 = bb.emit_te(linear_5, lv4, linear0_weight, linear0_bias)
lv6 = bb.emit_te(relu_6, lv5)
lv7 = bb.emit_te(linear_7, lv6, linear1_weight, linear1_bias)
lv8 = bb.emit_te(softMax_8, lv7)
gv = bb.emit_output(lv8)
bb.emit_func_output(gv)
return bb.get()
mod = create_model_via_emit_te_4()
sch = tvm.tir.Schedule(mod)
# Step 1. Get blocks
# block = sch.get_block(name="your_block_name", func_name="your_function_name")
# block = sch.get_block("root","conv2d_1")
# Step 2. Inline the padding block (if exists)
pad_temp = sch.get_block("pad_temp", "conv2d_1")
sch.compute_inline(pad_temp)
# Step 3. Get loops
conv = sch.get_block("conv2d_nchw","conv2d_1")
# Step 4. Organize the loops
i0, i1, i2, i3, i4, i5, i6 = sch.get_loops(conv)
i0_0, i0_1 = sch.split(i0, factors=[2, 2])
i1_0, i1_1 = sch.split(i1, factors=[None, 4])
i2_0, i2_1 = sch.split(i2, factors=[None, 2])
i3_0, i3_1 = sch.split(i3, factors=[None, 2])
sch.reorder(i0_0, i1_0, i2_0, i3_0, i4, i5, i6, i0_1, i1_1, i2_1, i3_1)
i0_0, i1_0, i2_0, i3_0, i4, i5, i6, i0_1, i1_1, i2_1, i3_1 = sch.get_loops(conv)
sch.fuse(i0_0, i1_0, i2_0, i3_0)
i0_0_i1_0_i2_0_i3_0_fuse, i4, i5, i6, i0_1, i1_1, i2_1, i3_1 = sch.get_loops(conv)
sch.parallel(i0_0_i1_0_i2_0_i3_0_fuse)
# i0_i2_i3_fused, i4, i5, i6, i1= sch.get_loops(conv)
# sch.parallel(i0_i2_i3_fused)
# Step 5. decompose reduction
sch.decompose_reduction(conv, i4)
# Step 6. fuse + vectorize / fuse + parallel / fuse + unroll
conv_init = sch.get_block("conv2d_nchw_init","conv2d_1")
i0_0_i1_0_i2_0_i3_0_fused, i0_1_init, i1_1_init, i2_1_init, i3_1_init = sch.get_loops(conv_init)
sch.fuse(i0_1_init, i1_1_init)
sch.fuse(i2_1_init, i3_1_init)
i0_0_i1_0_i2_0_i3_0_fused, i0_1_init_i1_1_init_fused, i2_1_init_i3_1_init_fused = sch.get_loops(conv_init)
sch.unroll(i0_1_init_i1_1_init_fused)
sch.vectorize(i2_1_init_i3_1_init_fused)
conv_update = sch.get_block("conv2d_nchw_update","conv2d_1")
i0_0_i1_0_i2_0_i3_0_fused, i4, i5, i6, i0_1, i1_1, i2_1, i3_1 = sch.get_loops(conv_update)
sch.fuse(i0_1, i1_1)
sch.fuse(i2_1, i3_1)
i0_0_i1_0_i2_0_i3_0_fused, i4, i5, i6, i0_1_i1_1_fused, i2_1_i3_1_fused = sch.get_loops(conv_update)
sch.unroll(i0_1_i1_1_fused)
# sch.vectorize(i2_1_i3_1_fused)
sch.unroll(i2_1_i3_1_fused)
IPython.display.HTML(code2html(sch.mod.script()))
@tir.prim_func
def conv2d_1(rxplaceholder: tir.Buffer[(4, 1, 28, 28), "float32"], rxplaceholder_1: tir.Buffer[(32, 1, 3, 3), "float32"], conv2d_nchw: tir.Buffer[(4, 32, 26, 26), "float32"]) -> None:
# function attr dict
tir.func_attr({"global_symbol": "conv2d_1", "tir.noalias": True})
# body
# with tir.block("root")
for i0_0_i1_0_i2_0_i3_0_fused in tir.parallel(2704):
for i0_1_init_i1_1_init_fused in tir.unroll(8):
for i2_1_init_i3_1_init_fused in tir.vectorized(4):
with tir.block("conv2d_nchw_init"):
nn = tir.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 1352 * 2 + i0_1_init_i1_1_init_fused // 4)
ff = tir.axis.spatial(32, i0_0_i1_0_i2_0_i3_0_fused % 1352 // 169 * 4 + i0_1_init_i1_1_init_fused % 4)
yy = tir.axis.spatial(26, i0_0_i1_0_i2_0_i3_0_fused % 169 // 13 * 2 + i2_1_init_i3_1_init_fused // 2)
xx = tir.axis.spatial(26, i0_0_i1_0_i2_0_i3_0_fused % 13 * 2 + i2_1_init_i3_1_init_fused % 2)
tir.reads()
tir.writes(conv2d_nchw[nn, ff, yy, xx])
conv2d_nchw[nn, ff, yy, xx] = tir.float32(0)
for i4, i5, i6 in tir.grid(1, 3, 3):
for i0_1_i1_1_fused in tir.unroll(8):
for i2_1_i3_1_fused in tir.unroll(4):
with tir.block("conv2d_nchw_update"):
nn = tir.axis.spatial(4, i0_0_i1_0_i2_0_i3_0_fused // 1352 * 2 + i0_1_i1_1_fused // 4)
ff = tir.axis.spatial(32, i0_0_i1_0_i2_0_i3_0_fused % 1352 // 169 * 4 + i0_1_i1_1_fused % 4)
yy = tir.axis.spatial(26, i0_0_i1_0_i2_0_i3_0_fused % 169 // 13 * 2 + i2_1_i3_1_fused // 2)
xx = tir.axis.spatial(26, i0_0_i1_0_i2_0_i3_0_fused % 13 * 2 + i2_1_i3_1_fused % 2)
rc, ry, rx = tir.axis.remap("RRR", [i4, i5, i6])
tir.reads(conv2d_nchw[nn, ff, yy, xx], rxplaceholder[nn, rc, yy + ry, xx + rx], rxplaceholder_1[ff, rc, ry, rx])
tir.writes(conv2d_nchw[nn, ff, yy, xx])
conv2d_nchw[nn, ff, yy, xx] = conv2d_nchw[nn, ff, yy, xx] + rxplaceholder[nn, rc, yy + ry, xx + rx] * rxplaceholder_1[ff, rc, ry, rx] |
Beta Was this translation helpful? Give feedback.
3 replies
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Uh oh!
There was an error while loading. Please reload this page.
-
Hi, thanks again for helping out. I had some difficulties when I was working out this section. One question connects with the following description in the markdown file:
I checked the document of
topi.nn.conv2d
here, and found that bias needs to be separated operated. Therefore I keep getting anT_add
block that I cannot reduce.Moreover, I was unable to
parallelize
orvectorize
the fused loops in the conv2d block. I tried to remove the addition of bias that follows for debugging, but didn't work as well. The best I could get was tounroll
the loop. I would be very grateful if you could answer my question!Here is my toy code for trying out the transformations
Outputs:
Beta Was this translation helpful? Give feedback.
All reactions