Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions tests/python/contrib/test_hexagon/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
values from testing parameters """

import tvm
from .infrastructure import get_packed_filter_layout
from .infrastructure import get_packed_filter_shape


@tvm.testing.fixture
Expand All @@ -34,4 +34,4 @@ def shape_oihw(out_channel, in_channel, kernel):

@tvm.testing.fixture
def shape_oihw8i32o4i(out_channel, in_channel, kernel):
return get_packed_filter_layout(out_channel, in_channel, kernel, kernel)
return get_packed_filter_shape(out_channel, in_channel, kernel, kernel)
118 changes: 85 additions & 33 deletions tests/python/contrib/test_hexagon/infrastructure.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,36 +18,43 @@
""" Hexagon testing infrastructure """

import tvm
from tvm import te
import numpy


def ceildiv(o, d):
return tvm.tir.floordiv(o + d - 1, d)
Comment thread
adstraw marked this conversation as resolved.


def get_packed_activation_layout(shape_nhwc, block_shape, packed_C=True):
def get_block_shape():
Comment thread
adstraw marked this conversation as resolved.
return 8, 8, 32


def get_filter_block_shape():
return 8, 32, 4
Comment thread
adstraw marked this conversation as resolved.


def get_packed_shape(shape_nhwc):
Comment thread
adstraw marked this conversation as resolved.
Outdated
Comment thread
adstraw marked this conversation as resolved.
Outdated
assert len(shape_nhwc) == 4
shape = [shape_nhwc[0]]
block_shape = get_block_shape()
off_h, off_w, off_c = block_shape
shape.append(ceildiv(shape_nhwc[1], off_h))
shape.append(ceildiv(shape_nhwc[2], off_w))
if packed_C:
shape.append(ceildiv(shape_nhwc[3], off_c))
shape.extend(block_shape)
else:
shape.extend([off_h, off_w, shape_nhwc[3]])
shape.append(ceildiv(shape_nhwc[3], off_c))
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In this code we handle ceildiv as-is, but in the get_packed_filter_shape function (below) we cast its result to a Python int type.

Is there a good reason for doing it two different ways?

Specifically: casting to int works when the runtime return type of ceildiv is tvm.tir.expr.IntImm. But I'm wondering if we're okay with this test making that assumption. I.e., are we testing everything we intend to test when we assume that the exact shapes are known at this point in time?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Indeed, casting to int is a means of converting from tvm.tir.expr.IntImm to int. I am not sure why it works differently for get_packed_shape (no cast) vs. get_packed_filter_shape (requires cast). Also not sure about your question RE knowing exact shapes. More investigation required.

NOTE: I did change get_packed_filter_shape to take a shape instead of a list of dimensions to match the API of get_packed_shape.

shape.extend(block_shape)
return shape


def get_block_shape():
return 8, 8, 32


def get_filter_block_shape():
return 8, 32, 4
def get_logical_shape(shape_nhwc8h8w32c):
shape = [shape_nhwc8h8w32c[0]]
shape.append(shape_nhwc8h8w32c[1] * shape_nhwc8h8w32c[4])
shape.append(shape_nhwc8h8w32c[2] * shape_nhwc8h8w32c[5])
shape.append(shape_nhwc8h8w32c[3] * shape_nhwc8h8w32c[6])
return shape


def get_packed_filter_layout(out_channel, in_channel, kernel_h, kernel_w):
def get_packed_filter_shape(out_channel, in_channel, kernel_h, kernel_w):
filter_Cio, filter_Ki, filter_Cii = get_filter_block_shape()
filter_Ci = filter_Cio * filter_Cii
return (
Expand Down Expand Up @@ -93,26 +100,10 @@ def get_conv2d_nhwc_shape(shape_nhwc, kernel_size, strides, padding, dilation, o
)


def verify_conv2d(output, ref_output, dtype):
# nhwc8h8w32c
if len(output.shape) == 7:
# nhwc8h8w32c -> nhwc
output = output.transpose(0, 1, 4, 2, 5, 3, 6).reshape(
output.shape[0],
output.shape[1] * output.shape[4],
output.shape[2] * output.shape[5],
output.shape[3] * output.shape[6],
)

# nhwhwc
else:
# nhwhwc -> nhwc
output = output.transpose(0, 1, 3, 2, 4, 5).reshape(
output.shape[0],
output.shape[1] * output.shape[3],
output.shape[2] * output.shape[4],
output.shape[5],
)
def conv2d_verify(output, ref_output, dtype):
# nhwc8h8w32c -> nhwc
logical_output_shape = get_logical_shape(output.shape)
output = output.transpose(0, 1, 4, 2, 5, 3, 6).reshape(logical_output_shape)

# slice output to match ref_output shape
# e.g. 8x8 spatial 3x3 filter = 6x6 ref output
Expand All @@ -129,3 +120,64 @@ def verify_conv2d(output, ref_output, dtype):
elif dtype == "float32":
tol = {"rtol": 1e-4, "atol": 2e-4}
tvm.testing.assert_allclose(output, ref_output, **tol)


def conv2d_compute(X, filt, pad, stride, dilation):
block_shape = get_block_shape()
block_H, block_W, block_C = block_shape
filter_Cio, filter_Ki, filter_Cii = get_filter_block_shape()
filter_Ci = filter_Cio * filter_Cii

shape_filter = filt.shape
kernel_size = tuple(shape_filter[2:4])
out_channels = shape_filter[0] * shape_filter[5]

logical_input_shape = get_logical_shape(X.shape)
logical_output_shape = get_conv2d_nhwc_shape(
logical_input_shape,
kernel_size,
stride,
pad,
dilation,
out_channels,
)

output_shape = get_packed_shape(logical_output_shape)
n, ho, wo, ko, hi, wi, ki = output_shape
rh = te.reduce_axis((0, kernel_size[0]), name="rh")
rw = te.reduce_axis((0, kernel_size[1]), name="rw")
rc = te.reduce_axis((0, logical_input_shape[3]), name="rc")

def compute(n, ho, wo, ko, hi, wi, ki):
h = ho * block_H + hi
h_contig = h * stride[0] + rh
h_block_id = h_contig // block_H
h_block_offset = h_contig % block_H

w = wo * block_W + wi
w_contig = w * stride[1] + rw
w_block_id = w_contig // block_W
w_block_offset = w_contig % block_W

c_block_id = rc // block_C
c_block_offset = rc % block_C

rco = rc // filter_Ci
rcio = (rc % filter_Ci) // filter_Cii
rcii = rc % filter_Cii

return te.sum(
X[
n,
h_block_id,
w_block_id,
c_block_id,
h_block_offset,
w_block_offset,
c_block_offset,
]
* filt[ko, rco, rh, rw, rcio, ki, rcii],
axis=[rh, rw, rc],
)

return output_shape, compute
74 changes: 36 additions & 38 deletions tests/python/contrib/test_hexagon/test_conv2d_blocked.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,23 @@ This is a baseline 1x1 conv2d schedule for Hexagon.

## Command

pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[conv2d_nhwc8h8w32c-1-1-0-float32-1-1-1-64-64-128-llvm]"
pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[1-64-64-0-1-1-128-1-1-float32-llvm]"

## Parameters

| Parameter | Value |
| --------- | ----------- |
| Batch | 1 |
| Filter | 1x1 |
| Spatial | 64x64 |
| Input Ch | 64 |
| Output Ch | 128 |
| Stride | 1 |
| Padding | 0 |
| Layout | NHWC8h8w32c |
| Parameter | Value |
| --------- | ----- |
| Batch | 1 |
| Spatial | 64x64 |
| Input Ch | 64 |
| Padding | 0 |
| Stride | 1 |
| Filter | 1x1 |
| Output Ch | 128 |

## Assumptions

* Pattern matching for microkernels is not senstive to cache reads and writes between the outer height (ho) and outer width (wo) loops.
* n/a

## To Do

Expand Down Expand Up @@ -174,26 +173,25 @@ The key changes in TIR versus the above are...

## Command

pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[conv2d_nhwc8h8w32c-1-1-0-float32-2-2-1-64-64-128-llvm]"
pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[1-64-64-0-1-1-128-2-2-float32-llvm]"

## Parameters

| Parameter | Value |
| --------- | ----------- |
| Batch | 1 |
| Filter | 1x1 |
| Spatial | 64x64 |
| Input Ch | 64 |
| Output Ch | 128 |
| Stride | 1 |
| Padding | 0 |
| Layout | NHWC8h8w32c |
| k_split | 2 |
| h_split | 2 |
| Parameter | Value |
| --------- | ----- |
| Batch | 1 |
| Spatial | 64x64 |
| Input Ch | 64 |
| Padding | 0 |
| Stride | 1 |
| Filter | 1x1 |
| Output Ch | 128 |
| k_split | 2 |
| h_split | 2 |

## Assumptions

* n/a - With the loop splits on `ko` and `ho` the compute schedule is now over `ko.inner` `ho.inner` `wo` etc. This should fit the pattern matching for microkernels.
* n/a

## To Do

Expand Down Expand Up @@ -350,21 +348,21 @@ The `if` statement above indicates NOT to prefetch the vertically adjacent slice

## Command

pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[conv2d_nhwc8h8w32c-3-1-0-float32-2-2-1-64-64-128-llvm]"
pytest -sv "tests/python/contrib/test_hexagon/test_conv2d_blocked.py::TestConv2dPackedFilter::test_conv2d[1-64-64-0-1-3-128-2-2-float32-llvm]"

## Parameters

| Parameter | Value |
| --------- | ----------- |
| Batch | 1 |
| Filter | 3x3 |
| Spatial | 64x64 |
| Input Ch | 64 |
| Output Ch | 128 |
| Stride | 1 |
| Padding | 0 |
| Layout | NHWC8h8w32c |
| h_split | 2 |
| Parameter | Value |
| --------- | ----- |
| Batch | 1 |
| Spatial | 64x64 |
| Input Ch | 64 |
| Padding | 0 |
| Stride | 1 |
| Filter | 1x1 |
| Output Ch | 128 |
| k_split | 2 |
| h_split | 2 |

## Assumptions

Expand Down
Loading