Skip to content

Separable filtering #3

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 68 additions & 54 deletions source/mir/cv/imgproc/imfilter.d
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,12 @@ int separable_imfilter
float* outputbuf,
size_t rows,
size_t cols,
size_t hsize,
size_t vsize,
size_t ksize,
int border_handling = BORDER_REPLICATE
)
{
return separable_imfilter_impl
(inputbuf, hmask, vmask, outputbuf, rows, cols, hsize, vsize, border_handling);
(inputbuf, hmask, vmask, outputbuf, rows, cols, ksize, border_handling);
}

package(mir.cv):
Expand Down Expand Up @@ -95,8 +94,7 @@ int separable_imfilter_impl(T)
T* outputbuf,
size_t rows,
size_t cols,
size_t hsize,
size_t vsize,
size_t ksize,
int border_handling = BORDER_REPLICATE
)
{
Expand All @@ -110,45 +108,63 @@ int separable_imfilter_impl(T)
auto input = inputbuf.sliced(rows, cols);
auto temp = tempbuf.sliced(rows, cols);
auto output = outputbuf.sliced(rows, cols);
auto hm = hmask.sliced(hsize);
auto vm = vmask.sliced(vsize);
auto hm = hmask.sliced(ksize);
auto vm = vmask.sliced(ksize);

if (avx) // add version (compile time flag if avx should be supported)
{
inner_filtering_impl!(AVX!T)(input, temp, hm, vm, output,
&apply_horizontal_kernel_simd!(AVX!T), &apply_vertical_kernel_simd!(AVX!T));
inner_filtering_impl!(AVX!T, apply_horizontal_kernel_simd!(AVX!T), apply_vertical_kernel_simd!(AVX!T))
(input, temp, hm, vm, output);
}
else version (sse)
else if (sse)
{
inner_filtering_impl!(SSE!T)(input, temp, hm, vm, output,
&apply_horizontal_kernel_simd!(SSE!T), &apply_vertical_kernel_simd!(SSE!T));
inner_filtering_impl!(SSE!T, apply_horizontal_kernel_simd!(SSE!T), apply_vertical_kernel_simd!(SSE!T))
(input, temp, hm, vm, output);
}
else
{
inner_filtering_impl!(Non_SIMD!T)(input, temp, hm, vm, output,
get_horizontal_kernel_for_mask!T(hsize), get_vertical_kernel_for_mask!T(vsize));
switch (ksize) {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@9il do you know of a nicer way to write this? (to avoid the switch)

case 3:
inner_filtering_impl!(Non_SIMD!T, apply_horizontal_kernel_3!T, apply_vertical_kernel_3!T)
(input, temp, hm, vm, output);
break;
case 5:
inner_filtering_impl!(Non_SIMD!T, apply_horizontal_kernel_5!T, apply_vertical_kernel_5!T)
(input, temp, hm, vm, output);
break;
case 7:
inner_filtering_impl!(Non_SIMD!T, apply_horizontal_kernel_7!T, apply_vertical_kernel_7!T)
(input, temp, hm, vm, output);
break;
default:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why 3, 5, 7?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We would also need 2, 4, 6, 8 :-)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please use static foreach in switch and compile time arguments

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

BTW, 2, 3, 4, 5, 6, 7, 8 filters can be optimised with simds too

inner_filtering_impl!(Non_SIMD!T, apply_horizontal_kernel!T, apply_vertical_kernel!T)
(input, temp, hm, vm, output);
}
}

if (border_handling != BORDER_REPLICATE)
{
return ERROR_INVALID_BORDER_METHOD;
}

borders_replicate_impl(output, hsize, vsize);
borders_replicate_impl(output, ksize);

return 0;
}

pragma(inline, true)
void inner_filtering_impl(alias InstructionSet)
void inner_filtering_impl
(
alias InstructionSet,
alias hkernel,
alias vkernel
)
(
Slice!(Contiguous, [2], InstructionSet.Scalar*) input,
Slice!(Contiguous, [2], InstructionSet.Scalar*) temp,
Slice!(Contiguous, [1], InstructionSet.Scalar*) hmask,
Slice!(Contiguous, [1], InstructionSet.Scalar*) vmask,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why mask? Is it kernel? mask refers more to boolean value

Slice!(Contiguous, [2], InstructionSet.Scalar*) output,
Horizontal_kernel_func!InstructionSet hkernel,
Vertical_kernel_func!InstructionSet vkernel,
Slice!(Contiguous, [2], InstructionSet.Scalar*) output
)
in
{
Expand All @@ -162,39 +178,38 @@ body
immutable velems = InstructionSet.elementCount;
immutable tbytes = T.sizeof;
immutable vbytes = V.sizeof;
immutable hsize = hmask.length;
immutable vsize = vmask.length;
immutable ksize = hmask.length;
immutable rows = input.length!0;
immutable cols = input.length!1;

static if (Is_SIMD!InstructionSet)
{
// It this is SIMD instructions set, allocate vectors for kernels

auto hk = cast(V[])alignedAllocate(hsize * vbytes, 16); // horizontal simd kernel
auto vk = cast(V[])alignedAllocate(vsize * vbytes, 16); // vertical simd kernel
auto hk = cast(V[])alignedAllocate(ksize * vbytes, 16); // horizontal simd kernel
auto vk = cast(V[])alignedAllocate(ksize * vbytes, 16); // vertical simd kernel
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

V.sizeof.max(16)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can multiple vector by scalar. No need to allocate memory and fill it with vectors.


scope (exit)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

scope(...) is try-catch analog. A betterC library should not have it, thought

{
deallocate(hk);
deallocate(vk);
}

foreach (i; 0 .. hsize)
foreach (i; 0 .. ksize)
{
hk[i].array[] = hmask[i];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

.array may generate bad code. hk[i] = hmask[i]; should work in LDC and recent DMD.

}

foreach (i; 0 .. vsize)
foreach (i; 0 .. ksize)
{
vk[i].array[] = vmask[i];
}
}
else
{
// ... otherwise just use input buffers.
auto hk = hmask._iterator[0 .. hsize];
auto vk = vmask._iterator[0 .. vsize];
auto hk = hmask._iterator[0 .. ksize];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

new syntax: auto hk = hmask.field;

auto vk = vmask._iterator[0 .. ksize];
}

// Get pointers to where vectors are loaded (has to be public)
Expand All @@ -208,80 +223,79 @@ body
auto _out = output.universal.map!toPtr;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

iota has an example to work with pointers. Use it instead.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Didn't see that before- nice!


// Stride buffers to match vector indexing.
auto a = _in[0 .. $ - vsize, 0 .. $ - hsize].strided!1(velems);
auto t = _tmp[0 .. $ - vsize, 0 .. $ - hsize].strided!1(velems);
auto b = _out[0 .. $ - vsize, 0 .. $ - hsize].strided!1(velems);
auto a = _in[0 .. $ - ksize, 0 .. $ - ksize].strided!1(velems);
auto t = _tmp[0 .. $ - ksize, 0 .. $ - ksize].strided!1(velems);
auto b = _out[0 .. $ - ksize, 0 .. $ - ksize].strided!1(velems);

// L1 cache block tiling (under 8192 bytes)
immutable rtiling = 32;
immutable ctiling = max(size_t(1), 256 / (tbytes * hsize));
immutable ctiling = max(size_t(1), 256 / (tbytes * ksize));
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should be described


// Process blocks
zip!true(a, t, b)
.blocks(rtiling, ctiling)
//.parallel
.each!((b) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

please foreach loops instead of each. THe are more readable. each is good when the function is already defined.

b.each!((w) { hkernel(w.a, hk.ptr, w.b, hsize); }); // apply horizontal kernel
b.each!((w) { vkernel(w.b, vk.ptr, w.c, vsize, cols); }); // apply vertical kernel
b.each!((w) { hkernel(w.a, hk.ptr, w.b, ksize); }); // apply horizontal kernel
b.each!((w) { vkernel(w.b, vk.ptr, w.c, ksize, cols); }); // apply vertical kernel
});

// Fill-in block horizontal borders
zip!true(t, b)[rtiling - vsize .. $ - vsize, 0 .. $]
.windows(vsize, b.length!1)
zip!true(t, b)[rtiling - ksize .. $ - ksize, 0 .. $]
.windows(ksize, b.length!1)
.strided!0(rtiling)
.each!((b) {
b.each!((w) { vkernel(w.a, vk.ptr, w.b, vsize, cols); });
b.each!((w) { vkernel(w.a, vk.ptr, w.b, ksize, cols); });
});

// perform scalar processing for the remaining pixels (from vector block selection)
immutable rrb = a.length!0 - (a.length!0 % rtiling) - vsize;
immutable crb = (a.length!1 - (a.length!1 % ctiling)) * velems - hsize;
immutable rrb = a.length!0 - (a.length!0 % rtiling) - ksize;
immutable crb = (a.length!1 - (a.length!1 % ctiling)) * velems - ksize;
immutable rowstride = input.shape[1];

// get scalar kernels
auto hskernel = get_horizontal_kernel_for_mask!T(hsize);
auto vskernel = get_vertical_kernel_for_mask!T(vsize);
auto hskernel = get_horizontal_kernel_for_mask!T(ksize);
auto vskernel = get_vertical_kernel_for_mask!T(ksize);

// bottom rows
if (rrb < rows - vsize)
if (rrb < rows - ksize)
{
zip!true(_in, _tmp)[rrb .. $, 0 .. $ - hsize].each!((w) { hskernel(w.a, hmask._iterator, w.b, hsize); });
zip!true(_tmp, _out)[rrb .. $ - vsize, 0 .. $ - hsize / 2].each!((w) { vskernel(w.a, vmask._iterator, w.b, vsize, rowstride); });
zip!true(_in, _tmp)[rrb .. $, 0 .. $ - ksize].each!((w) { hskernel(w.a, hmask._iterator, w.b, ksize); });
zip!true(_tmp, _out)[rrb .. $ - ksize, 0 .. $ - ksize / 2].each!((w) { vskernel(w.a, vmask._iterator, w.b, ksize, rowstride); });
}

// right columns
if (crb < cols - vsize)
if (crb < cols - ksize)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code duplication. Move it separate function.

{
zip!true(_in, _tmp)[0 .. $ - vsize, crb .. $ - hsize].each!((w) { hskernel(w.a, hmask._iterator, w.b, hsize); });
zip!true(_tmp, _out)[0 .. $ - vsize, crb .. $ - hsize / 2].each!((w) { vskernel(w.a, vmask._iterator, w.b, vsize, rowstride); });
zip!true(_in, _tmp)[0 .. $ - ksize, crb .. $ - ksize].each!((w) { hskernel(w.a, hmask._iterator, w.b, ksize); });
zip!true(_tmp, _out)[0 .. $ - ksize, crb .. $ - ksize / 2].each!((w) { vskernel(w.a, vmask._iterator, w.b, ksize, rowstride); });
}
}

pragma(inline, true)
void borders_replicate_impl(T)
(
Slice!(Contiguous, [2], T*) output,
size_t hsize,
size_t vsize
size_t ksize,
)
{
auto top = output[0 .. vsize / 2 + 1, hsize / 2 .. $ - hsize / 2 + 1];
auto bottom = output[$ - vsize / 2 - 2 .. $, hsize / 2 .. $ - hsize / 2 + 1];
auto top = output[0 .. ksize / 2 + 1, ksize / 2 .. $ - ksize / 2 + 1];
auto bottom = output[$ - ksize / 2 - 2 .. $, ksize / 2 .. $ - ksize / 2 + 1];

foreach (c; 0 .. top.length!1)
foreach (r; 0 .. top.length!0 - 1)
{
top[r, c] = top[hsize / 2 + 1, c];
top[r, c] = top[ksize / 2 + 1, c];
bottom[r + 1, c] = bottom[0, c];
}

auto left = output[0 .. $, 0 .. hsize / 2 + 1];
auto right = output[0 .. $, $ - hsize / 2 - 2 .. $];
auto left = output[0 .. $, 0 .. ksize / 2 + 1];
auto right = output[0 .. $, $ - ksize / 2 - 2 .. $];

foreach (r; 0 .. left.length!0)
foreach (c; 0 .. left.length!1 - 1)
{
left[r, c] = left[r, hsize / 2 + 1];
left[r, c] = left[r, ksize / 2 + 1];
right[r, c + 1] = right[r, 0];
}
}
Expand Down