Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 52 additions & 28 deletions src/avx2/fdct.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,10 +64,11 @@ pub fn fdct_avx2(data: &mut [i16; 64]) {
}

#[target_feature(enable = "avx2")]
unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
fn fdct_avx2_internal(data: &mut [i16; 64]) {
#[target_feature(enable = "avx2")]
#[allow(non_snake_case)]
#[inline(always)]
unsafe fn PW_F130_F054_MF130_F054() -> __m256i {
#[inline]
fn PW_F130_F054_MF130_F054() -> __m256i {
_mm256_set_epi16(
F_0_541,
F_0_541 - F_1_847,
Expand All @@ -88,9 +89,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
)
}

#[target_feature(enable = "avx2")]
#[allow(non_snake_case)]
#[inline(always)]
unsafe fn PW_MF078_F117_F078_F117() -> __m256i {
#[inline]
fn PW_MF078_F117_F078_F117() -> __m256i {
_mm256_set_epi16(
F_1_175,
F_1_175 - F_0_390,
Expand All @@ -111,9 +113,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
)
}

#[target_feature(enable = "avx2")]
#[allow(non_snake_case)]
#[inline(always)]
unsafe fn PW_MF060_MF089_MF050_MF256() -> __m256i {
#[inline]
fn PW_MF060_MF089_MF050_MF256() -> __m256i {
_mm256_set_epi16(
-F_2_562,
F_2_053 - F_2_562,
Expand All @@ -134,9 +137,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
)
}

#[target_feature(enable = "avx2")]
#[allow(non_snake_case)]
#[inline(always)]
unsafe fn PW_F050_MF256_F060_MF089() -> __m256i {
#[inline]
fn PW_F050_MF256_F060_MF089() -> __m256i {
_mm256_set_epi16(
-F_0_899,
F_1_501 - F_0_899,
Expand All @@ -157,9 +161,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
)
}

#[target_feature(enable = "avx2")]
#[allow(non_snake_case)]
#[inline(always)]
unsafe fn PD_DESCALE_P(first_pass: bool) -> __m256i {
#[inline]
fn PD_DESCALE_P(first_pass: bool) -> __m256i {
if first_pass {
_mm256_set_epi32(
1 << (DESCALE_P1 - 1),
Expand All @@ -185,9 +190,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
}
}

#[target_feature(enable = "avx2")]
#[allow(non_snake_case)]
#[inline(always)]
unsafe fn PW_DESCALE_P2X() -> __m256i {
#[inline]
fn PW_DESCALE_P2X() -> __m256i {
_mm256_set_epi32(
1 << (PASS1_BITS - 1),
1 << (PASS1_BITS - 1),
Expand All @@ -201,8 +207,9 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
}

// In-place 8x8x16-bit matrix transpose using AVX2 instructions
#[inline(always)]
unsafe fn do_transpose(
#[target_feature(enable = "avx2")]
#[inline]
fn do_transpose(
i1: __m256i,
i2: __m256i,
i3: __m256i,
Expand Down Expand Up @@ -244,8 +251,9 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
}

// In-place 8x8x16-bit accurate integer forward DCT using AVX2 instructions
#[inline(always)]
unsafe fn do_dct(
#[target_feature(enable = "avx2")]
#[inline]
fn do_dct(
first_pass: bool,
i1: __m256i,
i2: __m256i,
Expand Down Expand Up @@ -412,12 +420,10 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
(t1, t2, t3, t4)
}

let in_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.as_mut_ptr());

let ymm4 = _mm256_loadu_si256(in_data);
let ymm5 = _mm256_loadu_si256(in_data.add(1));
let ymm6 = _mm256_loadu_si256(in_data.add(2));
let ymm7 = _mm256_loadu_si256(in_data.add(3));
let ymm4 = avx_load(&data[0..16]);
let ymm5 = avx_load(&data[16..32]);
let ymm6 = avx_load(&data[32..48]);
let ymm7 = avx_load(&data[48..64]);

// ---- Pass 1: process rows.
// ymm4=(00 01 02 03 04 05 06 07 10 11 12 13 14 15 16 17)
Expand Down Expand Up @@ -451,10 +457,28 @@ unsafe fn fdct_avx2_internal(data: &mut [i16; 64]) {
let ymm6 = _mm256_permute2x128_si256(ymm0, ymm4, 0x31); // ymm6=data4_5
let ymm7 = _mm256_permute2x128_si256(ymm2, ymm4, 0x21); // ymm7=data6_7

let out_data = core::mem::transmute::<*mut i16, *mut __m256i>(data.as_mut_ptr());
avx_store(ymm3, &mut data[0..16]);
avx_store(ymm5, &mut data[16..32]);
avx_store(ymm6, &mut data[32..48]);
avx_store(ymm7, &mut data[48..64]);
}

_mm256_storeu_si256(out_data, ymm3);
_mm256_storeu_si256(out_data.add(1), ymm5);
_mm256_storeu_si256(out_data.add(2), ymm6);
_mm256_storeu_si256(out_data.add(3), ymm7);
/// Safe wrapper for an unaligned AVX load
#[target_feature(enable = "avx2")]
#[inline]
fn avx_load(input: &[i16]) -> __m256i {
assert!(input.len() == 16);
assert!(core::mem::size_of::<[i16; 16]>() == core::mem::size_of::<__m256i>());
// SAFETY: we've checked sizes above. The load is unaligned, so no alignment requirements.
unsafe { _mm256_loadu_si256(input.as_ptr() as *const __m256i) }
}

/// Safe wrapper for an unaligned AVX store
#[target_feature(enable = "avx2")]
#[inline]
fn avx_store(input: __m256i, output: &mut [i16]) {
assert!(output.len() == 16);
assert!(core::mem::size_of::<[i16; 16]>() == core::mem::size_of::<__m256i>());
// SAFETY: we've checked sizes above. The load is unaligned, so no alignment requirements.
unsafe { _mm256_storeu_si256(output.as_mut_ptr() as *mut __m256i, input) }
}
Loading