kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit d38a5aa048a510c6e3a447874a48a91ac8f7d4a3
parent 46038f070e74f5044e2069ea6d5e5f2c55adcd19
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Wed, 14 Nov 2018 00:17:45 +0300

AVX-512 fixes

Diffstat:
Minclude/kfr/base/logical.hpp | 40++++++++++++++++++++--------------------
Minclude/kfr/base/min_max.hpp | 5+++++
Minclude/kfr/base/saturation.hpp | 19+++++++++++++++++++
Minclude/kfr/base/select.hpp | 12++++++------
4 files changed, 50 insertions(+), 26 deletions(-)

diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp @@ -109,28 +109,28 @@ KFR_SINTRIN bool bittestall(const i64avx& x) { return _mm256_testc_si256(*x, *al #if defined CMT_ARCH_AVX512 // horizontal OR -KFR_SINTRIN bool bittestany(const f32avx512& x) { return !_mm512_test_epi32_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const f64avx512& x) { return !_mm512_test_epi64_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const u8avx512& x) { return !_mm512_test_epi8_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const u16avx512& x) { return !_mm512_test_epi16_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const u32avx512& x) { return !_mm512_test_epi32_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const u64avx512& x) { return !_mm512_test_epi64_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const i8avx512& x) { return !_mm512_test_epi8_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const i16avx512& x) { return !_mm512_test_epi16_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const i32avx512& x) { return !_mm512_test_epi32_mask(*x, *x); } -KFR_SINTRIN bool bittestany(const i64avx512& x) { return !_mm512_test_epi64_mask(*x, *x); } +KFR_SINTRIN bool bittestany(const f32avx512& x) { return _mm512_test_epi32_mask(*x, *x); } +KFR_SINTRIN bool bittestany(const f64avx512& x) { return _mm512_test_epi64_mask(*x, *x); } +KFR_SINTRIN bool bittestany(const u8avx512& x) { return _mm512_test_epi8_mask(*x, *x); } +KFR_SINTRIN bool bittestany(const u16avx512& x) { return _mm512_test_epi16_mask(*x, *x); } +KFR_SINTRIN bool bittestany(const u32avx512& x) { return _mm512_test_epi32_mask(*x, *x); } +KFR_SINTRIN bool bittestany(const u64avx512& x) { return _mm512_test_epi64_mask(*x, *x); } +KFR_SINTRIN bool bittestany(const i8avx512& x) { return _mm512_test_epi8_mask(*x, *x); } +KFR_SINTRIN bool bittestany(const i16avx512& x) { return _mm512_test_epi16_mask(*x, *x); } +KFR_SINTRIN bool bittestany(const i32avx512& x) { return _mm512_test_epi32_mask(*x, *x); } +KFR_SINTRIN bool bittestany(const i64avx512& x) { return _mm512_test_epi64_mask(*x, *x); } // horizontal AND -KFR_SINTRIN bool bittestall(const f32avx512& x) { return ~bittestany(~x); } -KFR_SINTRIN bool bittestall(const f64avx512& x) { return ~bittestany(~x); } -KFR_SINTRIN bool bittestall(const u8avx512& x) { return ~bittestany(~x); } -KFR_SINTRIN bool bittestall(const u16avx512& x) { return ~bittestany(~x); } -KFR_SINTRIN bool bittestall(const u32avx512& x) { return ~bittestany(~x); } -KFR_SINTRIN bool bittestall(const u64avx512& x) { return ~bittestany(~x); } -KFR_SINTRIN bool bittestall(const i8avx512& x) { return ~bittestany(~x); } -KFR_SINTRIN bool bittestall(const i16avx512& x) { return ~bittestany(~x); } -KFR_SINTRIN bool bittestall(const i32avx512& x) { return ~bittestany(~x); } -KFR_SINTRIN bool bittestall(const i64avx512& x) { return ~bittestany(~x); } +KFR_SINTRIN bool bittestall(const f32avx512& x) { return !bittestany(~x); } +KFR_SINTRIN bool bittestall(const f64avx512& x) { return !bittestany(~x); } +KFR_SINTRIN bool bittestall(const u8avx512& x) { return !bittestany(~x); } +KFR_SINTRIN bool bittestall(const u16avx512& x) { return !bittestany(~x); } +KFR_SINTRIN bool bittestall(const u32avx512& x) { return !bittestany(~x); } +KFR_SINTRIN bool bittestall(const u64avx512& x) { return !bittestany(~x); } +KFR_SINTRIN bool bittestall(const i8avx512& x) { return !bittestany(~x); } +KFR_SINTRIN bool bittestall(const i16avx512& x) { return !bittestany(~x); } +KFR_SINTRIN bool bittestall(const i32avx512& x) { return !bittestany(~x); } +KFR_SINTRIN bool bittestall(const i64avx512& x) { return !bittestany(~x); } #endif diff --git a/include/kfr/base/min_max.hpp b/include/kfr/base/min_max.hpp @@ -87,6 +87,11 @@ KFR_SINTRIN i64avx min(const i64avx& x, const i64avx& y) { return _mm256_min_epi KFR_SINTRIN u64avx min(const u64avx& x, const u64avx& y) { return _mm256_min_epu64(*x, *y); } KFR_SINTRIN i64avx max(const i64avx& x, const i64avx& y) { return _mm256_max_epi64(*x, *y); } KFR_SINTRIN u64avx max(const u64avx& x, const u64avx& y) { return _mm256_max_epu64(*x, *y); } + +KFR_SINTRIN i64sse min(const i64sse& x, const i64sse& y) { return _mm_min_epi64(*x, *y); } +KFR_SINTRIN u64sse min(const u64sse& x, const u64sse& y) { return _mm_min_epu64(*x, *y); } +KFR_SINTRIN i64sse max(const i64sse& x, const i64sse& y) { return _mm_max_epi64(*x, *y); } +KFR_SINTRIN u64sse max(const u64sse& x, const u64sse& y) { return _mm_max_epu64(*x, *y); } #else KFR_SINTRIN i64sse min(const i64sse& x, const i64sse& y) { return select(x < y, x, y); } KFR_SINTRIN u64sse min(const u64sse& x, const u64sse& y) { return select(x < y, x, y); } diff --git a/include/kfr/base/saturation.hpp b/include/kfr/base/saturation.hpp @@ -103,6 +103,16 @@ KFR_SINTRIN u8avx satsub(const u8avx& x, const u8avx& y) { return _mm256_subs_ep KFR_SINTRIN i8avx satsub(const i8avx& x, const i8avx& y) { return _mm256_subs_epi8(*x, *y); } KFR_SINTRIN u16avx satsub(const u16avx& x, const u16avx& y) { return _mm256_subs_epu16(*x, *y); } KFR_SINTRIN i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs_epi16(*x, *y); } + +KFR_SINTRIN i32avx satadd(const i32avx& a, const i32avx& b) { return saturated_signed_add(a, b); } +KFR_SINTRIN i64avx satadd(const i64avx& a, const i64avx& b) { return saturated_signed_add(a, b); } +KFR_SINTRIN u32avx satadd(const u32avx& a, const u32avx& b) { return saturated_unsigned_add(a, b); } +KFR_SINTRIN u64avx satadd(const u64avx& a, const u64avx& b) { return saturated_unsigned_add(a, b); } + +KFR_SINTRIN i32avx satsub(const i32avx& a, const i32avx& b) { return saturated_signed_sub(a, b); } +KFR_SINTRIN i64avx satsub(const i64avx& a, const i64avx& b) { return saturated_signed_sub(a, b); } +KFR_SINTRIN u32avx satsub(const u32avx& a, const u32avx& b) { return saturated_unsigned_sub(a, b); } +KFR_SINTRIN u64avx satsub(const u64avx& a, const u64avx& b) { return saturated_unsigned_sub(a, b); } #endif #if defined CMT_ARCH_AVX512 @@ -114,6 +124,15 @@ KFR_SINTRIN u8avx512 satsub(const u8avx512& x, const u8avx512& y) { return _mm51 KFR_SINTRIN i8avx512 satsub(const i8avx512& x, const i8avx512& y) { return _mm512_subs_epi8(*x, *y); } KFR_SINTRIN u16avx512 satsub(const u16avx512& x, const u16avx512& y) { return _mm512_subs_epu16(*x, *y); } KFR_SINTRIN i16avx512 satsub(const i16avx512& x, const i16avx512& y) { return _mm512_subs_epi16(*x, *y); } + +KFR_SINTRIN i32avx512 satadd(const i32avx512& a, const i32avx512& b) { return saturated_signed_add(a, b); } +KFR_SINTRIN i64avx512 satadd(const i64avx512& a, const i64avx512& b) { return saturated_signed_add(a, b); } +KFR_SINTRIN u32avx512 satadd(const u32avx512& a, const u32avx512& b) { return saturated_unsigned_add(a, b); } +KFR_SINTRIN u64avx512 satadd(const u64avx512& a, const u64avx512& b) { return saturated_unsigned_add(a, b); } +KFR_SINTRIN i32avx512 satsub(const i32avx512& a, const i32avx512& b) { return saturated_signed_sub(a, b); } +KFR_SINTRIN i64avx512 satsub(const i64avx512& a, const i64avx512& b) { return saturated_signed_sub(a, b); } +KFR_SINTRIN u32avx512 satsub(const u32avx512& a, const u32avx512& b) { return saturated_unsigned_sub(a, b); } +KFR_SINTRIN u64avx512 satsub(const u64avx512& a, const u64avx512& b) { return saturated_unsigned_sub(a, b); } #endif KFR_HANDLE_ALL_SIZES_2(satadd) diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp @@ -136,15 +136,15 @@ KFR_SINTRIN u8avx512 select(const maskfor<u8avx512>& m, const u8avx512& x, const } KFR_SINTRIN u16avx512 select(const maskfor<u16avx512>& m, const u16avx512& x, const u16avx512& y) { - return _mm512_mask_blend_epi8(_mm512_test_epi16_mask(*m, *m), *y, *x); + return _mm512_mask_blend_epi16(_mm512_test_epi16_mask(*m, *m), *y, *x); } KFR_SINTRIN u32avx512 select(const maskfor<u32avx512>& m, const u32avx512& x, const u32avx512& y) { - return _mm512_mask_blend_epi8(_mm512_test_epi32_mask(*m, *m), *y, *x); + return _mm512_mask_blend_epi32(_mm512_test_epi32_mask(*m, *m), *y, *x); } KFR_SINTRIN u64avx512 select(const maskfor<u64avx512>& m, const u64avx512& x, const u64avx512& y) { - return _mm512_mask_blend_epi8(_mm512_test_epi64_mask(*m, *m), *y, *x); + return _mm512_mask_blend_epi64(_mm512_test_epi64_mask(*m, *m), *y, *x); } KFR_SINTRIN i8avx512 select(const maskfor<i8avx512>& m, const i8avx512& x, const i8avx512& y) { @@ -152,15 +152,15 @@ KFR_SINTRIN i8avx512 select(const maskfor<i8avx512>& m, const i8avx512& x, const } KFR_SINTRIN i16avx512 select(const maskfor<i16avx512>& m, const i16avx512& x, const i16avx512& y) { - return _mm512_mask_blend_epi8(_mm512_test_epi16_mask(*m, *m), *y, *x); + return _mm512_mask_blend_epi16(_mm512_test_epi16_mask(*m, *m), *y, *x); } KFR_SINTRIN i32avx512 select(const maskfor<i32avx512>& m, const i32avx512& x, const i32avx512& y) { - return _mm512_mask_blend_epi8(_mm512_test_epi32_mask(*m, *m), *y, *x); + return _mm512_mask_blend_epi32(_mm512_test_epi32_mask(*m, *m), *y, *x); } KFR_SINTRIN i64avx512 select(const maskfor<i64avx512>& m, const i64avx512& x, const i64avx512& y) { - return _mm512_mask_blend_epi8(_mm512_test_epi64_mask(*m, *m), *y, *x); + return _mm512_mask_blend_epi64(_mm512_test_epi64_mask(*m, *m), *y, *x); } #endif