kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit 1ff05879d309acbffc4117521d9303d534d4fdb5
parent 1ee8ac5b3ae8bcb5c49847fe9860b4225e52eba5
Author: [email protected] <[email protected]>
Date:   Tue,  2 Aug 2016 21:52:08 +0300

Improve ARM NEON support

Diffstat:
Minclude/kfr/base/abs.hpp | 37+++++++++++++++++++++++++++++++++++--
Minclude/kfr/base/cpuid.hpp | 10++++++++++
Minclude/kfr/base/logical.hpp | 53+++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/kfr/base/min_max.hpp | 31+++++++++++++++++++++++++++++++
Minclude/kfr/base/saturation.hpp | 27+++++++++++++++++++++++++++
Minclude/kfr/base/select.hpp | 47+++++++++++++++++++++++++++++++++++++++++++++++
Minclude/kfr/cident.h | 4++--
7 files changed, 205 insertions(+), 4 deletions(-)

diff --git a/include/kfr/base/abs.hpp b/include/kfr/base/abs.hpp @@ -31,6 +31,9 @@ namespace kfr namespace intrinsics { + +#if defined CID_ARCH_SSSE3 + // floating point template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x) @@ -38,8 +41,6 @@ KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x) return x & internal::invhighbitmask<T>; } -#if defined CID_ARCH_SSSE3 - KFR_SINTRIN i64sse abs(const i64sse& x) { return select(x >= 0, x, -x); } KFR_SINTRIN i32sse abs(const i32sse& x) { return _mm_abs_epi32(*x); } KFR_SINTRIN i16sse abs(const i16sse& x) { return _mm_abs_epi16(*x); } @@ -62,7 +63,39 @@ KFR_SINTRIN u8avx abs(const u8avx& x) { return x; } KFR_HANDLE_ALL_SIZES_NOT_F_1(abs) +#elif defined CID_ARCH_NEON + +KFR_SINTRIN i8neon abs(const i8neon& x) { return vabsq_s8(*x); } +KFR_SINTRIN i16neon abs(const i16neon& x) { return vabsq_s16(*x); } +KFR_SINTRIN i32neon abs(const i32neon& x) { return vabsq_s32(*x); } +#if defined CID_ARCH_NEON64 +KFR_SINTRIN i64neon abs(const i64neon& x) { return vabsq_s64(*x); } #else +KFR_SINTRIN i64neon abs(const i64neon& x) { return select(x >= 0, x, -x); } +#endif + +KFR_SINTRIN u8neon abs(const u8neon& x) { return x; } +KFR_SINTRIN u16neon abs(const u16neon& x) { return x; } +KFR_SINTRIN u32neon abs(const u32neon& x) { return x; } +KFR_SINTRIN u64neon abs(const u64neon& x) { return x; } + +KFR_SINTRIN f32neon abs(const f32neon& x) { return vabsq_f32(*x); } +#if defined CID_ARCH_NEON64 +KFR_SINTRIN f64neon abs(const f64neon& x) { return vabsq_f64(*x); } +#else +KFR_SINTRIN f64neon abs(const f64neon& x) { return x & internal::invhighbitmask<f64>; } +#endif + +KFR_HANDLE_ALL_SIZES_1(abs) + +#else + +// floating point +template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> +KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x) +{ + return x & internal::invhighbitmask<T>; +} // fallback template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> diff --git a/include/kfr/base/cpuid.hpp b/include/kfr/base/cpuid.hpp @@ -27,6 +27,7 @@ namespace kfr { +#ifdef CID_ARCH_X86 struct cpu_features { @@ -277,4 +278,13 @@ cpu_t detect_cpu() return cpu_t::lowest; } } +#else + +template <size_t = 0> +cpu_t detect_cpu() +{ + return cpu_t::native; +} + +#endif } diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp @@ -158,6 +158,59 @@ KFR_SINTRIN bool bittestany(const vec<T, N>& a) return bittestany(low(a)) || bittestany(high(a)); } +#elif CID_ARCH_NEON + +KFR_SINTRIN bool bittestall(const u32neon& a) +{ + const uint32x2_t tmp = vand_u32(vget_low_u32(*a), vget_high_u32(*a)); + return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu; +} + +KFR_SINTRIN bool bittestany(const u32neon& a) +{ + const uint32x2_t tmp = vorr_u32(vget_low_u32(*a), vget_high_u32(*a)); + return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0; +} +KFR_SINTRIN bool bittestany(const u8neon& a) { return bitcast<u8>(bittestany(bitcast<u32>(a))); } +KFR_SINTRIN bool bittestany(const u16neon& a) { return bitcast<u16>(bittestany(bitcast<u32>(a))); } +KFR_SINTRIN bool bittestany(const u64neon& a) { return bitcast<u64>(bittestany(bitcast<u32>(a))); } +KFR_SINTRIN bool bittestany(const i8neon& a) { return bitcast<i8>(bittestany(bitcast<u32>(a))); } +KFR_SINTRIN bool bittestany(const i16neon& a) { return bitcast<i16>(bittestany(bitcast<u32>(a))); } +KFR_SINTRIN bool bittestany(const i64neon& a) { return bitcast<i64>(bittestany(bitcast<u32>(a))); } +KFR_SINTRIN bool bittestany(const f32neon& a) { return bitcast<f32>(bittestany(bitcast<u32>(a))); } +KFR_SINTRIN bool bittestany(const f64neon& a) { return bitcast<f64>(bittestany(bitcast<u32>(a))); } + +KFR_SINTRIN bool bittestall(const u8neon& a) { return bitcast<u8>(bittestall(bitcast<u32>(a))); } +KFR_SINTRIN bool bittestall(const u16neon& a) { return bitcast<u16>(bittestall(bitcast<u32>(a))); } +KFR_SINTRIN bool bittestall(const u64neon& a) { return bitcast<u64>(bittestall(bitcast<u32>(a))); } +KFR_SINTRIN bool bittestall(const i8neon& a) { return bitcast<i8>(bittestall(bitcast<u32>(a))); } +KFR_SINTRIN bool bittestall(const i16neon& a) { return bitcast<i16>(bittestall(bitcast<u32>(a))); } +KFR_SINTRIN bool bittestall(const i64neon& a) { return bitcast<i64>(bittestall(bitcast<u32>(a))); } +KFR_SINTRIN bool bittestall(const f32neon& a) { return bitcast<f32>(bittestall(bitcast<u32>(a))); } +KFR_SINTRIN bool bittestall(const f64neon& a) { return bitcast<f64>(bittestall(bitcast<u32>(a))); } + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> +KFR_SINTRIN bool bittestall(const vec<T, N>& a) +{ + return bittestall(expand_simd(a, internal::maskbits<T>(true))); +} +template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> +KFR_SINTRIN bool bittestall(const vec<T, N>& a) +{ + return bittestall(low(a)) && bittestall(high(a)); +} + +template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> +KFR_SINTRIN bool bittestany(const vec<T, N>& a) +{ + return bittestany(expand_simd(a, internal::maskbits<T>(false))); +} +template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> +KFR_SINTRIN bool bittestany(const vec<T, N>& a) +{ + return bittestany(low(a)) || bittestany(high(a)); +} + #else template <typename T, size_t N> diff --git a/include/kfr/base/min_max.hpp b/include/kfr/base/min_max.hpp @@ -103,6 +103,37 @@ KFR_SINTRIN u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, KFR_HANDLE_ALL_SIZES_2(min) KFR_HANDLE_ALL_SIZES_2(max) +#elif defined CID_ARCH_NEON + +KFR_SINTRIN i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(*x, *y); } +KFR_SINTRIN u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(*x, *y); } +KFR_SINTRIN i16neon min(const i16neon& x, const i16neon& y) { return vminq_s16(*x, *y); } +KFR_SINTRIN u16neon min(const u16neon& x, const u16neon& y) { return vminq_u16(*x, *y); } +KFR_SINTRIN i32neon min(const i32neon& x, const i32neon& y) { return vminq_s32(*x, *y); } +KFR_SINTRIN u32neon min(const u32neon& x, const u32neon& y) { return vminq_u32(*x, *y); } + +KFR_SINTRIN i8neon max(const i8neon& x, const i8neon& y) { return vmaxq_s8(*x, *y); } +KFR_SINTRIN u8neon max(const u8neon& x, const u8neon& y) { return vmaxq_u8(*x, *y); } +KFR_SINTRIN i16neon max(const i16neon& x, const i16neon& y) { return vmaxq_s16(*x, *y); } +KFR_SINTRIN u16neon max(const u16neon& x, const u16neon& y) { return vmaxq_u16(*x, *y); } +KFR_SINTRIN i32neon max(const i32neon& x, const i32neon& y) { return vmaxq_s32(*x, *y); } +KFR_SINTRIN u32neon max(const u32neon& x, const u32neon& y) { return vmaxq_u32(*x, *y); } +KFR_SINTRIN i64neon min(const i64neon& x, const i64neon& y) { return select(x < y, x, y); } +KFR_SINTRIN u64neon min(const u64neon& x, const u64neon& y) { return select(x < y, x, y); } + +KFR_SINTRIN f32neon min(const f32neon& x, const f32neon& y) { return vminq_f32(*x, *y); } +KFR_SINTRIN f32neon max(const f32neon& x, const f32neon& y) { return vmaxq_f32(*x, *y); } +#if defined CID_ARCH_NEON64 +KFR_SINTRIN f64neon min(const f64neon& x, const f64neon& y) { return vminq_f64(*x, *y); } +KFR_SINTRIN f64neon max(const f64neon& x, const f64neon& y) { return vmaxq_f64(*x, *y); } +#else +KFR_SINTRIN f64neon min(const f64neon& x, const f64neon& y) { return select(x < y, x, y); } +KFR_SINTRIN f64neon max(const f64neon& x, const f64neon& y) { return select(x > y, x, y); } +#endif + +KFR_HANDLE_ALL_SIZES_2(min) +KFR_HANDLE_ALL_SIZES_2(max) + #else // fallback diff --git a/include/kfr/base/saturation.hpp b/include/kfr/base/saturation.hpp @@ -30,6 +30,8 @@ namespace kfr namespace intrinsics { + +// Generic functions template <typename T, size_t N> KFR_SINTRIN vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b) { @@ -103,6 +105,31 @@ KFR_SINTRIN i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs KFR_HANDLE_ALL_SIZES_2(satadd) KFR_HANDLE_ALL_SIZES_2(satsub) +#elif defined CID_ARCH_NEON + +KFR_SINTRIN u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(*x, *y); } +KFR_SINTRIN i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(*x, *y); } +KFR_SINTRIN u16neon satadd(const u16neon& x, const u16neon& y) { return vqaddq_u16(*x, *y); } +KFR_SINTRIN i16neon satadd(const i16neon& x, const i16neon& y) { return vqaddq_s16(*x, *y); } + +KFR_SINTRIN u8neon satsub(const u8neon& x, const u8neon& y) { return vqsubq_u8(*x, *y); } +KFR_SINTRIN i8neon satsub(const i8neon& x, const i8neon& y) { return vqsubq_s8(*x, *y); } +KFR_SINTRIN u16neon satsub(const u16neon& x, const u16neon& y) { return vqsubq_u16(*x, *y); } +KFR_SINTRIN i16neon satsub(const i16neon& x, const i16neon& y) { return vqsubq_s16(*x, *y); } + +KFR_SINTRIN u32neon satadd(const u32neon& a, const u32neon& b) { return vqaddq_u32(*a, *b); } +KFR_SINTRIN i32neon satadd(const i32neon& a, const i32neon& b) { return vqaddq_s32(*a, *b); } +KFR_SINTRIN u64neon satadd(const u64neon& a, const u64neon& b) { return vqaddq_u64(*a, *b); } +KFR_SINTRIN i64neon satadd(const i64neon& a, const i64neon& b) { return vqaddq_s64(*a, *b); } + +KFR_SINTRIN i32neon satsub(const i32neon& a, const i32neon& b) { return vqsubq_u32(*a, *b); } +KFR_SINTRIN i64neon satsub(const i64neon& a, const i64neon& b) { return vqsubq_s32(*a, *b); } +KFR_SINTRIN u32neon satsub(const u32neon& a, const u32neon& b) { return vqsubq_u64(*a, *b); } +KFR_SINTRIN u64neon satsub(const u64neon& a, const u64neon& b) { return vqsubq_s64(*a, *b); } + +KFR_HANDLE_ALL_SIZES_2(satadd) +KFR_HANDLE_ALL_SIZES_2(satsub) + #else // fallback template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)> diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp @@ -129,6 +129,53 @@ KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec< return concat(select(low(a).asmask(), low(b), low(c)), select(high(a).asmask(), high(b), high(c))); } +#elif defined CID_ARCH_NEON + +KFR_SINTRIN f32neon select(const mf32neon& m, const f32neon& x, const f32neon& y) +{ + return vbslq_f32(*m, *x, *y); +} + +KFR_SINTRIN i8neon select(const mi8neon& m, const i8neon& x, const i8neon& y) { return vbslq_s8(*m, *x, *y); } +KFR_SINTRIN u8neon select(const mu8neon& m, const u8neon& x, const u8neon& y) { return vbslq_u8(*m, *x, *y); } +KFR_SINTRIN i16neon select(const mi16neon& m, const i16neon& x, const i16neon& y) +{ + return vbslq_s16(*m, *x, *y); +} +KFR_SINTRIN u16neon select(const mu16neon& m, const u16neon& x, const u16neon& y) +{ + return vbslq_u16(*m, *x, *y); +} +KFR_SINTRIN i32neon select(const mi32neon& m, const i32neon& x, const i32neon& y) +{ + return vbslq_s32(*m, *x, *y); +} +KFR_SINTRIN u32neon select(const mu32neon& m, const u32neon& x, const u32neon& y) +{ + return vbslq_u32(*m, *x, *y); +} +KFR_SINTRIN i64neon select(const mi64neon& m, const i64neon& x, const i64neon& y) +{ + return vbslq_s64(*m, *x, *y); +} +KFR_SINTRIN u64neon select(const mu64neon& m, const u64neon& x, const u64neon& y) +{ + return vbslq_u64(*m, *x, *y); +} + +#ifdef CID_ARCH_NEON64 +KFR_SINTRIN f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y) +{ + return vbslq_f64(*m, *x, *y); +} +#else +template <typename T, size_t N> +KFR_SINTRIN vec<T, N> select(const mask<T, N>& m, const vec<T, N>& x, const vec<T, N>& y) +{ + return y ^ ((x ^ y) & m); +} +#endif + #else // fallback diff --git a/include/kfr/cident.h b/include/kfr/cident.h @@ -2,7 +2,7 @@ #if defined(_M_IX86) || defined(__i386__) || defined(_M_X64) || defined(__x86_64__) #define CID_ARCH_X86 1 -#elif defined(__arm__) || defined(__arm64__) || defined(_M_ARM) +#elif defined(__arm__) || defined(__arm64__) || defined(_M_ARM) || defined(__aarch64__) #define CID_ARCH_ARM 1 #endif @@ -118,7 +118,7 @@ #elif defined(CID_ARCH_ARM) -#if defined(__arm64__) +#if defined(__aarch64__) #define CID_ARCH_X64 1 #else #define CID_ARCH_X32 1