kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit 52e3c3395ad7c58d4a02c0a0c2dc54b15b669920
parent 90bd5e0bdc190e7b93ca72480fa14dd906c92ce7
Author: [email protected] <[email protected]>
Date:   Mon, 25 Jul 2016 14:02:49 +0300

Initial support for ARM NEON

Diffstat:
Minclude/kfr/base/function.hpp | 30+++++++++++++++++++++++++++++-
Minclude/kfr/base/types.hpp | 34+++++++++++++++++++++++++++++-----
Minclude/kfr/cident.h | 11+++++++++++
3 files changed, 69 insertions(+), 6 deletions(-)

diff --git a/include/kfr/base/function.hpp b/include/kfr/base/function.hpp @@ -42,7 +42,7 @@ namespace kfr namespace internal { - +#ifdef CID_ARCH_X86 using f32sse = vec<f32, 4>; using f64sse = vec<f64, 2>; using i8sse = vec<i8, vector_width<i8, cpu_t::sse2>>; @@ -86,11 +86,39 @@ using mu8avx = mask<u8, vector_width<u8, cpu_t::avx2>>; using mu16avx = mask<u16, vector_width<u16, cpu_t::avx2>>; using mu32avx = mask<u32, vector_width<u32, cpu_t::avx2>>; using mu64avx = mask<u64, vector_width<u64, cpu_t::avx2>>; +#else +using f32neon = vec<f32, 4>; +using f64neon = vec<f64, 2>; +using i8neon = vec<i8, 16>; +using i16neon = vec<i16, 8>; +using i32neon = vec<i32, 4>; +using i64neon = vec<i64, 2>; +using u8neon = vec<u8, 16>; +using u16neon = vec<u16, 8>; +using u32neon = vec<u32, 4>; +using u64neon = vec<u64, 2>; + +using mf32neon = mask<f32, 4>; +using mf64neon = mask<f64, 2>; +using mi8neon = mask<i8, 16>; +using mi16neon = mask<i16, 8>; +using mi32neon = mask<i32, 4>; +using mi64neon = mask<i64, 2>; +using mu8neon = mask<u8, 16>; +using mu16neon = mask<u16, 8>; +using mu32neon = mask<u32, 4>; +using mu64neon = mask<u64, 2>; +#endif template <cpu_t c, typename T> constexpr inline size_t next_simd_width(size_t n) { +#ifdef CID_ARCH_X86 return n > vector_width<T, cpu_t::sse2> ? vector_width<T, c> : vector_width<T, cpu_t::sse2>; +#endif +#ifdef CID_ARCH_ARM + return vector_width<T, cpu_t::neon>; +#endif } template <typename T, size_t N, size_t Nout = next_simd_width<cpu_t::native, T>(N)> diff --git a/include/kfr/base/types.hpp b/include/kfr/base/types.hpp @@ -323,7 +323,8 @@ constexpr inline ptrdiff_t distance(const void* x, const void* y) enum class cpu_t : int { - common = 0, + common = 0, +#ifdef CID_ARCH_X86 sse2 = 1, sse3 = 2, ssse3 = 3, @@ -332,9 +333,15 @@ enum class cpu_t : int avx1 = 6, avx2 = 7, avx = static_cast<int>(avx1), - native = static_cast<int>(KFR_ARCH_NAME), lowest = static_cast<int>(sse2), highest = static_cast<int>(avx2), +#endif +#ifdef CID_ARCH_ARM + neon = 1, + lowest = static_cast<int>(neon), + highest = static_cast<int>(neon), +#endif + native = static_cast<int>(KFR_ARCH_NAME), runtime = -1, }; @@ -349,8 +356,12 @@ namespace internal constexpr cpu_t older(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) - 1); } constexpr cpu_t newer(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) + 1); } +#ifdef CID_ARCH_X86 constexpr auto cpu_list = cvals<cpu_t, cpu_t::avx2, cpu_t::avx1, cpu_t::sse41, cpu_t::ssse3, cpu_t::sse3, cpu_t::sse2>; +#else +constexpr auto cpu_list = cvals<cpu_t, cpu_t::neon>; +#endif } template <cpu_t cpu> @@ -359,8 +370,6 @@ template <cpu_t cpu> constexpr auto cpuval = cpuval_t<cpu>{}; constexpr auto cpu_all = cfilter(internal::cpu_list, internal::cpu_list >= cpuval<cpu_t::native>); -constexpr auto cpu_shuffle = - cfilter(cpu_all, cpu_all != cpuval<cpu_t::sse3> && cpu_all != cpuval<cpu_t::ssse3>); template <typename T> constexpr datatype typeclass = std::is_floating_point<typename compound_type_traits<T>::subtype>::value @@ -655,10 +664,20 @@ constexpr size_t common_int_vector_size = 16; template <cpu_t c> constexpr size_t native_float_vector_size = +#ifdef CID_ARCH_X86 c >= cpu_t::avx1 ? 32 : c >= cpu_t::sse2 ? 16 : common_float_vector_size; +#endif +#ifdef CID_ARCH_ARM +c == cpu_t::neon ? 16 : common_float_vector_size; +#endif template <cpu_t c> constexpr size_t native_int_vector_size = +#ifdef CID_ARCH_X86 c >= cpu_t::avx2 ? 32 : c >= cpu_t::sse2 ? 16 : common_int_vector_size; +#endif +#ifdef CID_ARCH_ARM +c == cpu_t::neon ? 16 : common_int_vector_size; +#endif struct input_expression { @@ -709,7 +728,12 @@ template <cpu_t c> constexpr size_t native_vector_alignment = std::max(native_float_vector_size<c>, native_int_vector_size<c>); template <cpu_t c> -constexpr bool fast_unaligned = c >= cpu_t::avx1; +constexpr bool fast_unaligned = +#ifdef CID_ARCH_X86 + c >= cpu_t::avx1; +#else + false; +#endif template <cpu_t c> constexpr size_t native_vector_alignment_mask = native_vector_alignment<c> - 1; diff --git a/include/kfr/cident.h b/include/kfr/cident.h @@ -124,6 +124,17 @@ #define CID_ARCH_X32 1 #endif +#ifdef __ARM_NEON__ + +#if __ARM_ARCH >= 8 && defined(__aarch64__) +#define CID_ARCH_NEON64 1 +#define CID_ARCH_NAME neon64 +#else +#define CID_ARCH_NEON 1 +#define CID_ARCH_NAME neon +#endif +#endif + #endif #ifndef CID_ARCH_NAME