Initial support for ARM NEON - kfr - Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)

commit 52e3c3395ad7c58d4a02c0a0c2dc54b15b669920
parent 90bd5e0bdc190e7b93ca72480fa14dd906c92ce7
Author: [email protected] <[email protected]>
Date:   Mon, 25 Jul 2016 14:02:49 +0300

Initial support for ARM NEON

Diffstat:
M include/kfr/base/function.hpp  | 30 +++++++++++++++++++++++++++++-
M include/kfr/base/types.hpp  | 34 +++++++++++++++++++++++++++++-----
M include/kfr/cident.h  | 11 +++++++++++

3 files changed, 69 insertions(+), 6 deletions(-)
diff --git a/include/kfr/base/function.hpp b/include/kfr/base/function.hpp
@@ -42,7 +42,7 @@ namespace kfr
 
 namespace internal
 {
-
+#ifdef CID_ARCH_X86
 using f32sse = vec<f32, 4>;
 using f64sse = vec<f64, 2>;
 using i8sse  = vec<i8, vector_width<i8, cpu_t::sse2>>;
@@ -86,11 +86,39 @@ using mu8avx  = mask<u8, vector_width<u8, cpu_t::avx2>>;
 using mu16avx = mask<u16, vector_width<u16, cpu_t::avx2>>;
 using mu32avx = mask<u32, vector_width<u32, cpu_t::avx2>>;
 using mu64avx = mask<u64, vector_width<u64, cpu_t::avx2>>;
+#else
+using f32neon = vec<f32, 4>;
+using f64neon = vec<f64, 2>;
+using  i8neon  = vec<i8, 16>;
+using i16neon = vec<i16, 8>;
+using i32neon = vec<i32, 4>;
+using i64neon = vec<i64, 2>;
+using  u8neon  = vec<u8, 16>;
+using u16neon = vec<u16, 8>;
+using u32neon = vec<u32, 4>;
+using u64neon = vec<u64, 2>;
+
+using mf32neon = mask<f32, 4>;
+using mf64neon = mask<f64, 2>;
+using mi8neon  = mask<i8, 16>;
+using mi16neon = mask<i16, 8>;
+using mi32neon = mask<i32, 4>;
+using mi64neon = mask<i64, 2>;
+using mu8neon  = mask<u8, 16>;
+using mu16neon = mask<u16, 8>;
+using mu32neon = mask<u32, 4>;
+using mu64neon = mask<u64, 2>;
+#endif
 
 template <cpu_t c, typename T>
 constexpr inline size_t next_simd_width(size_t n)
 {
+#ifdef CID_ARCH_X86
     return n > vector_width<T, cpu_t::sse2> ? vector_width<T, c> : vector_width<T, cpu_t::sse2>;
+#endif
+#ifdef CID_ARCH_ARM
+    return vector_width<T, cpu_t::neon>;
+#endif
 }
 
 template <typename T, size_t N, size_t Nout = next_simd_width<cpu_t::native, T>(N)>
diff --git a/include/kfr/base/types.hpp b/include/kfr/base/types.hpp
@@ -323,7 +323,8 @@ constexpr inline ptrdiff_t distance(const void* x, const void* y)
 
 enum class cpu_t : int
 {
-    common  = 0,
+    common = 0,
+#ifdef CID_ARCH_X86
     sse2    = 1,
     sse3    = 2,
     ssse3   = 3,
@@ -332,9 +333,15 @@ enum class cpu_t : int
     avx1    = 6,
     avx2    = 7,
     avx     = static_cast<int>(avx1),
-    native  = static_cast<int>(KFR_ARCH_NAME),
     lowest  = static_cast<int>(sse2),
     highest = static_cast<int>(avx2),
+#endif
+#ifdef CID_ARCH_ARM
+    neon    = 1,
+    lowest  = static_cast<int>(neon),
+    highest = static_cast<int>(neon),
+#endif
+    native  = static_cast<int>(KFR_ARCH_NAME),
     runtime = -1,
 };
 
@@ -349,8 +356,12 @@ namespace internal
 constexpr cpu_t older(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) - 1); }
 constexpr cpu_t newer(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) + 1); }
 
+#ifdef CID_ARCH_X86
 constexpr auto cpu_list =
     cvals<cpu_t, cpu_t::avx2, cpu_t::avx1, cpu_t::sse41, cpu_t::ssse3, cpu_t::sse3, cpu_t::sse2>;
+#else
+constexpr auto cpu_list = cvals<cpu_t, cpu_t::neon>;
+#endif
 }
 
 template <cpu_t cpu>
@@ -359,8 +370,6 @@ template <cpu_t cpu>
 constexpr auto cpuval = cpuval_t<cpu>{};
 
 constexpr auto cpu_all = cfilter(internal::cpu_list, internal::cpu_list >= cpuval<cpu_t::native>);
-constexpr auto cpu_shuffle =
-    cfilter(cpu_all, cpu_all != cpuval<cpu_t::sse3> && cpu_all != cpuval<cpu_t::ssse3>);
 
 template <typename T>
 constexpr datatype typeclass = std::is_floating_point<typename compound_type_traits<T>::subtype>::value
@@ -655,10 +664,20 @@ constexpr size_t common_int_vector_size   = 16;
 
 template <cpu_t c>
 constexpr size_t native_float_vector_size =
+#ifdef CID_ARCH_X86
     c >= cpu_t::avx1 ? 32 : c >= cpu_t::sse2 ? 16 : common_float_vector_size;
+#endif
+#ifdef CID_ARCH_ARM
+c == cpu_t::neon ? 16 : common_float_vector_size;
+#endif
 template <cpu_t c>
 constexpr size_t native_int_vector_size =
+#ifdef CID_ARCH_X86
     c >= cpu_t::avx2 ? 32 : c >= cpu_t::sse2 ? 16 : common_int_vector_size;
+#endif
+#ifdef CID_ARCH_ARM
+c == cpu_t::neon ? 16 : common_int_vector_size;
+#endif
 
 struct input_expression
 {
@@ -709,7 +728,12 @@ template <cpu_t c>
 constexpr size_t native_vector_alignment = std::max(native_float_vector_size<c>, native_int_vector_size<c>);
 
 template <cpu_t c>
-constexpr bool fast_unaligned = c >= cpu_t::avx1;
+constexpr bool fast_unaligned =
+#ifdef CID_ARCH_X86
+    c >= cpu_t::avx1;
+#else
+    false;
+#endif
 
 template <cpu_t c>
 constexpr size_t native_vector_alignment_mask = native_vector_alignment<c> - 1;
diff --git a/include/kfr/cident.h b/include/kfr/cident.h
@@ -124,6 +124,17 @@
 #define CID_ARCH_X32 1
 #endif
 
+#ifdef __ARM_NEON__
+
+#if __ARM_ARCH >= 8 && defined(__aarch64__)
+#define CID_ARCH_NEON64 1
+#define CID_ARCH_NAME neon64
+#else
+#define CID_ARCH_NEON 1
+#define CID_ARCH_NAME neon
+#endif
+#endif
+
 #endif
 
 #ifndef CID_ARCH_NAME

	kfr Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
	Log \| Files \| Refs \| README

M	include/kfr/base/function.hpp	\|	30	+++++++++++++++++++++++++++++-
M	include/kfr/base/types.hpp	\|	34	+++++++++++++++++++++++++++++-----
M	include/kfr/cident.h	\|	11	+++++++++++