commit 52e3c3395ad7c58d4a02c0a0c2dc54b15b669920
parent 90bd5e0bdc190e7b93ca72480fa14dd906c92ce7
Author: [email protected] <[email protected]>
Date: Mon, 25 Jul 2016 14:02:49 +0300
Initial support for ARM NEON
Diffstat:
3 files changed, 69 insertions(+), 6 deletions(-)
diff --git a/include/kfr/base/function.hpp b/include/kfr/base/function.hpp
@@ -42,7 +42,7 @@ namespace kfr
namespace internal
{
-
+#ifdef CID_ARCH_X86
using f32sse = vec<f32, 4>;
using f64sse = vec<f64, 2>;
using i8sse = vec<i8, vector_width<i8, cpu_t::sse2>>;
@@ -86,11 +86,39 @@ using mu8avx = mask<u8, vector_width<u8, cpu_t::avx2>>;
using mu16avx = mask<u16, vector_width<u16, cpu_t::avx2>>;
using mu32avx = mask<u32, vector_width<u32, cpu_t::avx2>>;
using mu64avx = mask<u64, vector_width<u64, cpu_t::avx2>>;
+#else
+using f32neon = vec<f32, 4>;
+using f64neon = vec<f64, 2>;
+using i8neon = vec<i8, 16>;
+using i16neon = vec<i16, 8>;
+using i32neon = vec<i32, 4>;
+using i64neon = vec<i64, 2>;
+using u8neon = vec<u8, 16>;
+using u16neon = vec<u16, 8>;
+using u32neon = vec<u32, 4>;
+using u64neon = vec<u64, 2>;
+
+using mf32neon = mask<f32, 4>;
+using mf64neon = mask<f64, 2>;
+using mi8neon = mask<i8, 16>;
+using mi16neon = mask<i16, 8>;
+using mi32neon = mask<i32, 4>;
+using mi64neon = mask<i64, 2>;
+using mu8neon = mask<u8, 16>;
+using mu16neon = mask<u16, 8>;
+using mu32neon = mask<u32, 4>;
+using mu64neon = mask<u64, 2>;
+#endif
template <cpu_t c, typename T>
constexpr inline size_t next_simd_width(size_t n)
{
+#ifdef CID_ARCH_X86
return n > vector_width<T, cpu_t::sse2> ? vector_width<T, c> : vector_width<T, cpu_t::sse2>;
+#endif
+#ifdef CID_ARCH_ARM
+ return vector_width<T, cpu_t::neon>;
+#endif
}
template <typename T, size_t N, size_t Nout = next_simd_width<cpu_t::native, T>(N)>
diff --git a/include/kfr/base/types.hpp b/include/kfr/base/types.hpp
@@ -323,7 +323,8 @@ constexpr inline ptrdiff_t distance(const void* x, const void* y)
enum class cpu_t : int
{
- common = 0,
+ common = 0,
+#ifdef CID_ARCH_X86
sse2 = 1,
sse3 = 2,
ssse3 = 3,
@@ -332,9 +333,15 @@ enum class cpu_t : int
avx1 = 6,
avx2 = 7,
avx = static_cast<int>(avx1),
- native = static_cast<int>(KFR_ARCH_NAME),
lowest = static_cast<int>(sse2),
highest = static_cast<int>(avx2),
+#endif
+#ifdef CID_ARCH_ARM
+ neon = 1,
+ lowest = static_cast<int>(neon),
+ highest = static_cast<int>(neon),
+#endif
+ native = static_cast<int>(KFR_ARCH_NAME),
runtime = -1,
};
@@ -349,8 +356,12 @@ namespace internal
constexpr cpu_t older(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) - 1); }
constexpr cpu_t newer(cpu_t x) { return static_cast<cpu_t>(static_cast<int>(x) + 1); }
+#ifdef CID_ARCH_X86
constexpr auto cpu_list =
cvals<cpu_t, cpu_t::avx2, cpu_t::avx1, cpu_t::sse41, cpu_t::ssse3, cpu_t::sse3, cpu_t::sse2>;
+#else
+constexpr auto cpu_list = cvals<cpu_t, cpu_t::neon>;
+#endif
}
template <cpu_t cpu>
@@ -359,8 +370,6 @@ template <cpu_t cpu>
constexpr auto cpuval = cpuval_t<cpu>{};
constexpr auto cpu_all = cfilter(internal::cpu_list, internal::cpu_list >= cpuval<cpu_t::native>);
-constexpr auto cpu_shuffle =
- cfilter(cpu_all, cpu_all != cpuval<cpu_t::sse3> && cpu_all != cpuval<cpu_t::ssse3>);
template <typename T>
constexpr datatype typeclass = std::is_floating_point<typename compound_type_traits<T>::subtype>::value
@@ -655,10 +664,20 @@ constexpr size_t common_int_vector_size = 16;
template <cpu_t c>
constexpr size_t native_float_vector_size =
+#ifdef CID_ARCH_X86
c >= cpu_t::avx1 ? 32 : c >= cpu_t::sse2 ? 16 : common_float_vector_size;
+#endif
+#ifdef CID_ARCH_ARM
+c == cpu_t::neon ? 16 : common_float_vector_size;
+#endif
template <cpu_t c>
constexpr size_t native_int_vector_size =
+#ifdef CID_ARCH_X86
c >= cpu_t::avx2 ? 32 : c >= cpu_t::sse2 ? 16 : common_int_vector_size;
+#endif
+#ifdef CID_ARCH_ARM
+c == cpu_t::neon ? 16 : common_int_vector_size;
+#endif
struct input_expression
{
@@ -709,7 +728,12 @@ template <cpu_t c>
constexpr size_t native_vector_alignment = std::max(native_float_vector_size<c>, native_int_vector_size<c>);
template <cpu_t c>
-constexpr bool fast_unaligned = c >= cpu_t::avx1;
+constexpr bool fast_unaligned =
+#ifdef CID_ARCH_X86
+ c >= cpu_t::avx1;
+#else
+ false;
+#endif
template <cpu_t c>
constexpr size_t native_vector_alignment_mask = native_vector_alignment<c> - 1;
diff --git a/include/kfr/cident.h b/include/kfr/cident.h
@@ -124,6 +124,17 @@
#define CID_ARCH_X32 1
#endif
+#ifdef __ARM_NEON__
+
+#if __ARM_ARCH >= 8 && defined(__aarch64__)
+#define CID_ARCH_NEON64 1
+#define CID_ARCH_NAME neon64
+#else
+#define CID_ARCH_NEON 1
+#define CID_ARCH_NAME neon
+#endif
+#endif
+
#endif
#ifndef CID_ARCH_NAME