commit 1ff05879d309acbffc4117521d9303d534d4fdb5
parent 1ee8ac5b3ae8bcb5c49847fe9860b4225e52eba5
Author: [email protected] <[email protected]>
Date: Tue, 2 Aug 2016 21:52:08 +0300
Improve ARM NEON support
Diffstat:
7 files changed, 205 insertions(+), 4 deletions(-)
diff --git a/include/kfr/base/abs.hpp b/include/kfr/base/abs.hpp
@@ -31,6 +31,9 @@ namespace kfr
namespace intrinsics
{
+
+#if defined CID_ARCH_SSSE3
+
// floating point
template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
@@ -38,8 +41,6 @@ KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
return x & internal::invhighbitmask<T>;
}
-#if defined CID_ARCH_SSSE3
-
KFR_SINTRIN i64sse abs(const i64sse& x) { return select(x >= 0, x, -x); }
KFR_SINTRIN i32sse abs(const i32sse& x) { return _mm_abs_epi32(*x); }
KFR_SINTRIN i16sse abs(const i16sse& x) { return _mm_abs_epi16(*x); }
@@ -62,7 +63,39 @@ KFR_SINTRIN u8avx abs(const u8avx& x) { return x; }
KFR_HANDLE_ALL_SIZES_NOT_F_1(abs)
+#elif defined CID_ARCH_NEON
+
+KFR_SINTRIN i8neon abs(const i8neon& x) { return vabsq_s8(*x); }
+KFR_SINTRIN i16neon abs(const i16neon& x) { return vabsq_s16(*x); }
+KFR_SINTRIN i32neon abs(const i32neon& x) { return vabsq_s32(*x); }
+#if defined CID_ARCH_NEON64
+KFR_SINTRIN i64neon abs(const i64neon& x) { return vabsq_s64(*x); }
#else
+KFR_SINTRIN i64neon abs(const i64neon& x) { return select(x >= 0, x, -x); }
+#endif
+
+KFR_SINTRIN u8neon abs(const u8neon& x) { return x; }
+KFR_SINTRIN u16neon abs(const u16neon& x) { return x; }
+KFR_SINTRIN u32neon abs(const u32neon& x) { return x; }
+KFR_SINTRIN u64neon abs(const u64neon& x) { return x; }
+
+KFR_SINTRIN f32neon abs(const f32neon& x) { return vabsq_f32(*x); }
+#if defined CID_ARCH_NEON64
+KFR_SINTRIN f64neon abs(const f64neon& x) { return vabsq_f64(*x); }
+#else
+KFR_SINTRIN f64neon abs(const f64neon& x) { return x & internal::invhighbitmask<f64>; }
+#endif
+
+KFR_HANDLE_ALL_SIZES_1(abs)
+
+#else
+
+// floating point
+template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
+KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
+{
+ return x & internal::invhighbitmask<T>;
+}
// fallback
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
diff --git a/include/kfr/base/cpuid.hpp b/include/kfr/base/cpuid.hpp
@@ -27,6 +27,7 @@
namespace kfr
{
+#ifdef CID_ARCH_X86
struct cpu_features
{
@@ -277,4 +278,13 @@ cpu_t detect_cpu()
return cpu_t::lowest;
}
}
+#else
+
+template <size_t = 0>
+cpu_t detect_cpu()
+{
+ return cpu_t::native;
+}
+
+#endif
}
diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp
@@ -158,6 +158,59 @@ KFR_SINTRIN bool bittestany(const vec<T, N>& a)
return bittestany(low(a)) || bittestany(high(a));
}
+#elif CID_ARCH_NEON
+
+KFR_SINTRIN bool bittestall(const u32neon& a)
+{
+ const uint32x2_t tmp = vand_u32(vget_low_u32(*a), vget_high_u32(*a));
+ return vget_lane_u32(vpmin_u32(tmp, tmp), 0) == 0xFFFFFFFFu;
+}
+
+KFR_SINTRIN bool bittestany(const u32neon& a)
+{
+ const uint32x2_t tmp = vorr_u32(vget_low_u32(*a), vget_high_u32(*a));
+ return vget_lane_u32(vpmax_u32(tmp, tmp), 0) != 0;
+}
+KFR_SINTRIN bool bittestany(const u8neon& a) { return bitcast<u8>(bittestany(bitcast<u32>(a))); }
+KFR_SINTRIN bool bittestany(const u16neon& a) { return bitcast<u16>(bittestany(bitcast<u32>(a))); }
+KFR_SINTRIN bool bittestany(const u64neon& a) { return bitcast<u64>(bittestany(bitcast<u32>(a))); }
+KFR_SINTRIN bool bittestany(const i8neon& a) { return bitcast<i8>(bittestany(bitcast<u32>(a))); }
+KFR_SINTRIN bool bittestany(const i16neon& a) { return bitcast<i16>(bittestany(bitcast<u32>(a))); }
+KFR_SINTRIN bool bittestany(const i64neon& a) { return bitcast<i64>(bittestany(bitcast<u32>(a))); }
+KFR_SINTRIN bool bittestany(const f32neon& a) { return bitcast<f32>(bittestany(bitcast<u32>(a))); }
+KFR_SINTRIN bool bittestany(const f64neon& a) { return bitcast<f64>(bittestany(bitcast<u32>(a))); }
+
+KFR_SINTRIN bool bittestall(const u8neon& a) { return bitcast<u8>(bittestall(bitcast<u32>(a))); }
+KFR_SINTRIN bool bittestall(const u16neon& a) { return bitcast<u16>(bittestall(bitcast<u32>(a))); }
+KFR_SINTRIN bool bittestall(const u64neon& a) { return bitcast<u64>(bittestall(bitcast<u32>(a))); }
+KFR_SINTRIN bool bittestall(const i8neon& a) { return bitcast<i8>(bittestall(bitcast<u32>(a))); }
+KFR_SINTRIN bool bittestall(const i16neon& a) { return bitcast<i16>(bittestall(bitcast<u32>(a))); }
+KFR_SINTRIN bool bittestall(const i64neon& a) { return bitcast<i64>(bittestall(bitcast<u32>(a))); }
+KFR_SINTRIN bool bittestall(const f32neon& a) { return bitcast<f32>(bittestall(bitcast<u32>(a))); }
+KFR_SINTRIN bool bittestall(const f64neon& a) { return bitcast<f64>(bittestall(bitcast<u32>(a))); }
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>
+KFR_SINTRIN bool bittestall(const vec<T, N>& a)
+{
+ return bittestall(expand_simd(a, internal::maskbits<T>(true)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>
+KFR_SINTRIN bool bittestall(const vec<T, N>& a)
+{
+ return bittestall(low(a)) && bittestall(high(a));
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>
+KFR_SINTRIN bool bittestany(const vec<T, N>& a)
+{
+ return bittestany(expand_simd(a, internal::maskbits<T>(false)));
+}
+template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>
+KFR_SINTRIN bool bittestany(const vec<T, N>& a)
+{
+ return bittestany(low(a)) || bittestany(high(a));
+}
+
#else
template <typename T, size_t N>
diff --git a/include/kfr/base/min_max.hpp b/include/kfr/base/min_max.hpp
@@ -103,6 +103,37 @@ KFR_SINTRIN u32sse max(const u32sse& x, const u32sse& y) { return select(x > y,
KFR_HANDLE_ALL_SIZES_2(min)
KFR_HANDLE_ALL_SIZES_2(max)
+#elif defined CID_ARCH_NEON
+
+KFR_SINTRIN i8neon min(const i8neon& x, const i8neon& y) { return vminq_s8(*x, *y); }
+KFR_SINTRIN u8neon min(const u8neon& x, const u8neon& y) { return vminq_u8(*x, *y); }
+KFR_SINTRIN i16neon min(const i16neon& x, const i16neon& y) { return vminq_s16(*x, *y); }
+KFR_SINTRIN u16neon min(const u16neon& x, const u16neon& y) { return vminq_u16(*x, *y); }
+KFR_SINTRIN i32neon min(const i32neon& x, const i32neon& y) { return vminq_s32(*x, *y); }
+KFR_SINTRIN u32neon min(const u32neon& x, const u32neon& y) { return vminq_u32(*x, *y); }
+
+KFR_SINTRIN i8neon max(const i8neon& x, const i8neon& y) { return vmaxq_s8(*x, *y); }
+KFR_SINTRIN u8neon max(const u8neon& x, const u8neon& y) { return vmaxq_u8(*x, *y); }
+KFR_SINTRIN i16neon max(const i16neon& x, const i16neon& y) { return vmaxq_s16(*x, *y); }
+KFR_SINTRIN u16neon max(const u16neon& x, const u16neon& y) { return vmaxq_u16(*x, *y); }
+KFR_SINTRIN i32neon max(const i32neon& x, const i32neon& y) { return vmaxq_s32(*x, *y); }
+KFR_SINTRIN u32neon max(const u32neon& x, const u32neon& y) { return vmaxq_u32(*x, *y); }
+KFR_SINTRIN i64neon min(const i64neon& x, const i64neon& y) { return select(x < y, x, y); }
+KFR_SINTRIN u64neon min(const u64neon& x, const u64neon& y) { return select(x < y, x, y); }
+
+KFR_SINTRIN f32neon min(const f32neon& x, const f32neon& y) { return vminq_f32(*x, *y); }
+KFR_SINTRIN f32neon max(const f32neon& x, const f32neon& y) { return vmaxq_f32(*x, *y); }
+#if defined CID_ARCH_NEON64
+KFR_SINTRIN f64neon min(const f64neon& x, const f64neon& y) { return vminq_f64(*x, *y); }
+KFR_SINTRIN f64neon max(const f64neon& x, const f64neon& y) { return vmaxq_f64(*x, *y); }
+#else
+KFR_SINTRIN f64neon min(const f64neon& x, const f64neon& y) { return select(x < y, x, y); }
+KFR_SINTRIN f64neon max(const f64neon& x, const f64neon& y) { return select(x > y, x, y); }
+#endif
+
+KFR_HANDLE_ALL_SIZES_2(min)
+KFR_HANDLE_ALL_SIZES_2(max)
+
#else
// fallback
diff --git a/include/kfr/base/saturation.hpp b/include/kfr/base/saturation.hpp
@@ -30,6 +30,8 @@ namespace kfr
namespace intrinsics
{
+
+// Generic functions
template <typename T, size_t N>
KFR_SINTRIN vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b)
{
@@ -103,6 +105,31 @@ KFR_SINTRIN i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs
KFR_HANDLE_ALL_SIZES_2(satadd)
KFR_HANDLE_ALL_SIZES_2(satsub)
+#elif defined CID_ARCH_NEON
+
+KFR_SINTRIN u8neon satadd(const u8neon& x, const u8neon& y) { return vqaddq_u8(*x, *y); }
+KFR_SINTRIN i8neon satadd(const i8neon& x, const i8neon& y) { return vqaddq_s8(*x, *y); }
+KFR_SINTRIN u16neon satadd(const u16neon& x, const u16neon& y) { return vqaddq_u16(*x, *y); }
+KFR_SINTRIN i16neon satadd(const i16neon& x, const i16neon& y) { return vqaddq_s16(*x, *y); }
+
+KFR_SINTRIN u8neon satsub(const u8neon& x, const u8neon& y) { return vqsubq_u8(*x, *y); }
+KFR_SINTRIN i8neon satsub(const i8neon& x, const i8neon& y) { return vqsubq_s8(*x, *y); }
+KFR_SINTRIN u16neon satsub(const u16neon& x, const u16neon& y) { return vqsubq_u16(*x, *y); }
+KFR_SINTRIN i16neon satsub(const i16neon& x, const i16neon& y) { return vqsubq_s16(*x, *y); }
+
+KFR_SINTRIN u32neon satadd(const u32neon& a, const u32neon& b) { return vqaddq_u32(*a, *b); }
+KFR_SINTRIN i32neon satadd(const i32neon& a, const i32neon& b) { return vqaddq_s32(*a, *b); }
+KFR_SINTRIN u64neon satadd(const u64neon& a, const u64neon& b) { return vqaddq_u64(*a, *b); }
+KFR_SINTRIN i64neon satadd(const i64neon& a, const i64neon& b) { return vqaddq_s64(*a, *b); }
+
+KFR_SINTRIN i32neon satsub(const i32neon& a, const i32neon& b) { return vqsubq_u32(*a, *b); }
+KFR_SINTRIN i64neon satsub(const i64neon& a, const i64neon& b) { return vqsubq_s32(*a, *b); }
+KFR_SINTRIN u32neon satsub(const u32neon& a, const u32neon& b) { return vqsubq_u64(*a, *b); }
+KFR_SINTRIN u64neon satsub(const u64neon& a, const u64neon& b) { return vqsubq_s64(*a, *b); }
+
+KFR_HANDLE_ALL_SIZES_2(satadd)
+KFR_HANDLE_ALL_SIZES_2(satsub)
+
#else
// fallback
template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)>
diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp
@@ -129,6 +129,53 @@ KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<
return concat(select(low(a).asmask(), low(b), low(c)), select(high(a).asmask(), high(b), high(c)));
}
+#elif defined CID_ARCH_NEON
+
+KFR_SINTRIN f32neon select(const mf32neon& m, const f32neon& x, const f32neon& y)
+{
+ return vbslq_f32(*m, *x, *y);
+}
+
+KFR_SINTRIN i8neon select(const mi8neon& m, const i8neon& x, const i8neon& y) { return vbslq_s8(*m, *x, *y); }
+KFR_SINTRIN u8neon select(const mu8neon& m, const u8neon& x, const u8neon& y) { return vbslq_u8(*m, *x, *y); }
+KFR_SINTRIN i16neon select(const mi16neon& m, const i16neon& x, const i16neon& y)
+{
+ return vbslq_s16(*m, *x, *y);
+}
+KFR_SINTRIN u16neon select(const mu16neon& m, const u16neon& x, const u16neon& y)
+{
+ return vbslq_u16(*m, *x, *y);
+}
+KFR_SINTRIN i32neon select(const mi32neon& m, const i32neon& x, const i32neon& y)
+{
+ return vbslq_s32(*m, *x, *y);
+}
+KFR_SINTRIN u32neon select(const mu32neon& m, const u32neon& x, const u32neon& y)
+{
+ return vbslq_u32(*m, *x, *y);
+}
+KFR_SINTRIN i64neon select(const mi64neon& m, const i64neon& x, const i64neon& y)
+{
+ return vbslq_s64(*m, *x, *y);
+}
+KFR_SINTRIN u64neon select(const mu64neon& m, const u64neon& x, const u64neon& y)
+{
+ return vbslq_u64(*m, *x, *y);
+}
+
+#ifdef CID_ARCH_NEON64
+KFR_SINTRIN f64neon select(const mf64neon& m, const f64neon& x, const f64neon& y)
+{
+ return vbslq_f64(*m, *x, *y);
+}
+#else
+template <typename T, size_t N>
+KFR_SINTRIN vec<T, N> select(const mask<T, N>& m, const vec<T, N>& x, const vec<T, N>& y)
+{
+ return y ^ ((x ^ y) & m);
+}
+#endif
+
#else
// fallback
diff --git a/include/kfr/cident.h b/include/kfr/cident.h
@@ -2,7 +2,7 @@
#if defined(_M_IX86) || defined(__i386__) || defined(_M_X64) || defined(__x86_64__)
#define CID_ARCH_X86 1
-#elif defined(__arm__) || defined(__arm64__) || defined(_M_ARM)
+#elif defined(__arm__) || defined(__arm64__) || defined(_M_ARM) || defined(__aarch64__)
#define CID_ARCH_ARM 1
#endif
@@ -118,7 +118,7 @@
#elif defined(CID_ARCH_ARM)
-#if defined(__arm64__)
+#if defined(__aarch64__)
#define CID_ARCH_X64 1
#else
#define CID_ARCH_X32 1