commit 6ff1acd1d2216a0655755c48a805cb14c130c150
parent e0edb39a1d4fdeda5ddd097077475415dcafb71a
Author: [email protected] <[email protected]>
Date: Mon, 25 Jul 2016 13:01:16 +0300
Move platform specific types to function.hpp
Diffstat:
2 files changed, 143 insertions(+), 48 deletions(-)
diff --git a/include/kfr/base/function.hpp b/include/kfr/base/function.hpp
@@ -48,7 +48,7 @@ namespace kfr
}
#define KFR_HANDLE_SCALAR(fn) \
- template <typename T, typename... Ts, KFR_ENABLE_IF(!is_vec<T>::value)> \
+ template <typename T, typename... Ts, KFR_ENABLE_IF(is_numeric_args<T, Ts...>::value)> \
KFR_SINTRIN auto fn(const T& x, const Ts&... rest) \
{ \
return fn(make_vector(x), make_vector(rest)...)[0]; \
@@ -120,5 +120,137 @@ KFR_INLINE auto handle_all_reduce(vec<T, N> x, Args&&... args)
return handle_all_reduce_f<cur>(redfn, fn, x, std::forward<Args>(args)...);
}
}
+
+namespace internal
+{
+
+using f32sse = vec<f32, 4>;
+using f64sse = vec<f64, 2>;
+using i8sse = vec<i8, vector_width<i8, cpu_t::sse2>>;
+using i16sse = vec<i16, vector_width<i16, cpu_t::sse2>>;
+using i32sse = vec<i32, vector_width<i32, cpu_t::sse2>>;
+using i64sse = vec<i64, vector_width<i64, cpu_t::sse2>>;
+using u8sse = vec<u8, vector_width<u8, cpu_t::sse2>>;
+using u16sse = vec<u16, vector_width<u16, cpu_t::sse2>>;
+using u32sse = vec<u32, vector_width<u32, cpu_t::sse2>>;
+using u64sse = vec<u64, vector_width<u64, cpu_t::sse2>>;
+
+using mf32sse = mask<f32, vector_width<f32, cpu_t::sse2>>;
+using mf64sse = mask<f64, vector_width<f64, cpu_t::sse2>>;
+using mi8sse = mask<i8, vector_width<i8, cpu_t::sse2>>;
+using mi16sse = mask<i16, vector_width<i16, cpu_t::sse2>>;
+using mi32sse = mask<i32, vector_width<i32, cpu_t::sse2>>;
+using mi64sse = mask<i64, vector_width<i64, cpu_t::sse2>>;
+using mu8sse = mask<u8, vector_width<u8, cpu_t::sse2>>;
+using mu16sse = mask<u16, vector_width<u16, cpu_t::sse2>>;
+using mu32sse = mask<u32, vector_width<u32, cpu_t::sse2>>;
+using mu64sse = mask<u64, vector_width<u64, cpu_t::sse2>>;
+
+using f32avx = vec<f32, 8>;
+using f64avx = vec<f64, 4>;
+using i8avx = vec<i8, vector_width<i8, cpu_t::avx2>>;
+using i16avx = vec<i16, vector_width<i16, cpu_t::avx2>>;
+using i32avx = vec<i32, vector_width<i32, cpu_t::avx2>>;
+using i64avx = vec<i64, vector_width<i64, cpu_t::avx2>>;
+using u8avx = vec<u8, vector_width<u8, cpu_t::avx2>>;
+using u16avx = vec<u16, vector_width<u16, cpu_t::avx2>>;
+using u32avx = vec<u32, vector_width<u32, cpu_t::avx2>>;
+using u64avx = vec<u64, vector_width<u64, cpu_t::avx2>>;
+
+using mf32avx = mask<f32, vector_width<f32, cpu_t::avx1>>;
+using mf64avx = mask<f64, vector_width<f64, cpu_t::avx1>>;
+using mi8avx = mask<i8, vector_width<i8, cpu_t::avx2>>;
+using mi16avx = mask<i16, vector_width<i16, cpu_t::avx2>>;
+using mi32avx = mask<i32, vector_width<i32, cpu_t::avx2>>;
+using mi64avx = mask<i64, vector_width<i64, cpu_t::avx2>>;
+using mu8avx = mask<u8, vector_width<u8, cpu_t::avx2>>;
+using mu16avx = mask<u16, vector_width<u16, cpu_t::avx2>>;
+using mu32avx = mask<u32, vector_width<u32, cpu_t::avx2>>;
+using mu64avx = mask<u64, vector_width<u64, cpu_t::avx2>>;
+
+template <cpu_t c, typename T>
+constexpr inline size_t next_simd_width(size_t n)
+{
+ return n > vector_width<T, cpu_t::sse2> ? vector_width<T, c> : vector_width<T, cpu_t::sse2>;
+}
+
+template <typename T, size_t N, size_t Nout = next_simd_width<cpu_t::native, T>(N)>
+KFR_SINTRIN vec<T, Nout> expand_simd(vec<T, N> x)
+{
+ return extend<Nout>(x);
+}
+
+#define KFR_HANDLE_ALL_SIZES_1(fn) \
+ template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \
+ KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \
+ { \
+ return slice<0, N>(fn(expand_simd(a))); \
+ } \
+ template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \
+ KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \
+ { \
+ return concat(fn(low(a)), fn(high(a))); \
+ }
+#define KFR_HANDLE_SCALAR_1(fn) \
+ template <typename T, KFR_ENABLE_IF(is_numeric<T>::value)> \
+ KFR_SINTRIN T fn(T a) \
+ { \
+ return fn(make_vector(a))[0]; \
+ }
+
+#define KFR_HANDLE_ALL_SIZES_2(fn) \
+ template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \
+ KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b) \
+ { \
+ return slice<0, N>(fn(expand_simd(a), expand_simd(b))); \
+ } \
+ template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \
+ KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b) \
+ { \
+ return concat(fn(low(a), low(b)), fn(high(a), high(b))); \
+ }
+#define KFR_HANDLE_SCALAR_2(fn) \
+ template <typename T, KFR_ENABLE_IF(is_numeric<T>::value)> \
+ KFR_SINTRIN T fn(T a, T b) \
+ { \
+ return fn(make_vector(a), make_vector(b))[0]; \
+ }
+
+#define KFR_HANDLE_ALL_SIZES_3(fn) \
+ template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \
+ KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b, vec<T, N> c) \
+ { \
+ return slice<0, N>(fn(expand_simd(a), expand_simd(b), expand_simd(c))); \
+ } \
+ template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \
+ KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b, vec<T, N> c) \
+ { \
+ return concat(fn(low(a), low(b), low(c)), fn(high(a), high(b), high(c))); \
+ }
+#define KFR_HANDLE_SCALAR_3(fn) \
+ template <typename T, KFR_ENABLE_IF(is_numeric<T>::value)> \
+ KFR_SINTRIN T fn(T a, T b, T c) \
+ { \
+ return fn(make_vector(a), make_vector(b), make_vector(c))[0]; \
+ }
+
+#define KFR_HANDLE_ALL_SIZES_4(fn) \
+ template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \
+ KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b, vec<T, N> c, vec<T, N> d) \
+ { \
+ return slice<0, N>(fn(expand_simd(a), expand_simd(b), expand_simd(c), expand_simd(d))); \
+ } \
+ template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \
+ KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b, vec<T, N> c, vec<T, N> d) \
+ { \
+ return concat(fn(low(a), low(b), low(c), low(d)), fn(high(a), high(b), high(c), high(d))); \
+ }
+#define KFR_HANDLE_SCALAR_4(fn) \
+ template <typename T, KFR_ENABLE_IF(is_numeric<T>::value)> \
+ KFR_SINTRIN T fn(T a, T b, T c, T d) \
+ { \
+ return fn(make_vector(a), make_vector(b), make_vector(c), make_vector(d))[0]; \
+ }
+}
}
#pragma clang diagnostic pop
diff --git a/include/kfr/base/vec.hpp b/include/kfr/base/vec.hpp
@@ -717,10 +717,10 @@ struct mask : public vec<T, N>
{
}
- template <typename M, typename = u8[sizeof(T) == sizeof(M)]>
- constexpr KFR_INLINE mask(mask<M, N> value) : base(reinterpret_cast<const vec<T, N>&>(value))
- {
- }
+// template <typename M, typename = u8[sizeof(T) == sizeof(M)]>
+// constexpr KFR_INLINE mask(mask<M, N> value) : base(reinterpret_cast<const vec<T, N>&>(value))
+// {
+// }
constexpr KFR_INLINE mask operator~() const { return bitcast<T>(~ubitcast(this->v)); }
constexpr KFR_INLINE mask operator&(vec<T, N> x) const
{
@@ -744,6 +744,12 @@ struct mask : public vec<T, N>
KFR_INLINE vec<T, N>& asvec() { return ref_cast<mask>(*this); }
KFR_INLINE const vec<T, N>& asvec() const { return ref_cast<mask>(*this); }
+ template <typename U, KFR_ENABLE_IF(sizeof(T) == sizeof(U))>
+ KFR_INLINE operator mask<U, N>() const
+ {
+ return bitcast<U>(*this);
+ }
+
KFR_INLINE bool operator[](size_t index) const { return ibitcast(this->v[index]) < 0; }
};
@@ -1041,49 +1047,6 @@ using double16 = f64x16;
namespace internal
{
-using f32sse = vec<f32, vector_width<f32, cpu_t::sse2>>;
-using f64sse = vec<f64, vector_width<f64, cpu_t::sse2>>;
-using i8sse = vec<i8, vector_width<i8, cpu_t::sse2>>;
-using i16sse = vec<i16, vector_width<i16, cpu_t::sse2>>;
-using i32sse = vec<i32, vector_width<i32, cpu_t::sse2>>;
-using i64sse = vec<i64, vector_width<i64, cpu_t::sse2>>;
-using u8sse = vec<u8, vector_width<u8, cpu_t::sse2>>;
-using u16sse = vec<u16, vector_width<u16, cpu_t::sse2>>;
-using u32sse = vec<u32, vector_width<u32, cpu_t::sse2>>;
-using u64sse = vec<u64, vector_width<u64, cpu_t::sse2>>;
-
-using mf32sse = mask<f32, vector_width<f32, cpu_t::sse2>>;
-using mf64sse = mask<f64, vector_width<f64, cpu_t::sse2>>;
-using mi8sse = mask<i8, vector_width<i8, cpu_t::sse2>>;
-using mi16sse = mask<i16, vector_width<i16, cpu_t::sse2>>;
-using mi32sse = mask<i32, vector_width<i32, cpu_t::sse2>>;
-using mi64sse = mask<i64, vector_width<i64, cpu_t::sse2>>;
-using mu8sse = mask<u8, vector_width<u8, cpu_t::sse2>>;
-using mu16sse = mask<u16, vector_width<u16, cpu_t::sse2>>;
-using mu32sse = mask<u32, vector_width<u32, cpu_t::sse2>>;
-using mu64sse = mask<u64, vector_width<u64, cpu_t::sse2>>;
-
-using f32avx = vec<f32, vector_width<f32, cpu_t::avx1>>;
-using f64avx = vec<f64, vector_width<f64, cpu_t::avx1>>;
-using i8avx = vec<i8, vector_width<i8, cpu_t::avx2>>;
-using i16avx = vec<i16, vector_width<i16, cpu_t::avx2>>;
-using i32avx = vec<i32, vector_width<i32, cpu_t::avx2>>;
-using i64avx = vec<i64, vector_width<i64, cpu_t::avx2>>;
-using u8avx = vec<u8, vector_width<u8, cpu_t::avx2>>;
-using u16avx = vec<u16, vector_width<u16, cpu_t::avx2>>;
-using u32avx = vec<u32, vector_width<u32, cpu_t::avx2>>;
-using u64avx = vec<u64, vector_width<u64, cpu_t::avx2>>;
-
-using mf32avx = mask<f32, vector_width<f32, cpu_t::avx1>>;
-using mf64avx = mask<f64, vector_width<f64, cpu_t::avx1>>;
-using mi8avx = mask<i8, vector_width<i8, cpu_t::avx2>>;
-using mi16avx = mask<i16, vector_width<i16, cpu_t::avx2>>;
-using mi32avx = mask<i32, vector_width<i32, cpu_t::avx2>>;
-using mi64avx = mask<i64, vector_width<i64, cpu_t::avx2>>;
-using mu8avx = mask<u8, vector_width<u8, cpu_t::avx2>>;
-using mu16avx = mask<u16, vector_width<u16, cpu_t::avx2>>;
-using mu32avx = mask<u32, vector_width<u32, cpu_t::avx2>>;
-using mu64avx = mask<u64, vector_width<u64, cpu_t::avx2>>;
template <typename T, size_t N>
struct vec_type