commit 8ca9e4785460793e5af8107fc87cd1a62b59f6ac
parent 64569318f21ece90d0632df3e14cbcbc793437be
Author: [email protected] <[email protected]>
Date: Sat, 8 Oct 2016 03:07:59 +0300
New implementation for SIMD read/write
Diffstat:
5 files changed, 115 insertions(+), 172 deletions(-)
diff --git a/include/kfr/base/read_write.hpp b/include/kfr/base/read_write.hpp
@@ -35,13 +35,13 @@ namespace kfr
template <size_t N, bool A = false, typename T>
CMT_INLINE vec<T, N> read(const T* src)
{
- return internal_read_write::read<N, A, T>(src);
+ return simd_read<N * compound_type_traits<T>::width, A>(ptr_cast<subtype<T>>(src));
}
template <bool A = false, size_t N, typename T>
CMT_INLINE void write(T* dest, const vec<T, N>& value)
{
- internal_read_write::write<A, N, T>(dest, value);
+ simd_write<A, N * compound_type_traits<T>::width>(ptr_cast<subtype<T>>(dest), *value);
}
template <typename... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)>
diff --git a/include/kfr/base/simd.hpp b/include/kfr/base/simd.hpp
@@ -31,53 +31,72 @@
namespace kfr
{
+constexpr size_t index_undefined = static_cast<size_t>(-1);
+
#ifdef CMT_COMPILER_CLANG
+#define KFR_NATIVE_SIMD 1
+#endif
-using simdindex = int;
+#ifdef KFR_NATIVE_SIMD
-template <typename T, simdindex N>
+template <typename T, size_t N>
using simd = T __attribute__((ext_vector_type(N)));
-#define KFT_CONVERT_VECTOR(X, T, N) __builtin_convertvector(X, ::kfr::simd<T, N>)
-
-#define KFR_SIMD_PARAM_ARE_DEDUCIBLE 1
-#define KFR_SIMD_FROM_SCALAR(X, T, N) (X)
-#define KFR_BUILTIN_SHUFFLEVECTOR(T, N, X, Y, I) __builtin_shufflevector(X, Y, I)
-
-#elif defined CMT_COMPILER_GNU
+template <typename T, size_t N, bool A>
+using simd_storage = internal::struct_with_alignment<simd<T, N>, A>;
-using simdindex = int;
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wattributes"
-
-template <typename T, simdindex N>
-struct simd_gcc
+template <typename T, size_t N, size_t... indices>
+CMT_INLINE simd<T, sizeof...(indices)> simd_shuffle(const identity<simd<T, N>>& x,
+ const identity<simd<T, N>>& y, csizes_t<indices...>)
{
- constexpr static size_t NN = next_poweroftwo(N);
- typedef __attribute__((vector_size(NN * sizeof(T)))) T simd_type;
- typedef simd_type type __attribute__((__packed__, __aligned__(sizeof(T))));
-};
-#pragma GCC diagnostic pop
-
-template <typename T, simdindex N>
-using simd = typename simd_gcc<T, N>::type;
+ return __builtin_shufflevector(x, y,
+ ((indices == index_undefined) ? -1 : static_cast<intptr_t>(indices))...);
+}
+template <typename T, size_t N, size_t... indices>
+CMT_INLINE simd<T, sizeof...(indices)> simd_shuffle(const identity<simd<T, N>>& x, csizes_t<indices...>)
+{
+ return __builtin_shufflevector(x, x,
+ ((indices == index_undefined) ? -1 : static_cast<intptr_t>(indices))...);
+}
-#define KFT_CONVERT_VECTOR(X, T, N) static_cast<::kfr::simd<T, N>>(X)
-#define KFR_SIMD_FROM_SCALAR(X, T, N) \
- (__builtin_shuffle(::kfr::simd<T, N>{ X }, ::kfr::simd<int_type<sizeof(T) * 8>, N>{ 0 }))
-#define KFR_BUILTIN_SHUFFLEVECTOR(T, N, X, Y, I) ::kfr::internal::builtin_shufflevector<T, N>(X, Y, I)
+template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
+CMT_INLINE simd<T, N> simd_read(const T* src)
+{
+ return ptr_cast<simd_storage<T, N, A>>(src)->value;
+}
-namespace internal
+template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
+CMT_INLINE simd<T, N> simd_read(const T* src)
{
-template <typename T, size_t N, typename... Int>
-KFR_INTRIN simd<T, sizeof...(Int)> builtin_shufflevector(const simd<T, N>& x, const simd<T, N>& y,
- const Int&... indices)
+ constexpr size_t first = prev_poweroftwo(N);
+ constexpr size_t rest = N - first;
+ constexpr auto extend_indices = cconcat(csizeseq<rest>, csizeseq<first - rest, index_undefined, 0>);
+ constexpr auto concat_indices = csizeseq<N>;
+ return simd_shuffle<T, first>(simd_read<first, A>(src),
+ simd_shuffle<T, rest>(simd_read<rest, false>(src + first), extend_indices),
+ concat_indices);
+}
+
+template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
+CMT_INLINE void simd_write(T* dest, const identity<simd<T, N>>& value)
{
- return simd<T, sizeof...(Int)>{ (indices < N ? x[indices] : y[indices])... };
+ ptr_cast<simd_storage<T, N, A>>(dest)->value = value;
}
+
+template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
+CMT_INLINE void simd_write(T* dest, const identity<simd<T, N>>& value)
+{
+ constexpr size_t first = prev_poweroftwo(N);
+ constexpr size_t rest = N - first;
+ simd_write<A, first>(dest, simd_shuffle(value, csizeseq<first>));
+ simd_write<false, rest>(dest + first, simd_shuffle(value, csizeseq<rest, first>));
}
+#define KFR_SIMD_CAST(T, N, X) __builtin_convertvector(X, ::kfr::simd<T, N>)
+#define KFR_SIMD_BITCAST(T, N, X) ((::kfr::simd<T, N>)(X))
+#define KFR_SIMD_BROADCAST(T, N, X) ((::kfr::simd<T, N>)(X))
+#define KFR_SIMD_SHUFFLE(X, Y, ...) __builtin_shufflevector(X, Y, __VA_ARGS__)
+
#endif
template <typename T, size_t N>
@@ -85,6 +104,7 @@ struct vec_op
{
using type = subtype<T>;
using utype = kfr::utype<type>;
+ using iutype = conditional<kfr::is_i_class<T>::value, type, utype>;
constexpr static size_t w = compound_type_traits<T>::width * N;
CMT_INLINE constexpr static simd<type, w> add(const simd<type, w>& x, const simd<type, w>& y) noexcept
@@ -109,11 +129,13 @@ struct vec_op
}
CMT_INLINE constexpr static simd<type, w> shl(const simd<type, w>& x, const simd<type, w>& y) noexcept
{
- return x << y;
+ return reinterpret_cast<simd<type, w>>(reinterpret_cast<simd<iutype, w>>(x)
+ << reinterpret_cast<simd<iutype, w>>(y));
}
CMT_INLINE constexpr static simd<type, w> shr(const simd<type, w>& x, const simd<type, w>& y) noexcept
{
- return x >> y;
+ return reinterpret_cast<simd<type, w>>(reinterpret_cast<simd<iutype, w>>(x) >>
+ reinterpret_cast<simd<iutype, w>>(y));
}
CMT_INLINE constexpr static simd<type, w> neg(const simd<type, w>& x) noexcept { return -x; }
CMT_INLINE constexpr static simd<type, w> band(const simd<type, w>& x, const simd<type, w>& y) noexcept
diff --git a/include/kfr/base/types.hpp b/include/kfr/base/types.hpp
@@ -300,6 +300,29 @@ CMT_INLINE void zeroize(T1& value)
{
builtin_memset(static_cast<void*>(builtin_addressof(value)), 0, sizeof(T1));
}
+
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wattributes"
+
+template <typename T, bool A>
+struct struct_with_alignment
+{
+ T value;
+ KFR_INTRIN void operator=(T value) { this->value = value; }
+};
+
+template <typename T>
+struct struct_with_alignment<T, false>
+{
+ T value;
+ KFR_INTRIN void operator=(T value) { this->value = value; }
+}
+#ifdef CMT_GNU_ATTRIBUTES
+__attribute__((__packed__, __may_alias__)) //
+#endif
+;
+
+#pragma GCC diagnostic pop
}
template <typename T>
diff --git a/include/kfr/base/vec.hpp b/include/kfr/base/vec.hpp
@@ -80,50 +80,11 @@ template <typename T, size_t N>
struct is_vec_impl<mask<T, N>> : std::true_type
{
};
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wattributes"
-
-template <typename T, bool A>
-struct struct_with_alignment
-{
- T value;
- KFR_INTRIN void operator=(T value) { this->value = value; }
-};
-
-template <typename T>
-struct struct_with_alignment<T, false>
-{
- T value;
- KFR_INTRIN void operator=(T value) { this->value = value; }
-}
-#ifdef CMT_GNU_ATTRIBUTES
-__attribute__((__packed__, __may_alias__)) //
-#endif
-;
}
template <typename T>
using is_vec = internal::is_vec_impl<T>;
-template <typename T, size_t N, bool A>
-using vec_algn = internal::struct_with_alignment<simd<T, N>, A>;
-
-#pragma GCC diagnostic pop
-
-template <typename T, size_t N, bool A>
-struct vec_ptr
-{
- constexpr CMT_INLINE vec_ptr(T* data) noexcept : data(data) {}
- constexpr CMT_INLINE vec_ptr(const T* data) noexcept : data(const_cast<T*>(data)) {}
- CMT_INLINE const vec_algn<T, N, A>& operator[](size_t i) const
- {
- return *static_cast<vec_algn<T, N, A>*>(data + i);
- }
- CMT_INLINE vec_algn<T, N, A>& operator[](size_t i) { return *static_cast<vec_algn<T, N, A>*>(data + i); }
- T* data;
-};
-
template <typename To, typename From, size_t N,
KFR_ENABLE_IF(std::is_same<subtype<From>, subtype<To>>::value),
size_t Nout = N* compound_type_traits<From>::width / compound_type_traits<To>::width>
@@ -151,15 +112,12 @@ get_vec_index(int = 0)
return fn.template operator()<index>();
}
-constexpr size_t index_undefined = static_cast<size_t>(-1);
-
template <typename T, size_t N, size_t... Indices, KFR_ENABLE_IF(!is_compound<T>::value)>
CMT_INLINE vec<T, sizeof...(Indices)> shufflevector(csizes_t<Indices...>, const vec<T, N>& x,
const vec<T, N>& y)
{
- vec<T, sizeof...(Indices)> result = KFR_BUILTIN_SHUFFLEVECTOR(
- T, N, *x, *y,
- static_cast<intptr_t>(Indices == index_undefined ? -1 : static_cast<intptr_t>(Indices))...);
+ vec<T, sizeof...(Indices)> result = KFR_SIMD_SHUFFLE(
+ *x, *y, static_cast<intptr_t>(Indices == index_undefined ? -1 : static_cast<intptr_t>(Indices))...);
return result;
}
@@ -282,7 +240,7 @@ template <typename To, typename From, size_t N, typename Tsub = deep_subtype<To>
size_t Nout = N* compound_type_traits<To>::deep_width>
constexpr CMT_INLINE vec<To, N> builtin_convertvector(const vec<From, N>& value) noexcept
{
- return KFT_CONVERT_VECTOR(*value, Tsub, Nout);
+ return KFR_SIMD_CAST(Tsub, Nout, *value);
}
// scalar to scalar
@@ -364,13 +322,17 @@ constexpr CMT_INLINE To bitcast(const From& value) noexcept
template <typename To, typename From, size_t N, size_t Nout = N* size_of<From>() / size_of<To>()>
constexpr CMT_INLINE vec<To, Nout> bitcast(const vec<From, N>& value) noexcept
{
- return reinterpret_cast<typename vec<To, Nout>::simd_t>(*value);
+ using Tsub = typename vec<To, Nout>::scalar_type;
+ constexpr size_t width = vec<To, Nout>::scalar_size();
+ return KFR_SIMD_BITCAST(Tsub, width, *value);
}
template <typename To, typename From, size_t N, size_t Nout = N* size_of<From>() / size_of<To>()>
constexpr CMT_INLINE mask<To, Nout> bitcast(const mask<From, N>& value) noexcept
{
- return reinterpret_cast<typename mask<To, Nout>::simd_t>(*value);
+ using Tsub = typename mask<To, Nout>::scalar_type;
+ constexpr size_t width = mask<To, Nout>::scalar_size();
+ return KFR_SIMD_BITCAST(Tsub, width, *value);
}
template <typename From, typename To = utype<From>, KFR_ENABLE_IF(!is_compound<From>::value)>
@@ -450,7 +412,7 @@ constexpr CMT_INLINE vec<T, Nout> resize(const vec<T, N>& x)
return x;
}
KFR_FN(resize)
-
+/*
namespace internal_read_write
{
@@ -484,13 +446,13 @@ CMT_INLINE void write(T* dest, const vec<T, N>& value)
internal_read_write::write<false, rest>(dest + first,
shufflevector<rest, internal::shuffle_index<first>>(value));
}
-}
+}*/
template <typename T, size_t N>
struct pkd_vec
{
constexpr pkd_vec() noexcept {}
- pkd_vec(const vec<T, N>& value) noexcept { internal_read_write::write(v, value); }
+ pkd_vec(const vec<T, N>& value) noexcept { simd_write<false, vec<T, N>::scalar_size()>(v, *value); }
template <typename... Ts>
constexpr pkd_vec(Ts... init) noexcept : v{ static_cast<T>(init)... }
{
@@ -578,13 +540,13 @@ struct CMT_EMPTY_BASES vec : vec_t<T, N>, operators::empty
static_assert(!is_vec<T>::value || is_poweroftwo(size_of<T>()),
"Inner vector size must be a power of two");
+ constexpr static size_t scalar_size() noexcept { return N * compound_type_traits<T>::width; }
using UT = utype<T>;
using value_type = T;
using scalar_type = subtype<T>;
- constexpr static size_t scalar_size() noexcept { return N * compound_type_traits<T>::width; }
- using simd_t = simd<scalar_type, N * compound_type_traits<T>::width>;
- using ref = vec&;
- using cref = const vec&;
+ using simd_t = simd<scalar_type, N * compound_type_traits<T>::width>;
+ using ref = vec&;
+ using cref = const vec&;
constexpr static bool is_pod = true;
@@ -598,7 +560,7 @@ struct CMT_EMPTY_BASES vec : vec_t<T, N>, operators::empty
}
template <typename U,
KFR_ENABLE_IF(std::is_convertible<U, T>::value&& compound_type_traits<T>::width == 1)>
- constexpr CMT_INLINE vec(const U& value) noexcept : v(KFR_SIMD_FROM_SCALAR(static_cast<T>(value), T, N))
+ constexpr CMT_INLINE vec(const U& value) noexcept : v(KFR_SIMD_BROADCAST(T, N, static_cast<T>(value)))
{
}
template <typename... Ts>
@@ -618,18 +580,16 @@ struct CMT_EMPTY_BASES vec : vec_t<T, N>, operators::empty
CMT_INLINE vec& operator=(const vec&) noexcept = default;
CMT_INLINE vec& operator=(vec&&) noexcept = default;
+ friend CMT_INLINE vec operator-(const vec& x) { return vec_op<T, N>::neg(x.v); }
+ friend CMT_INLINE vec operator~(const vec& x) { return vec_op<T, N>::bnot(x.v); }
friend CMT_INLINE vec operator+(const vec& x, const vec& y) { return vec_op<T, N>::add(x.v, y.v); }
friend CMT_INLINE vec operator-(const vec& x, const vec& y) { return vec_op<T, N>::sub(x.v, y.v); }
friend CMT_INLINE vec operator*(const vec& x, const vec& y) { return vec_op<T, N>::mul(x.v, y.v); }
friend CMT_INLINE vec operator/(const vec& x, const vec& y) { return vec_op<T, N>::div(x.v, y.v); }
friend CMT_INLINE vec operator%(const vec& x, const vec& y) { return vec_op<T, N>::rem(x.v, y.v); }
- friend CMT_INLINE vec operator-(const vec& x) { return vec_op<T, N>::neg(x.v); }
-
friend CMT_INLINE vec operator&(const vec& x, const vec& y) { return vec_op<T, N>::band(x.v, y.v); }
friend CMT_INLINE vec operator|(const vec& x, const vec& y) { return vec_op<T, N>::bor(x.v, y.v); }
friend CMT_INLINE vec operator^(const vec& x, const vec& y) { return vec_op<T, N>::bxor(x.v, y.v); }
- friend CMT_INLINE vec operator~(const vec& x) { return vec_op<T, N>::bnot(x.v); }
-
friend CMT_INLINE vec operator<<(const vec& x, const vec& y) { return vec_op<T, N>::shl(x.v, y.v); }
friend CMT_INLINE vec operator>>(const vec& x, const vec& y) { return vec_op<T, N>::shr(x.v, y.v); }
@@ -640,56 +600,16 @@ struct CMT_EMPTY_BASES vec : vec_t<T, N>, operators::empty
friend CMT_INLINE mask<T, N> operator<=(const vec& x, const vec& y) { return vec_op<T, N>::le(x.v, y.v); }
friend CMT_INLINE mask<T, N> operator>=(const vec& x, const vec& y) { return vec_op<T, N>::ge(x.v, y.v); }
- friend CMT_INLINE vec& operator+=(vec& x, const vec& y)
- {
- x = vec_op<T, N>::add(x.v, y.v);
- return x;
- }
- friend CMT_INLINE vec& operator-=(vec& x, const vec& y)
- {
- x = vec_op<T, N>::sub(x.v, y.v);
- return x;
- }
- friend CMT_INLINE vec& operator*=(vec& x, const vec& y)
- {
- x = vec_op<T, N>::mul(x.v, y.v);
- return x;
- }
- friend CMT_INLINE vec& operator/=(vec& x, const vec& y)
- {
- x = vec_op<T, N>::div(x.v, y.v);
- return x;
- }
- friend CMT_INLINE vec& operator%=(vec& x, const vec& y)
- {
- x = vec_op<T, N>::rem(x.v, y.v);
- return x;
- }
- friend CMT_INLINE vec& operator&=(vec& x, const vec& y)
- {
- x = vec_op<T, N>::band(x.v, y.v);
- return x;
- }
- friend CMT_INLINE vec& operator|=(vec& x, const vec& y)
- {
- x = vec_op<T, N>::bor(x.v, y.v);
- return x;
- }
- friend CMT_INLINE vec& operator^=(vec& x, const vec& y)
- {
- x = vec_op<T, N>::bxor(x.v, y.v);
- return x;
- }
- friend CMT_INLINE vec& operator<<=(vec& x, const vec& y)
- {
- x = vec_op<T, N>::shl(x.v, y.v);
- return x;
- }
- friend CMT_INLINE vec& operator>>=(vec& x, const vec& y)
- {
- x = vec_op<T, N>::shr(x.v, y.v);
- return x;
- }
+ friend CMT_INLINE vec& operator+=(vec& x, const vec& y) { return x = vec_op<T, N>::add(x.v, y.v); }
+ friend CMT_INLINE vec& operator-=(vec& x, const vec& y) { return x = vec_op<T, N>::sub(x.v, y.v); }
+ friend CMT_INLINE vec& operator*=(vec& x, const vec& y) { return x = vec_op<T, N>::mul(x.v, y.v); }
+ friend CMT_INLINE vec& operator/=(vec& x, const vec& y) { return x = vec_op<T, N>::div(x.v, y.v); }
+ friend CMT_INLINE vec& operator%=(vec& x, const vec& y) { return x = vec_op<T, N>::rem(x.v, y.v); }
+ friend CMT_INLINE vec& operator&=(vec& x, const vec& y) { return x = vec_op<T, N>::band(x.v, y.v); }
+ friend CMT_INLINE vec& operator|=(vec& x, const vec& y) { return x = vec_op<T, N>::bor(x.v, y.v); }
+ friend CMT_INLINE vec& operator^=(vec& x, const vec& y) { return x = vec_op<T, N>::bxor(x.v, y.v); }
+ friend CMT_INLINE vec& operator<<=(vec& x, const vec& y) { return x = vec_op<T, N>::shl(x.v, y.v); }
+ friend CMT_INLINE vec& operator>>=(vec& x, const vec& y) { return x = vec_op<T, N>::shr(x.v, y.v); }
constexpr CMT_INLINE const simd_t& operator*() const { return v; }
CMT_GNU_CONSTEXPR CMT_INLINE simd_t& operator*() { return v; }
@@ -1359,28 +1279,6 @@ struct compound_type_traits<kfr::vec_t<T, N>>
using deep_rebind = kfr::vec_t<cometa::deep_rebind<subtype, U>, N>;
};
-#ifdef KFR_SIMD_PARAM_ARE_DEDUCIBLE
-template <typename T, size_t N>
-struct compound_type_traits<kfr::simd<T, N>>
-{
- using subtype = T;
- using deep_subtype = cometa::deep_subtype<T>;
- constexpr static size_t width = N;
- constexpr static size_t deep_width = width * compound_type_traits<T>::width;
- constexpr static bool is_scalar = false;
- constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1;
- template <typename U>
- using rebind = kfr::simd<U, N>;
- template <typename U>
- using deep_rebind = kfr::simd<cometa::deep_rebind<subtype, U>, N>;
-
- CMT_INLINE static constexpr const subtype& at(const kfr::simd<T, N>& value, size_t index)
- {
- return value[index];
- }
-};
-#endif
-
template <typename T, size_t N>
struct compound_type_traits<kfr::vec<T, N>>
{
diff --git a/include/kfr/dft/ft.hpp b/include/kfr/dft/ft.hpp
@@ -109,13 +109,13 @@ using cvec = vec<T, N * 2>;
template <size_t N, bool A = false, typename T>
CMT_INLINE cvec<T, N> cread(const complex<T>* src)
{
- return internal_read_write::read<N * 2, A>(ptr_cast<T>(src));
+ return simd_read<N * 2, A>(ptr_cast<T>(src));
}
template <size_t N, bool A = false, typename T>
CMT_INLINE void cwrite(complex<T>* dest, cvec<T, N> value)
{
- return internal_read_write::write<A>(ptr_cast<T>(dest), value);
+ return simd_write<A, N * 2>(ptr_cast<T>(dest), *value);
}
template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices>
@@ -168,7 +168,7 @@ CMT_INLINE void cwrite_group(complex<T>* dest, size_t stride, cvec<T, count * N>
template <size_t N, bool A = false, bool split = false, typename T>
CMT_INLINE cvec<T, N> cread_split(const complex<T>* src)
{
- cvec<T, N> temp = internal_read_write::read<N * 2, A>(ptr_cast<T>(src));
+ cvec<T, N> temp = simd_read<N * 2, A>(ptr_cast<T>(src));
if (split)
temp = splitpairs(temp);
return temp;
@@ -179,7 +179,7 @@ CMT_INLINE void cwrite_split(complex<T>* dest, cvec<T, N> value)
{
if (split)
value = interleavehalfs(value);
- internal_read_write::write<A>(ptr_cast<T>(dest), value);
+ simd_write<A, N * 2>(ptr_cast<T>(dest), *value);
}
template <>