kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit 8ca9e4785460793e5af8107fc87cd1a62b59f6ac
parent 64569318f21ece90d0632df3e14cbcbc793437be
Author: [email protected] <[email protected]>
Date:   Sat,  8 Oct 2016 03:07:59 +0300

New implementation for SIMD read/write

Diffstat:
Minclude/kfr/base/read_write.hpp | 4++--
Minclude/kfr/base/simd.hpp | 92+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Minclude/kfr/base/types.hpp | 23+++++++++++++++++++++++
Minclude/kfr/base/vec.hpp | 160+++++++++++++++----------------------------------------------------------------
Minclude/kfr/dft/ft.hpp | 8++++----
5 files changed, 115 insertions(+), 172 deletions(-)

diff --git a/include/kfr/base/read_write.hpp b/include/kfr/base/read_write.hpp @@ -35,13 +35,13 @@ namespace kfr template <size_t N, bool A = false, typename T> CMT_INLINE vec<T, N> read(const T* src) { - return internal_read_write::read<N, A, T>(src); + return simd_read<N * compound_type_traits<T>::width, A>(ptr_cast<subtype<T>>(src)); } template <bool A = false, size_t N, typename T> CMT_INLINE void write(T* dest, const vec<T, N>& value) { - internal_read_write::write<A, N, T>(dest, value); + simd_write<A, N * compound_type_traits<T>::width>(ptr_cast<subtype<T>>(dest), *value); } template <typename... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)> diff --git a/include/kfr/base/simd.hpp b/include/kfr/base/simd.hpp @@ -31,53 +31,72 @@ namespace kfr { +constexpr size_t index_undefined = static_cast<size_t>(-1); + #ifdef CMT_COMPILER_CLANG +#define KFR_NATIVE_SIMD 1 +#endif -using simdindex = int; +#ifdef KFR_NATIVE_SIMD -template <typename T, simdindex N> +template <typename T, size_t N> using simd = T __attribute__((ext_vector_type(N))); -#define KFT_CONVERT_VECTOR(X, T, N) __builtin_convertvector(X, ::kfr::simd<T, N>) - -#define KFR_SIMD_PARAM_ARE_DEDUCIBLE 1 -#define KFR_SIMD_FROM_SCALAR(X, T, N) (X) -#define KFR_BUILTIN_SHUFFLEVECTOR(T, N, X, Y, I) __builtin_shufflevector(X, Y, I) - -#elif defined CMT_COMPILER_GNU +template <typename T, size_t N, bool A> +using simd_storage = internal::struct_with_alignment<simd<T, N>, A>; -using simdindex = int; - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wattributes" - -template <typename T, simdindex N> -struct simd_gcc +template <typename T, size_t N, size_t... indices> +CMT_INLINE simd<T, sizeof...(indices)> simd_shuffle(const identity<simd<T, N>>& x, + const identity<simd<T, N>>& y, csizes_t<indices...>) { - constexpr static size_t NN = next_poweroftwo(N); - typedef __attribute__((vector_size(NN * sizeof(T)))) T simd_type; - typedef simd_type type __attribute__((__packed__, __aligned__(sizeof(T)))); -}; -#pragma GCC diagnostic pop - -template <typename T, simdindex N> -using simd = typename simd_gcc<T, N>::type; + return __builtin_shufflevector(x, y, + ((indices == index_undefined) ? -1 : static_cast<intptr_t>(indices))...); +} +template <typename T, size_t N, size_t... indices> +CMT_INLINE simd<T, sizeof...(indices)> simd_shuffle(const identity<simd<T, N>>& x, csizes_t<indices...>) +{ + return __builtin_shufflevector(x, x, + ((indices == index_undefined) ? -1 : static_cast<intptr_t>(indices))...); +} -#define KFT_CONVERT_VECTOR(X, T, N) static_cast<::kfr::simd<T, N>>(X) -#define KFR_SIMD_FROM_SCALAR(X, T, N) \ - (__builtin_shuffle(::kfr::simd<T, N>{ X }, ::kfr::simd<int_type<sizeof(T) * 8>, N>{ 0 })) -#define KFR_BUILTIN_SHUFFLEVECTOR(T, N, X, Y, I) ::kfr::internal::builtin_shufflevector<T, N>(X, Y, I) +template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))> +CMT_INLINE simd<T, N> simd_read(const T* src) +{ + return ptr_cast<simd_storage<T, N, A>>(src)->value; +} -namespace internal +template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void> +CMT_INLINE simd<T, N> simd_read(const T* src) { -template <typename T, size_t N, typename... Int> -KFR_INTRIN simd<T, sizeof...(Int)> builtin_shufflevector(const simd<T, N>& x, const simd<T, N>& y, - const Int&... indices) + constexpr size_t first = prev_poweroftwo(N); + constexpr size_t rest = N - first; + constexpr auto extend_indices = cconcat(csizeseq<rest>, csizeseq<first - rest, index_undefined, 0>); + constexpr auto concat_indices = csizeseq<N>; + return simd_shuffle<T, first>(simd_read<first, A>(src), + simd_shuffle<T, rest>(simd_read<rest, false>(src + first), extend_indices), + concat_indices); +} + +template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))> +CMT_INLINE void simd_write(T* dest, const identity<simd<T, N>>& value) { - return simd<T, sizeof...(Int)>{ (indices < N ? x[indices] : y[indices])... }; + ptr_cast<simd_storage<T, N, A>>(dest)->value = value; } + +template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void> +CMT_INLINE void simd_write(T* dest, const identity<simd<T, N>>& value) +{ + constexpr size_t first = prev_poweroftwo(N); + constexpr size_t rest = N - first; + simd_write<A, first>(dest, simd_shuffle(value, csizeseq<first>)); + simd_write<false, rest>(dest + first, simd_shuffle(value, csizeseq<rest, first>)); } +#define KFR_SIMD_CAST(T, N, X) __builtin_convertvector(X, ::kfr::simd<T, N>) +#define KFR_SIMD_BITCAST(T, N, X) ((::kfr::simd<T, N>)(X)) +#define KFR_SIMD_BROADCAST(T, N, X) ((::kfr::simd<T, N>)(X)) +#define KFR_SIMD_SHUFFLE(X, Y, ...) __builtin_shufflevector(X, Y, __VA_ARGS__) + #endif template <typename T, size_t N> @@ -85,6 +104,7 @@ struct vec_op { using type = subtype<T>; using utype = kfr::utype<type>; + using iutype = conditional<kfr::is_i_class<T>::value, type, utype>; constexpr static size_t w = compound_type_traits<T>::width * N; CMT_INLINE constexpr static simd<type, w> add(const simd<type, w>& x, const simd<type, w>& y) noexcept @@ -109,11 +129,13 @@ struct vec_op } CMT_INLINE constexpr static simd<type, w> shl(const simd<type, w>& x, const simd<type, w>& y) noexcept { - return x << y; + return reinterpret_cast<simd<type, w>>(reinterpret_cast<simd<iutype, w>>(x) + << reinterpret_cast<simd<iutype, w>>(y)); } CMT_INLINE constexpr static simd<type, w> shr(const simd<type, w>& x, const simd<type, w>& y) noexcept { - return x >> y; + return reinterpret_cast<simd<type, w>>(reinterpret_cast<simd<iutype, w>>(x) >> + reinterpret_cast<simd<iutype, w>>(y)); } CMT_INLINE constexpr static simd<type, w> neg(const simd<type, w>& x) noexcept { return -x; } CMT_INLINE constexpr static simd<type, w> band(const simd<type, w>& x, const simd<type, w>& y) noexcept diff --git a/include/kfr/base/types.hpp b/include/kfr/base/types.hpp @@ -300,6 +300,29 @@ CMT_INLINE void zeroize(T1& value) { builtin_memset(static_cast<void*>(builtin_addressof(value)), 0, sizeof(T1)); } + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wattributes" + +template <typename T, bool A> +struct struct_with_alignment +{ + T value; + KFR_INTRIN void operator=(T value) { this->value = value; } +}; + +template <typename T> +struct struct_with_alignment<T, false> +{ + T value; + KFR_INTRIN void operator=(T value) { this->value = value; } +} +#ifdef CMT_GNU_ATTRIBUTES +__attribute__((__packed__, __may_alias__)) // +#endif +; + +#pragma GCC diagnostic pop } template <typename T> diff --git a/include/kfr/base/vec.hpp b/include/kfr/base/vec.hpp @@ -80,50 +80,11 @@ template <typename T, size_t N> struct is_vec_impl<mask<T, N>> : std::true_type { }; - -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wattributes" - -template <typename T, bool A> -struct struct_with_alignment -{ - T value; - KFR_INTRIN void operator=(T value) { this->value = value; } -}; - -template <typename T> -struct struct_with_alignment<T, false> -{ - T value; - KFR_INTRIN void operator=(T value) { this->value = value; } -} -#ifdef CMT_GNU_ATTRIBUTES -__attribute__((__packed__, __may_alias__)) // -#endif -; } template <typename T> using is_vec = internal::is_vec_impl<T>; -template <typename T, size_t N, bool A> -using vec_algn = internal::struct_with_alignment<simd<T, N>, A>; - -#pragma GCC diagnostic pop - -template <typename T, size_t N, bool A> -struct vec_ptr -{ - constexpr CMT_INLINE vec_ptr(T* data) noexcept : data(data) {} - constexpr CMT_INLINE vec_ptr(const T* data) noexcept : data(const_cast<T*>(data)) {} - CMT_INLINE const vec_algn<T, N, A>& operator[](size_t i) const - { - return *static_cast<vec_algn<T, N, A>*>(data + i); - } - CMT_INLINE vec_algn<T, N, A>& operator[](size_t i) { return *static_cast<vec_algn<T, N, A>*>(data + i); } - T* data; -}; - template <typename To, typename From, size_t N, KFR_ENABLE_IF(std::is_same<subtype<From>, subtype<To>>::value), size_t Nout = N* compound_type_traits<From>::width / compound_type_traits<To>::width> @@ -151,15 +112,12 @@ get_vec_index(int = 0) return fn.template operator()<index>(); } -constexpr size_t index_undefined = static_cast<size_t>(-1); - template <typename T, size_t N, size_t... Indices, KFR_ENABLE_IF(!is_compound<T>::value)> CMT_INLINE vec<T, sizeof...(Indices)> shufflevector(csizes_t<Indices...>, const vec<T, N>& x, const vec<T, N>& y) { - vec<T, sizeof...(Indices)> result = KFR_BUILTIN_SHUFFLEVECTOR( - T, N, *x, *y, - static_cast<intptr_t>(Indices == index_undefined ? -1 : static_cast<intptr_t>(Indices))...); + vec<T, sizeof...(Indices)> result = KFR_SIMD_SHUFFLE( + *x, *y, static_cast<intptr_t>(Indices == index_undefined ? -1 : static_cast<intptr_t>(Indices))...); return result; } @@ -282,7 +240,7 @@ template <typename To, typename From, size_t N, typename Tsub = deep_subtype<To> size_t Nout = N* compound_type_traits<To>::deep_width> constexpr CMT_INLINE vec<To, N> builtin_convertvector(const vec<From, N>& value) noexcept { - return KFT_CONVERT_VECTOR(*value, Tsub, Nout); + return KFR_SIMD_CAST(Tsub, Nout, *value); } // scalar to scalar @@ -364,13 +322,17 @@ constexpr CMT_INLINE To bitcast(const From& value) noexcept template <typename To, typename From, size_t N, size_t Nout = N* size_of<From>() / size_of<To>()> constexpr CMT_INLINE vec<To, Nout> bitcast(const vec<From, N>& value) noexcept { - return reinterpret_cast<typename vec<To, Nout>::simd_t>(*value); + using Tsub = typename vec<To, Nout>::scalar_type; + constexpr size_t width = vec<To, Nout>::scalar_size(); + return KFR_SIMD_BITCAST(Tsub, width, *value); } template <typename To, typename From, size_t N, size_t Nout = N* size_of<From>() / size_of<To>()> constexpr CMT_INLINE mask<To, Nout> bitcast(const mask<From, N>& value) noexcept { - return reinterpret_cast<typename mask<To, Nout>::simd_t>(*value); + using Tsub = typename mask<To, Nout>::scalar_type; + constexpr size_t width = mask<To, Nout>::scalar_size(); + return KFR_SIMD_BITCAST(Tsub, width, *value); } template <typename From, typename To = utype<From>, KFR_ENABLE_IF(!is_compound<From>::value)> @@ -450,7 +412,7 @@ constexpr CMT_INLINE vec<T, Nout> resize(const vec<T, N>& x) return x; } KFR_FN(resize) - +/* namespace internal_read_write { @@ -484,13 +446,13 @@ CMT_INLINE void write(T* dest, const vec<T, N>& value) internal_read_write::write<false, rest>(dest + first, shufflevector<rest, internal::shuffle_index<first>>(value)); } -} +}*/ template <typename T, size_t N> struct pkd_vec { constexpr pkd_vec() noexcept {} - pkd_vec(const vec<T, N>& value) noexcept { internal_read_write::write(v, value); } + pkd_vec(const vec<T, N>& value) noexcept { simd_write<false, vec<T, N>::scalar_size()>(v, *value); } template <typename... Ts> constexpr pkd_vec(Ts... init) noexcept : v{ static_cast<T>(init)... } { @@ -578,13 +540,13 @@ struct CMT_EMPTY_BASES vec : vec_t<T, N>, operators::empty static_assert(!is_vec<T>::value || is_poweroftwo(size_of<T>()), "Inner vector size must be a power of two"); + constexpr static size_t scalar_size() noexcept { return N * compound_type_traits<T>::width; } using UT = utype<T>; using value_type = T; using scalar_type = subtype<T>; - constexpr static size_t scalar_size() noexcept { return N * compound_type_traits<T>::width; } - using simd_t = simd<scalar_type, N * compound_type_traits<T>::width>; - using ref = vec&; - using cref = const vec&; + using simd_t = simd<scalar_type, N * compound_type_traits<T>::width>; + using ref = vec&; + using cref = const vec&; constexpr static bool is_pod = true; @@ -598,7 +560,7 @@ struct CMT_EMPTY_BASES vec : vec_t<T, N>, operators::empty } template <typename U, KFR_ENABLE_IF(std::is_convertible<U, T>::value&& compound_type_traits<T>::width == 1)> - constexpr CMT_INLINE vec(const U& value) noexcept : v(KFR_SIMD_FROM_SCALAR(static_cast<T>(value), T, N)) + constexpr CMT_INLINE vec(const U& value) noexcept : v(KFR_SIMD_BROADCAST(T, N, static_cast<T>(value))) { } template <typename... Ts> @@ -618,18 +580,16 @@ struct CMT_EMPTY_BASES vec : vec_t<T, N>, operators::empty CMT_INLINE vec& operator=(const vec&) noexcept = default; CMT_INLINE vec& operator=(vec&&) noexcept = default; + friend CMT_INLINE vec operator-(const vec& x) { return vec_op<T, N>::neg(x.v); } + friend CMT_INLINE vec operator~(const vec& x) { return vec_op<T, N>::bnot(x.v); } friend CMT_INLINE vec operator+(const vec& x, const vec& y) { return vec_op<T, N>::add(x.v, y.v); } friend CMT_INLINE vec operator-(const vec& x, const vec& y) { return vec_op<T, N>::sub(x.v, y.v); } friend CMT_INLINE vec operator*(const vec& x, const vec& y) { return vec_op<T, N>::mul(x.v, y.v); } friend CMT_INLINE vec operator/(const vec& x, const vec& y) { return vec_op<T, N>::div(x.v, y.v); } friend CMT_INLINE vec operator%(const vec& x, const vec& y) { return vec_op<T, N>::rem(x.v, y.v); } - friend CMT_INLINE vec operator-(const vec& x) { return vec_op<T, N>::neg(x.v); } - friend CMT_INLINE vec operator&(const vec& x, const vec& y) { return vec_op<T, N>::band(x.v, y.v); } friend CMT_INLINE vec operator|(const vec& x, const vec& y) { return vec_op<T, N>::bor(x.v, y.v); } friend CMT_INLINE vec operator^(const vec& x, const vec& y) { return vec_op<T, N>::bxor(x.v, y.v); } - friend CMT_INLINE vec operator~(const vec& x) { return vec_op<T, N>::bnot(x.v); } - friend CMT_INLINE vec operator<<(const vec& x, const vec& y) { return vec_op<T, N>::shl(x.v, y.v); } friend CMT_INLINE vec operator>>(const vec& x, const vec& y) { return vec_op<T, N>::shr(x.v, y.v); } @@ -640,56 +600,16 @@ struct CMT_EMPTY_BASES vec : vec_t<T, N>, operators::empty friend CMT_INLINE mask<T, N> operator<=(const vec& x, const vec& y) { return vec_op<T, N>::le(x.v, y.v); } friend CMT_INLINE mask<T, N> operator>=(const vec& x, const vec& y) { return vec_op<T, N>::ge(x.v, y.v); } - friend CMT_INLINE vec& operator+=(vec& x, const vec& y) - { - x = vec_op<T, N>::add(x.v, y.v); - return x; - } - friend CMT_INLINE vec& operator-=(vec& x, const vec& y) - { - x = vec_op<T, N>::sub(x.v, y.v); - return x; - } - friend CMT_INLINE vec& operator*=(vec& x, const vec& y) - { - x = vec_op<T, N>::mul(x.v, y.v); - return x; - } - friend CMT_INLINE vec& operator/=(vec& x, const vec& y) - { - x = vec_op<T, N>::div(x.v, y.v); - return x; - } - friend CMT_INLINE vec& operator%=(vec& x, const vec& y) - { - x = vec_op<T, N>::rem(x.v, y.v); - return x; - } - friend CMT_INLINE vec& operator&=(vec& x, const vec& y) - { - x = vec_op<T, N>::band(x.v, y.v); - return x; - } - friend CMT_INLINE vec& operator|=(vec& x, const vec& y) - { - x = vec_op<T, N>::bor(x.v, y.v); - return x; - } - friend CMT_INLINE vec& operator^=(vec& x, const vec& y) - { - x = vec_op<T, N>::bxor(x.v, y.v); - return x; - } - friend CMT_INLINE vec& operator<<=(vec& x, const vec& y) - { - x = vec_op<T, N>::shl(x.v, y.v); - return x; - } - friend CMT_INLINE vec& operator>>=(vec& x, const vec& y) - { - x = vec_op<T, N>::shr(x.v, y.v); - return x; - } + friend CMT_INLINE vec& operator+=(vec& x, const vec& y) { return x = vec_op<T, N>::add(x.v, y.v); } + friend CMT_INLINE vec& operator-=(vec& x, const vec& y) { return x = vec_op<T, N>::sub(x.v, y.v); } + friend CMT_INLINE vec& operator*=(vec& x, const vec& y) { return x = vec_op<T, N>::mul(x.v, y.v); } + friend CMT_INLINE vec& operator/=(vec& x, const vec& y) { return x = vec_op<T, N>::div(x.v, y.v); } + friend CMT_INLINE vec& operator%=(vec& x, const vec& y) { return x = vec_op<T, N>::rem(x.v, y.v); } + friend CMT_INLINE vec& operator&=(vec& x, const vec& y) { return x = vec_op<T, N>::band(x.v, y.v); } + friend CMT_INLINE vec& operator|=(vec& x, const vec& y) { return x = vec_op<T, N>::bor(x.v, y.v); } + friend CMT_INLINE vec& operator^=(vec& x, const vec& y) { return x = vec_op<T, N>::bxor(x.v, y.v); } + friend CMT_INLINE vec& operator<<=(vec& x, const vec& y) { return x = vec_op<T, N>::shl(x.v, y.v); } + friend CMT_INLINE vec& operator>>=(vec& x, const vec& y) { return x = vec_op<T, N>::shr(x.v, y.v); } constexpr CMT_INLINE const simd_t& operator*() const { return v; } CMT_GNU_CONSTEXPR CMT_INLINE simd_t& operator*() { return v; } @@ -1359,28 +1279,6 @@ struct compound_type_traits<kfr::vec_t<T, N>> using deep_rebind = kfr::vec_t<cometa::deep_rebind<subtype, U>, N>; }; -#ifdef KFR_SIMD_PARAM_ARE_DEDUCIBLE -template <typename T, size_t N> -struct compound_type_traits<kfr::simd<T, N>> -{ - using subtype = T; - using deep_subtype = cometa::deep_subtype<T>; - constexpr static size_t width = N; - constexpr static size_t deep_width = width * compound_type_traits<T>::width; - constexpr static bool is_scalar = false; - constexpr static size_t depth = cometa::compound_type_traits<T>::depth + 1; - template <typename U> - using rebind = kfr::simd<U, N>; - template <typename U> - using deep_rebind = kfr::simd<cometa::deep_rebind<subtype, U>, N>; - - CMT_INLINE static constexpr const subtype& at(const kfr::simd<T, N>& value, size_t index) - { - return value[index]; - } -}; -#endif - template <typename T, size_t N> struct compound_type_traits<kfr::vec<T, N>> { diff --git a/include/kfr/dft/ft.hpp b/include/kfr/dft/ft.hpp @@ -109,13 +109,13 @@ using cvec = vec<T, N * 2>; template <size_t N, bool A = false, typename T> CMT_INLINE cvec<T, N> cread(const complex<T>* src) { - return internal_read_write::read<N * 2, A>(ptr_cast<T>(src)); + return simd_read<N * 2, A>(ptr_cast<T>(src)); } template <size_t N, bool A = false, typename T> CMT_INLINE void cwrite(complex<T>* dest, cvec<T, N> value) { - return internal_read_write::write<A>(ptr_cast<T>(dest), value); + return simd_write<A, N * 2>(ptr_cast<T>(dest), *value); } template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices> @@ -168,7 +168,7 @@ CMT_INLINE void cwrite_group(complex<T>* dest, size_t stride, cvec<T, count * N> template <size_t N, bool A = false, bool split = false, typename T> CMT_INLINE cvec<T, N> cread_split(const complex<T>* src) { - cvec<T, N> temp = internal_read_write::read<N * 2, A>(ptr_cast<T>(src)); + cvec<T, N> temp = simd_read<N * 2, A>(ptr_cast<T>(src)); if (split) temp = splitpairs(temp); return temp; @@ -179,7 +179,7 @@ CMT_INLINE void cwrite_split(complex<T>* dest, cvec<T, N> value) { if (split) value = interleavehalfs(value); - internal_read_write::write<A>(ptr_cast<T>(dest), value); + simd_write<A, N * 2>(ptr_cast<T>(dest), *value); } template <>