kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit 374e2c03d0d1b6032624c5e7ec073608a35256f1
parent 75ba81ba8421d474003470cd1254654d8cd8c14d
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date:   Sat, 30 Jul 2016 21:33:56 +0300

Pass vec by const reference (intrinsics)

Diffstat:
Minclude/kfr/base/abs.hpp | 36++++++++++++++++++------------------
Minclude/kfr/base/asin_acos.hpp | 4++--
Minclude/kfr/base/atan.hpp | 8++++----
Minclude/kfr/base/clamp.hpp | 4++--
Minclude/kfr/base/complex.hpp | 8++++----
Minclude/kfr/base/digitreverse.hpp | 6+++---
Minclude/kfr/base/expression.hpp | 4++--
Minclude/kfr/base/function.hpp | 40++++++++++++++++++++--------------------
Minclude/kfr/base/gamma.hpp | 4++--
Minclude/kfr/base/hyperbolic.hpp | 12++++++------
Minclude/kfr/base/log_exp.hpp | 60+++++++++++++++++++++++++++++-------------------------------
Minclude/kfr/base/logical.hpp | 164+++++++++++++++++++++++++++++++++++++++++--------------------------------------
Minclude/kfr/base/min_max.hpp | 112++++++++++++++++++++++++++++++++++++++++----------------------------------------
Minclude/kfr/base/modzerobessel.hpp | 2+-
Minclude/kfr/base/operators.hpp | 52++++++++++++++++++++++++++--------------------------
Minclude/kfr/base/read_write.hpp | 16++++++++--------
Minclude/kfr/base/round.hpp | 78+++++++++++++++++++++++++++++++++++++++---------------------------------------
Minclude/kfr/base/saturation.hpp | 66+++++++++++++++++++++++++++++++++---------------------------------
Minclude/kfr/base/select.hpp | 106++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-----------------
Minclude/kfr/base/sin_cos.hpp | 41+++++++++++++++++++++--------------------
Minclude/kfr/base/sqrt.hpp | 17++++++++++-------
Minclude/kfr/base/tan.hpp | 6+++---
Minclude/kfr/base/univector.hpp | 2+-
Minclude/kfr/dsp/goertzel.hpp | 4++--
Minclude/kfr/expressions/basic.hpp | 2+-
Minclude/kfr/expressions/reduce.hpp | 2+-
Minclude/kfr/io/file.hpp | 4++--
27 files changed, 464 insertions(+), 396 deletions(-)

diff --git a/include/kfr/base/abs.hpp b/include/kfr/base/abs.hpp @@ -33,31 +33,31 @@ namespace intrinsics { // floating point template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> abs(vec<T, N> x) +KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x) { return x & internal::invhighbitmask<T>; } #if defined CID_ARCH_SSSE3 -KFR_SINTRIN i64sse abs(i64sse x) { return select(x >= 0, x, -x); } -KFR_SINTRIN i32sse abs(i32sse x) { return _mm_abs_epi32(*x); } -KFR_SINTRIN i16sse abs(i16sse x) { return _mm_abs_epi16(*x); } -KFR_SINTRIN i8sse abs(i8sse x) { return _mm_abs_epi8(*x); } -KFR_SINTRIN u64sse abs(u64sse x) { return x; } -KFR_SINTRIN u32sse abs(u32sse x) { return x; } -KFR_SINTRIN u16sse abs(u16sse x) { return x; } -KFR_SINTRIN u8sse abs(u8sse x) { return x; } +KFR_SINTRIN i64sse abs(const i64sse& x) { return select(x >= 0, x, -x); } +KFR_SINTRIN i32sse abs(const i32sse& x) { return _mm_abs_epi32(*x); } +KFR_SINTRIN i16sse abs(const i16sse& x) { return _mm_abs_epi16(*x); } +KFR_SINTRIN i8sse abs(const i8sse& x) { return _mm_abs_epi8(*x); } +KFR_SINTRIN u64sse abs(const u64sse& x) { return x; } +KFR_SINTRIN u32sse abs(const u32sse& x) { return x; } +KFR_SINTRIN u16sse abs(const u16sse& x) { return x; } +KFR_SINTRIN u8sse abs(const u8sse& x) { return x; } #if defined CID_ARCH_AVX2 -KFR_SINTRIN i64avx abs(i64avx x) { return select(x >= 0, x, -x); } -KFR_SINTRIN i32avx abs(i32avx x) { return _mm256_abs_epi32(*x); } -KFR_SINTRIN i16avx abs(i16avx x) { return _mm256_abs_epi16(*x); } -KFR_SINTRIN i8avx abs(i8avx x) { return _mm256_abs_epi8(*x); } -KFR_SINTRIN u64avx abs(u64avx x) { return x; } -KFR_SINTRIN u32avx abs(u32avx x) { return x; } -KFR_SINTRIN u16avx abs(u16avx x) { return x; } -KFR_SINTRIN u8avx abs(u8avx x) { return x; } +KFR_SINTRIN i64avx abs(const i64avx& x) { return select(x >= 0, x, -x); } +KFR_SINTRIN i32avx abs(const i32avx& x) { return _mm256_abs_epi32(*x); } +KFR_SINTRIN i16avx abs(const i16avx& x) { return _mm256_abs_epi16(*x); } +KFR_SINTRIN i8avx abs(const i8avx& x) { return _mm256_abs_epi8(*x); } +KFR_SINTRIN u64avx abs(const u64avx& x) { return x; } +KFR_SINTRIN u32avx abs(const u32avx& x) { return x; } +KFR_SINTRIN u16avx abs(const u16avx& x) { return x; } +KFR_SINTRIN u8avx abs(const u8avx& x) { return x; } #endif KFR_HANDLE_ALL_SIZES_NOT_F_1(abs) @@ -66,7 +66,7 @@ KFR_HANDLE_ALL_SIZES_NOT_F_1(abs) // fallback template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> abs(vec<T, N> x) +KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x) { return select(x >= T(), x, -x); } diff --git a/include/kfr/base/asin_acos.hpp b/include/kfr/base/asin_acos.hpp @@ -34,14 +34,14 @@ namespace intrinsics { template <typename T, size_t N, typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> asin(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> asin(const vec<T, N>& x) { const vec<Tout, N> xx = cast<Tout>(x); return atan2(xx, sqrt(Tout(1) - xx * xx)); } template <typename T, size_t N, typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> acos(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> acos(const vec<T, N>& x) { const vec<Tout, N> xx = cast<Tout>(x); return atan2(sqrt(Tout(1) - xx * xx), xx); diff --git a/include/kfr/base/atan.hpp b/include/kfr/base/atan.hpp @@ -100,7 +100,7 @@ KFR_SINTRIN vec<f64, N> atan2k(vec<f64, N> y, vec<f64, N> x) } template <size_t N> -KFR_SINTRIN vec<f32, N> atan2(vec<f32, N> y, vec<f32, N> x) +KFR_SINTRIN vec<f32, N> atan2(const vec<f32, N>& y, const vec<f32, N>& x) { vec<f32, N> r = atan2k(abs(y), x); constexpr f32 pi = 3.1415926535897932384626433832795f; @@ -115,7 +115,7 @@ KFR_SINTRIN vec<f32, N> atan2(vec<f32, N> y, vec<f32, N> x) } template <size_t N> -KFR_SINTRIN vec<f64, N> atan2(vec<f64, N> y, vec<f64, N> x) +KFR_SINTRIN vec<f64, N> atan2(const vec<f64, N>& y, const vec<f64, N>& x) { vec<f64, N> r = atan2k(abs(y), x); constexpr f64 pi = 3.1415926535897932384626433832795; @@ -130,7 +130,7 @@ KFR_SINTRIN vec<f64, N> atan2(vec<f64, N> y, vec<f64, N> x) } template <size_t N> -KFR_SINTRIN vec<f32, N> atan(vec<f32, N> s) +KFR_SINTRIN vec<f32, N> atan(const vec<f32, N>& s) { vec<f32, N> t, u; vec<i32, N> q; @@ -154,7 +154,7 @@ KFR_SINTRIN vec<f32, N> atan(vec<f32, N> s) } template <size_t N> -KFR_SINTRIN vec<f64, N> atan(vec<f64, N> s) +KFR_SINTRIN vec<f64, N> atan(const vec<f64, N>& s) { vec<f64, N> t, u; vec<i64, N> q; diff --git a/include/kfr/base/clamp.hpp b/include/kfr/base/clamp.hpp @@ -31,13 +31,13 @@ namespace intrinsics { template <typename T, size_t N> -KFR_SINTRIN vec<T, N> clamp(vec<T, N> x, vec<T, N> lo, vec<T, N> hi) +KFR_SINTRIN vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& lo, const vec<T, N>& hi) { return max(min(x, hi), lo); } template <typename T, size_t N> -KFR_SINTRIN vec<T, N> clamp(vec<T, N> x, vec<T, N> hi) +KFR_SINTRIN vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& hi) { return max(min(x, hi), zerovector<T, N>()); } diff --git a/include/kfr/base/complex.hpp b/include/kfr/base/complex.hpp @@ -209,7 +209,7 @@ struct is_complex_impl<complex<T>> : std::true_type // real to complex template <typename To, typename From, size_t N, KFR_ENABLE_IF(internal::is_complex_impl<To>::value)> -constexpr KFR_INLINE vec<To, N> cast(vec<From, N> value) noexcept +constexpr KFR_INLINE vec<To, N> cast(const vec<From, N>& value) noexcept { const vec<subtype<To>, N> casted = cast<subtype<To>>(value); return subcast<To>(interleave(casted, zerovector(casted))); @@ -217,14 +217,14 @@ constexpr KFR_INLINE vec<To, N> cast(vec<From, N> value) noexcept // complex to complex template <typename To, typename From, size_t N, KFR_ENABLE_IF(internal::is_complex_impl<To>::value)> -constexpr KFR_INLINE vec<To, N> cast(vec<complex<From>, N> value) noexcept +constexpr KFR_INLINE vec<To, N> cast(const vec<complex<From>, N>& value) noexcept { return subcast<To>(cast<subtype<To>>(subcast<From>(value))); } // complex to real template <typename To, typename From, size_t N, KFR_ENABLE_IF(!internal::is_complex_impl<To>::value)> -constexpr KFR_INLINE vec<To, N> cast(vec<complex<From>, N> value) noexcept +constexpr KFR_INLINE vec<To, N> cast(const vec<complex<From>, N>& value) noexcept { static_assert(sizeof(To) == 0, "Can't cast complex to real"); return {}; @@ -375,7 +375,7 @@ KFR_SINTRIN vec<complex<T>, N> cartesian(const vec<complex<T>, N>& x) } template <typename T, size_t N> -KFR_SINTRIN vec<T, N> cabsdup(vec<T, N> x) +KFR_SINTRIN vec<T, N> cabsdup(const vec<T, N>& x) { x = sqr(x); return sqrt(x + swap<2>(x)); diff --git a/include/kfr/base/digitreverse.hpp b/include/kfr/base/digitreverse.hpp @@ -90,19 +90,19 @@ struct shuffle_index_digitreverse } template <size_t radix, size_t groupsize = 1, typename T, size_t N> -KFR_INLINE vec<T, N> digitreverse(vec<T, N> x) +KFR_INLINE vec<T, N> digitreverse(const vec<T, N>& x) { return shufflevector<N, internal::shuffle_index_digitreverse<radix, ilog2(N / groupsize)>, groupsize>(x); } template <size_t groupsize = 1, typename T, size_t N> -KFR_INLINE vec<T, N> bitreverse(vec<T, N> x) +KFR_INLINE vec<T, N> bitreverse(const vec<T, N>& x) { return digitreverse<2, groupsize>(x); } template <size_t groupsize = 1, typename T, size_t N> -KFR_INLINE vec<T, N> digitreverse4(vec<T, N> x) +KFR_INLINE vec<T, N> digitreverse4(const vec<T, N>& x) { return digitreverse<4, groupsize>(x); } diff --git a/include/kfr/base/expression.hpp b/include/kfr/base/expression.hpp @@ -143,7 +143,7 @@ struct expression_scalar : input_expression using value_type = T; expression_scalar() = delete; constexpr expression_scalar(const T& val) noexcept : val(val) {} - constexpr expression_scalar(vec<T, width> val) noexcept : val(val) {} + constexpr expression_scalar(const vec<T, width>& val) noexcept : val(val) {} const vec<T, width> val; template <typename U, size_t N> @@ -221,7 +221,7 @@ KFR_INLINE internal::expression_scalar<T> scalar(const T& val) } template <typename T, size_t N> -KFR_INLINE internal::expression_scalar<T, N> scalar(vec<T, N> val) +KFR_INLINE internal::expression_scalar<T, N> scalar(const vec<T, N>& val) { return internal::expression_scalar<T, N>(val); } diff --git a/include/kfr/base/function.hpp b/include/kfr/base/function.hpp @@ -126,37 +126,37 @@ constexpr inline size_t next_simd_width(size_t n) } template <typename T, size_t N, size_t Nout = next_simd_width<cpu_t::native, T>(N)> -KFR_SINTRIN vec<T, Nout> expand_simd(vec<T, N> x) +KFR_SINTRIN vec<T, Nout> expand_simd(const vec<T, N>& x) { return extend<Nout>(x); } template <typename T, size_t N, size_t Nout = next_simd_width<cpu_t::native, T>(N)> -KFR_SINTRIN vec<T, Nout> expand_simd(vec<T, N> x, identity<T> value) +KFR_SINTRIN vec<T, Nout> expand_simd(const vec<T, N>& x, identity<T> value) { return widen<Nout>(x, value); } #define KFR_HANDLE_ALL_SIZES_1(fn) \ template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \ - KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \ + KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return slice<0, N>(fn(expand_simd(a))); \ } \ template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \ - KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \ + KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return concat(fn(low(a)), fn(high(a))); \ } #define KFR_HANDLE_ALL_SIZES_FLT_1(fn) \ template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \ - KFR_SINTRIN vec<flt_type<T>, N> fn(vec<T, N> a) \ + KFR_SINTRIN vec<flt_type<T>, N> fn(const vec<T, N>& a) \ { \ return slice<0, N>(fn(expand_simd(cast<flt_type<T>>(a)))); \ } \ template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \ - KFR_SINTRIN vec<flt_type<T>, N> fn(vec<T, N> a) \ + KFR_SINTRIN vec<flt_type<T>, N> fn(const vec<T, N>& a) \ { \ return concat(fn(low(cast<flt_type<T>>(a))), fn(high(cast<flt_type<T>>(a)))); \ } @@ -164,13 +164,13 @@ KFR_SINTRIN vec<T, Nout> expand_simd(vec<T, N> x, identity<T> value) #define KFR_HANDLE_ALL_SIZES_F_1(fn) \ template <typename T, size_t N, \ KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && is_f_class<T>::value)> \ - KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \ + KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return slice<0, N>(fn(expand_simd(a))); \ } \ template <typename T, size_t N, \ KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && is_f_class<T>::value), typename = void> \ - KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \ + KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return concat(fn(low(a)), fn(high(a))); \ } @@ -178,13 +178,13 @@ KFR_SINTRIN vec<T, Nout> expand_simd(vec<T, N> x, identity<T> value) #define KFR_HANDLE_ALL_SIZES_I_1(fn) \ template <typename T, size_t N, \ KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && is_i_class<T>::value)> \ - KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \ + KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return slice<0, N>(fn(expand_simd(a))); \ } \ template <typename T, size_t N, \ KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && is_i_class<T>::value), typename = void> \ - KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \ + KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return concat(fn(low(a)), fn(high(a))); \ } @@ -192,13 +192,13 @@ KFR_SINTRIN vec<T, Nout> expand_simd(vec<T, N> x, identity<T> value) #define KFR_HANDLE_ALL_SIZES_U_1(fn) \ template <typename T, size_t N, \ KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && is_u_class<T>::value)> \ - KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \ + KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return slice<0, N>(fn(expand_simd(a))); \ } \ template <typename T, size_t N, \ KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && is_u_class<T>::value), typename = void> \ - KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \ + KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return concat(fn(low(a)), fn(high(a))); \ } @@ -206,49 +206,49 @@ KFR_SINTRIN vec<T, Nout> expand_simd(vec<T, N> x, identity<T> value) #define KFR_HANDLE_ALL_SIZES_NOT_F_1(fn) \ template <typename T, size_t N, \ KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && !is_f_class<T>::value)> \ - KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \ + KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return slice<0, N>(fn(expand_simd(a))); \ } \ template <typename T, size_t N, \ KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && !is_f_class<T>::value), typename = void> \ - KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \ + KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \ { \ return concat(fn(low(a)), fn(high(a))); \ } #define KFR_HANDLE_ALL_SIZES_2(fn) \ template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \ - KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b) \ + KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) \ { \ return slice<0, N>(fn(expand_simd(a), expand_simd(b))); \ } \ template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \ - KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b) \ + KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) \ { \ return concat(fn(low(a), low(b)), fn(high(a), high(b))); \ } #define KFR_HANDLE_ALL_SIZES_3(fn) \ template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \ - KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b, vec<T, N> c) \ + KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) \ { \ return slice<0, N>(fn(expand_simd(a), expand_simd(b), expand_simd(c))); \ } \ template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \ - KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b, vec<T, N> c) \ + KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) \ { \ return concat(fn(low(a), low(b), low(c)), fn(high(a), high(b), high(c))); \ } #define KFR_HANDLE_ALL_SIZES_4(fn) \ template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \ - KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b, vec<T, N> c, vec<T, N> d) \ + KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, const vec<T, N>& d) \ { \ return slice<0, N>(fn(expand_simd(a), expand_simd(b), expand_simd(c), expand_simd(d))); \ } \ template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \ - KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b, vec<T, N> c, vec<T, N> d) \ + KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, const vec<T, N>& d) \ { \ return concat(fn(low(a), low(b), low(c), low(d)), fn(high(a), high(b), high(c), high(d))); \ } diff --git a/include/kfr/base/gamma.hpp b/include/kfr/base/gamma.hpp @@ -42,7 +42,7 @@ constexpr T gamma_precalc[] = { }; template <typename T, size_t N> -KFR_SINTRIN vec<T, N> gamma(vec<T, N> z) +KFR_SINTRIN vec<T, N> gamma(const vec<T, N>& z) { constexpr size_t Count = arraysize(gamma_precalc<T>); vec<T, N> accm = gamma_precalc<T>[0]; @@ -54,7 +54,7 @@ KFR_SINTRIN vec<T, N> gamma(vec<T, N> z) } template <typename T, size_t N> -KFR_SINTRIN vec<T, N> factorial_approx(vec<T, N> x) +KFR_SINTRIN vec<T, N> factorial_approx(const vec<T, N>& x) { return gamma(x + T(1)); } diff --git a/include/kfr/base/hyperbolic.hpp b/include/kfr/base/hyperbolic.hpp @@ -36,33 +36,33 @@ namespace intrinsics { template <typename T, size_t N> -KFR_SINTRIN vec<T, N> sinh(vec<T, N> x) +KFR_SINTRIN vec<T, N> sinh(const vec<T, N>& x) { return (exp(x) - exp(-x)) * T(0.5); } template <typename T, size_t N> -KFR_SINTRIN vec<T, N> cosh(vec<T, N> x) +KFR_SINTRIN vec<T, N> cosh(const vec<T, N>& x) { return (exp(x) + exp(-x)) * T(0.5); } template <typename T, size_t N> -KFR_SINTRIN vec<T, N> tanh(vec<T, N> x) +KFR_SINTRIN vec<T, N> tanh(const vec<T, N>& x) { x = -2 * x; return (1 - exp(x)) / (1 + exp(x)); } template <typename T, size_t N> -KFR_SINTRIN vec<T, N> coth(vec<T, N> x) +KFR_SINTRIN vec<T, N> coth(const vec<T, N>& x) { x = -2 * x; return (1 + exp(x)) / (1 - exp(x)); } template <typename T, size_t N, KFR_ENABLE_IF(N > 1)> -KFR_SINTRIN vec<T, N> sinhcosh(vec<T, N> x) +KFR_SINTRIN vec<T, N> sinhcosh(const vec<T, N>& x) { const vec<T, N> a = exp(x); const vec<T, N> b = exp(-x); @@ -70,7 +70,7 @@ KFR_SINTRIN vec<T, N> sinhcosh(vec<T, N> x) } template <typename T, size_t N, KFR_ENABLE_IF(N > 1)> -KFR_SINTRIN vec<T, N> coshsinh(vec<T, N> x) +KFR_SINTRIN vec<T, N> coshsinh(const vec<T, N>& x) { const vec<T, N> a = exp(x); const vec<T, N> b = exp(-x); diff --git a/include/kfr/base/log_exp.hpp b/include/kfr/base/log_exp.hpp @@ -39,55 +39,53 @@ namespace intrinsics { template <size_t N> -KFR_SINTRIN vec<i32, N> vilogbp1(vec<f32, N> d) +KFR_SINTRIN vec<i32, N> vilogbp1(const vec<f32, N>& d) { mask<i32, N> m = d < 5.421010862427522E-20f; - d = select(m, 1.8446744073709552E19f * d, d); - vec<i32, N> q = (ibitcast(d) >> 23) & 0xff; + vec<i32, N> q = (ibitcast(select(m, 1.8446744073709552E19f * d, d)) >> 23) & 0xff; q = select(m, q - (64 + 0x7e), q - 0x7e); return q; } template <size_t N> -KFR_SINTRIN vec<i64, N> vilogbp1(vec<f64, N> d) +KFR_SINTRIN vec<i64, N> vilogbp1(const vec<f64, N>& d) { mask<i64, N> m = d < 4.9090934652977266E-91; - d = select(m, 2.037035976334486E90 * d, d); - vec<i64, N> q = (ibitcast(d) >> 52) & 0x7ff; + vec<i64, N> q = (ibitcast(select(m, 2.037035976334486E90 * d, d)) >> 52) & 0x7ff; q = select(m, q - (300 + 0x03fe), q - 0x03fe); return q; } template <size_t N> -KFR_SINTRIN vec<f32, N> vldexpk(vec<f32, N> x, vec<i32, N> q) +KFR_SINTRIN vec<f32, N> vldexpk(const vec<f32, N>& x, const vec<i32, N>& q) { vec<i32, N> m = q >> 31; m = (((m + q) >> 6) - m) << 4; - q = q - (m << 2); + const vec<i32, N> qq = q - (m << 2); m = clamp(m + 0x7f, vec<i32, N>(0xff)); vec<f32, N> u = pow4(bitcast<f32>(cast<i32>(m) << 23)); - return x * u * bitcast<f32>((cast<i32>(q + 0x7f)) << 23); + return x * u * bitcast<f32>((cast<i32>(qq + 0x7f)) << 23); } template <size_t N> -KFR_SINTRIN vec<f64, N> vldexpk(vec<f64, N> x, vec<i64, N> q) +KFR_SINTRIN vec<f64, N> vldexpk(const vec<f64, N>& x, const vec<i64, N>& q) { vec<i64, N> m = q >> 31; m = (((m + q) >> 9) - m) << 7; - q = q - (m << 2); + const vec<i64, N> qq = q - (m << 2); m = clamp(m + 0x3ff, i64(0x7ff)); vec<f64, N> u = pow4(bitcast<f64>(cast<i64>(m) << 52)); - return x * u * bitcast<f64>((cast<i64>(q + 0x3ff)) << 52); + return x * u * bitcast<f64>((cast<i64>(qq + 0x3ff)) << 52); } template <typename T, size_t N> -KFR_SINTRIN vec<T, N> logb(vec<T, N> x) +KFR_SINTRIN vec<T, N> logb(const vec<T, N>& x) { return select(x == T(), -c_infinity<T>, cast<T>(vilogbp1(x) - 1)); } template <size_t N> -KFR_SINTRIN vec<f32, N> log(vec<f32, N> d) +KFR_SINTRIN vec<f32, N> log(const vec<f32, N>& d) { vec<i32, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485f ); vec<f32, N> m = vldexpk(d, -e); @@ -110,7 +108,7 @@ KFR_SINTRIN vec<f32, N> log(vec<f32, N> d) } template <size_t N> -KFR_SINTRIN vec<f64, N> log(vec<f64, N> d) +KFR_SINTRIN vec<f64, N> log(const vec<f64, N>& d) { vec<i64, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485 ); vec<f64, N> m = vldexpk(d, -e); @@ -136,18 +134,18 @@ KFR_SINTRIN vec<f64, N> log(vec<f64, N> d) } template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> log2(vec<T, N> x) +KFR_SINTRIN vec<T, N> log2(const vec<T, N>& x) { return log(x) * c_recip_log_2<T>; } template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> log10(vec<T, N> x) +KFR_SINTRIN vec<T, N> log10(const vec<T, N>& x) { return log(x) * c_recip_log_10<T>; } template <size_t N> -KFR_SINTRIN vec<f32, N> exp(vec<f32, N> d) +KFR_SINTRIN vec<f32, N> exp(const vec<f32, N>& d) { const f32 ln2_part1 = 0.6931457519f; const f32 ln2_part2 = 1.4286067653e-6f; @@ -181,7 +179,7 @@ KFR_SINTRIN vec<f32, N> exp(vec<f32, N> d) } template <size_t N> -KFR_SINTRIN vec<f64, N> exp(vec<f64, N> d) +KFR_SINTRIN vec<f64, N> exp(const vec<f64, N>& d) { const f64 ln2_part1 = 0.69314717501401901245; const f64 ln2_part2 = 5.545926273775592108e-009; @@ -222,12 +220,12 @@ KFR_SINTRIN vec<f64, N> exp(vec<f64, N> d) return u; } template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> exp2(vec<T, N> x) +KFR_SINTRIN vec<T, N> exp2(const vec<T, N>& x) { return exp(x * c_log_2<T>); } template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> exp10(vec<T, N> x) +KFR_SINTRIN vec<T, N> exp10(const vec<T, N>& x) { return exp(x * c_log_10<T>); } @@ -257,7 +255,7 @@ KFR_SINTRIN common_type<T1, T2, T3> log_fmadd(const T1& x, const T2& m, const T3 } template <typename T, size_t N> -KFR_SINTRIN vec<T, N> pow(vec<T, N> a, vec<T, N> b) +KFR_SINTRIN vec<T, N> pow(const vec<T, N>& a, const vec<T, N>& b) { const vec<T, N> t = exp(b * log(abs(a))); const mask<T, N> isint = floor(b) == b; @@ -267,49 +265,49 @@ KFR_SINTRIN vec<T, N> pow(vec<T, N> a, vec<T, N> b) } template <typename T, size_t N> -KFR_SINTRIN vec<T, N> root(vec<T, N> x, vec<T, N> b) +KFR_SINTRIN vec<T, N> root(const vec<T, N>& x, const vec<T, N>& b) { return exp(reciprocal(b) * log(x)); } template <typename T, size_t N> -KFR_SINTRIN vec<T, N> cbrt(vec<T, N> x) +KFR_SINTRIN vec<T, N> cbrt(const vec<T, N>& x) { return pow<T, N>(x, T(0.333333333333333333333333333333333)); } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>> -KFR_SINTRIN vec<Tout, N> exp(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> exp(const vec<T, N>& x) { return exp(cast<Tout>(x)); } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>> -KFR_SINTRIN vec<Tout, N> exp2(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> exp2(const vec<T, N>& x) { return exp2(cast<Tout>(x)); } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>> -KFR_SINTRIN vec<Tout, N> exp10(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> exp10(const vec<T, N>& x) { return exp10(cast<Tout>(x)); } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>> -KFR_SINTRIN vec<Tout, N> log(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> log(const vec<T, N>& x) { return log(cast<Tout>(x)); } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>> -KFR_SINTRIN vec<Tout, N> log2(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> log2(const vec<T, N>& x) { return log2(cast<Tout>(x)); } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>> -KFR_SINTRIN vec<Tout, N> log10(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> log10(const vec<T, N>& x) { return log10(cast<Tout>(x)); } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>> -KFR_SINTRIN vec<Tout, N> cbrt(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> cbrt(const vec<T, N>& x) { return cbrt(cast<Tout>(x)); } diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp @@ -50,104 +50,110 @@ struct bitmask #if defined CID_ARCH_SSE41 -KFR_SINTRIN bool bittestany(u8sse x) { return !_mm_testz_si128(*x, *x); } -KFR_SINTRIN bool bittestany(u16sse x) { return !_mm_testz_si128(*x, *x); } -KFR_SINTRIN bool bittestany(u32sse x) { return !_mm_testz_si128(*x, *x); } -KFR_SINTRIN bool bittestany(u64sse x) { return !_mm_testz_si128(*x, *x); } -KFR_SINTRIN bool bittestany(i8sse x) { return !_mm_testz_si128(*x, *x); } -KFR_SINTRIN bool bittestany(i16sse x) { return !_mm_testz_si128(*x, *x); } -KFR_SINTRIN bool bittestany(i32sse x) { return !_mm_testz_si128(*x, *x); } -KFR_SINTRIN bool bittestany(i64sse x) { return !_mm_testz_si128(*x, *x); } - -KFR_SINTRIN bool bittestall(u8sse x) { return _mm_testc_si128(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(u16sse x) { return _mm_testc_si128(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(u32sse x) { return _mm_testc_si128(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(u64sse x) { return _mm_testc_si128(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(i8sse x) { return _mm_testc_si128(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(i16sse x) { return _mm_testc_si128(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(i32sse x) { return _mm_testc_si128(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(i64sse x) { return _mm_testc_si128(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestany(const u8sse& x) { return !_mm_testz_si128(*x, *x); } +KFR_SINTRIN bool bittestany(const u16sse& x) { return !_mm_testz_si128(*x, *x); } +KFR_SINTRIN bool bittestany(const u32sse& x) { return !_mm_testz_si128(*x, *x); } +KFR_SINTRIN bool bittestany(const u64sse& x) { return !_mm_testz_si128(*x, *x); } +KFR_SINTRIN bool bittestany(const i8sse& x) { return !_mm_testz_si128(*x, *x); } +KFR_SINTRIN bool bittestany(const i16sse& x) { return !_mm_testz_si128(*x, *x); } +KFR_SINTRIN bool bittestany(const i32sse& x) { return !_mm_testz_si128(*x, *x); } +KFR_SINTRIN bool bittestany(const i64sse& x) { return !_mm_testz_si128(*x, *x); } + +KFR_SINTRIN bool bittestall(const u8sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestall(const u16sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestall(const u32sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestall(const u64sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestall(const i8sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestall(const i16sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestall(const i32sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestall(const i64sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); } #endif #if defined CID_ARCH_AVX -KFR_SINTRIN bool bittestany(f32sse x) { return !_mm_testz_ps(*x, *x); } -KFR_SINTRIN bool bittestany(f64sse x) { return !_mm_testz_pd(*x, *x); } -KFR_SINTRIN bool bittestall(f32sse x) { return _mm_testc_ps(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(f64sse x) { return _mm_testc_pd(*x, *allonesvector(x)); } - -KFR_SINTRIN bool bittestany(f32avx x) { return !_mm256_testz_ps(*x, *x); } -KFR_SINTRIN bool bittestany(f64avx x) { return !_mm256_testz_pd(*x, *x); } - -KFR_SINTRIN bool bittestnall(f32avx x) { return _mm256_testc_ps(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestnall(f64avx x) { return _mm256_testc_pd(*x, *allonesvector(x)); } - -KFR_SINTRIN bool bittestany(u8avx x) { return !_mm256_testz_si256(*x, *x); } -KFR_SINTRIN bool bittestany(u16avx x) { return !_mm256_testz_si256(*x, *x); } -KFR_SINTRIN bool bittestany(u32avx x) { return !_mm256_testz_si256(*x, *x); } -KFR_SINTRIN bool bittestany(u64avx x) { return !_mm256_testz_si256(*x, *x); } -KFR_SINTRIN bool bittestany(i8avx x) { return !_mm256_testz_si256(*x, *x); } -KFR_SINTRIN bool bittestany(i16avx x) { return !_mm256_testz_si256(*x, *x); } -KFR_SINTRIN bool bittestany(i32avx x) { return !_mm256_testz_si256(*x, *x); } -KFR_SINTRIN bool bittestany(i64avx x) { return !_mm256_testz_si256(*x, *x); } - -KFR_SINTRIN bool bittestall(u8avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(u16avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(u32avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(u64avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(i8avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(i16avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(i32avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); } -KFR_SINTRIN bool bittestall(i64avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestany(const f32sse& x) { return !_mm_testz_ps(*x, *x); } +KFR_SINTRIN bool bittestany(const f64sse& x) { return !_mm_testz_pd(*x, *x); } +KFR_SINTRIN bool bittestall(const f32sse& x) { return _mm_testc_ps(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestall(const f64sse& x) { return _mm_testc_pd(*x, *allonesvector(x)); } + +KFR_SINTRIN bool bittestany(const f32avx& x) { return !_mm256_testz_ps(*x, *x); } +KFR_SINTRIN bool bittestany(const f64avx& x) { return !_mm256_testz_pd(*x, *x); } + +KFR_SINTRIN bool bittestnall(const f32avx& x) { return _mm256_testc_ps(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestnall(const f64avx& x) { return _mm256_testc_pd(*x, *allonesvector(x)); } + +KFR_SINTRIN bool bittestany(const u8avx& x) { return !_mm256_testz_si256(*x, *x); } +KFR_SINTRIN bool bittestany(const u16avx& x) { return !_mm256_testz_si256(*x, *x); } +KFR_SINTRIN bool bittestany(const u32avx& x) { return !_mm256_testz_si256(*x, *x); } +KFR_SINTRIN bool bittestany(const u64avx& x) { return !_mm256_testz_si256(*x, *x); } +KFR_SINTRIN bool bittestany(const i8avx& x) { return !_mm256_testz_si256(*x, *x); } +KFR_SINTRIN bool bittestany(const i16avx& x) { return !_mm256_testz_si256(*x, *x); } +KFR_SINTRIN bool bittestany(const i32avx& x) { return !_mm256_testz_si256(*x, *x); } +KFR_SINTRIN bool bittestany(const i64avx& x) { return !_mm256_testz_si256(*x, *x); } + +KFR_SINTRIN bool bittestall(const u8avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestall(const u16avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestall(const u32avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestall(const u64avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestall(const i8avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestall(const i16avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestall(const i32avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); } +KFR_SINTRIN bool bittestall(const i64avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); } #elif defined CID_ARCH_SSE41 -KFR_SINTRIN bool bittestany(f32sse x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); } -KFR_SINTRIN bool bittestany(f64sse x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); } -KFR_SINTRIN bool bittestall(f32sse x) { return _mm_testc_si128(*bitcast<u8>(x), *allonesvector(bitcast<u8>(x))); } -KFR_SINTRIN bool bittestall(f64sse x) { return _mm_testc_si128(*bitcast<u8>(x), *allonesvector(bitcast<u8>(x))); } +KFR_SINTRIN bool bittestany(const f32sse& x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); } +KFR_SINTRIN bool bittestany(const f64sse& x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); } +KFR_SINTRIN bool bittestall(const f32sse& x) +{ + return _mm_testc_si128(*bitcast<u8>(x), *allonesvector(bitcast<u8>(x))); +} +KFR_SINTRIN bool bittestall(const f64sse& x) +{ + return _mm_testc_si128(*bitcast<u8>(x), *allonesvector(bitcast<u8>(x))); +} #endif #if !defined CID_ARCH_SSE41 -KFR_SINTRIN bool bittestany(f32sse x) { return _mm_movemask_ps(*x); } -KFR_SINTRIN bool bittestany(f64sse x) { return _mm_movemask_pd(*x); } -KFR_SINTRIN bool bittestany(u8sse x) { return _mm_movemask_epi8(*x); } -KFR_SINTRIN bool bittestany(u16sse x) { return _mm_movemask_epi8(*x); } -KFR_SINTRIN bool bittestany(u32sse x) { return _mm_movemask_epi8(*x); } -KFR_SINTRIN bool bittestany(u64sse x) { return _mm_movemask_epi8(*x); } -KFR_SINTRIN bool bittestany(i8sse x) { return _mm_movemask_epi8(*x); } -KFR_SINTRIN bool bittestany(i16sse x) { return _mm_movemask_epi8(*x); } -KFR_SINTRIN bool bittestany(i32sse x) { return _mm_movemask_epi8(*x); } -KFR_SINTRIN bool bittestany(i64sse x) { return _mm_movemask_epi8(*x); } - -KFR_SINTRIN bool bittestall(f32sse x) { return !_mm_movemask_ps(*~x); } -KFR_SINTRIN bool bittestall(f64sse x) { return !_mm_movemask_pd(*~x); } -KFR_SINTRIN bool bittestall(u8sse x) { return !_mm_movemask_epi8(*~x); } -KFR_SINTRIN bool bittestall(u16sse x) { return !_mm_movemask_epi8(*~x); } -KFR_SINTRIN bool bittestall(u32sse x) { return !_mm_movemask_epi8(*~x); } -KFR_SINTRIN bool bittestall(u64sse x) { return !_mm_movemask_epi8(*~x); } -KFR_SINTRIN bool bittestall(i8sse x) { return !_mm_movemask_epi8(*~x); } -KFR_SINTRIN bool bittestall(i16sse x) { return !_mm_movemask_epi8(*~x); } -KFR_SINTRIN bool bittestall(i32sse x) { return !_mm_movemask_epi8(*~x); } -KFR_SINTRIN bool bittestall(i64sse x) { return !_mm_movemask_epi8(*~x); } +KFR_SINTRIN bool bittestany(const f32sse& x) { return _mm_movemask_ps(*x); } +KFR_SINTRIN bool bittestany(const f64sse& x) { return _mm_movemask_pd(*x); } +KFR_SINTRIN bool bittestany(const u8sse& x) { return _mm_movemask_epi8(*x); } +KFR_SINTRIN bool bittestany(const u16sse& x) { return _mm_movemask_epi8(*x); } +KFR_SINTRIN bool bittestany(const u32sse& x) { return _mm_movemask_epi8(*x); } +KFR_SINTRIN bool bittestany(const u64sse& x) { return _mm_movemask_epi8(*x); } +KFR_SINTRIN bool bittestany(const i8sse& x) { return _mm_movemask_epi8(*x); } +KFR_SINTRIN bool bittestany(const i16sse& x) { return _mm_movemask_epi8(*x); } +KFR_SINTRIN bool bittestany(const i32sse& x) { return _mm_movemask_epi8(*x); } +KFR_SINTRIN bool bittestany(const i64sse& x) { return _mm_movemask_epi8(*x); } + +KFR_SINTRIN bool bittestall(const f32sse& x) { return !_mm_movemask_ps(*~x); } +KFR_SINTRIN bool bittestall(const f64sse& x) { return !_mm_movemask_pd(*~x); } +KFR_SINTRIN bool bittestall(const u8sse& x) { return !_mm_movemask_epi8(*~x); } +KFR_SINTRIN bool bittestall(const u16sse& x) { return !_mm_movemask_epi8(*~x); } +KFR_SINTRIN bool bittestall(const u32sse& x) { return !_mm_movemask_epi8(*~x); } +KFR_SINTRIN bool bittestall(const u64sse& x) { return !_mm_movemask_epi8(*~x); } +KFR_SINTRIN bool bittestall(const i8sse& x) { return !_mm_movemask_epi8(*~x); } +KFR_SINTRIN bool bittestall(const i16sse& x) { return !_mm_movemask_epi8(*~x); } +KFR_SINTRIN bool bittestall(const i32sse& x) { return !_mm_movemask_epi8(*~x); } +KFR_SINTRIN bool bittestall(const i64sse& x) { return !_mm_movemask_epi8(*~x); } #endif template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> -KFR_SINTRIN bool bittestall(vec<T, N> a) +KFR_SINTRIN bool bittestall(const vec<T, N>& a) { return bittestall(expand_simd(a, internal::maskbits<T>(true))); } template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> -KFR_SINTRIN bool bittestall(vec<T, N> a) +KFR_SINTRIN bool bittestall(const vec<T, N>& a) { return bittestall(low(a)) && bittestall(high(a)); } template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> -KFR_SINTRIN bool bittestany(vec<T, N> a) +KFR_SINTRIN bool bittestany(const vec<T, N>& a) { return bittestany(expand_simd(a, internal::maskbits<T>(false))); } template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> -KFR_SINTRIN bool bittestany(vec<T, N> a) +KFR_SINTRIN bool bittestany(const vec<T, N>& a) { return bittestany(low(a)) || bittestany(high(a)); } @@ -155,7 +161,7 @@ KFR_SINTRIN bool bittestany(vec<T, N> a) #else template <typename T, size_t N> -KFR_SINTRIN bitmask<N> getmask(vec<T, N> x) +KFR_SINTRIN bitmask<N> getmask(const vec<T, N>& x) { typename bitmask<N>::type val = 0; for (size_t i = 0; i < N; i++) @@ -166,23 +172,23 @@ KFR_SINTRIN bitmask<N> getmask(vec<T, N> x) } template <typename T, size_t N> -KFR_SINTRIN bool bittestany(vec<T, N> x) +KFR_SINTRIN bool bittestany(const vec<T, N>& x) { return getmask(x).value; } template <typename T, size_t N> -KFR_SINTRIN bool bittestany(vec<T, N> x, vec<T, N> y) +KFR_SINTRIN bool bittestany(const vec<T, N>& x, const vec<T, N>& y) { return bittestany(x & y); } template <typename T, size_t N> -KFR_SINTRIN bool bittestall(vec<T, N> x) +KFR_SINTRIN bool bittestall(const vec<T, N>& x) { return !getmask(~x).value; } template <typename T, size_t N> -KFR_SINTRIN bool bittestall(vec<T, N> x, vec<T, N> y) +KFR_SINTRIN bool bittestall(const vec<T, N>& x, const vec<T, N>& y) { return !bittestany(~x & y); } diff --git a/include/kfr/base/min_max.hpp b/include/kfr/base/min_max.hpp @@ -35,68 +35,68 @@ namespace intrinsics #if defined CID_ARCH_SSE2 -KFR_SINTRIN f32sse min(f32sse x, f32sse y) { return _mm_min_ps(*x, *y); } -KFR_SINTRIN f64sse min(f64sse x, f64sse y) { return _mm_min_pd(*x, *y); } -KFR_SINTRIN u8sse min(u8sse x, u8sse y) { return _mm_min_epu8(*x, *y); } -KFR_SINTRIN i16sse min(i16sse x, i16sse y) { return _mm_min_epi16(*x, *y); } -KFR_SINTRIN i64sse min(i64sse x, i64sse y) { return select(x < y, x, y); } -KFR_SINTRIN u64sse min(u64sse x, u64sse y) { return select(x < y, x, y); } - -KFR_SINTRIN f32sse max(f32sse x, f32sse y) { return _mm_max_ps(*x, *y); } -KFR_SINTRIN f64sse max(f64sse x, f64sse y) { return _mm_max_pd(*x, *y); } -KFR_SINTRIN u8sse max(u8sse x, u8sse y) { return _mm_max_epu8(*x, *y); } -KFR_SINTRIN i16sse max(i16sse x, i16sse y) { return _mm_max_epi16(*x, *y); } -KFR_SINTRIN i64sse max(i64sse x, i64sse y) { return select(x > y, x, y); } -KFR_SINTRIN u64sse max(u64sse x, u64sse y) { return select(x > y, x, y); } +KFR_SINTRIN f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(*x, *y); } +KFR_SINTRIN f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(*x, *y); } +KFR_SINTRIN u8sse min(const u8sse& x, const u8sse& y) { return _mm_min_epu8(*x, *y); } +KFR_SINTRIN i16sse min(const i16sse& x, const i16sse& y) { return _mm_min_epi16(*x, *y); } +KFR_SINTRIN i64sse min(const i64sse& x, const i64sse& y) { return select(x < y, x, y); } +KFR_SINTRIN u64sse min(const u64sse& x, const u64sse& y) { return select(x < y, x, y); } + +KFR_SINTRIN f32sse max(const f32sse& x, const f32sse& y) { return _mm_max_ps(*x, *y); } +KFR_SINTRIN f64sse max(const f64sse& x, const f64sse& y) { return _mm_max_pd(*x, *y); } +KFR_SINTRIN u8sse max(const u8sse& x, const u8sse& y) { return _mm_max_epu8(*x, *y); } +KFR_SINTRIN i16sse max(const i16sse& x, const i16sse& y) { return _mm_max_epi16(*x, *y); } +KFR_SINTRIN i64sse max(const i64sse& x, const i64sse& y) { return select(x > y, x, y); } +KFR_SINTRIN u64sse max(const u64sse& x, const u64sse& y) { return select(x > y, x, y); } #if defined CID_ARCH_AVX2 -KFR_SINTRIN u8avx min(u8avx x, u8avx y) { return _mm256_min_epu8(*x, *y); } -KFR_SINTRIN i16avx min(i16avx x, i16avx y) { return _mm256_min_epi16(*x, *y); } -KFR_SINTRIN i8avx min(i8avx x, i8avx y) { return _mm256_min_epi8(*x, *y); } -KFR_SINTRIN u16avx min(u16avx x, u16avx y) { return _mm256_min_epu16(*x, *y); } -KFR_SINTRIN i32avx min(i32avx x, i32avx y) { return _mm256_min_epi32(*x, *y); } -KFR_SINTRIN u32avx min(u32avx x, u32avx y) { return _mm256_min_epu32(*x, *y); } - -KFR_SINTRIN u8avx max(u8avx x, u8avx y) { return _mm256_max_epu8(*x, *y); } -KFR_SINTRIN i16avx max(i16avx x, i16avx y) { return _mm256_max_epi16(*x, *y); } -KFR_SINTRIN i8avx max(i8avx x, i8avx y) { return _mm256_max_epi8(*x, *y); } -KFR_SINTRIN u16avx max(u16avx x, u16avx y) { return _mm256_max_epu16(*x, *y); } -KFR_SINTRIN i32avx max(i32avx x, i32avx y) { return _mm256_max_epi32(*x, *y); } -KFR_SINTRIN u32avx max(u32avx x, u32avx y) { return _mm256_max_epu32(*x, *y); } - -KFR_SINTRIN i64avx min(i64avx x, i64avx y) { return select(x < y, x, y); } -KFR_SINTRIN u64avx min(u64avx x, u64avx y) { return select(x < y, x, y); } -KFR_SINTRIN i64avx max(i64avx x, i64avx y) { return select(x > y, x, y); } -KFR_SINTRIN u64avx max(u64avx x, u64avx y) { return select(x > y, x, y); } +KFR_SINTRIN u8avx min(const u8avx& x, const u8avx& y) { return _mm256_min_epu8(*x, *y); } +KFR_SINTRIN i16avx min(const i16avx& x, const i16avx& y) { return _mm256_min_epi16(*x, *y); } +KFR_SINTRIN i8avx min(const i8avx& x, const i8avx& y) { return _mm256_min_epi8(*x, *y); } +KFR_SINTRIN u16avx min(const u16avx& x, const u16avx& y) { return _mm256_min_epu16(*x, *y); } +KFR_SINTRIN i32avx min(const i32avx& x, const i32avx& y) { return _mm256_min_epi32(*x, *y); } +KFR_SINTRIN u32avx min(const u32avx& x, const u32avx& y) { return _mm256_min_epu32(*x, *y); } + +KFR_SINTRIN u8avx max(const u8avx& x, const u8avx& y) { return _mm256_max_epu8(*x, *y); } +KFR_SINTRIN i16avx max(const i16avx& x, const i16avx& y) { return _mm256_max_epi16(*x, *y); } +KFR_SINTRIN i8avx max(const i8avx& x, const i8avx& y) { return _mm256_max_epi8(*x, *y); } +KFR_SINTRIN u16avx max(const u16avx& x, const u16avx& y) { return _mm256_max_epu16(*x, *y); } +KFR_SINTRIN i32avx max(const i32avx& x, const i32avx& y) { return _mm256_max_epi32(*x, *y); } +KFR_SINTRIN u32avx max(const u32avx& x, const u32avx& y) { return _mm256_max_epu32(*x, *y); } + +KFR_SINTRIN i64avx min(const i64avx& x, const i64avx& y) { return select(x < y, x, y); } +KFR_SINTRIN u64avx min(const u64avx& x, const u64avx& y) { return select(x < y, x, y); } +KFR_SINTRIN i64avx max(const i64avx& x, const i64avx& y) { return select(x > y, x, y); } +KFR_SINTRIN u64avx max(const u64avx& x, const u64avx& y) { return select(x > y, x, y); } #endif #if defined CID_ARCH_AVX -KFR_SINTRIN f32avx min(f32avx x, f32avx y) { return _mm256_min_ps(*x, *y); } -KFR_SINTRIN f64avx min(f64avx x, f64avx y) { return _mm256_min_pd(*x, *y); } -KFR_SINTRIN f32avx max(f32avx x, f32avx y) { return _mm256_max_ps(*x, *y); } -KFR_SINTRIN f64avx max(f64avx x, f64avx y) { return _mm256_max_pd(*x, *y); } +KFR_SINTRIN f32avx min(const f32avx& x, const f32avx& y) { return _mm256_min_ps(*x, *y); } +KFR_SINTRIN f64avx min(const f64avx& x, const f64avx& y) { return _mm256_min_pd(*x, *y); } +KFR_SINTRIN f32avx max(const f32avx& x, const f32avx& y) { return _mm256_max_ps(*x, *y); } +KFR_SINTRIN f64avx max(const f64avx& x, const f64avx& y) { return _mm256_max_pd(*x, *y); } #endif #if defined CID_ARCH_SSE41 -KFR_SINTRIN i8sse min(i8sse x, i8sse y) { return _mm_min_epi8(*x, *y); } -KFR_SINTRIN u16sse min(u16sse x, u16sse y) { return _mm_min_epu16(*x, *y); } -KFR_SINTRIN i32sse min(i32sse x, i32sse y) { return _mm_min_epi32(*x, *y); } -KFR_SINTRIN u32sse min(u32sse x, u32sse y) { return _mm_min_epu32(*x, *y); } - -KFR_SINTRIN i8sse max(i8sse x, i8sse y) { return _mm_max_epi8(*x, *y); } -KFR_SINTRIN u16sse max(u16sse x, u16sse y) { return _mm_max_epu16(*x, *y); } -KFR_SINTRIN i32sse max(i32sse x, i32sse y) { return _mm_max_epi32(*x, *y); } -KFR_SINTRIN u32sse max(u32sse x, u32sse y) { return _mm_max_epu32(*x, *y); } +KFR_SINTRIN i8sse min(const i8sse& x, const i8sse& y) { return _mm_min_epi8(*x, *y); } +KFR_SINTRIN u16sse min(const u16sse& x, const u16sse& y) { return _mm_min_epu16(*x, *y); } +KFR_SINTRIN i32sse min(const i32sse& x, const i32sse& y) { return _mm_min_epi32(*x, *y); } +KFR_SINTRIN u32sse min(const u32sse& x, const u32sse& y) { return _mm_min_epu32(*x, *y); } + +KFR_SINTRIN i8sse max(const i8sse& x, const i8sse& y) { return _mm_max_epi8(*x, *y); } +KFR_SINTRIN u16sse max(const u16sse& x, const u16sse& y) { return _mm_max_epu16(*x, *y); } +KFR_SINTRIN i32sse max(const i32sse& x, const i32sse& y) { return _mm_max_epi32(*x, *y); } +KFR_SINTRIN u32sse max(const u32sse& x, const u32sse& y) { return _mm_max_epu32(*x, *y); } #else -KFR_SINTRIN i8sse min(i8sse x, i8sse y) { return select(x < y, x, y); } -KFR_SINTRIN u16sse min(u16sse x, u16sse y) { return select(x < y, x, y); } -KFR_SINTRIN i32sse min(i32sse x, i32sse y) { return select(x < y, x, y); } -KFR_SINTRIN u32sse min(u32sse x, u32sse y) { return select(x < y, x, y); } +KFR_SINTRIN i8sse min(const i8sse& x, const i8sse& y) { return select(x < y, x, y); } +KFR_SINTRIN u16sse min(const u16sse& x, const u16sse& y) { return select(x < y, x, y); } +KFR_SINTRIN i32sse min(const i32sse& x, const i32sse& y) { return select(x < y, x, y); } +KFR_SINTRIN u32sse min(const u32sse& x, const u32sse& y) { return select(x < y, x, y); } -KFR_SINTRIN i8sse max(i8sse x, i8sse y) { return select(x > y, x, y); } -KFR_SINTRIN u16sse max(u16sse x, u16sse y) { return select(x > y, x, y); } -KFR_SINTRIN i32sse max(i32sse x, i32sse y) { return select(x > y, x, y); } -KFR_SINTRIN u32sse max(u32sse x, u32sse y) { return select(x > y, x, y); } +KFR_SINTRIN i8sse max(const i8sse& x, const i8sse& y) { return select(x > y, x, y); } +KFR_SINTRIN u16sse max(const u16sse& x, const u16sse& y) { return select(x > y, x, y); } +KFR_SINTRIN i32sse max(const i32sse& x, const i32sse& y) { return select(x > y, x, y); } +KFR_SINTRIN u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, x, y); } #endif @@ -107,12 +107,12 @@ KFR_HANDLE_ALL_SIZES_2(max) // fallback template <typename T, size_t N> -KFR_SINTRIN vec<T, N> min(vec<T, N> x, vec<T, N> y) +KFR_SINTRIN vec<T, N> min(const vec<T, N>& x, const vec<T, N>& y) { return select(x < y, x, y); } template <typename T, size_t N> -KFR_SINTRIN vec<T, N> max(vec<T, N> x, vec<T, N> y) +KFR_SINTRIN vec<T, N> max(const vec<T, N>& x, const vec<T, N>& y) { return select(x > y, x, y); } @@ -143,12 +143,12 @@ KFR_SINTRIN T absmax(initialvalue<T>) } template <typename T, size_t N> -KFR_SINTRIN vec<T, N> absmin(vec<T, N> x, vec<T, N> y) +KFR_SINTRIN vec<T, N> absmin(const vec<T, N>& x, const vec<T, N>& y) { return min(abs(x), abs(y)); } template <typename T, size_t N> -KFR_SINTRIN vec<T, N> absmax(vec<T, N> x, vec<T, N> y) +KFR_SINTRIN vec<T, N> absmax(const vec<T, N>& x, const vec<T, N>& y) { return max(abs(x), abs(y)); } diff --git a/include/kfr/base/modzerobessel.hpp b/include/kfr/base/modzerobessel.hpp @@ -77,7 +77,7 @@ constexpr T bessel_coef[] = { T(0.25), T(1.5021381070956226783e-096) }; template <typename T, size_t N> -KFR_INLINE vec<T, N> modzerobessel(vec<T, N> x) +KFR_INLINE vec<T, N> modzerobessel(const vec<T, N>& x) { const vec<T, N> x_2 = x * 0.5; const vec<T, N> x_2_sqr = x_2 * x_2; diff --git a/include/kfr/base/operators.hpp b/include/kfr/base/operators.hpp @@ -32,18 +32,18 @@ namespace internal { template <typename T, typename ReduceFn> -KFR_INLINE T horizontal_impl(vec<T, 1> value, ReduceFn&&) +KFR_INLINE T horizontal_impl(const vec<T, 1>& value, ReduceFn&&) { return T(value[0]); } template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && is_poweroftwo(N))> -KFR_INLINE T horizontal_impl(vec<T, N> value, ReduceFn&& reduce) +KFR_INLINE T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce) { return horizontal_impl(reduce(low(value), high(value)), std::forward<ReduceFn>(reduce)); } template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && !is_poweroftwo(N))> -KFR_INLINE T horizontal_impl(vec<T, N> value, ReduceFn&& reduce) +KFR_INLINE T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce) { const T initial = reduce(initialvalue<T>()); return horizontal_impl(widen<next_poweroftwo(N)>(value, initial), std::forward<ReduceFn>(reduce)); @@ -51,7 +51,7 @@ KFR_INLINE T horizontal_impl(vec<T, N> value, ReduceFn&& reduce) } template <typename T, size_t N, typename ReduceFn> -KFR_INLINE T horizontal(vec<T, N> value, ReduceFn&& reduce) +KFR_INLINE T horizontal(const vec<T, N>& value, ReduceFn&& reduce) { return internal::horizontal_impl(value, std::forward<ReduceFn>(reduce)); } @@ -486,7 +486,7 @@ constexpr KFR_INLINE T reciprocal(T x) KFR_FN(reciprocal) template <typename T, size_t N> -KFR_INLINE vec<T, N> mulsign(vec<T, N> x, vec<T, N> y) +KFR_INLINE vec<T, N> mulsign(const vec<T, N>& x, const vec<T, N>& y) { return x ^ (y & internal::highbitmask<T>); } @@ -494,13 +494,13 @@ KFR_FN_S(mulsign) KFR_FN(mulsign) template <typename T, size_t N> -constexpr KFR_INLINE vec<T, N> copysign(vec<T, N> x, vec<T, N> y) +constexpr KFR_INLINE vec<T, N> copysign(const vec<T, N>& x, const vec<T, N>& y) { return (x & internal::highbitmask<T>) | (y & internal::highbitmask<T>); } template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_INLINE vec<T, N> fmod(vec<T, N> x, vec<T, N> y) +KFR_INLINE vec<T, N> fmod(const vec<T, N>& x, const vec<T, N>& y) { return x - cast<itype<T>>(x / y) * y; } @@ -509,55 +509,55 @@ KFR_FN_S(fmod) KFR_FN(fmod) template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> -constexpr KFR_INLINE vec<T, N> rem(vec<T, N> x, vec<T, N> y) +constexpr KFR_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y) { return x % y; } template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_INLINE vec<T, N> rem(vec<T, N> x, vec<T, N> y) +KFR_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y) { return fmod(x, y); } template <typename T, size_t N> -KFR_INLINE mask<T, N> isnan(vec<T, N> x) +KFR_INLINE mask<T, N> isnan(const vec<T, N>& x) { return x != x; } template <typename T, size_t N> -KFR_INLINE mask<T, N> isinf(vec<T, N> x) +KFR_INLINE mask<T, N> isinf(const vec<T, N>& x) { return x == c_infinity<T> || x == -c_infinity<T>; } template <typename T, size_t N> -KFR_INLINE mask<T, N> isfinite(vec<T, N> x) +KFR_INLINE mask<T, N> isfinite(const vec<T, N>& x) { return !isnan(x) && !isinf(x); } template <typename T, size_t N> -KFR_INLINE mask<T, N> isnegative(vec<T, N> x) +KFR_INLINE mask<T, N> isnegative(const vec<T, N>& x) { return (x & internal::highbitmask<T>) != 0; } template <typename T, size_t N> -KFR_INLINE mask<T, N> ispositive(vec<T, N> x) +KFR_INLINE mask<T, N> ispositive(const vec<T, N>& x) { return !isnegative(x); } template <typename T, size_t N> -KFR_INLINE mask<T, N> iszero(vec<T, N> x) +KFR_INLINE mask<T, N> iszero(const vec<T, N>& x) { return x == T(); } /// Swap byte order template <typename T, size_t N, KFR_ENABLE_IF(sizeof(vec<T, N>) > 8)> -KFR_INLINE vec<T, N> swapbyteorder(vec<T, N> x) +KFR_INLINE vec<T, N> swapbyteorder(const vec<T, N>& x) { return bitcast<T>(swap<sizeof(T)>(bitcast<u8>(x))); } @@ -580,7 +580,7 @@ KFR_FN(swapbyteorder) /// Sum all elements of the vector template <typename T, size_t N> -KFR_INLINE T hadd(vec<T, N> value) +KFR_INLINE T hadd(const vec<T, N>& value) { return horizontal(value, fn_add()); } @@ -588,26 +588,26 @@ KFR_FN(hadd) /// Multiply all elements of the vector template <typename T, size_t N> -KFR_INLINE T hmul(vec<T, N> value) +KFR_INLINE T hmul(const vec<T, N>& value) { return horizontal(value, fn_mul()); } KFR_FN(hmul) template <typename T, size_t N> -KFR_INLINE T hbitwiseand(vec<T, N> value) +KFR_INLINE T hbitwiseand(const vec<T, N>& value) { return horizontal(value, fn_bitwiseand()); } KFR_FN(hbitwiseand) template <typename T, size_t N> -KFR_INLINE T hbitwiseor(vec<T, N> value) +KFR_INLINE T hbitwiseor(const vec<T, N>& value) { return horizontal(value, fn_bitwiseor()); } KFR_FN(hbitwiseor) template <typename T, size_t N> -KFR_INLINE T hbitwisexor(vec<T, N> value) +KFR_INLINE T hbitwisexor(const vec<T, N>& value) { return horizontal(value, fn_bitwisexor()); } @@ -615,7 +615,7 @@ KFR_FN(hbitwisexor) /// Calculate the Dot-Product of two vectors template <typename T, size_t N> -KFR_INLINE T dot(vec<T, N> x, vec<T, N> y) +KFR_INLINE T dot(const vec<T, N>& x, const vec<T, N>& y) { return hadd(x * y); } @@ -623,7 +623,7 @@ KFR_FN(dot) /// Calculate the Arithmetic mean of all elements in the vector template <typename T, size_t N> -KFR_INLINE T avg(vec<T, N> value) +KFR_INLINE T avg(const vec<T, N>& value) { return hadd(value) / N; } @@ -631,19 +631,19 @@ KFR_FN(avg) /// Calculate the RMS of all elements in the vector template <typename T, size_t N> -KFR_INLINE T rms(vec<T, N> value) +KFR_INLINE T rms(const vec<T, N>& value) { return internal::builtin_sqrt(hadd(value * value) / N); } KFR_FN(rms) template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> -KFR_INLINE vec<T, N> subadd(vec<T, N> a, vec<T, N> b) +KFR_INLINE vec<T, N> subadd(const vec<T, N>& a, const vec<T, N>& b) { return blend<1, 0>(a + b, a - b); } template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> -KFR_INLINE vec<T, N> addsub(vec<T, N> a, vec<T, N> b) +KFR_INLINE vec<T, N> addsub(const vec<T, N>& a, const vec<T, N>& b) { return blend<0, 1>(a + b, a - b); } diff --git a/include/kfr/base/read_write.hpp b/include/kfr/base/read_write.hpp @@ -36,7 +36,7 @@ KFR_INLINE vec<T, N> read(const T* src) } template <bool A = false, size_t N, typename T> -KFR_INLINE void write(T* dest, vec<T, N> value) +KFR_INLINE void write(T* dest, const vec<T, N>& value) { internal_read_write::write<A, N, T>(dest, value); } @@ -54,7 +54,7 @@ KFR_INLINE vec<T, Nout> gather(const T* base) } template <size_t Index, size_t... Indices, typename T, size_t N, size_t InIndex = 0> -KFR_INLINE void scatter(const T* base, vec<T, N> value) +KFR_INLINE void scatter(const T* base, const vec<T, N>& value) { base[Index] = value[InIndex]; scatter<Indices..., T, N, InIndex + 1>(base, value); @@ -63,7 +63,7 @@ KFR_INLINE void scatter(const T* base, vec<T, N> value) namespace internal { template <typename T, size_t N, size_t... Indices> -KFR_INLINE vec<T, N> gather(const T* base, vec<u32, N> indices, csizes_t<Indices...>) +KFR_INLINE vec<T, N> gather(const T* base, const vec<u32, N>& indices, csizes_t<Indices...>) { return make_vector(base[indices[Indices]]...); } @@ -80,7 +80,7 @@ KFR_INLINE vec<T, Nout> gather_stride_s(const T* base, size_t stride, csizes_t<I } template <typename T, size_t N> -KFR_INLINE vec<T, N> gather(const T* base, vec<u32, N> indices) +KFR_INLINE vec<T, N> gather(const T* base, const vec<u32, N>& indices) { return internal::gather(base, indices, csizeseq<N>); } @@ -98,24 +98,24 @@ KFR_INLINE vec<T, Nout> gather_stride(const T* base) } template <size_t groupsize, typename T, size_t N, typename IT, size_t... Indices> -KFR_INLINE vec<T, N * groupsize> gather_helper(const T* base, vec<IT, N> offset, csizes_t<Indices...>) +KFR_INLINE vec<T, N * groupsize> gather_helper(const T* base, const vec<IT, N>& offset, csizes_t<Indices...>) { return concat(read<groupsize>(base + groupsize * (*offset)[Indices])...); } template <size_t groupsize = 1, typename T, size_t N, typename IT> -KFR_INLINE vec<T, N * groupsize> gather(const T* base, vec<IT, N> offset) +KFR_INLINE vec<T, N * groupsize> gather(const T* base, const vec<IT, N>& offset) { return gather_helper<groupsize>(base, offset, csizeseq<N>); } template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, typename IT, size_t... Indices> -KFR_INLINE void scatter_helper(T* base, vec<IT, N> offset, vec<T, Nout> value, csizes_t<Indices...>) +KFR_INLINE void scatter_helper(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value, csizes_t<Indices...>) { swallow{ (write(base + groupsize * (*offset)[Indices], slice<Indices * groupsize, groupsize>(value)), 0)... }; } template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT> -KFR_INLINE void scatter(T* base, vec<IT, N> offset, vec<T, Nout> value) +KFR_INLINE void scatter(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value) { return scatter_helper<groupsize>(base, offset, value, csizeseq<N>); } diff --git a/include/kfr/base/round.hpp b/include/kfr/base/round.hpp @@ -53,29 +53,29 @@ namespace intrinsics #if defined CID_ARCH_SSE41 -KFR_SINTRIN f32sse floor(f32sse value) { return _mm_floor_ps(*value); } -KFR_SINTRIN f32sse ceil(f32sse value) { return _mm_ceil_ps(*value); } -KFR_SINTRIN f32sse trunc(f32sse value) { return KFR_mm_trunc_ps(*value); } -KFR_SINTRIN f32sse round(f32sse value) { return KFR_mm_roundnearest_ps(*value); } -KFR_SINTRIN f64sse floor(f64sse value) { return _mm_floor_pd(*value); } -KFR_SINTRIN f64sse ceil(f64sse value) { return _mm_ceil_pd(*value); } -KFR_SINTRIN f64sse trunc(f64sse value) { return KFR_mm_trunc_pd(*value); } -KFR_SINTRIN f64sse round(f64sse value) { return KFR_mm_roundnearest_pd(*value); } -KFR_SINTRIN f32sse fract(f32sse x) { return x - floor(x); } -KFR_SINTRIN f64sse fract(f64sse x) { return x - floor(x); } +KFR_SINTRIN f32sse floor(const f32sse& value) { return _mm_floor_ps(*value); } +KFR_SINTRIN f32sse ceil(const f32sse& value) { return _mm_ceil_ps(*value); } +KFR_SINTRIN f32sse trunc(const f32sse& value) { return KFR_mm_trunc_ps(*value); } +KFR_SINTRIN f32sse round(const f32sse& value) { return KFR_mm_roundnearest_ps(*value); } +KFR_SINTRIN f64sse floor(const f64sse& value) { return _mm_floor_pd(*value); } +KFR_SINTRIN f64sse ceil(const f64sse& value) { return _mm_ceil_pd(*value); } +KFR_SINTRIN f64sse trunc(const f64sse& value) { return KFR_mm_trunc_pd(*value); } +KFR_SINTRIN f64sse round(const f64sse& value) { return KFR_mm_roundnearest_pd(*value); } +KFR_SINTRIN f32sse fract(const f32sse& x) { return x - floor(x); } +KFR_SINTRIN f64sse fract(const f64sse& x) { return x - floor(x); } #if defined CID_ARCH_AVX -KFR_SINTRIN f32avx floor(f32avx value) { return _mm256_floor_ps(*value); } -KFR_SINTRIN f32avx ceil(f32avx value) { return _mm256_ceil_ps(*value); } -KFR_SINTRIN f32avx trunc(f32avx value) { return KFR_mm256_trunc_ps(*value); } -KFR_SINTRIN f32avx round(f32avx value) { return KFR_mm256_roundnearest_ps(*value); } -KFR_SINTRIN f64avx floor(f64avx value) { return _mm256_floor_pd(*value); } -KFR_SINTRIN f64avx ceil(f64avx value) { return _mm256_ceil_pd(*value); } -KFR_SINTRIN f64avx trunc(f64avx value) { return KFR_mm256_trunc_pd(*value); } -KFR_SINTRIN f64avx round(f64avx value) { return KFR_mm256_roundnearest_pd(*value); } -KFR_SINTRIN f32avx fract(f32avx x) { return x - floor(x); } -KFR_SINTRIN f64avx fract(f64avx x) { return x - floor(x); } +KFR_SINTRIN f32avx floor(const f32avx& value) { return _mm256_floor_ps(*value); } +KFR_SINTRIN f32avx ceil(const f32avx& value) { return _mm256_ceil_ps(*value); } +KFR_SINTRIN f32avx trunc(const f32avx& value) { return KFR_mm256_trunc_ps(*value); } +KFR_SINTRIN f32avx round(const f32avx& value) { return KFR_mm256_roundnearest_ps(*value); } +KFR_SINTRIN f64avx floor(const f64avx& value) { return _mm256_floor_pd(*value); } +KFR_SINTRIN f64avx ceil(const f64avx& value) { return _mm256_ceil_pd(*value); } +KFR_SINTRIN f64avx trunc(const f64avx& value) { return KFR_mm256_trunc_pd(*value); } +KFR_SINTRIN f64avx round(const f64avx& value) { return KFR_mm256_roundnearest_pd(*value); } +KFR_SINTRIN f32avx fract(const f32avx& x) { return x - floor(x); } +KFR_SINTRIN f64avx fract(const f64avx& x) { return x - floor(x); } #endif KFR_HANDLE_ALL_SIZES_F_1(floor) @@ -89,104 +89,104 @@ KFR_HANDLE_ALL_SIZES_F_1(fract) // fallback template <size_t N> -KFR_SINTRIN vec<f32, N> floor(vec<f32, N> x) +KFR_SINTRIN vec<f32, N> floor(const vec<f32, N>& x) { vec<f32, N> t = cast<f32>(cast<i32>(x)); return t - (bitcast<f32>(x < t) & 1.f); } template <size_t N> -KFR_SINTRIN vec<f64, N> floor(vec<f64, N> x) +KFR_SINTRIN vec<f64, N> floor(const vec<f64, N>& x) { vec<f64, N> t = cast<f64>(cast<i64>(x)); return t - (bitcast<f64>(x < t) & 1.0); } template <size_t N> -KFR_SINTRIN vec<f32, N> ceil(vec<f32, N> x) +KFR_SINTRIN vec<f32, N> ceil(const vec<f32, N>& x) { vec<f32, N> t = cast<f32>(cast<i32>(x)); return t + (bitcast<f32>(x > t) & 1.f); } template <size_t N> -KFR_SINTRIN vec<f64, N> ceil(vec<f64, N> x) +KFR_SINTRIN vec<f64, N> ceil(const vec<f64, N>& x) { vec<f64, N> t = cast<f64>(cast<i64>(x)); return t + (bitcast<f64>(x > t) & 1.0); } template <size_t N> -KFR_SINTRIN vec<f32, N> round(vec<f32, N> x) +KFR_SINTRIN vec<f32, N> round(const vec<f32, N>& x) { return cast<f32>(cast<i32>(x + mulsign(broadcast<N>(0.5f), x))); } template <size_t N> -KFR_SINTRIN vec<f64, N> round(vec<f64, N> x) +KFR_SINTRIN vec<f64, N> round(const vec<f64, N>& x) { return cast<f64>(cast<i64>(x + mulsign(broadcast<N>(0.5), x))); } template <size_t N> -KFR_SINTRIN vec<f32, N> trunc(vec<f32, N> x) +KFR_SINTRIN vec<f32, N> trunc(const vec<f32, N>& x) { return cast<f32>(cast<i32>(x)); } template <size_t N> -KFR_SINTRIN vec<f64, N> trunc(vec<f64, N> x) +KFR_SINTRIN vec<f64, N> trunc(const vec<f64, N>& x) { return cast<f64>(cast<i64>(x)); } template <size_t N> -KFR_SINTRIN vec<f32, N> fract(vec<f32, N> x) +KFR_SINTRIN vec<f32, N> fract(const vec<f32, N>& x) { return x - floor(x); } template <size_t N> -KFR_SINTRIN vec<f64, N> fract(vec<f64, N> x) +KFR_SINTRIN vec<f64, N> fract(const vec<f64, N>& x) { return x - floor(x); } #endif template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> floor(vec<T, N> value) +KFR_SINTRIN vec<T, N> floor(const vec<T, N>& value) { return value; } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> ceil(vec<T, N> value) +KFR_SINTRIN vec<T, N> ceil(const vec<T, N>& value) { return value; } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> trunc(vec<T, N> value) +KFR_SINTRIN vec<T, N> trunc(const vec<T, N>& value) { return value; } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> round(vec<T, N> value) +KFR_SINTRIN vec<T, N> round(const vec<T, N>& value) { return value; } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> fract(vec<T, N>) +KFR_SINTRIN vec<T, N> fract(const vec<T, N>&) { return T(0); } template <typename T, size_t N, typename IT = itype<T>> -KFR_SINTRIN vec<IT, N> ifloor(vec<T, N> value) +KFR_SINTRIN vec<IT, N> ifloor(const vec<T, N>& value) { return cast<IT>(floor(value)); } template <typename T, size_t N, typename IT = itype<T>> -KFR_SINTRIN vec<IT, N> iceil(vec<T, N> value) +KFR_SINTRIN vec<IT, N> iceil(const vec<T, N>& value) { return cast<IT>(ceil(value)); } template <typename T, size_t N, typename IT = itype<T>> -KFR_SINTRIN vec<IT, N> itrunc(vec<T, N> value) +KFR_SINTRIN vec<IT, N> itrunc(const vec<T, N>& value) { return cast<IT>(trunc(value)); } template <typename T, size_t N, typename IT = itype<T>> -KFR_SINTRIN vec<IT, N> iround(vec<T, N> value) +KFR_SINTRIN vec<IT, N> iround(const vec<T, N>& value) { return cast<IT>(round(value)); } diff --git a/include/kfr/base/saturation.hpp b/include/kfr/base/saturation.hpp @@ -31,7 +31,7 @@ namespace kfr namespace intrinsics { template <typename T, size_t N> -KFR_SINTRIN vec<T, N> saturated_signed_add(vec<T, N> a, vec<T, N> b) +KFR_SINTRIN vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b) { using UT = utype<T>; constexpr size_t shift = typebits<UT>::bits - 1; @@ -43,7 +43,7 @@ KFR_SINTRIN vec<T, N> saturated_signed_add(vec<T, N> a, vec<T, N> b) return select(bitcast<T>((aa ^ bb) | ~(bb ^ sum)) >= 0, a, bitcast<T>(sum)); } template <typename T, size_t N> -KFR_SINTRIN vec<T, N> saturated_signed_sub(vec<T, N> a, vec<T, N> b) +KFR_SINTRIN vec<T, N> saturated_signed_sub(const vec<T, N>& a, const vec<T, N>& b) { using UT = utype<T>; constexpr size_t shift = typebits<UT>::bits - 1; @@ -55,49 +55,49 @@ KFR_SINTRIN vec<T, N> saturated_signed_sub(vec<T, N> a, vec<T, N> b) return select(bitcast<T>((aa ^ bb) & (aa ^ diff)) < 0, a, bitcast<T>(diff)); } template <typename T, size_t N> -KFR_SINTRIN vec<T, N> saturated_unsigned_add(vec<T, N> a, vec<T, N> b) +KFR_SINTRIN vec<T, N> saturated_unsigned_add(const vec<T, N>& a, const vec<T, N>& b) { const vec<T, N> t = allonesvector(a); return select(a > t - b, t, a + b); } template <typename T, size_t N> -KFR_SINTRIN vec<T, N> saturated_unsigned_sub(vec<T, N> a, vec<T, N> b) +KFR_SINTRIN vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>& b) { return select(a < b, zerovector(a), a - b); } #if defined CID_ARCH_SSE2 -KFR_SINTRIN u8sse satadd(u8sse x, u8sse y) { return _mm_adds_epu8(*x, *y); } -KFR_SINTRIN i8sse satadd(i8sse x, i8sse y) { return _mm_adds_epi8(*x, *y); } -KFR_SINTRIN u16sse satadd(u16sse x, u16sse y) { return _mm_adds_epu16(*x, *y); } -KFR_SINTRIN i16sse satadd(i16sse x, i16sse y) { return _mm_adds_epi16(*x, *y); } +KFR_SINTRIN u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(*x, *y); } +KFR_SINTRIN i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(*x, *y); } +KFR_SINTRIN u16sse satadd(const u16sse& x, const u16sse& y) { return _mm_adds_epu16(*x, *y); } +KFR_SINTRIN i16sse satadd(const i16sse& x, const i16sse& y) { return _mm_adds_epi16(*x, *y); } -KFR_SINTRIN u8sse satsub(u8sse x, u8sse y) { return _mm_subs_epu8(*x, *y); } -KFR_SINTRIN i8sse satsub(i8sse x, i8sse y) { return _mm_subs_epi8(*x, *y); } -KFR_SINTRIN u16sse satsub(u16sse x, u16sse y) { return _mm_subs_epu16(*x, *y); } -KFR_SINTRIN i16sse satsub(i16sse x, i16sse y) { return _mm_subs_epi16(*x, *y); } +KFR_SINTRIN u8sse satsub(const u8sse& x, const u8sse& y) { return _mm_subs_epu8(*x, *y); } +KFR_SINTRIN i8sse satsub(const i8sse& x, const i8sse& y) { return _mm_subs_epi8(*x, *y); } +KFR_SINTRIN u16sse satsub(const u16sse& x, const u16sse& y) { return _mm_subs_epu16(*x, *y); } +KFR_SINTRIN i16sse satsub(const i16sse& x, const i16sse& y) { return _mm_subs_epi16(*x, *y); } -KFR_SINTRIN i32sse satadd(i32sse a, i32sse b) { return saturated_signed_add(a, b); } -KFR_SINTRIN i64sse satadd(i64sse a, i64sse b) { return saturated_signed_add(a, b); } -KFR_SINTRIN u32sse satadd(u32sse a, u32sse b) { return saturated_unsigned_add(a, b); } -KFR_SINTRIN u64sse satadd(u64sse a, u64sse b) { return saturated_unsigned_add(a, b); } +KFR_SINTRIN i32sse satadd(const i32sse& a, const i32sse& b) { return saturated_signed_add(a, b); } +KFR_SINTRIN i64sse satadd(const i64sse& a, const i64sse& b) { return saturated_signed_add(a, b); } +KFR_SINTRIN u32sse satadd(const u32sse& a, const u32sse& b) { return saturated_unsigned_add(a, b); } +KFR_SINTRIN u64sse satadd(const u64sse& a, const u64sse& b) { return saturated_unsigned_add(a, b); } -KFR_SINTRIN i32sse satsub(i32sse a, i32sse b) { return saturated_signed_sub(a, b); } -KFR_SINTRIN i64sse satsub(i64sse a, i64sse b) { return saturated_signed_sub(a, b); } -KFR_SINTRIN u32sse satsub(u32sse a, u32sse b) { return saturated_unsigned_sub(a, b); } -KFR_SINTRIN u64sse satsub(u64sse a, u64sse b) { return saturated_unsigned_sub(a, b); } +KFR_SINTRIN i32sse satsub(const i32sse& a, const i32sse& b) { return saturated_signed_sub(a, b); } +KFR_SINTRIN i64sse satsub(const i64sse& a, const i64sse& b) { return saturated_signed_sub(a, b); } +KFR_SINTRIN u32sse satsub(const u32sse& a, const u32sse& b) { return saturated_unsigned_sub(a, b); } +KFR_SINTRIN u64sse satsub(const u64sse& a, const u64sse& b) { return saturated_unsigned_sub(a, b); } #if defined CID_ARCH_AVX2 -KFR_SINTRIN u8avx satadd(u8avx x, u8avx y) { return _mm256_adds_epu8(*x, *y); } -KFR_SINTRIN i8avx satadd(i8avx x, i8avx y) { return _mm256_adds_epi8(*x, *y); } -KFR_SINTRIN u16avx satadd(u16avx x, u16avx y) { return _mm256_adds_epu16(*x, *y); } -KFR_SINTRIN i16avx satadd(i16avx x, i16avx y) { return _mm256_adds_epi16(*x, *y); } - -KFR_SINTRIN u8avx satsub(u8avx x, u8avx y) { return _mm256_subs_epu8(*x, *y); } -KFR_SINTRIN i8avx satsub(i8avx x, i8avx y) { return _mm256_subs_epi8(*x, *y); } -KFR_SINTRIN u16avx satsub(u16avx x, u16avx y) { return _mm256_subs_epu16(*x, *y); } -KFR_SINTRIN i16avx satsub(i16avx x, i16avx y) { return _mm256_subs_epi16(*x, *y); } +KFR_SINTRIN u8avx satadd(const u8avx& x, const u8avx& y) { return _mm256_adds_epu8(*x, *y); } +KFR_SINTRIN i8avx satadd(const i8avx& x, const i8avx& y) { return _mm256_adds_epi8(*x, *y); } +KFR_SINTRIN u16avx satadd(const u16avx& x, const u16avx& y) { return _mm256_adds_epu16(*x, *y); } +KFR_SINTRIN i16avx satadd(const i16avx& x, const i16avx& y) { return _mm256_adds_epi16(*x, *y); } + +KFR_SINTRIN u8avx satsub(const u8avx& x, const u8avx& y) { return _mm256_subs_epu8(*x, *y); } +KFR_SINTRIN i8avx satsub(const i8avx& x, const i8avx& y) { return _mm256_subs_epi8(*x, *y); } +KFR_SINTRIN u16avx satsub(const u16avx& x, const u16avx& y) { return _mm256_subs_epu16(*x, *y); } +KFR_SINTRIN i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs_epi16(*x, *y); } #endif KFR_HANDLE_ALL_SIZES_2(satadd) @@ -106,22 +106,22 @@ KFR_HANDLE_ALL_SIZES_2(satsub) #else // fallback template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)> -KFR_SINTRIN vec<T, N> satadd(vec<T, N> a, vec<T, N> b) +KFR_SINTRIN vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b) { return saturated_signed_add(a, b); } template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)> -KFR_SINTRIN vec<T, N> satadd(vec<T, N> a, vec<T, N> b) +KFR_SINTRIN vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b) { return saturated_unsigned_add(a, b); } template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)> -KFR_SINTRIN vec<T, N> satsub(vec<T, N> a, vec<T, N> b) +KFR_SINTRIN vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b) { return saturated_signed_sub(a, b); } template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)> -KFR_SINTRIN vec<T, N> satsub(vec<T, N> a, vec<T, N> b) +KFR_SINTRIN vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b) { return saturated_unsigned_sub(a, b); } diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp @@ -31,40 +31,100 @@ namespace intrinsics #if defined CID_ARCH_SSE41 -KFR_SINTRIN u8sse select(mu8sse m, u8sse x, u8sse y) { return _mm_blendv_epi8(*y, *x, *m); } -KFR_SINTRIN u16sse select(mu16sse m, u16sse x, u16sse y) { return _mm_blendv_epi8(*y, *x, *m); } -KFR_SINTRIN u32sse select(mu32sse m, u32sse x, u32sse y) { return _mm_blendv_epi8(*y, *x, *m); } -KFR_SINTRIN u64sse select(mu64sse m, u64sse x, u64sse y) { return _mm_blendv_epi8(*y, *x, *m); } -KFR_SINTRIN i8sse select(mi8sse m, i8sse x, i8sse y) { return _mm_blendv_epi8(*y, *x, *m); } -KFR_SINTRIN i16sse select(mi16sse m, i16sse x, i16sse y) { return _mm_blendv_epi8(*y, *x, *m); } -KFR_SINTRIN i32sse select(mi32sse m, i32sse x, i32sse y) { return _mm_blendv_epi8(*y, *x, *m); } -KFR_SINTRIN i64sse select(mi64sse m, i64sse x, i64sse y) { return _mm_blendv_epi8(*y, *x, *m); } -KFR_SINTRIN f32sse select(mf32sse m, f32sse x, f32sse y) { return _mm_blendv_ps(*y, *x, *m); } -KFR_SINTRIN f64sse select(mf64sse m, f64sse x, f64sse y) { return _mm_blendv_pd(*y, *x, *m); } +KFR_SINTRIN u8sse select(const mu8sse& m, const u8sse& x, const u8sse& y) +{ + return _mm_blendv_epi8(*y, *x, *m); +} +KFR_SINTRIN u16sse select(const mu16sse& m, const u16sse& x, const u16sse& y) +{ + return _mm_blendv_epi8(*y, *x, *m); +} +KFR_SINTRIN u32sse select(const mu32sse& m, const u32sse& x, const u32sse& y) +{ + return _mm_blendv_epi8(*y, *x, *m); +} +KFR_SINTRIN u64sse select(const mu64sse& m, const u64sse& x, const u64sse& y) +{ + return _mm_blendv_epi8(*y, *x, *m); +} +KFR_SINTRIN i8sse select(const mi8sse& m, const i8sse& x, const i8sse& y) +{ + return _mm_blendv_epi8(*y, *x, *m); +} +KFR_SINTRIN i16sse select(const mi16sse& m, const i16sse& x, const i16sse& y) +{ + return _mm_blendv_epi8(*y, *x, *m); +} +KFR_SINTRIN i32sse select(const mi32sse& m, const i32sse& x, const i32sse& y) +{ + return _mm_blendv_epi8(*y, *x, *m); +} +KFR_SINTRIN i64sse select(const mi64sse& m, const i64sse& x, const i64sse& y) +{ + return _mm_blendv_epi8(*y, *x, *m); +} +KFR_SINTRIN f32sse select(const mf32sse& m, const f32sse& x, const f32sse& y) +{ + return _mm_blendv_ps(*y, *x, *m); +} +KFR_SINTRIN f64sse select(const mf64sse& m, const f64sse& x, const f64sse& y) +{ + return _mm_blendv_pd(*y, *x, *m); +} #if defined CID_ARCH_AVX -KFR_SINTRIN f64avx select(mf64avx m, f64avx x, f64avx y) { return _mm256_blendv_pd(*y, *x, *m); } -KFR_SINTRIN f32avx select(mf32avx m, f32avx x, f32avx y) { return _mm256_blendv_ps(*y, *x, *m); } +KFR_SINTRIN f64avx select(const mf64avx& m, const f64avx& x, const f64avx& y) +{ + return _mm256_blendv_pd(*y, *x, *m); +} +KFR_SINTRIN f32avx select(const mf32avx& m, const f32avx& x, const f32avx& y) +{ + return _mm256_blendv_ps(*y, *x, *m); +} #endif #if defined CID_ARCH_AVX2 -KFR_SINTRIN u8avx select(mu8avx m, u8avx x, u8avx y) { return _mm256_blendv_epi8(*y, *x, *m); } -KFR_SINTRIN u16avx select(mu16avx m, u16avx x, u16avx y) { return _mm256_blendv_epi8(*y, *x, *m); } -KFR_SINTRIN u32avx select(mu32avx m, u32avx x, u32avx y) { return _mm256_blendv_epi8(*y, *x, *m); } -KFR_SINTRIN u64avx select(mu64avx m, u64avx x, u64avx y) { return _mm256_blendv_epi8(*y, *x, *m); } -KFR_SINTRIN i8avx select(mi8avx m, i8avx x, i8avx y) { return _mm256_blendv_epi8(*y, *x, *m); } -KFR_SINTRIN i16avx select(mi16avx m, i16avx x, i16avx y) { return _mm256_blendv_epi8(*y, *x, *m); } -KFR_SINTRIN i32avx select(mi32avx m, i32avx x, i32avx y) { return _mm256_blendv_epi8(*y, *x, *m); } -KFR_SINTRIN i64avx select(mi64avx m, i64avx x, i64avx y) { return _mm256_blendv_epi8(*y, *x, *m); } +KFR_SINTRIN u8avx select(const mu8avx& m, const u8avx& x, const u8avx& y) +{ + return _mm256_blendv_epi8(*y, *x, *m); +} +KFR_SINTRIN u16avx select(const mu16avx& m, const u16avx& x, const u16avx& y) +{ + return _mm256_blendv_epi8(*y, *x, *m); +} +KFR_SINTRIN u32avx select(const mu32avx& m, const u32avx& x, const u32avx& y) +{ + return _mm256_blendv_epi8(*y, *x, *m); +} +KFR_SINTRIN u64avx select(const mu64avx& m, const u64avx& x, const u64avx& y) +{ + return _mm256_blendv_epi8(*y, *x, *m); +} +KFR_SINTRIN i8avx select(const mi8avx& m, const i8avx& x, const i8avx& y) +{ + return _mm256_blendv_epi8(*y, *x, *m); +} +KFR_SINTRIN i16avx select(const mi16avx& m, const i16avx& x, const i16avx& y) +{ + return _mm256_blendv_epi8(*y, *x, *m); +} +KFR_SINTRIN i32avx select(const mi32avx& m, const i32avx& x, const i32avx& y) +{ + return _mm256_blendv_epi8(*y, *x, *m); +} +KFR_SINTRIN i64avx select(const mi64avx& m, const i64avx& x, const i64avx& y) +{ + return _mm256_blendv_epi8(*y, *x, *m); +} #endif template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> -KFR_SINTRIN vec<T, N> select(mask<T, N> a, vec<T, N> b, vec<T, N> c) +KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) { return slice<0, N>(select(expand_simd(a).asmask(), expand_simd(b), expand_simd(c))); } template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> -KFR_SINTRIN vec<T, N> select(mask<T, N> a, vec<T, N> b, vec<T, N> c) +KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) { return concat(select(low(a).asmask(), low(b), low(c)), select(high(a).asmask(), high(b), high(c))); } @@ -73,7 +133,7 @@ KFR_SINTRIN vec<T, N> select(mask<T, N> a, vec<T, N> b, vec<T, N> c) // fallback template <typename T, size_t N> -KFR_SINTRIN vec<T, N> select(mask<T, N> m, vec<T, N> x, vec<T, N> y) +KFR_SINTRIN vec<T, N> select(mask<T, N> m, const vec<T, N>& x, const vec<T, N>& y) { return y ^ ((x ^ y) & m); } diff --git a/include/kfr/base/sin_cos.hpp b/include/kfr/base/sin_cos.hpp @@ -52,19 +52,20 @@ template <typename T> constexpr static T fold_constant_rem2 = choose_const<T>(-0x1.de973ep-32f, 0x1.8469898cc5170p-49); template <typename T, size_t N> -KFR_SINTRIN vec<T, N> trig_horner(vec<T, N>, mask<T, N> msk, T a0, T b0) +KFR_SINTRIN vec<T, N> trig_horner(const vec<T, N>&, const mask<T, N>& msk, const T& a0, const T& b0) { return select(msk, a0, b0); } template <typename T, size_t N, typename... Ts> -KFR_SINTRIN vec<T, N> trig_horner(vec<T, N> x, mask<T, N> msk, T a0, T b0, T a1, T b1, Ts... values) +KFR_SINTRIN vec<T, N> trig_horner(const vec<T, N>& x, const mask<T, N>& msk, const T& a0, const T& b0, + const T& a1, const T& b1, const Ts&... values) { return fmadd(trig_horner(x, msk, a1, b1, values...), x, select(msk, a0, b0)); } template <typename T, size_t N, typename Tprecise = f64> -KFR_SINTRIN vec<T, N> trig_fold(vec<T, N> x, vec<itype<T>, N>& quadrant) +KFR_SINTRIN vec<T, N> trig_fold(const vec<T, N>& x, vec<itype<T>, N>& quadrant) { const vec<T, N> xabs = abs(x); constexpr vec<T, N> div = fold_constant_div<T>; @@ -83,7 +84,7 @@ KFR_SINTRIN vec<T, N> trig_fold(vec<T, N> x, vec<itype<T>, N>& quadrant) } template <size_t N> -KFR_SINTRIN vec<f32, N> trig_sincos(vec<f32, N> folded, mask<f32, N> cosmask) +KFR_SINTRIN vec<f32, N> trig_sincos(const vec<f32, N>& folded, const mask<f32, N>& cosmask) { constexpr f32 sin_c2 = -0x2.aaaaacp-4f; constexpr f32 sin_c4 = 0x2.222334p-8f; @@ -106,7 +107,7 @@ KFR_SINTRIN vec<f32, N> trig_sincos(vec<f32, N> folded, mask<f32, N> cosmask) } template <size_t N> -KFR_SINTRIN vec<f64, N> trig_sincos(vec<f64, N> folded, mask<f64, N> cosmask) +KFR_SINTRIN vec<f64, N> trig_sincos(const vec<f64, N>& folded, const mask<f64, N>& cosmask) { constexpr f64 sin_c2 = -0x2.aaaaaaaaaaaaap-4; constexpr f64 sin_c4 = 0x2.22222222220cep-8; @@ -135,7 +136,7 @@ KFR_SINTRIN vec<f64, N> trig_sincos(vec<f64, N> folded, mask<f64, N> cosmask) } template <typename T, size_t N, typename = u8[N > 1]> -KFR_SINTRIN vec<T, N> sincos_mask(vec<T, N> x_full, mask<T, N> cosmask) +KFR_SINTRIN vec<T, N> sincos_mask(const vec<T, N>& x_full, const mask<T, N>& cosmask) { vec<itype<T>, N> quadrant; vec<T, N> folded = trig_fold(x_full, quadrant); @@ -156,7 +157,7 @@ KFR_SINTRIN vec<T, N> sincos_mask(vec<T, N> x_full, mask<T, N> cosmask) } template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> sin(vec<T, N> x) +KFR_SINTRIN vec<T, N> sin(const vec<T, N>& x) { vec<itype<T>, N> quadrant; vec<T, N> folded = trig_fold(x, quadrant); @@ -171,7 +172,7 @@ KFR_SINTRIN vec<T, N> sin(vec<T, N> x) } template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> cos(vec<T, N> x) +KFR_SINTRIN vec<T, N> cos(const vec<T, N>& x) { vec<itype<T>, N> quadrant; vec<T, N> folded = trig_fold(x, quadrant); @@ -187,7 +188,7 @@ KFR_SINTRIN vec<T, N> cos(vec<T, N> x) } template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> fastsin(vec<T, N> x) +KFR_SINTRIN vec<T, N> fastsin(const vec<T, N>& x) { constexpr vec<T, N> msk = broadcast<N>(internal::highbitmask<T>); @@ -212,7 +213,7 @@ KFR_SINTRIN vec<T, N> fastsin(vec<T, N> x) } template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> fastcos(vec<T, N> x) +KFR_SINTRIN vec<T, N> fastcos(const vec<T, N>& x) { x += c_pi<T, 1, 2>; x = select(x >= c_pi<T, 2>, x - c_pi<T, 2>, x); @@ -220,61 +221,61 @@ KFR_SINTRIN vec<T, N> fastcos(vec<T, N> x) } template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> sincos(vec<T, N> x) +KFR_SINTRIN vec<T, N> sincos(const vec<T, N>& x) { return sincos_mask(x, internal::oddmask<T, N>()); } template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> cossin(vec<T, N> x) +KFR_SINTRIN vec<T, N> cossin(const vec<T, N>& x) { return sincos_mask(x, internal::evenmask<T, N>()); } template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)> -KFR_SINTRIN vec<T, N> sinc(vec<T, N> x) +KFR_SINTRIN vec<T, N> sinc(const vec<T, N>& x) { return select(abs(x) <= c_epsilon<T>, T(1), sin(x) / x); } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>> -KFR_SINTRIN vec<Tout, N> sin(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> sin(const vec<T, N>& x) { return sin(cast<Tout>(x)); } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>> -KFR_SINTRIN vec<Tout, N> cos(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> cos(const vec<T, N>& x) { return cos(cast<Tout>(x)); } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>> -KFR_SINTRIN vec<Tout, N> fastsin(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> fastsin(const vec<T, N>& x) { return fastsin(cast<Tout>(x)); } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>> -KFR_SINTRIN vec<Tout, N> fastcos(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> fastcos(const vec<T, N>& x) { return fastcos(cast<Tout>(x)); } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>> -KFR_SINTRIN vec<Tout, N> sincos(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> sincos(const vec<T, N>& x) { return sincos(cast<Tout>(x)); } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>> -KFR_SINTRIN vec<Tout, N> cossin(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> cossin(const vec<T, N>& x) { return cossin(cast<Tout>(x)); } template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>> -KFR_SINTRIN vec<Tout, N> sinc(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> sinc(const vec<T, N>& x) { return sinc(cast<Tout>(x)); } diff --git a/include/kfr/base/sqrt.hpp b/include/kfr/base/sqrt.hpp @@ -32,14 +32,17 @@ namespace intrinsics #if defined CID_ARCH_SSE2 -KFR_SINTRIN f32x1 sqrt(f32x1 x) { return slice<0, 1>(tovec(_mm_sqrt_ss(*extend<4>(x)))); } -KFR_SINTRIN f64x1 sqrt(f64x1 x) { return slice<0, 1>(tovec(_mm_sqrt_sd(_mm_setzero_pd(), *extend<2>(x)))); } -KFR_SINTRIN f32sse sqrt(f32sse x) { return _mm_sqrt_ps(*x); } -KFR_SINTRIN f64sse sqrt(f64sse x) { return _mm_sqrt_pd(*x); } +KFR_SINTRIN f32x1 sqrt(const f32x1& x) { return slice<0, 1>(tovec(_mm_sqrt_ss(*extend<4>(x)))); } +KFR_SINTRIN f64x1 sqrt(const f64x1& x) +{ + return slice<0, 1>(tovec(_mm_sqrt_sd(_mm_setzero_pd(), *extend<2>(x)))); +} +KFR_SINTRIN f32sse sqrt(const f32sse& x) { return _mm_sqrt_ps(*x); } +KFR_SINTRIN f64sse sqrt(const f64sse& x) { return _mm_sqrt_pd(*x); } #if defined CID_ARCH_AVX -KFR_SINTRIN f32avx sqrt(f32avx x) { return _mm256_sqrt_ps(*x); } -KFR_SINTRIN f64avx sqrt(f64avx x) { return _mm256_sqrt_pd(*x); } +KFR_SINTRIN f32avx sqrt(const f32avx& x) { return _mm256_sqrt_ps(*x); } +KFR_SINTRIN f64avx sqrt(const f64avx& x) { return _mm256_sqrt_pd(*x); } #endif KFR_HANDLE_ALL_SIZES_FLT_1(sqrt) @@ -48,7 +51,7 @@ KFR_HANDLE_ALL_SIZES_FLT_1(sqrt) // fallback template <typename T, size_t N, typename Tout = flt_type<T>> -KFR_SINTRIN vec<Tout, N> sqrt(vec<T, N> x) +KFR_SINTRIN vec<Tout, N> sqrt(const vec<T, N>& x) { return apply([](T x) { return std::sqrt(static_cast<Tout>(x)); }, x); } diff --git a/include/kfr/base/tan.hpp b/include/kfr/base/tan.hpp @@ -35,7 +35,7 @@ namespace intrinsics { template <typename T, size_t N, typename IT = itype<T>> -KFR_SINTRIN vec<T, N> trig_fold_simple(vec<T, N> x_full, mask<T, N>& inverse) +KFR_SINTRIN vec<T, N> trig_fold_simple(const vec<T, N>& x_full, mask<T, N>& inverse) { constexpr T pi_14 = c_pi<T, 1, 4>; @@ -56,7 +56,7 @@ KFR_SINTRIN vec<T, N> trig_fold_simple(vec<T, N> x_full, mask<T, N>& inverse) } template <size_t N> -KFR_SINTRIN vec<f32, N> tan(vec<f32, N> x_full) +KFR_SINTRIN vec<f32, N> tan(const vec<f32, N>& x_full) { mask<f32, N> inverse; const vec<f32, N> x = trig_fold_simple(x_full, inverse); @@ -84,7 +84,7 @@ KFR_SINTRIN vec<f32, N> tan(vec<f32, N> x_full) } template <size_t N> -KFR_SINTRIN vec<f64, N> tan(vec<f64, N> x_full) +KFR_SINTRIN vec<f64, N> tan(const vec<f64, N>& x_full) { mask<f64, N> inverse; const vec<f64, N> x = trig_fold_simple(x_full, inverse); diff --git a/include/kfr/base/univector.hpp b/include/kfr/base/univector.hpp @@ -40,7 +40,7 @@ template <typename T, typename Class> struct univector_base : input_expression, output_expression { template <typename U, size_t N> - KFR_INLINE void operator()(coutput_t, size_t index, vec<U, N> value) + KFR_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& value) { T* data = derived_cast<Class>(this)->data(); write(ptr_cast<T>(data) + index, cast<T>(value)); diff --git a/include/kfr/dsp/goertzel.hpp b/include/kfr/dsp/goertzel.hpp @@ -53,7 +53,7 @@ public: result.imag(q2 * sin(omega)); } template <typename U, size_t N> - KFR_INLINE void operator()(coutput_t, size_t index, vec<U, N> x) + KFR_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x) { vec<T, N> in = cast<T>(x); KFR_LOOP_UNROLL @@ -90,7 +90,7 @@ public: } } template <typename U, size_t N> - KFR_INLINE void operator()(coutput_t, size_t index, vec<U, N> x) + KFR_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x) { const vec<T, N> in = cast<T>(x); KFR_LOOP_UNROLL diff --git a/include/kfr/expressions/basic.hpp b/include/kfr/expressions/basic.hpp @@ -346,7 +346,7 @@ struct multioutput : output_expression { } template <typename T, size_t N> - void operator()(coutput_t, size_t index, vec<T, N> x) + void operator()(coutput_t, size_t index, const vec<T, N>& x) { cfor(csize<0>, csize<sizeof...(E)>, [&](auto n) { std::get<val_of(n)>(outputs)(coutput, index, x); }); } diff --git a/include/kfr/expressions/reduce.hpp b/include/kfr/expressions/reduce.hpp @@ -70,7 +70,7 @@ struct expression_reduce : output_expression } template <typename U, size_t N> - KFR_INLINE void operator()(coutput_t, size_t, vec<U, N> x) const + KFR_INLINE void operator()(coutput_t, size_t, const vec<U, N>& x) const { counter += N; process(x); diff --git a/include/kfr/io/file.hpp b/include/kfr/io/file.hpp @@ -48,7 +48,7 @@ struct expression_sequential_file_writer : expression_file_base, output_expressi { using expression_file_base::expression_file_base; template <typename U, size_t N> - void operator()(coutput_t, size_t, vec<U, N> value) + void operator()(coutput_t, size_t, const vec<U, N>& value) { write(value); } @@ -81,7 +81,7 @@ struct expression_file_writer : expression_file_base, output_expression { using expression_file_base::expression_file_base; template <typename U, size_t N> - void operator()(coutput_t, size_t index, vec<U, N> value) + void operator()(coutput_t, size_t index, const vec<U, N>& value) { if (position != index) fseeko(file, static_cast<off_t>(index * sizeof(T)), SEEK_SET);