commit 374e2c03d0d1b6032624c5e7ec073608a35256f1
parent 75ba81ba8421d474003470cd1254654d8cd8c14d
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date: Sat, 30 Jul 2016 21:33:56 +0300
Pass vec by const reference (intrinsics)
Diffstat:
27 files changed, 464 insertions(+), 396 deletions(-)
diff --git a/include/kfr/base/abs.hpp b/include/kfr/base/abs.hpp
@@ -33,31 +33,31 @@ namespace intrinsics
{
// floating point
template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> abs(vec<T, N> x)
+KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
{
return x & internal::invhighbitmask<T>;
}
#if defined CID_ARCH_SSSE3
-KFR_SINTRIN i64sse abs(i64sse x) { return select(x >= 0, x, -x); }
-KFR_SINTRIN i32sse abs(i32sse x) { return _mm_abs_epi32(*x); }
-KFR_SINTRIN i16sse abs(i16sse x) { return _mm_abs_epi16(*x); }
-KFR_SINTRIN i8sse abs(i8sse x) { return _mm_abs_epi8(*x); }
-KFR_SINTRIN u64sse abs(u64sse x) { return x; }
-KFR_SINTRIN u32sse abs(u32sse x) { return x; }
-KFR_SINTRIN u16sse abs(u16sse x) { return x; }
-KFR_SINTRIN u8sse abs(u8sse x) { return x; }
+KFR_SINTRIN i64sse abs(const i64sse& x) { return select(x >= 0, x, -x); }
+KFR_SINTRIN i32sse abs(const i32sse& x) { return _mm_abs_epi32(*x); }
+KFR_SINTRIN i16sse abs(const i16sse& x) { return _mm_abs_epi16(*x); }
+KFR_SINTRIN i8sse abs(const i8sse& x) { return _mm_abs_epi8(*x); }
+KFR_SINTRIN u64sse abs(const u64sse& x) { return x; }
+KFR_SINTRIN u32sse abs(const u32sse& x) { return x; }
+KFR_SINTRIN u16sse abs(const u16sse& x) { return x; }
+KFR_SINTRIN u8sse abs(const u8sse& x) { return x; }
#if defined CID_ARCH_AVX2
-KFR_SINTRIN i64avx abs(i64avx x) { return select(x >= 0, x, -x); }
-KFR_SINTRIN i32avx abs(i32avx x) { return _mm256_abs_epi32(*x); }
-KFR_SINTRIN i16avx abs(i16avx x) { return _mm256_abs_epi16(*x); }
-KFR_SINTRIN i8avx abs(i8avx x) { return _mm256_abs_epi8(*x); }
-KFR_SINTRIN u64avx abs(u64avx x) { return x; }
-KFR_SINTRIN u32avx abs(u32avx x) { return x; }
-KFR_SINTRIN u16avx abs(u16avx x) { return x; }
-KFR_SINTRIN u8avx abs(u8avx x) { return x; }
+KFR_SINTRIN i64avx abs(const i64avx& x) { return select(x >= 0, x, -x); }
+KFR_SINTRIN i32avx abs(const i32avx& x) { return _mm256_abs_epi32(*x); }
+KFR_SINTRIN i16avx abs(const i16avx& x) { return _mm256_abs_epi16(*x); }
+KFR_SINTRIN i8avx abs(const i8avx& x) { return _mm256_abs_epi8(*x); }
+KFR_SINTRIN u64avx abs(const u64avx& x) { return x; }
+KFR_SINTRIN u32avx abs(const u32avx& x) { return x; }
+KFR_SINTRIN u16avx abs(const u16avx& x) { return x; }
+KFR_SINTRIN u8avx abs(const u8avx& x) { return x; }
#endif
KFR_HANDLE_ALL_SIZES_NOT_F_1(abs)
@@ -66,7 +66,7 @@ KFR_HANDLE_ALL_SIZES_NOT_F_1(abs)
// fallback
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> abs(vec<T, N> x)
+KFR_SINTRIN vec<T, N> abs(const vec<T, N>& x)
{
return select(x >= T(), x, -x);
}
diff --git a/include/kfr/base/asin_acos.hpp b/include/kfr/base/asin_acos.hpp
@@ -34,14 +34,14 @@ namespace intrinsics
{
template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> asin(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> asin(const vec<T, N>& x)
{
const vec<Tout, N> xx = cast<Tout>(x);
return atan2(xx, sqrt(Tout(1) - xx * xx));
}
template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> acos(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> acos(const vec<T, N>& x)
{
const vec<Tout, N> xx = cast<Tout>(x);
return atan2(sqrt(Tout(1) - xx * xx), xx);
diff --git a/include/kfr/base/atan.hpp b/include/kfr/base/atan.hpp
@@ -100,7 +100,7 @@ KFR_SINTRIN vec<f64, N> atan2k(vec<f64, N> y, vec<f64, N> x)
}
template <size_t N>
-KFR_SINTRIN vec<f32, N> atan2(vec<f32, N> y, vec<f32, N> x)
+KFR_SINTRIN vec<f32, N> atan2(const vec<f32, N>& y, const vec<f32, N>& x)
{
vec<f32, N> r = atan2k(abs(y), x);
constexpr f32 pi = 3.1415926535897932384626433832795f;
@@ -115,7 +115,7 @@ KFR_SINTRIN vec<f32, N> atan2(vec<f32, N> y, vec<f32, N> x)
}
template <size_t N>
-KFR_SINTRIN vec<f64, N> atan2(vec<f64, N> y, vec<f64, N> x)
+KFR_SINTRIN vec<f64, N> atan2(const vec<f64, N>& y, const vec<f64, N>& x)
{
vec<f64, N> r = atan2k(abs(y), x);
constexpr f64 pi = 3.1415926535897932384626433832795;
@@ -130,7 +130,7 @@ KFR_SINTRIN vec<f64, N> atan2(vec<f64, N> y, vec<f64, N> x)
}
template <size_t N>
-KFR_SINTRIN vec<f32, N> atan(vec<f32, N> s)
+KFR_SINTRIN vec<f32, N> atan(const vec<f32, N>& s)
{
vec<f32, N> t, u;
vec<i32, N> q;
@@ -154,7 +154,7 @@ KFR_SINTRIN vec<f32, N> atan(vec<f32, N> s)
}
template <size_t N>
-KFR_SINTRIN vec<f64, N> atan(vec<f64, N> s)
+KFR_SINTRIN vec<f64, N> atan(const vec<f64, N>& s)
{
vec<f64, N> t, u;
vec<i64, N> q;
diff --git a/include/kfr/base/clamp.hpp b/include/kfr/base/clamp.hpp
@@ -31,13 +31,13 @@ namespace intrinsics
{
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> clamp(vec<T, N> x, vec<T, N> lo, vec<T, N> hi)
+KFR_SINTRIN vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& lo, const vec<T, N>& hi)
{
return max(min(x, hi), lo);
}
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> clamp(vec<T, N> x, vec<T, N> hi)
+KFR_SINTRIN vec<T, N> clamp(const vec<T, N>& x, const vec<T, N>& hi)
{
return max(min(x, hi), zerovector<T, N>());
}
diff --git a/include/kfr/base/complex.hpp b/include/kfr/base/complex.hpp
@@ -209,7 +209,7 @@ struct is_complex_impl<complex<T>> : std::true_type
// real to complex
template <typename To, typename From, size_t N, KFR_ENABLE_IF(internal::is_complex_impl<To>::value)>
-constexpr KFR_INLINE vec<To, N> cast(vec<From, N> value) noexcept
+constexpr KFR_INLINE vec<To, N> cast(const vec<From, N>& value) noexcept
{
const vec<subtype<To>, N> casted = cast<subtype<To>>(value);
return subcast<To>(interleave(casted, zerovector(casted)));
@@ -217,14 +217,14 @@ constexpr KFR_INLINE vec<To, N> cast(vec<From, N> value) noexcept
// complex to complex
template <typename To, typename From, size_t N, KFR_ENABLE_IF(internal::is_complex_impl<To>::value)>
-constexpr KFR_INLINE vec<To, N> cast(vec<complex<From>, N> value) noexcept
+constexpr KFR_INLINE vec<To, N> cast(const vec<complex<From>, N>& value) noexcept
{
return subcast<To>(cast<subtype<To>>(subcast<From>(value)));
}
// complex to real
template <typename To, typename From, size_t N, KFR_ENABLE_IF(!internal::is_complex_impl<To>::value)>
-constexpr KFR_INLINE vec<To, N> cast(vec<complex<From>, N> value) noexcept
+constexpr KFR_INLINE vec<To, N> cast(const vec<complex<From>, N>& value) noexcept
{
static_assert(sizeof(To) == 0, "Can't cast complex to real");
return {};
@@ -375,7 +375,7 @@ KFR_SINTRIN vec<complex<T>, N> cartesian(const vec<complex<T>, N>& x)
}
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> cabsdup(vec<T, N> x)
+KFR_SINTRIN vec<T, N> cabsdup(const vec<T, N>& x)
{
x = sqr(x);
return sqrt(x + swap<2>(x));
diff --git a/include/kfr/base/digitreverse.hpp b/include/kfr/base/digitreverse.hpp
@@ -90,19 +90,19 @@ struct shuffle_index_digitreverse
}
template <size_t radix, size_t groupsize = 1, typename T, size_t N>
-KFR_INLINE vec<T, N> digitreverse(vec<T, N> x)
+KFR_INLINE vec<T, N> digitreverse(const vec<T, N>& x)
{
return shufflevector<N, internal::shuffle_index_digitreverse<radix, ilog2(N / groupsize)>, groupsize>(x);
}
template <size_t groupsize = 1, typename T, size_t N>
-KFR_INLINE vec<T, N> bitreverse(vec<T, N> x)
+KFR_INLINE vec<T, N> bitreverse(const vec<T, N>& x)
{
return digitreverse<2, groupsize>(x);
}
template <size_t groupsize = 1, typename T, size_t N>
-KFR_INLINE vec<T, N> digitreverse4(vec<T, N> x)
+KFR_INLINE vec<T, N> digitreverse4(const vec<T, N>& x)
{
return digitreverse<4, groupsize>(x);
}
diff --git a/include/kfr/base/expression.hpp b/include/kfr/base/expression.hpp
@@ -143,7 +143,7 @@ struct expression_scalar : input_expression
using value_type = T;
expression_scalar() = delete;
constexpr expression_scalar(const T& val) noexcept : val(val) {}
- constexpr expression_scalar(vec<T, width> val) noexcept : val(val) {}
+ constexpr expression_scalar(const vec<T, width>& val) noexcept : val(val) {}
const vec<T, width> val;
template <typename U, size_t N>
@@ -221,7 +221,7 @@ KFR_INLINE internal::expression_scalar<T> scalar(const T& val)
}
template <typename T, size_t N>
-KFR_INLINE internal::expression_scalar<T, N> scalar(vec<T, N> val)
+KFR_INLINE internal::expression_scalar<T, N> scalar(const vec<T, N>& val)
{
return internal::expression_scalar<T, N>(val);
}
diff --git a/include/kfr/base/function.hpp b/include/kfr/base/function.hpp
@@ -126,37 +126,37 @@ constexpr inline size_t next_simd_width(size_t n)
}
template <typename T, size_t N, size_t Nout = next_simd_width<cpu_t::native, T>(N)>
-KFR_SINTRIN vec<T, Nout> expand_simd(vec<T, N> x)
+KFR_SINTRIN vec<T, Nout> expand_simd(const vec<T, N>& x)
{
return extend<Nout>(x);
}
template <typename T, size_t N, size_t Nout = next_simd_width<cpu_t::native, T>(N)>
-KFR_SINTRIN vec<T, Nout> expand_simd(vec<T, N> x, identity<T> value)
+KFR_SINTRIN vec<T, Nout> expand_simd(const vec<T, N>& x, identity<T> value)
{
return widen<Nout>(x, value);
}
#define KFR_HANDLE_ALL_SIZES_1(fn) \
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \
- KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \
+ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
{ \
return slice<0, N>(fn(expand_simd(a))); \
} \
template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \
- KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \
+ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
{ \
return concat(fn(low(a)), fn(high(a))); \
}
#define KFR_HANDLE_ALL_SIZES_FLT_1(fn) \
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \
- KFR_SINTRIN vec<flt_type<T>, N> fn(vec<T, N> a) \
+ KFR_SINTRIN vec<flt_type<T>, N> fn(const vec<T, N>& a) \
{ \
return slice<0, N>(fn(expand_simd(cast<flt_type<T>>(a)))); \
} \
template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \
- KFR_SINTRIN vec<flt_type<T>, N> fn(vec<T, N> a) \
+ KFR_SINTRIN vec<flt_type<T>, N> fn(const vec<T, N>& a) \
{ \
return concat(fn(low(cast<flt_type<T>>(a))), fn(high(cast<flt_type<T>>(a)))); \
}
@@ -164,13 +164,13 @@ KFR_SINTRIN vec<T, Nout> expand_simd(vec<T, N> x, identity<T> value)
#define KFR_HANDLE_ALL_SIZES_F_1(fn) \
template <typename T, size_t N, \
KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && is_f_class<T>::value)> \
- KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \
+ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
{ \
return slice<0, N>(fn(expand_simd(a))); \
} \
template <typename T, size_t N, \
KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && is_f_class<T>::value), typename = void> \
- KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \
+ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
{ \
return concat(fn(low(a)), fn(high(a))); \
}
@@ -178,13 +178,13 @@ KFR_SINTRIN vec<T, Nout> expand_simd(vec<T, N> x, identity<T> value)
#define KFR_HANDLE_ALL_SIZES_I_1(fn) \
template <typename T, size_t N, \
KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && is_i_class<T>::value)> \
- KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \
+ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
{ \
return slice<0, N>(fn(expand_simd(a))); \
} \
template <typename T, size_t N, \
KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && is_i_class<T>::value), typename = void> \
- KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \
+ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
{ \
return concat(fn(low(a)), fn(high(a))); \
}
@@ -192,13 +192,13 @@ KFR_SINTRIN vec<T, Nout> expand_simd(vec<T, N> x, identity<T> value)
#define KFR_HANDLE_ALL_SIZES_U_1(fn) \
template <typename T, size_t N, \
KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && is_u_class<T>::value)> \
- KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \
+ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
{ \
return slice<0, N>(fn(expand_simd(a))); \
} \
template <typename T, size_t N, \
KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && is_u_class<T>::value), typename = void> \
- KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \
+ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
{ \
return concat(fn(low(a)), fn(high(a))); \
}
@@ -206,49 +206,49 @@ KFR_SINTRIN vec<T, Nout> expand_simd(vec<T, N> x, identity<T> value)
#define KFR_HANDLE_ALL_SIZES_NOT_F_1(fn) \
template <typename T, size_t N, \
KFR_ENABLE_IF(N < vector_width<T, cpu_t::native> && !is_f_class<T>::value)> \
- KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \
+ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
{ \
return slice<0, N>(fn(expand_simd(a))); \
} \
template <typename T, size_t N, \
KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native> && !is_f_class<T>::value), typename = void> \
- KFR_SINTRIN vec<T, N> fn(vec<T, N> a) \
+ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a) \
{ \
return concat(fn(low(a)), fn(high(a))); \
}
#define KFR_HANDLE_ALL_SIZES_2(fn) \
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \
- KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b) \
+ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) \
{ \
return slice<0, N>(fn(expand_simd(a), expand_simd(b))); \
} \
template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \
- KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b) \
+ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b) \
{ \
return concat(fn(low(a), low(b)), fn(high(a), high(b))); \
}
#define KFR_HANDLE_ALL_SIZES_3(fn) \
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \
- KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b, vec<T, N> c) \
+ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) \
{ \
return slice<0, N>(fn(expand_simd(a), expand_simd(b), expand_simd(c))); \
} \
template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \
- KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b, vec<T, N> c) \
+ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c) \
{ \
return concat(fn(low(a), low(b), low(c)), fn(high(a), high(b), high(c))); \
}
#define KFR_HANDLE_ALL_SIZES_4(fn) \
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)> \
- KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b, vec<T, N> c, vec<T, N> d) \
+ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, const vec<T, N>& d) \
{ \
return slice<0, N>(fn(expand_simd(a), expand_simd(b), expand_simd(c), expand_simd(d))); \
} \
template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void> \
- KFR_SINTRIN vec<T, N> fn(vec<T, N> a, vec<T, N> b, vec<T, N> c, vec<T, N> d) \
+ KFR_SINTRIN vec<T, N> fn(const vec<T, N>& a, const vec<T, N>& b, const vec<T, N>& c, const vec<T, N>& d) \
{ \
return concat(fn(low(a), low(b), low(c), low(d)), fn(high(a), high(b), high(c), high(d))); \
}
diff --git a/include/kfr/base/gamma.hpp b/include/kfr/base/gamma.hpp
@@ -42,7 +42,7 @@ constexpr T gamma_precalc[] = {
};
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> gamma(vec<T, N> z)
+KFR_SINTRIN vec<T, N> gamma(const vec<T, N>& z)
{
constexpr size_t Count = arraysize(gamma_precalc<T>);
vec<T, N> accm = gamma_precalc<T>[0];
@@ -54,7 +54,7 @@ KFR_SINTRIN vec<T, N> gamma(vec<T, N> z)
}
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> factorial_approx(vec<T, N> x)
+KFR_SINTRIN vec<T, N> factorial_approx(const vec<T, N>& x)
{
return gamma(x + T(1));
}
diff --git a/include/kfr/base/hyperbolic.hpp b/include/kfr/base/hyperbolic.hpp
@@ -36,33 +36,33 @@ namespace intrinsics
{
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> sinh(vec<T, N> x)
+KFR_SINTRIN vec<T, N> sinh(const vec<T, N>& x)
{
return (exp(x) - exp(-x)) * T(0.5);
}
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> cosh(vec<T, N> x)
+KFR_SINTRIN vec<T, N> cosh(const vec<T, N>& x)
{
return (exp(x) + exp(-x)) * T(0.5);
}
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> tanh(vec<T, N> x)
+KFR_SINTRIN vec<T, N> tanh(const vec<T, N>& x)
{
x = -2 * x;
return (1 - exp(x)) / (1 + exp(x));
}
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> coth(vec<T, N> x)
+KFR_SINTRIN vec<T, N> coth(const vec<T, N>& x)
{
x = -2 * x;
return (1 + exp(x)) / (1 - exp(x));
}
template <typename T, size_t N, KFR_ENABLE_IF(N > 1)>
-KFR_SINTRIN vec<T, N> sinhcosh(vec<T, N> x)
+KFR_SINTRIN vec<T, N> sinhcosh(const vec<T, N>& x)
{
const vec<T, N> a = exp(x);
const vec<T, N> b = exp(-x);
@@ -70,7 +70,7 @@ KFR_SINTRIN vec<T, N> sinhcosh(vec<T, N> x)
}
template <typename T, size_t N, KFR_ENABLE_IF(N > 1)>
-KFR_SINTRIN vec<T, N> coshsinh(vec<T, N> x)
+KFR_SINTRIN vec<T, N> coshsinh(const vec<T, N>& x)
{
const vec<T, N> a = exp(x);
const vec<T, N> b = exp(-x);
diff --git a/include/kfr/base/log_exp.hpp b/include/kfr/base/log_exp.hpp
@@ -39,55 +39,53 @@ namespace intrinsics
{
template <size_t N>
-KFR_SINTRIN vec<i32, N> vilogbp1(vec<f32, N> d)
+KFR_SINTRIN vec<i32, N> vilogbp1(const vec<f32, N>& d)
{
mask<i32, N> m = d < 5.421010862427522E-20f;
- d = select(m, 1.8446744073709552E19f * d, d);
- vec<i32, N> q = (ibitcast(d) >> 23) & 0xff;
+ vec<i32, N> q = (ibitcast(select(m, 1.8446744073709552E19f * d, d)) >> 23) & 0xff;
q = select(m, q - (64 + 0x7e), q - 0x7e);
return q;
}
template <size_t N>
-KFR_SINTRIN vec<i64, N> vilogbp1(vec<f64, N> d)
+KFR_SINTRIN vec<i64, N> vilogbp1(const vec<f64, N>& d)
{
mask<i64, N> m = d < 4.9090934652977266E-91;
- d = select(m, 2.037035976334486E90 * d, d);
- vec<i64, N> q = (ibitcast(d) >> 52) & 0x7ff;
+ vec<i64, N> q = (ibitcast(select(m, 2.037035976334486E90 * d, d)) >> 52) & 0x7ff;
q = select(m, q - (300 + 0x03fe), q - 0x03fe);
return q;
}
template <size_t N>
-KFR_SINTRIN vec<f32, N> vldexpk(vec<f32, N> x, vec<i32, N> q)
+KFR_SINTRIN vec<f32, N> vldexpk(const vec<f32, N>& x, const vec<i32, N>& q)
{
vec<i32, N> m = q >> 31;
m = (((m + q) >> 6) - m) << 4;
- q = q - (m << 2);
+ const vec<i32, N> qq = q - (m << 2);
m = clamp(m + 0x7f, vec<i32, N>(0xff));
vec<f32, N> u = pow4(bitcast<f32>(cast<i32>(m) << 23));
- return x * u * bitcast<f32>((cast<i32>(q + 0x7f)) << 23);
+ return x * u * bitcast<f32>((cast<i32>(qq + 0x7f)) << 23);
}
template <size_t N>
-KFR_SINTRIN vec<f64, N> vldexpk(vec<f64, N> x, vec<i64, N> q)
+KFR_SINTRIN vec<f64, N> vldexpk(const vec<f64, N>& x, const vec<i64, N>& q)
{
vec<i64, N> m = q >> 31;
m = (((m + q) >> 9) - m) << 7;
- q = q - (m << 2);
+ const vec<i64, N> qq = q - (m << 2);
m = clamp(m + 0x3ff, i64(0x7ff));
vec<f64, N> u = pow4(bitcast<f64>(cast<i64>(m) << 52));
- return x * u * bitcast<f64>((cast<i64>(q + 0x3ff)) << 52);
+ return x * u * bitcast<f64>((cast<i64>(qq + 0x3ff)) << 52);
}
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> logb(vec<T, N> x)
+KFR_SINTRIN vec<T, N> logb(const vec<T, N>& x)
{
return select(x == T(), -c_infinity<T>, cast<T>(vilogbp1(x) - 1));
}
template <size_t N>
-KFR_SINTRIN vec<f32, N> log(vec<f32, N> d)
+KFR_SINTRIN vec<f32, N> log(const vec<f32, N>& d)
{
vec<i32, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485f );
vec<f32, N> m = vldexpk(d, -e);
@@ -110,7 +108,7 @@ KFR_SINTRIN vec<f32, N> log(vec<f32, N> d)
}
template <size_t N>
-KFR_SINTRIN vec<f64, N> log(vec<f64, N> d)
+KFR_SINTRIN vec<f64, N> log(const vec<f64, N>& d)
{
vec<i64, N> e = vilogbp1(d * 0.7071); // 0678118654752440084436210485 );
vec<f64, N> m = vldexpk(d, -e);
@@ -136,18 +134,18 @@ KFR_SINTRIN vec<f64, N> log(vec<f64, N> d)
}
template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> log2(vec<T, N> x)
+KFR_SINTRIN vec<T, N> log2(const vec<T, N>& x)
{
return log(x) * c_recip_log_2<T>;
}
template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> log10(vec<T, N> x)
+KFR_SINTRIN vec<T, N> log10(const vec<T, N>& x)
{
return log(x) * c_recip_log_10<T>;
}
template <size_t N>
-KFR_SINTRIN vec<f32, N> exp(vec<f32, N> d)
+KFR_SINTRIN vec<f32, N> exp(const vec<f32, N>& d)
{
const f32 ln2_part1 = 0.6931457519f;
const f32 ln2_part2 = 1.4286067653e-6f;
@@ -181,7 +179,7 @@ KFR_SINTRIN vec<f32, N> exp(vec<f32, N> d)
}
template <size_t N>
-KFR_SINTRIN vec<f64, N> exp(vec<f64, N> d)
+KFR_SINTRIN vec<f64, N> exp(const vec<f64, N>& d)
{
const f64 ln2_part1 = 0.69314717501401901245;
const f64 ln2_part2 = 5.545926273775592108e-009;
@@ -222,12 +220,12 @@ KFR_SINTRIN vec<f64, N> exp(vec<f64, N> d)
return u;
}
template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> exp2(vec<T, N> x)
+KFR_SINTRIN vec<T, N> exp2(const vec<T, N>& x)
{
return exp(x * c_log_2<T>);
}
template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> exp10(vec<T, N> x)
+KFR_SINTRIN vec<T, N> exp10(const vec<T, N>& x)
{
return exp(x * c_log_10<T>);
}
@@ -257,7 +255,7 @@ KFR_SINTRIN common_type<T1, T2, T3> log_fmadd(const T1& x, const T2& m, const T3
}
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> pow(vec<T, N> a, vec<T, N> b)
+KFR_SINTRIN vec<T, N> pow(const vec<T, N>& a, const vec<T, N>& b)
{
const vec<T, N> t = exp(b * log(abs(a)));
const mask<T, N> isint = floor(b) == b;
@@ -267,49 +265,49 @@ KFR_SINTRIN vec<T, N> pow(vec<T, N> a, vec<T, N> b)
}
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> root(vec<T, N> x, vec<T, N> b)
+KFR_SINTRIN vec<T, N> root(const vec<T, N>& x, const vec<T, N>& b)
{
return exp(reciprocal(b) * log(x));
}
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> cbrt(vec<T, N> x)
+KFR_SINTRIN vec<T, N> cbrt(const vec<T, N>& x)
{
return pow<T, N>(x, T(0.333333333333333333333333333333333));
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-KFR_SINTRIN vec<Tout, N> exp(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> exp(const vec<T, N>& x)
{
return exp(cast<Tout>(x));
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-KFR_SINTRIN vec<Tout, N> exp2(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> exp2(const vec<T, N>& x)
{
return exp2(cast<Tout>(x));
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-KFR_SINTRIN vec<Tout, N> exp10(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> exp10(const vec<T, N>& x)
{
return exp10(cast<Tout>(x));
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-KFR_SINTRIN vec<Tout, N> log(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> log(const vec<T, N>& x)
{
return log(cast<Tout>(x));
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-KFR_SINTRIN vec<Tout, N> log2(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> log2(const vec<T, N>& x)
{
return log2(cast<Tout>(x));
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-KFR_SINTRIN vec<Tout, N> log10(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> log10(const vec<T, N>& x)
{
return log10(cast<Tout>(x));
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-KFR_SINTRIN vec<Tout, N> cbrt(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> cbrt(const vec<T, N>& x)
{
return cbrt(cast<Tout>(x));
}
diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp
@@ -50,104 +50,110 @@ struct bitmask
#if defined CID_ARCH_SSE41
-KFR_SINTRIN bool bittestany(u8sse x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(u16sse x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(u32sse x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(u64sse x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(i8sse x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(i16sse x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(i32sse x) { return !_mm_testz_si128(*x, *x); }
-KFR_SINTRIN bool bittestany(i64sse x) { return !_mm_testz_si128(*x, *x); }
-
-KFR_SINTRIN bool bittestall(u8sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(u16sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(u32sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(u64sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(i8sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(i16sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(i32sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(i64sse x) { return _mm_testc_si128(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestany(const u8sse& x) { return !_mm_testz_si128(*x, *x); }
+KFR_SINTRIN bool bittestany(const u16sse& x) { return !_mm_testz_si128(*x, *x); }
+KFR_SINTRIN bool bittestany(const u32sse& x) { return !_mm_testz_si128(*x, *x); }
+KFR_SINTRIN bool bittestany(const u64sse& x) { return !_mm_testz_si128(*x, *x); }
+KFR_SINTRIN bool bittestany(const i8sse& x) { return !_mm_testz_si128(*x, *x); }
+KFR_SINTRIN bool bittestany(const i16sse& x) { return !_mm_testz_si128(*x, *x); }
+KFR_SINTRIN bool bittestany(const i32sse& x) { return !_mm_testz_si128(*x, *x); }
+KFR_SINTRIN bool bittestany(const i64sse& x) { return !_mm_testz_si128(*x, *x); }
+
+KFR_SINTRIN bool bittestall(const u8sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(const u16sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(const u32sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(const u64sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(const i8sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(const i16sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(const i32sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(const i64sse& x) { return _mm_testc_si128(*x, *allonesvector(x)); }
#endif
#if defined CID_ARCH_AVX
-KFR_SINTRIN bool bittestany(f32sse x) { return !_mm_testz_ps(*x, *x); }
-KFR_SINTRIN bool bittestany(f64sse x) { return !_mm_testz_pd(*x, *x); }
-KFR_SINTRIN bool bittestall(f32sse x) { return _mm_testc_ps(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(f64sse x) { return _mm_testc_pd(*x, *allonesvector(x)); }
-
-KFR_SINTRIN bool bittestany(f32avx x) { return !_mm256_testz_ps(*x, *x); }
-KFR_SINTRIN bool bittestany(f64avx x) { return !_mm256_testz_pd(*x, *x); }
-
-KFR_SINTRIN bool bittestnall(f32avx x) { return _mm256_testc_ps(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestnall(f64avx x) { return _mm256_testc_pd(*x, *allonesvector(x)); }
-
-KFR_SINTRIN bool bittestany(u8avx x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(u16avx x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(u32avx x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(u64avx x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(i8avx x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(i16avx x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(i32avx x) { return !_mm256_testz_si256(*x, *x); }
-KFR_SINTRIN bool bittestany(i64avx x) { return !_mm256_testz_si256(*x, *x); }
-
-KFR_SINTRIN bool bittestall(u8avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(u16avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(u32avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(u64avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(i8avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(i16avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(i32avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
-KFR_SINTRIN bool bittestall(i64avx x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestany(const f32sse& x) { return !_mm_testz_ps(*x, *x); }
+KFR_SINTRIN bool bittestany(const f64sse& x) { return !_mm_testz_pd(*x, *x); }
+KFR_SINTRIN bool bittestall(const f32sse& x) { return _mm_testc_ps(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(const f64sse& x) { return _mm_testc_pd(*x, *allonesvector(x)); }
+
+KFR_SINTRIN bool bittestany(const f32avx& x) { return !_mm256_testz_ps(*x, *x); }
+KFR_SINTRIN bool bittestany(const f64avx& x) { return !_mm256_testz_pd(*x, *x); }
+
+KFR_SINTRIN bool bittestnall(const f32avx& x) { return _mm256_testc_ps(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestnall(const f64avx& x) { return _mm256_testc_pd(*x, *allonesvector(x)); }
+
+KFR_SINTRIN bool bittestany(const u8avx& x) { return !_mm256_testz_si256(*x, *x); }
+KFR_SINTRIN bool bittestany(const u16avx& x) { return !_mm256_testz_si256(*x, *x); }
+KFR_SINTRIN bool bittestany(const u32avx& x) { return !_mm256_testz_si256(*x, *x); }
+KFR_SINTRIN bool bittestany(const u64avx& x) { return !_mm256_testz_si256(*x, *x); }
+KFR_SINTRIN bool bittestany(const i8avx& x) { return !_mm256_testz_si256(*x, *x); }
+KFR_SINTRIN bool bittestany(const i16avx& x) { return !_mm256_testz_si256(*x, *x); }
+KFR_SINTRIN bool bittestany(const i32avx& x) { return !_mm256_testz_si256(*x, *x); }
+KFR_SINTRIN bool bittestany(const i64avx& x) { return !_mm256_testz_si256(*x, *x); }
+
+KFR_SINTRIN bool bittestall(const u8avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(const u16avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(const u32avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(const u64avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(const i8avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(const i16avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(const i32avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
+KFR_SINTRIN bool bittestall(const i64avx& x) { return _mm256_testc_si256(*x, *allonesvector(x)); }
#elif defined CID_ARCH_SSE41
-KFR_SINTRIN bool bittestany(f32sse x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); }
-KFR_SINTRIN bool bittestany(f64sse x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); }
-KFR_SINTRIN bool bittestall(f32sse x) { return _mm_testc_si128(*bitcast<u8>(x), *allonesvector(bitcast<u8>(x))); }
-KFR_SINTRIN bool bittestall(f64sse x) { return _mm_testc_si128(*bitcast<u8>(x), *allonesvector(bitcast<u8>(x))); }
+KFR_SINTRIN bool bittestany(const f32sse& x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); }
+KFR_SINTRIN bool bittestany(const f64sse& x) { return !_mm_testz_si128(*bitcast<u8>(x), *bitcast<u8>(x)); }
+KFR_SINTRIN bool bittestall(const f32sse& x)
+{
+ return _mm_testc_si128(*bitcast<u8>(x), *allonesvector(bitcast<u8>(x)));
+}
+KFR_SINTRIN bool bittestall(const f64sse& x)
+{
+ return _mm_testc_si128(*bitcast<u8>(x), *allonesvector(bitcast<u8>(x)));
+}
#endif
#if !defined CID_ARCH_SSE41
-KFR_SINTRIN bool bittestany(f32sse x) { return _mm_movemask_ps(*x); }
-KFR_SINTRIN bool bittestany(f64sse x) { return _mm_movemask_pd(*x); }
-KFR_SINTRIN bool bittestany(u8sse x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(u16sse x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(u32sse x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(u64sse x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(i8sse x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(i16sse x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(i32sse x) { return _mm_movemask_epi8(*x); }
-KFR_SINTRIN bool bittestany(i64sse x) { return _mm_movemask_epi8(*x); }
-
-KFR_SINTRIN bool bittestall(f32sse x) { return !_mm_movemask_ps(*~x); }
-KFR_SINTRIN bool bittestall(f64sse x) { return !_mm_movemask_pd(*~x); }
-KFR_SINTRIN bool bittestall(u8sse x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(u16sse x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(u32sse x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(u64sse x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(i8sse x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(i16sse x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(i32sse x) { return !_mm_movemask_epi8(*~x); }
-KFR_SINTRIN bool bittestall(i64sse x) { return !_mm_movemask_epi8(*~x); }
+KFR_SINTRIN bool bittestany(const f32sse& x) { return _mm_movemask_ps(*x); }
+KFR_SINTRIN bool bittestany(const f64sse& x) { return _mm_movemask_pd(*x); }
+KFR_SINTRIN bool bittestany(const u8sse& x) { return _mm_movemask_epi8(*x); }
+KFR_SINTRIN bool bittestany(const u16sse& x) { return _mm_movemask_epi8(*x); }
+KFR_SINTRIN bool bittestany(const u32sse& x) { return _mm_movemask_epi8(*x); }
+KFR_SINTRIN bool bittestany(const u64sse& x) { return _mm_movemask_epi8(*x); }
+KFR_SINTRIN bool bittestany(const i8sse& x) { return _mm_movemask_epi8(*x); }
+KFR_SINTRIN bool bittestany(const i16sse& x) { return _mm_movemask_epi8(*x); }
+KFR_SINTRIN bool bittestany(const i32sse& x) { return _mm_movemask_epi8(*x); }
+KFR_SINTRIN bool bittestany(const i64sse& x) { return _mm_movemask_epi8(*x); }
+
+KFR_SINTRIN bool bittestall(const f32sse& x) { return !_mm_movemask_ps(*~x); }
+KFR_SINTRIN bool bittestall(const f64sse& x) { return !_mm_movemask_pd(*~x); }
+KFR_SINTRIN bool bittestall(const u8sse& x) { return !_mm_movemask_epi8(*~x); }
+KFR_SINTRIN bool bittestall(const u16sse& x) { return !_mm_movemask_epi8(*~x); }
+KFR_SINTRIN bool bittestall(const u32sse& x) { return !_mm_movemask_epi8(*~x); }
+KFR_SINTRIN bool bittestall(const u64sse& x) { return !_mm_movemask_epi8(*~x); }
+KFR_SINTRIN bool bittestall(const i8sse& x) { return !_mm_movemask_epi8(*~x); }
+KFR_SINTRIN bool bittestall(const i16sse& x) { return !_mm_movemask_epi8(*~x); }
+KFR_SINTRIN bool bittestall(const i32sse& x) { return !_mm_movemask_epi8(*~x); }
+KFR_SINTRIN bool bittestall(const i64sse& x) { return !_mm_movemask_epi8(*~x); }
#endif
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>
-KFR_SINTRIN bool bittestall(vec<T, N> a)
+KFR_SINTRIN bool bittestall(const vec<T, N>& a)
{
return bittestall(expand_simd(a, internal::maskbits<T>(true)));
}
template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>
-KFR_SINTRIN bool bittestall(vec<T, N> a)
+KFR_SINTRIN bool bittestall(const vec<T, N>& a)
{
return bittestall(low(a)) && bittestall(high(a));
}
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>
-KFR_SINTRIN bool bittestany(vec<T, N> a)
+KFR_SINTRIN bool bittestany(const vec<T, N>& a)
{
return bittestany(expand_simd(a, internal::maskbits<T>(false)));
}
template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>
-KFR_SINTRIN bool bittestany(vec<T, N> a)
+KFR_SINTRIN bool bittestany(const vec<T, N>& a)
{
return bittestany(low(a)) || bittestany(high(a));
}
@@ -155,7 +161,7 @@ KFR_SINTRIN bool bittestany(vec<T, N> a)
#else
template <typename T, size_t N>
-KFR_SINTRIN bitmask<N> getmask(vec<T, N> x)
+KFR_SINTRIN bitmask<N> getmask(const vec<T, N>& x)
{
typename bitmask<N>::type val = 0;
for (size_t i = 0; i < N; i++)
@@ -166,23 +172,23 @@ KFR_SINTRIN bitmask<N> getmask(vec<T, N> x)
}
template <typename T, size_t N>
-KFR_SINTRIN bool bittestany(vec<T, N> x)
+KFR_SINTRIN bool bittestany(const vec<T, N>& x)
{
return getmask(x).value;
}
template <typename T, size_t N>
-KFR_SINTRIN bool bittestany(vec<T, N> x, vec<T, N> y)
+KFR_SINTRIN bool bittestany(const vec<T, N>& x, const vec<T, N>& y)
{
return bittestany(x & y);
}
template <typename T, size_t N>
-KFR_SINTRIN bool bittestall(vec<T, N> x)
+KFR_SINTRIN bool bittestall(const vec<T, N>& x)
{
return !getmask(~x).value;
}
template <typename T, size_t N>
-KFR_SINTRIN bool bittestall(vec<T, N> x, vec<T, N> y)
+KFR_SINTRIN bool bittestall(const vec<T, N>& x, const vec<T, N>& y)
{
return !bittestany(~x & y);
}
diff --git a/include/kfr/base/min_max.hpp b/include/kfr/base/min_max.hpp
@@ -35,68 +35,68 @@ namespace intrinsics
#if defined CID_ARCH_SSE2
-KFR_SINTRIN f32sse min(f32sse x, f32sse y) { return _mm_min_ps(*x, *y); }
-KFR_SINTRIN f64sse min(f64sse x, f64sse y) { return _mm_min_pd(*x, *y); }
-KFR_SINTRIN u8sse min(u8sse x, u8sse y) { return _mm_min_epu8(*x, *y); }
-KFR_SINTRIN i16sse min(i16sse x, i16sse y) { return _mm_min_epi16(*x, *y); }
-KFR_SINTRIN i64sse min(i64sse x, i64sse y) { return select(x < y, x, y); }
-KFR_SINTRIN u64sse min(u64sse x, u64sse y) { return select(x < y, x, y); }
-
-KFR_SINTRIN f32sse max(f32sse x, f32sse y) { return _mm_max_ps(*x, *y); }
-KFR_SINTRIN f64sse max(f64sse x, f64sse y) { return _mm_max_pd(*x, *y); }
-KFR_SINTRIN u8sse max(u8sse x, u8sse y) { return _mm_max_epu8(*x, *y); }
-KFR_SINTRIN i16sse max(i16sse x, i16sse y) { return _mm_max_epi16(*x, *y); }
-KFR_SINTRIN i64sse max(i64sse x, i64sse y) { return select(x > y, x, y); }
-KFR_SINTRIN u64sse max(u64sse x, u64sse y) { return select(x > y, x, y); }
+KFR_SINTRIN f32sse min(const f32sse& x, const f32sse& y) { return _mm_min_ps(*x, *y); }
+KFR_SINTRIN f64sse min(const f64sse& x, const f64sse& y) { return _mm_min_pd(*x, *y); }
+KFR_SINTRIN u8sse min(const u8sse& x, const u8sse& y) { return _mm_min_epu8(*x, *y); }
+KFR_SINTRIN i16sse min(const i16sse& x, const i16sse& y) { return _mm_min_epi16(*x, *y); }
+KFR_SINTRIN i64sse min(const i64sse& x, const i64sse& y) { return select(x < y, x, y); }
+KFR_SINTRIN u64sse min(const u64sse& x, const u64sse& y) { return select(x < y, x, y); }
+
+KFR_SINTRIN f32sse max(const f32sse& x, const f32sse& y) { return _mm_max_ps(*x, *y); }
+KFR_SINTRIN f64sse max(const f64sse& x, const f64sse& y) { return _mm_max_pd(*x, *y); }
+KFR_SINTRIN u8sse max(const u8sse& x, const u8sse& y) { return _mm_max_epu8(*x, *y); }
+KFR_SINTRIN i16sse max(const i16sse& x, const i16sse& y) { return _mm_max_epi16(*x, *y); }
+KFR_SINTRIN i64sse max(const i64sse& x, const i64sse& y) { return select(x > y, x, y); }
+KFR_SINTRIN u64sse max(const u64sse& x, const u64sse& y) { return select(x > y, x, y); }
#if defined CID_ARCH_AVX2
-KFR_SINTRIN u8avx min(u8avx x, u8avx y) { return _mm256_min_epu8(*x, *y); }
-KFR_SINTRIN i16avx min(i16avx x, i16avx y) { return _mm256_min_epi16(*x, *y); }
-KFR_SINTRIN i8avx min(i8avx x, i8avx y) { return _mm256_min_epi8(*x, *y); }
-KFR_SINTRIN u16avx min(u16avx x, u16avx y) { return _mm256_min_epu16(*x, *y); }
-KFR_SINTRIN i32avx min(i32avx x, i32avx y) { return _mm256_min_epi32(*x, *y); }
-KFR_SINTRIN u32avx min(u32avx x, u32avx y) { return _mm256_min_epu32(*x, *y); }
-
-KFR_SINTRIN u8avx max(u8avx x, u8avx y) { return _mm256_max_epu8(*x, *y); }
-KFR_SINTRIN i16avx max(i16avx x, i16avx y) { return _mm256_max_epi16(*x, *y); }
-KFR_SINTRIN i8avx max(i8avx x, i8avx y) { return _mm256_max_epi8(*x, *y); }
-KFR_SINTRIN u16avx max(u16avx x, u16avx y) { return _mm256_max_epu16(*x, *y); }
-KFR_SINTRIN i32avx max(i32avx x, i32avx y) { return _mm256_max_epi32(*x, *y); }
-KFR_SINTRIN u32avx max(u32avx x, u32avx y) { return _mm256_max_epu32(*x, *y); }
-
-KFR_SINTRIN i64avx min(i64avx x, i64avx y) { return select(x < y, x, y); }
-KFR_SINTRIN u64avx min(u64avx x, u64avx y) { return select(x < y, x, y); }
-KFR_SINTRIN i64avx max(i64avx x, i64avx y) { return select(x > y, x, y); }
-KFR_SINTRIN u64avx max(u64avx x, u64avx y) { return select(x > y, x, y); }
+KFR_SINTRIN u8avx min(const u8avx& x, const u8avx& y) { return _mm256_min_epu8(*x, *y); }
+KFR_SINTRIN i16avx min(const i16avx& x, const i16avx& y) { return _mm256_min_epi16(*x, *y); }
+KFR_SINTRIN i8avx min(const i8avx& x, const i8avx& y) { return _mm256_min_epi8(*x, *y); }
+KFR_SINTRIN u16avx min(const u16avx& x, const u16avx& y) { return _mm256_min_epu16(*x, *y); }
+KFR_SINTRIN i32avx min(const i32avx& x, const i32avx& y) { return _mm256_min_epi32(*x, *y); }
+KFR_SINTRIN u32avx min(const u32avx& x, const u32avx& y) { return _mm256_min_epu32(*x, *y); }
+
+KFR_SINTRIN u8avx max(const u8avx& x, const u8avx& y) { return _mm256_max_epu8(*x, *y); }
+KFR_SINTRIN i16avx max(const i16avx& x, const i16avx& y) { return _mm256_max_epi16(*x, *y); }
+KFR_SINTRIN i8avx max(const i8avx& x, const i8avx& y) { return _mm256_max_epi8(*x, *y); }
+KFR_SINTRIN u16avx max(const u16avx& x, const u16avx& y) { return _mm256_max_epu16(*x, *y); }
+KFR_SINTRIN i32avx max(const i32avx& x, const i32avx& y) { return _mm256_max_epi32(*x, *y); }
+KFR_SINTRIN u32avx max(const u32avx& x, const u32avx& y) { return _mm256_max_epu32(*x, *y); }
+
+KFR_SINTRIN i64avx min(const i64avx& x, const i64avx& y) { return select(x < y, x, y); }
+KFR_SINTRIN u64avx min(const u64avx& x, const u64avx& y) { return select(x < y, x, y); }
+KFR_SINTRIN i64avx max(const i64avx& x, const i64avx& y) { return select(x > y, x, y); }
+KFR_SINTRIN u64avx max(const u64avx& x, const u64avx& y) { return select(x > y, x, y); }
#endif
#if defined CID_ARCH_AVX
-KFR_SINTRIN f32avx min(f32avx x, f32avx y) { return _mm256_min_ps(*x, *y); }
-KFR_SINTRIN f64avx min(f64avx x, f64avx y) { return _mm256_min_pd(*x, *y); }
-KFR_SINTRIN f32avx max(f32avx x, f32avx y) { return _mm256_max_ps(*x, *y); }
-KFR_SINTRIN f64avx max(f64avx x, f64avx y) { return _mm256_max_pd(*x, *y); }
+KFR_SINTRIN f32avx min(const f32avx& x, const f32avx& y) { return _mm256_min_ps(*x, *y); }
+KFR_SINTRIN f64avx min(const f64avx& x, const f64avx& y) { return _mm256_min_pd(*x, *y); }
+KFR_SINTRIN f32avx max(const f32avx& x, const f32avx& y) { return _mm256_max_ps(*x, *y); }
+KFR_SINTRIN f64avx max(const f64avx& x, const f64avx& y) { return _mm256_max_pd(*x, *y); }
#endif
#if defined CID_ARCH_SSE41
-KFR_SINTRIN i8sse min(i8sse x, i8sse y) { return _mm_min_epi8(*x, *y); }
-KFR_SINTRIN u16sse min(u16sse x, u16sse y) { return _mm_min_epu16(*x, *y); }
-KFR_SINTRIN i32sse min(i32sse x, i32sse y) { return _mm_min_epi32(*x, *y); }
-KFR_SINTRIN u32sse min(u32sse x, u32sse y) { return _mm_min_epu32(*x, *y); }
-
-KFR_SINTRIN i8sse max(i8sse x, i8sse y) { return _mm_max_epi8(*x, *y); }
-KFR_SINTRIN u16sse max(u16sse x, u16sse y) { return _mm_max_epu16(*x, *y); }
-KFR_SINTRIN i32sse max(i32sse x, i32sse y) { return _mm_max_epi32(*x, *y); }
-KFR_SINTRIN u32sse max(u32sse x, u32sse y) { return _mm_max_epu32(*x, *y); }
+KFR_SINTRIN i8sse min(const i8sse& x, const i8sse& y) { return _mm_min_epi8(*x, *y); }
+KFR_SINTRIN u16sse min(const u16sse& x, const u16sse& y) { return _mm_min_epu16(*x, *y); }
+KFR_SINTRIN i32sse min(const i32sse& x, const i32sse& y) { return _mm_min_epi32(*x, *y); }
+KFR_SINTRIN u32sse min(const u32sse& x, const u32sse& y) { return _mm_min_epu32(*x, *y); }
+
+KFR_SINTRIN i8sse max(const i8sse& x, const i8sse& y) { return _mm_max_epi8(*x, *y); }
+KFR_SINTRIN u16sse max(const u16sse& x, const u16sse& y) { return _mm_max_epu16(*x, *y); }
+KFR_SINTRIN i32sse max(const i32sse& x, const i32sse& y) { return _mm_max_epi32(*x, *y); }
+KFR_SINTRIN u32sse max(const u32sse& x, const u32sse& y) { return _mm_max_epu32(*x, *y); }
#else
-KFR_SINTRIN i8sse min(i8sse x, i8sse y) { return select(x < y, x, y); }
-KFR_SINTRIN u16sse min(u16sse x, u16sse y) { return select(x < y, x, y); }
-KFR_SINTRIN i32sse min(i32sse x, i32sse y) { return select(x < y, x, y); }
-KFR_SINTRIN u32sse min(u32sse x, u32sse y) { return select(x < y, x, y); }
+KFR_SINTRIN i8sse min(const i8sse& x, const i8sse& y) { return select(x < y, x, y); }
+KFR_SINTRIN u16sse min(const u16sse& x, const u16sse& y) { return select(x < y, x, y); }
+KFR_SINTRIN i32sse min(const i32sse& x, const i32sse& y) { return select(x < y, x, y); }
+KFR_SINTRIN u32sse min(const u32sse& x, const u32sse& y) { return select(x < y, x, y); }
-KFR_SINTRIN i8sse max(i8sse x, i8sse y) { return select(x > y, x, y); }
-KFR_SINTRIN u16sse max(u16sse x, u16sse y) { return select(x > y, x, y); }
-KFR_SINTRIN i32sse max(i32sse x, i32sse y) { return select(x > y, x, y); }
-KFR_SINTRIN u32sse max(u32sse x, u32sse y) { return select(x > y, x, y); }
+KFR_SINTRIN i8sse max(const i8sse& x, const i8sse& y) { return select(x > y, x, y); }
+KFR_SINTRIN u16sse max(const u16sse& x, const u16sse& y) { return select(x > y, x, y); }
+KFR_SINTRIN i32sse max(const i32sse& x, const i32sse& y) { return select(x > y, x, y); }
+KFR_SINTRIN u32sse max(const u32sse& x, const u32sse& y) { return select(x > y, x, y); }
#endif
@@ -107,12 +107,12 @@ KFR_HANDLE_ALL_SIZES_2(max)
// fallback
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> min(vec<T, N> x, vec<T, N> y)
+KFR_SINTRIN vec<T, N> min(const vec<T, N>& x, const vec<T, N>& y)
{
return select(x < y, x, y);
}
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> max(vec<T, N> x, vec<T, N> y)
+KFR_SINTRIN vec<T, N> max(const vec<T, N>& x, const vec<T, N>& y)
{
return select(x > y, x, y);
}
@@ -143,12 +143,12 @@ KFR_SINTRIN T absmax(initialvalue<T>)
}
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> absmin(vec<T, N> x, vec<T, N> y)
+KFR_SINTRIN vec<T, N> absmin(const vec<T, N>& x, const vec<T, N>& y)
{
return min(abs(x), abs(y));
}
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> absmax(vec<T, N> x, vec<T, N> y)
+KFR_SINTRIN vec<T, N> absmax(const vec<T, N>& x, const vec<T, N>& y)
{
return max(abs(x), abs(y));
}
diff --git a/include/kfr/base/modzerobessel.hpp b/include/kfr/base/modzerobessel.hpp
@@ -77,7 +77,7 @@ constexpr T bessel_coef[] = { T(0.25),
T(1.5021381070956226783e-096) };
template <typename T, size_t N>
-KFR_INLINE vec<T, N> modzerobessel(vec<T, N> x)
+KFR_INLINE vec<T, N> modzerobessel(const vec<T, N>& x)
{
const vec<T, N> x_2 = x * 0.5;
const vec<T, N> x_2_sqr = x_2 * x_2;
diff --git a/include/kfr/base/operators.hpp b/include/kfr/base/operators.hpp
@@ -32,18 +32,18 @@ namespace internal
{
template <typename T, typename ReduceFn>
-KFR_INLINE T horizontal_impl(vec<T, 1> value, ReduceFn&&)
+KFR_INLINE T horizontal_impl(const vec<T, 1>& value, ReduceFn&&)
{
return T(value[0]);
}
template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && is_poweroftwo(N))>
-KFR_INLINE T horizontal_impl(vec<T, N> value, ReduceFn&& reduce)
+KFR_INLINE T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce)
{
return horizontal_impl(reduce(low(value), high(value)), std::forward<ReduceFn>(reduce));
}
template <typename T, size_t N, typename ReduceFn, KFR_ENABLE_IF(N > 1 && !is_poweroftwo(N))>
-KFR_INLINE T horizontal_impl(vec<T, N> value, ReduceFn&& reduce)
+KFR_INLINE T horizontal_impl(const vec<T, N>& value, ReduceFn&& reduce)
{
const T initial = reduce(initialvalue<T>());
return horizontal_impl(widen<next_poweroftwo(N)>(value, initial), std::forward<ReduceFn>(reduce));
@@ -51,7 +51,7 @@ KFR_INLINE T horizontal_impl(vec<T, N> value, ReduceFn&& reduce)
}
template <typename T, size_t N, typename ReduceFn>
-KFR_INLINE T horizontal(vec<T, N> value, ReduceFn&& reduce)
+KFR_INLINE T horizontal(const vec<T, N>& value, ReduceFn&& reduce)
{
return internal::horizontal_impl(value, std::forward<ReduceFn>(reduce));
}
@@ -486,7 +486,7 @@ constexpr KFR_INLINE T reciprocal(T x)
KFR_FN(reciprocal)
template <typename T, size_t N>
-KFR_INLINE vec<T, N> mulsign(vec<T, N> x, vec<T, N> y)
+KFR_INLINE vec<T, N> mulsign(const vec<T, N>& x, const vec<T, N>& y)
{
return x ^ (y & internal::highbitmask<T>);
}
@@ -494,13 +494,13 @@ KFR_FN_S(mulsign)
KFR_FN(mulsign)
template <typename T, size_t N>
-constexpr KFR_INLINE vec<T, N> copysign(vec<T, N> x, vec<T, N> y)
+constexpr KFR_INLINE vec<T, N> copysign(const vec<T, N>& x, const vec<T, N>& y)
{
return (x & internal::highbitmask<T>) | (y & internal::highbitmask<T>);
}
template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_INLINE vec<T, N> fmod(vec<T, N> x, vec<T, N> y)
+KFR_INLINE vec<T, N> fmod(const vec<T, N>& x, const vec<T, N>& y)
{
return x - cast<itype<T>>(x / y) * y;
}
@@ -509,55 +509,55 @@ KFR_FN_S(fmod)
KFR_FN(fmod)
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-constexpr KFR_INLINE vec<T, N> rem(vec<T, N> x, vec<T, N> y)
+constexpr KFR_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
{
return x % y;
}
template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_INLINE vec<T, N> rem(vec<T, N> x, vec<T, N> y)
+KFR_INLINE vec<T, N> rem(const vec<T, N>& x, const vec<T, N>& y)
{
return fmod(x, y);
}
template <typename T, size_t N>
-KFR_INLINE mask<T, N> isnan(vec<T, N> x)
+KFR_INLINE mask<T, N> isnan(const vec<T, N>& x)
{
return x != x;
}
template <typename T, size_t N>
-KFR_INLINE mask<T, N> isinf(vec<T, N> x)
+KFR_INLINE mask<T, N> isinf(const vec<T, N>& x)
{
return x == c_infinity<T> || x == -c_infinity<T>;
}
template <typename T, size_t N>
-KFR_INLINE mask<T, N> isfinite(vec<T, N> x)
+KFR_INLINE mask<T, N> isfinite(const vec<T, N>& x)
{
return !isnan(x) && !isinf(x);
}
template <typename T, size_t N>
-KFR_INLINE mask<T, N> isnegative(vec<T, N> x)
+KFR_INLINE mask<T, N> isnegative(const vec<T, N>& x)
{
return (x & internal::highbitmask<T>) != 0;
}
template <typename T, size_t N>
-KFR_INLINE mask<T, N> ispositive(vec<T, N> x)
+KFR_INLINE mask<T, N> ispositive(const vec<T, N>& x)
{
return !isnegative(x);
}
template <typename T, size_t N>
-KFR_INLINE mask<T, N> iszero(vec<T, N> x)
+KFR_INLINE mask<T, N> iszero(const vec<T, N>& x)
{
return x == T();
}
/// Swap byte order
template <typename T, size_t N, KFR_ENABLE_IF(sizeof(vec<T, N>) > 8)>
-KFR_INLINE vec<T, N> swapbyteorder(vec<T, N> x)
+KFR_INLINE vec<T, N> swapbyteorder(const vec<T, N>& x)
{
return bitcast<T>(swap<sizeof(T)>(bitcast<u8>(x)));
}
@@ -580,7 +580,7 @@ KFR_FN(swapbyteorder)
/// Sum all elements of the vector
template <typename T, size_t N>
-KFR_INLINE T hadd(vec<T, N> value)
+KFR_INLINE T hadd(const vec<T, N>& value)
{
return horizontal(value, fn_add());
}
@@ -588,26 +588,26 @@ KFR_FN(hadd)
/// Multiply all elements of the vector
template <typename T, size_t N>
-KFR_INLINE T hmul(vec<T, N> value)
+KFR_INLINE T hmul(const vec<T, N>& value)
{
return horizontal(value, fn_mul());
}
KFR_FN(hmul)
template <typename T, size_t N>
-KFR_INLINE T hbitwiseand(vec<T, N> value)
+KFR_INLINE T hbitwiseand(const vec<T, N>& value)
{
return horizontal(value, fn_bitwiseand());
}
KFR_FN(hbitwiseand)
template <typename T, size_t N>
-KFR_INLINE T hbitwiseor(vec<T, N> value)
+KFR_INLINE T hbitwiseor(const vec<T, N>& value)
{
return horizontal(value, fn_bitwiseor());
}
KFR_FN(hbitwiseor)
template <typename T, size_t N>
-KFR_INLINE T hbitwisexor(vec<T, N> value)
+KFR_INLINE T hbitwisexor(const vec<T, N>& value)
{
return horizontal(value, fn_bitwisexor());
}
@@ -615,7 +615,7 @@ KFR_FN(hbitwisexor)
/// Calculate the Dot-Product of two vectors
template <typename T, size_t N>
-KFR_INLINE T dot(vec<T, N> x, vec<T, N> y)
+KFR_INLINE T dot(const vec<T, N>& x, const vec<T, N>& y)
{
return hadd(x * y);
}
@@ -623,7 +623,7 @@ KFR_FN(dot)
/// Calculate the Arithmetic mean of all elements in the vector
template <typename T, size_t N>
-KFR_INLINE T avg(vec<T, N> value)
+KFR_INLINE T avg(const vec<T, N>& value)
{
return hadd(value) / N;
}
@@ -631,19 +631,19 @@ KFR_FN(avg)
/// Calculate the RMS of all elements in the vector
template <typename T, size_t N>
-KFR_INLINE T rms(vec<T, N> value)
+KFR_INLINE T rms(const vec<T, N>& value)
{
return internal::builtin_sqrt(hadd(value * value) / N);
}
KFR_FN(rms)
template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-KFR_INLINE vec<T, N> subadd(vec<T, N> a, vec<T, N> b)
+KFR_INLINE vec<T, N> subadd(const vec<T, N>& a, const vec<T, N>& b)
{
return blend<1, 0>(a + b, a - b);
}
template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
-KFR_INLINE vec<T, N> addsub(vec<T, N> a, vec<T, N> b)
+KFR_INLINE vec<T, N> addsub(const vec<T, N>& a, const vec<T, N>& b)
{
return blend<0, 1>(a + b, a - b);
}
diff --git a/include/kfr/base/read_write.hpp b/include/kfr/base/read_write.hpp
@@ -36,7 +36,7 @@ KFR_INLINE vec<T, N> read(const T* src)
}
template <bool A = false, size_t N, typename T>
-KFR_INLINE void write(T* dest, vec<T, N> value)
+KFR_INLINE void write(T* dest, const vec<T, N>& value)
{
internal_read_write::write<A, N, T>(dest, value);
}
@@ -54,7 +54,7 @@ KFR_INLINE vec<T, Nout> gather(const T* base)
}
template <size_t Index, size_t... Indices, typename T, size_t N, size_t InIndex = 0>
-KFR_INLINE void scatter(const T* base, vec<T, N> value)
+KFR_INLINE void scatter(const T* base, const vec<T, N>& value)
{
base[Index] = value[InIndex];
scatter<Indices..., T, N, InIndex + 1>(base, value);
@@ -63,7 +63,7 @@ KFR_INLINE void scatter(const T* base, vec<T, N> value)
namespace internal
{
template <typename T, size_t N, size_t... Indices>
-KFR_INLINE vec<T, N> gather(const T* base, vec<u32, N> indices, csizes_t<Indices...>)
+KFR_INLINE vec<T, N> gather(const T* base, const vec<u32, N>& indices, csizes_t<Indices...>)
{
return make_vector(base[indices[Indices]]...);
}
@@ -80,7 +80,7 @@ KFR_INLINE vec<T, Nout> gather_stride_s(const T* base, size_t stride, csizes_t<I
}
template <typename T, size_t N>
-KFR_INLINE vec<T, N> gather(const T* base, vec<u32, N> indices)
+KFR_INLINE vec<T, N> gather(const T* base, const vec<u32, N>& indices)
{
return internal::gather(base, indices, csizeseq<N>);
}
@@ -98,24 +98,24 @@ KFR_INLINE vec<T, Nout> gather_stride(const T* base)
}
template <size_t groupsize, typename T, size_t N, typename IT, size_t... Indices>
-KFR_INLINE vec<T, N * groupsize> gather_helper(const T* base, vec<IT, N> offset, csizes_t<Indices...>)
+KFR_INLINE vec<T, N * groupsize> gather_helper(const T* base, const vec<IT, N>& offset, csizes_t<Indices...>)
{
return concat(read<groupsize>(base + groupsize * (*offset)[Indices])...);
}
template <size_t groupsize = 1, typename T, size_t N, typename IT>
-KFR_INLINE vec<T, N * groupsize> gather(const T* base, vec<IT, N> offset)
+KFR_INLINE vec<T, N * groupsize> gather(const T* base, const vec<IT, N>& offset)
{
return gather_helper<groupsize>(base, offset, csizeseq<N>);
}
template <size_t groupsize, typename T, size_t N, size_t Nout = N* groupsize, typename IT, size_t... Indices>
-KFR_INLINE void scatter_helper(T* base, vec<IT, N> offset, vec<T, Nout> value, csizes_t<Indices...>)
+KFR_INLINE void scatter_helper(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value, csizes_t<Indices...>)
{
swallow{ (write(base + groupsize * (*offset)[Indices], slice<Indices * groupsize, groupsize>(value)),
0)... };
}
template <size_t groupsize = 1, typename T, size_t N, size_t Nout = N* groupsize, typename IT>
-KFR_INLINE void scatter(T* base, vec<IT, N> offset, vec<T, Nout> value)
+KFR_INLINE void scatter(T* base, const vec<IT, N>& offset, const vec<T, Nout>& value)
{
return scatter_helper<groupsize>(base, offset, value, csizeseq<N>);
}
diff --git a/include/kfr/base/round.hpp b/include/kfr/base/round.hpp
@@ -53,29 +53,29 @@ namespace intrinsics
#if defined CID_ARCH_SSE41
-KFR_SINTRIN f32sse floor(f32sse value) { return _mm_floor_ps(*value); }
-KFR_SINTRIN f32sse ceil(f32sse value) { return _mm_ceil_ps(*value); }
-KFR_SINTRIN f32sse trunc(f32sse value) { return KFR_mm_trunc_ps(*value); }
-KFR_SINTRIN f32sse round(f32sse value) { return KFR_mm_roundnearest_ps(*value); }
-KFR_SINTRIN f64sse floor(f64sse value) { return _mm_floor_pd(*value); }
-KFR_SINTRIN f64sse ceil(f64sse value) { return _mm_ceil_pd(*value); }
-KFR_SINTRIN f64sse trunc(f64sse value) { return KFR_mm_trunc_pd(*value); }
-KFR_SINTRIN f64sse round(f64sse value) { return KFR_mm_roundnearest_pd(*value); }
-KFR_SINTRIN f32sse fract(f32sse x) { return x - floor(x); }
-KFR_SINTRIN f64sse fract(f64sse x) { return x - floor(x); }
+KFR_SINTRIN f32sse floor(const f32sse& value) { return _mm_floor_ps(*value); }
+KFR_SINTRIN f32sse ceil(const f32sse& value) { return _mm_ceil_ps(*value); }
+KFR_SINTRIN f32sse trunc(const f32sse& value) { return KFR_mm_trunc_ps(*value); }
+KFR_SINTRIN f32sse round(const f32sse& value) { return KFR_mm_roundnearest_ps(*value); }
+KFR_SINTRIN f64sse floor(const f64sse& value) { return _mm_floor_pd(*value); }
+KFR_SINTRIN f64sse ceil(const f64sse& value) { return _mm_ceil_pd(*value); }
+KFR_SINTRIN f64sse trunc(const f64sse& value) { return KFR_mm_trunc_pd(*value); }
+KFR_SINTRIN f64sse round(const f64sse& value) { return KFR_mm_roundnearest_pd(*value); }
+KFR_SINTRIN f32sse fract(const f32sse& x) { return x - floor(x); }
+KFR_SINTRIN f64sse fract(const f64sse& x) { return x - floor(x); }
#if defined CID_ARCH_AVX
-KFR_SINTRIN f32avx floor(f32avx value) { return _mm256_floor_ps(*value); }
-KFR_SINTRIN f32avx ceil(f32avx value) { return _mm256_ceil_ps(*value); }
-KFR_SINTRIN f32avx trunc(f32avx value) { return KFR_mm256_trunc_ps(*value); }
-KFR_SINTRIN f32avx round(f32avx value) { return KFR_mm256_roundnearest_ps(*value); }
-KFR_SINTRIN f64avx floor(f64avx value) { return _mm256_floor_pd(*value); }
-KFR_SINTRIN f64avx ceil(f64avx value) { return _mm256_ceil_pd(*value); }
-KFR_SINTRIN f64avx trunc(f64avx value) { return KFR_mm256_trunc_pd(*value); }
-KFR_SINTRIN f64avx round(f64avx value) { return KFR_mm256_roundnearest_pd(*value); }
-KFR_SINTRIN f32avx fract(f32avx x) { return x - floor(x); }
-KFR_SINTRIN f64avx fract(f64avx x) { return x - floor(x); }
+KFR_SINTRIN f32avx floor(const f32avx& value) { return _mm256_floor_ps(*value); }
+KFR_SINTRIN f32avx ceil(const f32avx& value) { return _mm256_ceil_ps(*value); }
+KFR_SINTRIN f32avx trunc(const f32avx& value) { return KFR_mm256_trunc_ps(*value); }
+KFR_SINTRIN f32avx round(const f32avx& value) { return KFR_mm256_roundnearest_ps(*value); }
+KFR_SINTRIN f64avx floor(const f64avx& value) { return _mm256_floor_pd(*value); }
+KFR_SINTRIN f64avx ceil(const f64avx& value) { return _mm256_ceil_pd(*value); }
+KFR_SINTRIN f64avx trunc(const f64avx& value) { return KFR_mm256_trunc_pd(*value); }
+KFR_SINTRIN f64avx round(const f64avx& value) { return KFR_mm256_roundnearest_pd(*value); }
+KFR_SINTRIN f32avx fract(const f32avx& x) { return x - floor(x); }
+KFR_SINTRIN f64avx fract(const f64avx& x) { return x - floor(x); }
#endif
KFR_HANDLE_ALL_SIZES_F_1(floor)
@@ -89,104 +89,104 @@ KFR_HANDLE_ALL_SIZES_F_1(fract)
// fallback
template <size_t N>
-KFR_SINTRIN vec<f32, N> floor(vec<f32, N> x)
+KFR_SINTRIN vec<f32, N> floor(const vec<f32, N>& x)
{
vec<f32, N> t = cast<f32>(cast<i32>(x));
return t - (bitcast<f32>(x < t) & 1.f);
}
template <size_t N>
-KFR_SINTRIN vec<f64, N> floor(vec<f64, N> x)
+KFR_SINTRIN vec<f64, N> floor(const vec<f64, N>& x)
{
vec<f64, N> t = cast<f64>(cast<i64>(x));
return t - (bitcast<f64>(x < t) & 1.0);
}
template <size_t N>
-KFR_SINTRIN vec<f32, N> ceil(vec<f32, N> x)
+KFR_SINTRIN vec<f32, N> ceil(const vec<f32, N>& x)
{
vec<f32, N> t = cast<f32>(cast<i32>(x));
return t + (bitcast<f32>(x > t) & 1.f);
}
template <size_t N>
-KFR_SINTRIN vec<f64, N> ceil(vec<f64, N> x)
+KFR_SINTRIN vec<f64, N> ceil(const vec<f64, N>& x)
{
vec<f64, N> t = cast<f64>(cast<i64>(x));
return t + (bitcast<f64>(x > t) & 1.0);
}
template <size_t N>
-KFR_SINTRIN vec<f32, N> round(vec<f32, N> x)
+KFR_SINTRIN vec<f32, N> round(const vec<f32, N>& x)
{
return cast<f32>(cast<i32>(x + mulsign(broadcast<N>(0.5f), x)));
}
template <size_t N>
-KFR_SINTRIN vec<f64, N> round(vec<f64, N> x)
+KFR_SINTRIN vec<f64, N> round(const vec<f64, N>& x)
{
return cast<f64>(cast<i64>(x + mulsign(broadcast<N>(0.5), x)));
}
template <size_t N>
-KFR_SINTRIN vec<f32, N> trunc(vec<f32, N> x)
+KFR_SINTRIN vec<f32, N> trunc(const vec<f32, N>& x)
{
return cast<f32>(cast<i32>(x));
}
template <size_t N>
-KFR_SINTRIN vec<f64, N> trunc(vec<f64, N> x)
+KFR_SINTRIN vec<f64, N> trunc(const vec<f64, N>& x)
{
return cast<f64>(cast<i64>(x));
}
template <size_t N>
-KFR_SINTRIN vec<f32, N> fract(vec<f32, N> x)
+KFR_SINTRIN vec<f32, N> fract(const vec<f32, N>& x)
{
return x - floor(x);
}
template <size_t N>
-KFR_SINTRIN vec<f64, N> fract(vec<f64, N> x)
+KFR_SINTRIN vec<f64, N> fract(const vec<f64, N>& x)
{
return x - floor(x);
}
#endif
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> floor(vec<T, N> value)
+KFR_SINTRIN vec<T, N> floor(const vec<T, N>& value)
{
return value;
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> ceil(vec<T, N> value)
+KFR_SINTRIN vec<T, N> ceil(const vec<T, N>& value)
{
return value;
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> trunc(vec<T, N> value)
+KFR_SINTRIN vec<T, N> trunc(const vec<T, N>& value)
{
return value;
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> round(vec<T, N> value)
+KFR_SINTRIN vec<T, N> round(const vec<T, N>& value)
{
return value;
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> fract(vec<T, N>)
+KFR_SINTRIN vec<T, N> fract(const vec<T, N>&)
{
return T(0);
}
template <typename T, size_t N, typename IT = itype<T>>
-KFR_SINTRIN vec<IT, N> ifloor(vec<T, N> value)
+KFR_SINTRIN vec<IT, N> ifloor(const vec<T, N>& value)
{
return cast<IT>(floor(value));
}
template <typename T, size_t N, typename IT = itype<T>>
-KFR_SINTRIN vec<IT, N> iceil(vec<T, N> value)
+KFR_SINTRIN vec<IT, N> iceil(const vec<T, N>& value)
{
return cast<IT>(ceil(value));
}
template <typename T, size_t N, typename IT = itype<T>>
-KFR_SINTRIN vec<IT, N> itrunc(vec<T, N> value)
+KFR_SINTRIN vec<IT, N> itrunc(const vec<T, N>& value)
{
return cast<IT>(trunc(value));
}
template <typename T, size_t N, typename IT = itype<T>>
-KFR_SINTRIN vec<IT, N> iround(vec<T, N> value)
+KFR_SINTRIN vec<IT, N> iround(const vec<T, N>& value)
{
return cast<IT>(round(value));
}
diff --git a/include/kfr/base/saturation.hpp b/include/kfr/base/saturation.hpp
@@ -31,7 +31,7 @@ namespace kfr
namespace intrinsics
{
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> saturated_signed_add(vec<T, N> a, vec<T, N> b)
+KFR_SINTRIN vec<T, N> saturated_signed_add(const vec<T, N>& a, const vec<T, N>& b)
{
using UT = utype<T>;
constexpr size_t shift = typebits<UT>::bits - 1;
@@ -43,7 +43,7 @@ KFR_SINTRIN vec<T, N> saturated_signed_add(vec<T, N> a, vec<T, N> b)
return select(bitcast<T>((aa ^ bb) | ~(bb ^ sum)) >= 0, a, bitcast<T>(sum));
}
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> saturated_signed_sub(vec<T, N> a, vec<T, N> b)
+KFR_SINTRIN vec<T, N> saturated_signed_sub(const vec<T, N>& a, const vec<T, N>& b)
{
using UT = utype<T>;
constexpr size_t shift = typebits<UT>::bits - 1;
@@ -55,49 +55,49 @@ KFR_SINTRIN vec<T, N> saturated_signed_sub(vec<T, N> a, vec<T, N> b)
return select(bitcast<T>((aa ^ bb) & (aa ^ diff)) < 0, a, bitcast<T>(diff));
}
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> saturated_unsigned_add(vec<T, N> a, vec<T, N> b)
+KFR_SINTRIN vec<T, N> saturated_unsigned_add(const vec<T, N>& a, const vec<T, N>& b)
{
const vec<T, N> t = allonesvector(a);
return select(a > t - b, t, a + b);
}
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> saturated_unsigned_sub(vec<T, N> a, vec<T, N> b)
+KFR_SINTRIN vec<T, N> saturated_unsigned_sub(const vec<T, N>& a, const vec<T, N>& b)
{
return select(a < b, zerovector(a), a - b);
}
#if defined CID_ARCH_SSE2
-KFR_SINTRIN u8sse satadd(u8sse x, u8sse y) { return _mm_adds_epu8(*x, *y); }
-KFR_SINTRIN i8sse satadd(i8sse x, i8sse y) { return _mm_adds_epi8(*x, *y); }
-KFR_SINTRIN u16sse satadd(u16sse x, u16sse y) { return _mm_adds_epu16(*x, *y); }
-KFR_SINTRIN i16sse satadd(i16sse x, i16sse y) { return _mm_adds_epi16(*x, *y); }
+KFR_SINTRIN u8sse satadd(const u8sse& x, const u8sse& y) { return _mm_adds_epu8(*x, *y); }
+KFR_SINTRIN i8sse satadd(const i8sse& x, const i8sse& y) { return _mm_adds_epi8(*x, *y); }
+KFR_SINTRIN u16sse satadd(const u16sse& x, const u16sse& y) { return _mm_adds_epu16(*x, *y); }
+KFR_SINTRIN i16sse satadd(const i16sse& x, const i16sse& y) { return _mm_adds_epi16(*x, *y); }
-KFR_SINTRIN u8sse satsub(u8sse x, u8sse y) { return _mm_subs_epu8(*x, *y); }
-KFR_SINTRIN i8sse satsub(i8sse x, i8sse y) { return _mm_subs_epi8(*x, *y); }
-KFR_SINTRIN u16sse satsub(u16sse x, u16sse y) { return _mm_subs_epu16(*x, *y); }
-KFR_SINTRIN i16sse satsub(i16sse x, i16sse y) { return _mm_subs_epi16(*x, *y); }
+KFR_SINTRIN u8sse satsub(const u8sse& x, const u8sse& y) { return _mm_subs_epu8(*x, *y); }
+KFR_SINTRIN i8sse satsub(const i8sse& x, const i8sse& y) { return _mm_subs_epi8(*x, *y); }
+KFR_SINTRIN u16sse satsub(const u16sse& x, const u16sse& y) { return _mm_subs_epu16(*x, *y); }
+KFR_SINTRIN i16sse satsub(const i16sse& x, const i16sse& y) { return _mm_subs_epi16(*x, *y); }
-KFR_SINTRIN i32sse satadd(i32sse a, i32sse b) { return saturated_signed_add(a, b); }
-KFR_SINTRIN i64sse satadd(i64sse a, i64sse b) { return saturated_signed_add(a, b); }
-KFR_SINTRIN u32sse satadd(u32sse a, u32sse b) { return saturated_unsigned_add(a, b); }
-KFR_SINTRIN u64sse satadd(u64sse a, u64sse b) { return saturated_unsigned_add(a, b); }
+KFR_SINTRIN i32sse satadd(const i32sse& a, const i32sse& b) { return saturated_signed_add(a, b); }
+KFR_SINTRIN i64sse satadd(const i64sse& a, const i64sse& b) { return saturated_signed_add(a, b); }
+KFR_SINTRIN u32sse satadd(const u32sse& a, const u32sse& b) { return saturated_unsigned_add(a, b); }
+KFR_SINTRIN u64sse satadd(const u64sse& a, const u64sse& b) { return saturated_unsigned_add(a, b); }
-KFR_SINTRIN i32sse satsub(i32sse a, i32sse b) { return saturated_signed_sub(a, b); }
-KFR_SINTRIN i64sse satsub(i64sse a, i64sse b) { return saturated_signed_sub(a, b); }
-KFR_SINTRIN u32sse satsub(u32sse a, u32sse b) { return saturated_unsigned_sub(a, b); }
-KFR_SINTRIN u64sse satsub(u64sse a, u64sse b) { return saturated_unsigned_sub(a, b); }
+KFR_SINTRIN i32sse satsub(const i32sse& a, const i32sse& b) { return saturated_signed_sub(a, b); }
+KFR_SINTRIN i64sse satsub(const i64sse& a, const i64sse& b) { return saturated_signed_sub(a, b); }
+KFR_SINTRIN u32sse satsub(const u32sse& a, const u32sse& b) { return saturated_unsigned_sub(a, b); }
+KFR_SINTRIN u64sse satsub(const u64sse& a, const u64sse& b) { return saturated_unsigned_sub(a, b); }
#if defined CID_ARCH_AVX2
-KFR_SINTRIN u8avx satadd(u8avx x, u8avx y) { return _mm256_adds_epu8(*x, *y); }
-KFR_SINTRIN i8avx satadd(i8avx x, i8avx y) { return _mm256_adds_epi8(*x, *y); }
-KFR_SINTRIN u16avx satadd(u16avx x, u16avx y) { return _mm256_adds_epu16(*x, *y); }
-KFR_SINTRIN i16avx satadd(i16avx x, i16avx y) { return _mm256_adds_epi16(*x, *y); }
-
-KFR_SINTRIN u8avx satsub(u8avx x, u8avx y) { return _mm256_subs_epu8(*x, *y); }
-KFR_SINTRIN i8avx satsub(i8avx x, i8avx y) { return _mm256_subs_epi8(*x, *y); }
-KFR_SINTRIN u16avx satsub(u16avx x, u16avx y) { return _mm256_subs_epu16(*x, *y); }
-KFR_SINTRIN i16avx satsub(i16avx x, i16avx y) { return _mm256_subs_epi16(*x, *y); }
+KFR_SINTRIN u8avx satadd(const u8avx& x, const u8avx& y) { return _mm256_adds_epu8(*x, *y); }
+KFR_SINTRIN i8avx satadd(const i8avx& x, const i8avx& y) { return _mm256_adds_epi8(*x, *y); }
+KFR_SINTRIN u16avx satadd(const u16avx& x, const u16avx& y) { return _mm256_adds_epu16(*x, *y); }
+KFR_SINTRIN i16avx satadd(const i16avx& x, const i16avx& y) { return _mm256_adds_epi16(*x, *y); }
+
+KFR_SINTRIN u8avx satsub(const u8avx& x, const u8avx& y) { return _mm256_subs_epu8(*x, *y); }
+KFR_SINTRIN i8avx satsub(const i8avx& x, const i8avx& y) { return _mm256_subs_epi8(*x, *y); }
+KFR_SINTRIN u16avx satsub(const u16avx& x, const u16avx& y) { return _mm256_subs_epu16(*x, *y); }
+KFR_SINTRIN i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs_epi16(*x, *y); }
#endif
KFR_HANDLE_ALL_SIZES_2(satadd)
@@ -106,22 +106,22 @@ KFR_HANDLE_ALL_SIZES_2(satsub)
#else
// fallback
template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)>
-KFR_SINTRIN vec<T, N> satadd(vec<T, N> a, vec<T, N> b)
+KFR_SINTRIN vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
{
return saturated_signed_add(a, b);
}
template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)>
-KFR_SINTRIN vec<T, N> satadd(vec<T, N> a, vec<T, N> b)
+KFR_SINTRIN vec<T, N> satadd(const vec<T, N>& a, const vec<T, N>& b)
{
return saturated_unsigned_add(a, b);
}
template <typename T, size_t N, KFR_ENABLE_IF(std::is_signed<T>::value)>
-KFR_SINTRIN vec<T, N> satsub(vec<T, N> a, vec<T, N> b)
+KFR_SINTRIN vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
{
return saturated_signed_sub(a, b);
}
template <typename T, size_t N, KFR_ENABLE_IF(std::is_unsigned<T>::value)>
-KFR_SINTRIN vec<T, N> satsub(vec<T, N> a, vec<T, N> b)
+KFR_SINTRIN vec<T, N> satsub(const vec<T, N>& a, const vec<T, N>& b)
{
return saturated_unsigned_sub(a, b);
}
diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp
@@ -31,40 +31,100 @@ namespace intrinsics
#if defined CID_ARCH_SSE41
-KFR_SINTRIN u8sse select(mu8sse m, u8sse x, u8sse y) { return _mm_blendv_epi8(*y, *x, *m); }
-KFR_SINTRIN u16sse select(mu16sse m, u16sse x, u16sse y) { return _mm_blendv_epi8(*y, *x, *m); }
-KFR_SINTRIN u32sse select(mu32sse m, u32sse x, u32sse y) { return _mm_blendv_epi8(*y, *x, *m); }
-KFR_SINTRIN u64sse select(mu64sse m, u64sse x, u64sse y) { return _mm_blendv_epi8(*y, *x, *m); }
-KFR_SINTRIN i8sse select(mi8sse m, i8sse x, i8sse y) { return _mm_blendv_epi8(*y, *x, *m); }
-KFR_SINTRIN i16sse select(mi16sse m, i16sse x, i16sse y) { return _mm_blendv_epi8(*y, *x, *m); }
-KFR_SINTRIN i32sse select(mi32sse m, i32sse x, i32sse y) { return _mm_blendv_epi8(*y, *x, *m); }
-KFR_SINTRIN i64sse select(mi64sse m, i64sse x, i64sse y) { return _mm_blendv_epi8(*y, *x, *m); }
-KFR_SINTRIN f32sse select(mf32sse m, f32sse x, f32sse y) { return _mm_blendv_ps(*y, *x, *m); }
-KFR_SINTRIN f64sse select(mf64sse m, f64sse x, f64sse y) { return _mm_blendv_pd(*y, *x, *m); }
+KFR_SINTRIN u8sse select(const mu8sse& m, const u8sse& x, const u8sse& y)
+{
+ return _mm_blendv_epi8(*y, *x, *m);
+}
+KFR_SINTRIN u16sse select(const mu16sse& m, const u16sse& x, const u16sse& y)
+{
+ return _mm_blendv_epi8(*y, *x, *m);
+}
+KFR_SINTRIN u32sse select(const mu32sse& m, const u32sse& x, const u32sse& y)
+{
+ return _mm_blendv_epi8(*y, *x, *m);
+}
+KFR_SINTRIN u64sse select(const mu64sse& m, const u64sse& x, const u64sse& y)
+{
+ return _mm_blendv_epi8(*y, *x, *m);
+}
+KFR_SINTRIN i8sse select(const mi8sse& m, const i8sse& x, const i8sse& y)
+{
+ return _mm_blendv_epi8(*y, *x, *m);
+}
+KFR_SINTRIN i16sse select(const mi16sse& m, const i16sse& x, const i16sse& y)
+{
+ return _mm_blendv_epi8(*y, *x, *m);
+}
+KFR_SINTRIN i32sse select(const mi32sse& m, const i32sse& x, const i32sse& y)
+{
+ return _mm_blendv_epi8(*y, *x, *m);
+}
+KFR_SINTRIN i64sse select(const mi64sse& m, const i64sse& x, const i64sse& y)
+{
+ return _mm_blendv_epi8(*y, *x, *m);
+}
+KFR_SINTRIN f32sse select(const mf32sse& m, const f32sse& x, const f32sse& y)
+{
+ return _mm_blendv_ps(*y, *x, *m);
+}
+KFR_SINTRIN f64sse select(const mf64sse& m, const f64sse& x, const f64sse& y)
+{
+ return _mm_blendv_pd(*y, *x, *m);
+}
#if defined CID_ARCH_AVX
-KFR_SINTRIN f64avx select(mf64avx m, f64avx x, f64avx y) { return _mm256_blendv_pd(*y, *x, *m); }
-KFR_SINTRIN f32avx select(mf32avx m, f32avx x, f32avx y) { return _mm256_blendv_ps(*y, *x, *m); }
+KFR_SINTRIN f64avx select(const mf64avx& m, const f64avx& x, const f64avx& y)
+{
+ return _mm256_blendv_pd(*y, *x, *m);
+}
+KFR_SINTRIN f32avx select(const mf32avx& m, const f32avx& x, const f32avx& y)
+{
+ return _mm256_blendv_ps(*y, *x, *m);
+}
#endif
#if defined CID_ARCH_AVX2
-KFR_SINTRIN u8avx select(mu8avx m, u8avx x, u8avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
-KFR_SINTRIN u16avx select(mu16avx m, u16avx x, u16avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
-KFR_SINTRIN u32avx select(mu32avx m, u32avx x, u32avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
-KFR_SINTRIN u64avx select(mu64avx m, u64avx x, u64avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
-KFR_SINTRIN i8avx select(mi8avx m, i8avx x, i8avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
-KFR_SINTRIN i16avx select(mi16avx m, i16avx x, i16avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
-KFR_SINTRIN i32avx select(mi32avx m, i32avx x, i32avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
-KFR_SINTRIN i64avx select(mi64avx m, i64avx x, i64avx y) { return _mm256_blendv_epi8(*y, *x, *m); }
+KFR_SINTRIN u8avx select(const mu8avx& m, const u8avx& x, const u8avx& y)
+{
+ return _mm256_blendv_epi8(*y, *x, *m);
+}
+KFR_SINTRIN u16avx select(const mu16avx& m, const u16avx& x, const u16avx& y)
+{
+ return _mm256_blendv_epi8(*y, *x, *m);
+}
+KFR_SINTRIN u32avx select(const mu32avx& m, const u32avx& x, const u32avx& y)
+{
+ return _mm256_blendv_epi8(*y, *x, *m);
+}
+KFR_SINTRIN u64avx select(const mu64avx& m, const u64avx& x, const u64avx& y)
+{
+ return _mm256_blendv_epi8(*y, *x, *m);
+}
+KFR_SINTRIN i8avx select(const mi8avx& m, const i8avx& x, const i8avx& y)
+{
+ return _mm256_blendv_epi8(*y, *x, *m);
+}
+KFR_SINTRIN i16avx select(const mi16avx& m, const i16avx& x, const i16avx& y)
+{
+ return _mm256_blendv_epi8(*y, *x, *m);
+}
+KFR_SINTRIN i32avx select(const mi32avx& m, const i32avx& x, const i32avx& y)
+{
+ return _mm256_blendv_epi8(*y, *x, *m);
+}
+KFR_SINTRIN i64avx select(const mi64avx& m, const i64avx& x, const i64avx& y)
+{
+ return _mm256_blendv_epi8(*y, *x, *m);
+}
#endif
template <typename T, size_t N, KFR_ENABLE_IF(N < vector_width<T, cpu_t::native>)>
-KFR_SINTRIN vec<T, N> select(mask<T, N> a, vec<T, N> b, vec<T, N> c)
+KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
{
return slice<0, N>(select(expand_simd(a).asmask(), expand_simd(b), expand_simd(c)));
}
template <typename T, size_t N, KFR_ENABLE_IF(N >= vector_width<T, cpu_t::native>), typename = void>
-KFR_SINTRIN vec<T, N> select(mask<T, N> a, vec<T, N> b, vec<T, N> c)
+KFR_SINTRIN vec<T, N> select(const mask<T, N>& a, const vec<T, N>& b, const vec<T, N>& c)
{
return concat(select(low(a).asmask(), low(b), low(c)), select(high(a).asmask(), high(b), high(c)));
}
@@ -73,7 +133,7 @@ KFR_SINTRIN vec<T, N> select(mask<T, N> a, vec<T, N> b, vec<T, N> c)
// fallback
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> select(mask<T, N> m, vec<T, N> x, vec<T, N> y)
+KFR_SINTRIN vec<T, N> select(mask<T, N> m, const vec<T, N>& x, const vec<T, N>& y)
{
return y ^ ((x ^ y) & m);
}
diff --git a/include/kfr/base/sin_cos.hpp b/include/kfr/base/sin_cos.hpp
@@ -52,19 +52,20 @@ template <typename T>
constexpr static T fold_constant_rem2 = choose_const<T>(-0x1.de973ep-32f, 0x1.8469898cc5170p-49);
template <typename T, size_t N>
-KFR_SINTRIN vec<T, N> trig_horner(vec<T, N>, mask<T, N> msk, T a0, T b0)
+KFR_SINTRIN vec<T, N> trig_horner(const vec<T, N>&, const mask<T, N>& msk, const T& a0, const T& b0)
{
return select(msk, a0, b0);
}
template <typename T, size_t N, typename... Ts>
-KFR_SINTRIN vec<T, N> trig_horner(vec<T, N> x, mask<T, N> msk, T a0, T b0, T a1, T b1, Ts... values)
+KFR_SINTRIN vec<T, N> trig_horner(const vec<T, N>& x, const mask<T, N>& msk, const T& a0, const T& b0,
+ const T& a1, const T& b1, const Ts&... values)
{
return fmadd(trig_horner(x, msk, a1, b1, values...), x, select(msk, a0, b0));
}
template <typename T, size_t N, typename Tprecise = f64>
-KFR_SINTRIN vec<T, N> trig_fold(vec<T, N> x, vec<itype<T>, N>& quadrant)
+KFR_SINTRIN vec<T, N> trig_fold(const vec<T, N>& x, vec<itype<T>, N>& quadrant)
{
const vec<T, N> xabs = abs(x);
constexpr vec<T, N> div = fold_constant_div<T>;
@@ -83,7 +84,7 @@ KFR_SINTRIN vec<T, N> trig_fold(vec<T, N> x, vec<itype<T>, N>& quadrant)
}
template <size_t N>
-KFR_SINTRIN vec<f32, N> trig_sincos(vec<f32, N> folded, mask<f32, N> cosmask)
+KFR_SINTRIN vec<f32, N> trig_sincos(const vec<f32, N>& folded, const mask<f32, N>& cosmask)
{
constexpr f32 sin_c2 = -0x2.aaaaacp-4f;
constexpr f32 sin_c4 = 0x2.222334p-8f;
@@ -106,7 +107,7 @@ KFR_SINTRIN vec<f32, N> trig_sincos(vec<f32, N> folded, mask<f32, N> cosmask)
}
template <size_t N>
-KFR_SINTRIN vec<f64, N> trig_sincos(vec<f64, N> folded, mask<f64, N> cosmask)
+KFR_SINTRIN vec<f64, N> trig_sincos(const vec<f64, N>& folded, const mask<f64, N>& cosmask)
{
constexpr f64 sin_c2 = -0x2.aaaaaaaaaaaaap-4;
constexpr f64 sin_c4 = 0x2.22222222220cep-8;
@@ -135,7 +136,7 @@ KFR_SINTRIN vec<f64, N> trig_sincos(vec<f64, N> folded, mask<f64, N> cosmask)
}
template <typename T, size_t N, typename = u8[N > 1]>
-KFR_SINTRIN vec<T, N> sincos_mask(vec<T, N> x_full, mask<T, N> cosmask)
+KFR_SINTRIN vec<T, N> sincos_mask(const vec<T, N>& x_full, const mask<T, N>& cosmask)
{
vec<itype<T>, N> quadrant;
vec<T, N> folded = trig_fold(x_full, quadrant);
@@ -156,7 +157,7 @@ KFR_SINTRIN vec<T, N> sincos_mask(vec<T, N> x_full, mask<T, N> cosmask)
}
template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> sin(vec<T, N> x)
+KFR_SINTRIN vec<T, N> sin(const vec<T, N>& x)
{
vec<itype<T>, N> quadrant;
vec<T, N> folded = trig_fold(x, quadrant);
@@ -171,7 +172,7 @@ KFR_SINTRIN vec<T, N> sin(vec<T, N> x)
}
template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> cos(vec<T, N> x)
+KFR_SINTRIN vec<T, N> cos(const vec<T, N>& x)
{
vec<itype<T>, N> quadrant;
vec<T, N> folded = trig_fold(x, quadrant);
@@ -187,7 +188,7 @@ KFR_SINTRIN vec<T, N> cos(vec<T, N> x)
}
template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> fastsin(vec<T, N> x)
+KFR_SINTRIN vec<T, N> fastsin(const vec<T, N>& x)
{
constexpr vec<T, N> msk = broadcast<N>(internal::highbitmask<T>);
@@ -212,7 +213,7 @@ KFR_SINTRIN vec<T, N> fastsin(vec<T, N> x)
}
template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> fastcos(vec<T, N> x)
+KFR_SINTRIN vec<T, N> fastcos(const vec<T, N>& x)
{
x += c_pi<T, 1, 2>;
x = select(x >= c_pi<T, 2>, x - c_pi<T, 2>, x);
@@ -220,61 +221,61 @@ KFR_SINTRIN vec<T, N> fastcos(vec<T, N> x)
}
template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> sincos(vec<T, N> x)
+KFR_SINTRIN vec<T, N> sincos(const vec<T, N>& x)
{
return sincos_mask(x, internal::oddmask<T, N>());
}
template <typename T, size_t N, KFR_ENABLE_IF(N > 1 && is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> cossin(vec<T, N> x)
+KFR_SINTRIN vec<T, N> cossin(const vec<T, N>& x)
{
return sincos_mask(x, internal::evenmask<T, N>());
}
template <typename T, size_t N, KFR_ENABLE_IF(is_f_class<T>::value)>
-KFR_SINTRIN vec<T, N> sinc(vec<T, N> x)
+KFR_SINTRIN vec<T, N> sinc(const vec<T, N>& x)
{
return select(abs(x) <= c_epsilon<T>, T(1), sin(x) / x);
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-KFR_SINTRIN vec<Tout, N> sin(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> sin(const vec<T, N>& x)
{
return sin(cast<Tout>(x));
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-KFR_SINTRIN vec<Tout, N> cos(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> cos(const vec<T, N>& x)
{
return cos(cast<Tout>(x));
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-KFR_SINTRIN vec<Tout, N> fastsin(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> fastsin(const vec<T, N>& x)
{
return fastsin(cast<Tout>(x));
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-KFR_SINTRIN vec<Tout, N> fastcos(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> fastcos(const vec<T, N>& x)
{
return fastcos(cast<Tout>(x));
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-KFR_SINTRIN vec<Tout, N> sincos(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> sincos(const vec<T, N>& x)
{
return sincos(cast<Tout>(x));
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-KFR_SINTRIN vec<Tout, N> cossin(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> cossin(const vec<T, N>& x)
{
return cossin(cast<Tout>(x));
}
template <typename T, size_t N, KFR_ENABLE_IF(!is_f_class<T>::value), typename Tout = ftype<T>>
-KFR_SINTRIN vec<Tout, N> sinc(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> sinc(const vec<T, N>& x)
{
return sinc(cast<Tout>(x));
}
diff --git a/include/kfr/base/sqrt.hpp b/include/kfr/base/sqrt.hpp
@@ -32,14 +32,17 @@ namespace intrinsics
#if defined CID_ARCH_SSE2
-KFR_SINTRIN f32x1 sqrt(f32x1 x) { return slice<0, 1>(tovec(_mm_sqrt_ss(*extend<4>(x)))); }
-KFR_SINTRIN f64x1 sqrt(f64x1 x) { return slice<0, 1>(tovec(_mm_sqrt_sd(_mm_setzero_pd(), *extend<2>(x)))); }
-KFR_SINTRIN f32sse sqrt(f32sse x) { return _mm_sqrt_ps(*x); }
-KFR_SINTRIN f64sse sqrt(f64sse x) { return _mm_sqrt_pd(*x); }
+KFR_SINTRIN f32x1 sqrt(const f32x1& x) { return slice<0, 1>(tovec(_mm_sqrt_ss(*extend<4>(x)))); }
+KFR_SINTRIN f64x1 sqrt(const f64x1& x)
+{
+ return slice<0, 1>(tovec(_mm_sqrt_sd(_mm_setzero_pd(), *extend<2>(x))));
+}
+KFR_SINTRIN f32sse sqrt(const f32sse& x) { return _mm_sqrt_ps(*x); }
+KFR_SINTRIN f64sse sqrt(const f64sse& x) { return _mm_sqrt_pd(*x); }
#if defined CID_ARCH_AVX
-KFR_SINTRIN f32avx sqrt(f32avx x) { return _mm256_sqrt_ps(*x); }
-KFR_SINTRIN f64avx sqrt(f64avx x) { return _mm256_sqrt_pd(*x); }
+KFR_SINTRIN f32avx sqrt(const f32avx& x) { return _mm256_sqrt_ps(*x); }
+KFR_SINTRIN f64avx sqrt(const f64avx& x) { return _mm256_sqrt_pd(*x); }
#endif
KFR_HANDLE_ALL_SIZES_FLT_1(sqrt)
@@ -48,7 +51,7 @@ KFR_HANDLE_ALL_SIZES_FLT_1(sqrt)
// fallback
template <typename T, size_t N, typename Tout = flt_type<T>>
-KFR_SINTRIN vec<Tout, N> sqrt(vec<T, N> x)
+KFR_SINTRIN vec<Tout, N> sqrt(const vec<T, N>& x)
{
return apply([](T x) { return std::sqrt(static_cast<Tout>(x)); }, x);
}
diff --git a/include/kfr/base/tan.hpp b/include/kfr/base/tan.hpp
@@ -35,7 +35,7 @@ namespace intrinsics
{
template <typename T, size_t N, typename IT = itype<T>>
-KFR_SINTRIN vec<T, N> trig_fold_simple(vec<T, N> x_full, mask<T, N>& inverse)
+KFR_SINTRIN vec<T, N> trig_fold_simple(const vec<T, N>& x_full, mask<T, N>& inverse)
{
constexpr T pi_14 = c_pi<T, 1, 4>;
@@ -56,7 +56,7 @@ KFR_SINTRIN vec<T, N> trig_fold_simple(vec<T, N> x_full, mask<T, N>& inverse)
}
template <size_t N>
-KFR_SINTRIN vec<f32, N> tan(vec<f32, N> x_full)
+KFR_SINTRIN vec<f32, N> tan(const vec<f32, N>& x_full)
{
mask<f32, N> inverse;
const vec<f32, N> x = trig_fold_simple(x_full, inverse);
@@ -84,7 +84,7 @@ KFR_SINTRIN vec<f32, N> tan(vec<f32, N> x_full)
}
template <size_t N>
-KFR_SINTRIN vec<f64, N> tan(vec<f64, N> x_full)
+KFR_SINTRIN vec<f64, N> tan(const vec<f64, N>& x_full)
{
mask<f64, N> inverse;
const vec<f64, N> x = trig_fold_simple(x_full, inverse);
diff --git a/include/kfr/base/univector.hpp b/include/kfr/base/univector.hpp
@@ -40,7 +40,7 @@ template <typename T, typename Class>
struct univector_base : input_expression, output_expression
{
template <typename U, size_t N>
- KFR_INLINE void operator()(coutput_t, size_t index, vec<U, N> value)
+ KFR_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& value)
{
T* data = derived_cast<Class>(this)->data();
write(ptr_cast<T>(data) + index, cast<T>(value));
diff --git a/include/kfr/dsp/goertzel.hpp b/include/kfr/dsp/goertzel.hpp
@@ -53,7 +53,7 @@ public:
result.imag(q2 * sin(omega));
}
template <typename U, size_t N>
- KFR_INLINE void operator()(coutput_t, size_t index, vec<U, N> x)
+ KFR_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x)
{
vec<T, N> in = cast<T>(x);
KFR_LOOP_UNROLL
@@ -90,7 +90,7 @@ public:
}
}
template <typename U, size_t N>
- KFR_INLINE void operator()(coutput_t, size_t index, vec<U, N> x)
+ KFR_INLINE void operator()(coutput_t, size_t index, const vec<U, N>& x)
{
const vec<T, N> in = cast<T>(x);
KFR_LOOP_UNROLL
diff --git a/include/kfr/expressions/basic.hpp b/include/kfr/expressions/basic.hpp
@@ -346,7 +346,7 @@ struct multioutput : output_expression
{
}
template <typename T, size_t N>
- void operator()(coutput_t, size_t index, vec<T, N> x)
+ void operator()(coutput_t, size_t index, const vec<T, N>& x)
{
cfor(csize<0>, csize<sizeof...(E)>, [&](auto n) { std::get<val_of(n)>(outputs)(coutput, index, x); });
}
diff --git a/include/kfr/expressions/reduce.hpp b/include/kfr/expressions/reduce.hpp
@@ -70,7 +70,7 @@ struct expression_reduce : output_expression
}
template <typename U, size_t N>
- KFR_INLINE void operator()(coutput_t, size_t, vec<U, N> x) const
+ KFR_INLINE void operator()(coutput_t, size_t, const vec<U, N>& x) const
{
counter += N;
process(x);
diff --git a/include/kfr/io/file.hpp b/include/kfr/io/file.hpp
@@ -48,7 +48,7 @@ struct expression_sequential_file_writer : expression_file_base, output_expressi
{
using expression_file_base::expression_file_base;
template <typename U, size_t N>
- void operator()(coutput_t, size_t, vec<U, N> value)
+ void operator()(coutput_t, size_t, const vec<U, N>& value)
{
write(value);
}
@@ -81,7 +81,7 @@ struct expression_file_writer : expression_file_base, output_expression
{
using expression_file_base::expression_file_base;
template <typename U, size_t N>
- void operator()(coutput_t, size_t index, vec<U, N> value)
+ void operator()(coutput_t, size_t index, const vec<U, N>& value)
{
if (position != index)
fseeko(file, static_cast<off_t>(index * sizeof(T)), SEEK_SET);