commit aa603f76838c450537b1864177cc81255ad70d3e
parent 9ec57529ec94779c6c708497659d74685c53674f
Author: [email protected] <[email protected]>
Date: Fri, 1 Mar 2019 14:17:28 +0000
Unaligned read
Diffstat:
15 files changed, 696 insertions(+), 198 deletions(-)
diff --git a/cmake/target_set_arch.cmake b/cmake/target_set_arch.cmake
@@ -6,9 +6,10 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
set(ARCH_FLAGS_GNU_sse3 -msse3)
set(ARCH_FLAGS_GNU_ssse3 -mssse3)
set(ARCH_FLAGS_GNU_sse41 -msse4.1)
- set(ARCH_FLAGS_GNU_avx -msse4.1 -mavx)
- set(ARCH_FLAGS_GNU_avx2 -msse4.1 -mavx2 -mfma)
- set(ARCH_FLAGS_GNU_avx512 -msse4.1 -mavx2 -mfma -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl)
+ set(ARCH_FLAGS_GNU_sse42 -msse4.2)
+ set(ARCH_FLAGS_GNU_avx -msse4.2 -mavx)
+ set(ARCH_FLAGS_GNU_avx2 -msse4.2 -mavx2 -mfma)
+ set(ARCH_FLAGS_GNU_avx512 -msse4.2 -mavx2 -mfma -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl)
if (CMAKE_SIZEOF_VOID_P EQUAL 8)
# SSE2 is part of x86_64
@@ -22,6 +23,7 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)")
set(ARCH_FLAGS_MS_sse3 ${ARCH_FLAG_MS_SSE2} -D__SSE3__)
set(ARCH_FLAGS_MS_ssse3 ${ARCH_FLAG_MS_SSE2} -D__SSSE3__)
set(ARCH_FLAGS_MS_sse41 ${ARCH_FLAG_MS_SSE2} -D__SSE3__ -D__SSSE3__ -D__SSE4_1__)
+ set(ARCH_FLAGS_MS_sse42 ${ARCH_FLAG_MS_SSE2} -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__)
set(ARCH_FLAGS_MS_avx -arch:AVX)
set(ARCH_FLAGS_MS_avx2 -arch:AVX2)
set(ARCH_FLAGS_MS_avx512 -arch:AVX512)
diff --git a/include/kfr/cometa.hpp b/include/kfr/cometa.hpp
@@ -7,6 +7,7 @@
#include <cstdint>
#include <cstdlib>
+#include <cstring>
#include <limits>
#include <random>
#include <type_traits>
@@ -714,7 +715,7 @@ template <typename... List>
using indicesfor_t = cvalseq_t<size_t, sizeof...(List), 0>;
template <size_t group, size_t... indices, size_t N = group * sizeof...(indices)>
-constexpr inline auto scale(csizes_t<indices...> i) CMT_NOEXCEPT
+constexpr inline auto scale(csizes_t<indices...>) CMT_NOEXCEPT
{
return cconcat(csizeseq_t<group, group * indices>()...);
// return i[csizeseq_t<N>() / csize_t<group>()] * csize_t<group>() + csizeseq_t<N>() %
@@ -1941,10 +1942,10 @@ using overload_generic = overload_priority<0>;
#define CMT_GEN_LIST(c, m, ...) CMT_GEN_LIST##c(m, __VA_ARGS__)
template <typename Tout, typename Tin>
-constexpr CMT_INLINE Tout bitcast_anything(const Tin& in)
+CMT_INLINE Tout bitcast_anything(const Tin& in)
{
static_assert(sizeof(Tin) == sizeof(Tout), "Invalid arguments for bitcast_anything");
-#ifdef CMT_COMPILER_INTEL
+#if defined CMT_COMPILER_INTEL
const union {
const Tin in;
Tout out;
@@ -1971,6 +1972,19 @@ constexpr T just_value(T value)
return value;
}
+template <typename Tout, typename>
+CMT_INTRINSIC constexpr Tout pack_elements()
+{
+ return 0;
+}
+
+template <typename Tout, typename Arg, typename... Args>
+CMT_INTRINSIC constexpr Tout pack_elements(Arg x, Args... args)
+{
+ return static_cast<typename std::make_unsigned<Arg>::type>(x) |
+ (pack_elements<Tout, Arg>(args...) << (sizeof(Arg) * 8));
+}
+
enum class special_constant
{
undefined,
diff --git a/include/kfr/dft/impl/ft.hpp b/include/kfr/dft/impl/ft.hpp
@@ -127,7 +127,7 @@ KFR_INTRINSIC void cwrite(complex<T>* dest, const cvec<T, N>& value)
template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices>
KFR_INTRINSIC cvec<T, count * N> cread_group_impl(const complex<T>* src, csizes_t<indices...>)
{
- return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...);
+ return concat(read(cbool<A>, csize<N * 2>, ptr_cast<T>(src + stride * indices))...);
}
template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices>
KFR_INTRINSIC void cwrite_group_impl(complex<T>* dest, const cvec<T, count * N>& value, csizes_t<indices...>)
@@ -138,7 +138,7 @@ KFR_INTRINSIC void cwrite_group_impl(complex<T>* dest, const cvec<T, count * N>&
template <size_t count, size_t N, bool A, typename T, size_t... indices>
KFR_INTRINSIC cvec<T, count * N> cread_group_impl(const complex<T>* src, size_t stride, csizes_t<indices...>)
{
- return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...);
+ return concat(read(cbool<A>, csize<N * 2>, ptr_cast<T>(src + stride * indices))...);
}
template <size_t count, size_t N, bool A, typename T, size_t... indices>
KFR_INTRINSIC void cwrite_group_impl(complex<T>* dest, size_t stride, const cvec<T, count * N>& value,
@@ -1459,7 +1459,7 @@ KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T
template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()>
KFR_INTRINSIC void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w)
{
- vec<T, Nout> temp = read<Nout>(ptr_cast<T>(ptr));
+ vec<T, Nout> temp = read(cunaligned, csize<Nout>, ptr_cast<T>(ptr));
if (transposed)
temp = ctranspose<sizeof...(N)>(temp);
split(temp, w...);
diff --git a/include/kfr/simd/impl/backend_clang.hpp b/include/kfr/simd/impl/backend_clang.hpp
@@ -173,41 +173,6 @@ KFR_INTRINSIC simd<T, N> simd_convert(simd_cvt_t<T, T, N>, const simd<T, N>& x)
template <typename T, size_t N, bool A>
using simd_storage = struct_with_alignment<simd<T, N>, A>;
-template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
-KFR_INTRINSIC simd<T, N> simd_read(const T* src)
-{
- return ptr_cast<simd_storage<T, N, A>>(src)->value;
-}
-
-template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
-KFR_INTRINSIC simd<T, N> simd_read(const T* src)
-{
- constexpr size_t first = prev_poweroftwo(N);
- constexpr size_t rest = N - first;
- constexpr auto extend_indices = cconcat(csizeseq<rest>, csizeseq<first - rest, index_undefined, 0>);
- constexpr auto concat_indices = cvalseq_t<size_t, N>();
- return simd_shuffle(
- simd2_t<T, first, first>{}, simd_read<first, A>(src),
- simd_shuffle(simd_t<T, rest>{}, simd_read<rest, false>(src + first), extend_indices, overload_auto),
- concat_indices, overload_auto);
-}
-
-template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
-KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value)
-{
- ptr_cast<simd_storage<T, N, A>>(dest)->value = value;
-}
-
-template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
-KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value)
-{
- constexpr size_t first = prev_poweroftwo(N);
- constexpr size_t rest = N - first;
- simd_write<A, first>(dest, simd_shuffle(simd_t<T, N>{}, value, csizeseq<first>, overload_auto));
- simd_write<false, rest>(dest + first,
- simd_shuffle(simd_t<T, N>{}, value, csizeseq<rest, first>, overload_auto));
-}
-
template <typename T, size_t N>
KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, size_t index)
{
diff --git a/include/kfr/simd/impl/backend_generic.hpp b/include/kfr/simd/impl/backend_generic.hpp
@@ -54,17 +54,32 @@ using simd = typename simd_type<T, N>::type;
template <typename T, size_t N, typename U>
union simd_small_array {
static_assert(sizeof(T) * N == sizeof(U), "");
- T arr[N];
U whole;
- KFR_INTRINSIC static constexpr simd_small_array from(U whole)
+ using value_type = T;
+ constexpr static size_t size = N;
+ using packed_type = U;
+
+ KFR_INTRINSIC constexpr simd_small_array() CMT_NOEXCEPT = default;
+
+ KFR_INTRINSIC constexpr simd_small_array(U whole) CMT_NOEXCEPT : whole(whole) {}
+
+ template <typename... Args>
+ KFR_INTRINSIC constexpr simd_small_array(T a, T b, Args... args) CMT_NOEXCEPT
+ : whole(pack_elements<U, T>(a, b, args...))
{
- union {
- const U w;
- simd_small_array r;
- } u{ whole };
- return u.r;
}
+
+ KFR_INTRINSIC static constexpr simd_small_array from(U whole) CMT_NOEXCEPT { return { whole }; }
+};
+
+template <typename T>
+struct is_simd_small_array : cfalse_t
+{
+};
+template <typename T, size_t N, typename U>
+struct is_simd_small_array<simd_small_array<T, N, U>> : ctrue_t
+{
};
#define KFR_SIMD_TYPE(T, N, ...) \
@@ -108,8 +123,6 @@ KFR_SIMD_SMALL_TYPE(i8, 8, u64)
KFR_SIMD_SMALL_TYPE(i16, 4, u64)
KFR_SIMD_SMALL_TYPE(i32, 2, u64)
-KFR_SIMD_SMALL_TYPE(f32, 2, f64)
-
#ifdef CMT_ARCH_SSE
KFR_SIMD_TYPE(f32, 4, __m128)
KFR_SIMD_TYPE(f64, 2, __m128d)
@@ -207,11 +220,15 @@ KFR_SIMD_TYPE(f64, 2, float64x2_t);
#ifdef CMT_ARCH_SSE2
inline __m128i KFR_mm_setr_epi64x(int64_t q0, int64_t q1) CMT_NOEXCEPT { return _mm_set_epi64x(q1, q0); }
+inline __m128i KFR_mm_setr_epi32(int32_t q0, int32_t q1, int32_t q2, int32_t q3) CMT_NOEXCEPT
+{
+ return _mm_set_epi32(q3, q2, q1, q0);
+}
KFR_INTRIN_MAKE(2, i64, KFR_mm_setr_epi64x)
KFR_INTRIN_MAKE(2, u64, KFR_mm_setr_epi64x)
KFR_INTRIN_MAKE(2, f64, _mm_setr_pd)
-KFR_INTRIN_MAKE(4, i32, _mm_setr_epi32)
-KFR_INTRIN_MAKE(4, u32, _mm_setr_epi32)
+KFR_INTRIN_MAKE(4, i32, KFR_mm_setr_epi32)
+KFR_INTRIN_MAKE(4, u32, KFR_mm_setr_epi32)
KFR_INTRIN_MAKE(4, f32, _mm_setr_ps)
KFR_INTRIN_MAKE(8, i16, _mm_setr_epi16)
KFR_INTRIN_MAKE(8, u16, _mm_setr_epi16)
@@ -301,7 +318,7 @@ KFR_INTRIN_SHUFFLE_LINEAR(i64, 1, 2, _mm_cvtsi128_si64(x))
KFR_INTRIN_SHUFFLE_LINEAR(u64, 1, 2, _mm_cvtsi128_si64(x))
KFR_INTRIN_SHUFFLE_LINEAR(f32, 1, 4, _mm_cvtss_f32(x))
KFR_INTRIN_SHUFFLE_LINEAR(f32, 2, 4, bitcast_anything<simd<float, 2>>(_mm_cvtsd_f64(_mm_castps_pd(x))))
-KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 2, _mm_castpd_ps(_mm_set_sd(x.whole)))
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 2, _mm_unpacklo_ps(_mm_set_ss(x.low), _mm_set_ss(x.high)))
KFR_INTRIN_SHUFFLE_LINEAR(f64, 1, 2, _mm_cvtsd_f64(x))
KFR_INTRIN_SHUFFLE_LINEAR(i8, 2, 16, simd<i8, 2>::from(u16(_mm_cvtsi128_si32(x))))
@@ -333,11 +350,24 @@ KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 2, 4, 2, simd<i32, 2>::from(KFR_u64sse_INDE
return __VA_ARGS__; \
}
+#define KFR_INTRIN_CONVERT_NOOP_REF(Tout, Tin, N) \
+ KFR_INTRINSIC const simd<Tout, N>& simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) \
+ CMT_NOEXCEPT \
+ { \
+ return x; \
+ }
+#define KFR_INTRIN_CONVERT_NOOP(Tout, Tin, N) \
+ KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT \
+ { \
+ return x; \
+ }
+
KFR_INTRIN_CONVERT(f32, i32, 4, _mm_cvtepi32_ps(x))
KFR_INTRIN_CONVERT(i32, f32, 4, _mm_cvttps_epi32(x))
KFR_INTRIN_CONVERT(i32, f64, 2, simd<i32, 2>::from(_mm_cvtsi128_si64(_mm_cvttpd_epi32(x))))
KFR_INTRIN_CONVERT(f64, i32, 2, _mm_cvtepi32_pd(KFR_mm_setr_epi64x(x.whole, 0)))
-KFR_INTRIN_CONVERT(i64, f64, 2, _mm_set_epi64x(_mm_cvttsd_si64(_mm_unpackhi_pd(x, x)), _mm_cvttsd_si64(x)))
+KFR_INTRIN_CONVERT(i64, f64, 2,
+ KFR_mm_setr_epi64x(_mm_cvttsd_si64(x), _mm_cvttsd_si64(_mm_unpackhi_pd(x, x))))
KFR_INTRIN_CONVERT(f64, i64, 2,
_mm_unpacklo_pd(_mm_cvtsi64_sd(_mm_setzero_pd(), _mm_cvtsi128_si64(x)),
_mm_cvtsi64_sd(_mm_setzero_pd(), KFR_i64sse_INDEX(x, 1))))
@@ -355,6 +385,25 @@ KFR_INTRIN_CONVERT(f32, f64, 4,
simd<f32, 4>{ _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(_mm_cvtpd_ps(x.low)),
_mm_castps_pd(_mm_cvtpd_ps(x.high)))) })
#endif
+
+KFR_INTRIN_CONVERT_NOOP(u8, i8, 1)
+KFR_INTRIN_CONVERT_NOOP(i8, u8, 1)
+KFR_INTRIN_CONVERT_NOOP(u16, i16, 1)
+KFR_INTRIN_CONVERT_NOOP(i16, u16, 1)
+KFR_INTRIN_CONVERT_NOOP(u32, i32, 1)
+KFR_INTRIN_CONVERT_NOOP(i32, u32, 1)
+KFR_INTRIN_CONVERT_NOOP(u64, i64, 1)
+KFR_INTRIN_CONVERT_NOOP(i64, u64, 1)
+
+KFR_INTRIN_CONVERT_NOOP_REF(u8, i8, 16)
+KFR_INTRIN_CONVERT_NOOP_REF(i8, u8, 16)
+KFR_INTRIN_CONVERT_NOOP_REF(u16, i16, 8)
+KFR_INTRIN_CONVERT_NOOP_REF(i16, u16, 8)
+KFR_INTRIN_CONVERT_NOOP_REF(u32, i32, 4)
+KFR_INTRIN_CONVERT_NOOP_REF(i32, u32, 4)
+KFR_INTRIN_CONVERT_NOOP_REF(u64, i64, 2)
+KFR_INTRIN_CONVERT_NOOP_REF(i64, u64, 2)
+
#endif // CMT_ARCH_SSE2
#ifdef CMT_ARCH_SSE41
@@ -707,12 +756,34 @@ KFR_INTRINSIC simd_array<T, N> to_simd_array(const simd<T, N>& x) CMT_NOEXCEPT
return bitcast_anything<simd_array<T, N>>(x);
}
+#if defined CMT_COMPILER_MSVC
+
+template <typename T, size_t N, KFR_ENABLE_IF(!is_simd_small_array<simd<T, N>>::value)>
+KFR_INTRINSIC simd<T, N> from_simd_array(const simd_array<T, N>& x) CMT_NOEXCEPT
+{
+ return bitcast_anything<simd<T, N>>(x);
+}
+
+template <typename T, size_t N, size_t... indices>
+KFR_INTRINSIC simd<T, N> from_simd_array_impl(const simd_array<T, N>& x, csizes_t<indices...>) CMT_NOEXCEPT
+{
+ return { x.val[indices]... };
+}
+
+template <typename T, size_t N, KFR_ENABLE_IF(is_simd_small_array<simd<T, N>>::value)>
+KFR_INTRINSIC simd<T, N> from_simd_array(const simd_array<T, N>& x) CMT_NOEXCEPT
+{
+ return from_simd_array_impl(x, csizeseq<N>);
+}
+#else
template <typename T, size_t N>
KFR_INTRINSIC simd<T, N> from_simd_array(const simd_array<T, N>& x) CMT_NOEXCEPT
{
return bitcast_anything<simd<T, N>>(x);
}
+#endif
+
#define KFR_COMPONENTWISE_RET(code) \
vec<T, N> result; \
for (size_t i = 0; i < N; i++) \
@@ -815,8 +886,7 @@ KFR_INTRINSIC const simd<T, N>& simd_bitcast(simd_cvt_t<T, T, N>, const simd<T,
template <typename T, size_t N, size_t index>
KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, csize_t<index>) CMT_NOEXCEPT
{
- not_optimized(CMT_FUNC_SIGNATURE);
- return to_simd_array<T, N>(value).val[index];
+ return simd_shuffle(simd_t<T, N>{}, value, csizes<index>, overload_auto);
}
template <typename T, size_t N, size_t index>
@@ -1022,53 +1092,15 @@ using simd_storage = struct_with_alignment<simd<T, N>, A>;
CMT_PRAGMA_GNU(GCC diagnostic pop)
-template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
-KFR_INTRINSIC simd<T, N> simd_read(const T* src) CMT_NOEXCEPT
-{
- return reinterpret_cast<typename simd_storage<T, N, A>::const_pointer>(src)->value;
-}
-
-template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
-KFR_INTRINSIC simd<T, N> simd_read(const T* src) CMT_NOEXCEPT
-{
- constexpr size_t first = prev_poweroftwo(N);
- constexpr size_t rest = N - first;
- constexpr auto extend_indices =
- cconcat(csizeseq_t<rest>(), csizeseq_t<first - rest, index_undefined, 0>());
- constexpr auto concat_indices = cvalseq_t<size_t, N>();
- return simd_shuffle(
- simd2_t<T, first, first>{}, simd_read<first, A>(src),
- simd_shuffle(simd_t<T, rest>{}, simd_read<rest, false>(src + first), extend_indices, overload_auto),
- concat_indices, overload_auto);
-}
-
-template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
-KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value) CMT_NOEXCEPT
-{
- reinterpret_cast<typename simd_storage<T, N, A>::pointer>(dest)->value = value;
-}
-
-template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
-KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value) CMT_NOEXCEPT
-{
- constexpr size_t first = prev_poweroftwo(N);
- constexpr size_t rest = N - first;
- simd_write<A, first>(dest, simd_shuffle(simd_t<T, N>{}, value, csizeseq_t<first>(), overload_auto));
- simd_write<false, rest>(dest + first,
- simd_shuffle(simd_t<T, N>{}, value, csizeseq_t<rest, first>(), overload_auto));
-}
-
template <typename T, size_t N>
KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, size_t index) CMT_NOEXCEPT
{
- not_optimized(CMT_FUNC_SIGNATURE);
return to_simd_array<T, N>(value).val[index];
}
template <typename T, size_t N>
KFR_INTRINSIC simd<T, N> simd_set_element(const simd<T, N>& value, size_t index, T x) CMT_NOEXCEPT
{
- not_optimized(CMT_FUNC_SIGNATURE);
simd_array<T, N> arr = to_simd_array<T, N>(value);
arr.val[index] = x;
return from_simd_array(arr);
diff --git a/include/kfr/simd/impl/read_write.hpp b/include/kfr/simd/impl/read_write.hpp
@@ -0,0 +1,397 @@
+/** @addtogroup read_write
+ * @{
+ */
+/*
+ Copyright (C) 2016 D Levin (https://www.kfrlib.com)
+ This file is part of KFR
+
+ KFR is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ KFR is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with KFR.
+
+ If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
+ Buying a commercial license is mandatory as soon as you develop commercial activities without
+ disclosing the source code of your own applications.
+ See https://www.kfrlib.com for details.
+ */
+#pragma once
+
+#include "../shuffle.hpp"
+#include "../types.hpp"
+#include "../vec.hpp"
+
+namespace kfr
+{
+inline namespace CMT_ARCH_NAME
+{
+namespace intrinsics
+{
+
+#ifndef CMT_CLANG_EXT
+
+#ifdef CMT_ARCH_SSE2
+
+template <typename T>
+KFR_INTRINSIC vec<T, 1> read(cunaligned_t, csize_t<1>, const T* ptr)
+{
+ return *ptr;
+}
+
+KFR_INTRINSIC f32x2 read(cunaligned_t, csize_t<2>, const f32* ptr)
+{
+ return f32x2::simd_type{ ptr[0], ptr[1] };
+}
+
+#if !defined(CMT_COMPILER_GCC)
+
+KFR_INTRINSIC u8x2 read(cunaligned_t, csize_t<2>, const u8* ptr)
+{
+ return u8x2::simd_type::from(*reinterpret_cast<const u16*>(ptr));
+}
+KFR_INTRINSIC i8x2 read(cunaligned_t, csize_t<2>, const i8* ptr)
+{
+ return i8x2::simd_type::from(*reinterpret_cast<const u16*>(ptr));
+}
+KFR_INTRINSIC u8x4 read(cunaligned_t, csize_t<4>, const u8* ptr)
+{
+ return u8x4::simd_type::from(*reinterpret_cast<const u32*>(ptr));
+}
+KFR_INTRINSIC i8x4 read(cunaligned_t, csize_t<4>, const i8* ptr)
+{
+ return i8x4::simd_type::from(*reinterpret_cast<const u32*>(ptr));
+}
+KFR_INTRINSIC u16x2 read(cunaligned_t, csize_t<2>, const u16* ptr)
+{
+ return u16x2::simd_type::from(*reinterpret_cast<const u32*>(ptr));
+}
+KFR_INTRINSIC i16x2 read(cunaligned_t, csize_t<2>, const i16* ptr)
+{
+ return i16x2::simd_type::from(*reinterpret_cast<const u32*>(ptr));
+}
+KFR_INTRINSIC u8x8 read(cunaligned_t, csize_t<8>, const u8* ptr)
+{
+ return u8x8::simd_type::from(*reinterpret_cast<const u64*>(ptr));
+}
+KFR_INTRINSIC i8x8 read(cunaligned_t, csize_t<8>, const i8* ptr)
+{
+ return i8x8::simd_type::from(*reinterpret_cast<const u64*>(ptr));
+}
+KFR_INTRINSIC u16x4 read(cunaligned_t, csize_t<4>, const u16* ptr)
+{
+ return u16x4::simd_type::from(*reinterpret_cast<const u64*>(ptr));
+}
+KFR_INTRINSIC i16x4 read(cunaligned_t, csize_t<4>, const i16* ptr)
+{
+ return i16x4::simd_type::from(*reinterpret_cast<const u64*>(ptr));
+}
+KFR_INTRINSIC u32x2 read(cunaligned_t, csize_t<2>, const u32* ptr)
+{
+ return u32x2::simd_type::from(*reinterpret_cast<const u64*>(ptr));
+}
+KFR_INTRINSIC i32x2 read(cunaligned_t, csize_t<2>, const i32* ptr)
+{
+ return i32x2::simd_type::from(*reinterpret_cast<const u64*>(ptr));
+}
+
+#endif
+
+KFR_INTRINSIC f32sse read(cunaligned_t, csize_t<4>, const f32* ptr) { return _mm_loadu_ps(ptr); }
+KFR_INTRINSIC f64sse read(cunaligned_t, csize_t<2>, const f64* ptr) { return _mm_loadu_pd(ptr); }
+KFR_INTRINSIC u8sse read(cunaligned_t, csize_t<16>, const u8* ptr)
+{
+ return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
+}
+KFR_INTRINSIC i8sse read(cunaligned_t, csize_t<16>, const i8* ptr)
+{
+ return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
+}
+KFR_INTRINSIC u16sse read(cunaligned_t, csize_t<8>, const u16* ptr)
+{
+ return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
+}
+KFR_INTRINSIC i16sse read(cunaligned_t, csize_t<8>, const i16* ptr)
+{
+ return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
+}
+KFR_INTRINSIC u32sse read(cunaligned_t, csize_t<4>, const u32* ptr)
+{
+ return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
+}
+KFR_INTRINSIC i32sse read(cunaligned_t, csize_t<4>, const i32* ptr)
+{
+ return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
+}
+KFR_INTRINSIC u64sse read(cunaligned_t, csize_t<2>, const u64* ptr)
+{
+ return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
+}
+KFR_INTRINSIC i64sse read(cunaligned_t, csize_t<2>, const i64* ptr)
+{
+ return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr));
+}
+
+template <typename T>
+KFR_INTRINSIC void write(cunaligned_t, T* ptr, const vec<T, 1>& x)
+{
+ *ptr = x.front();
+}
+KFR_INTRINSIC void write(cunaligned_t, f32* ptr, const f32x2& x)
+{
+ ptr[0] = x.v.low;
+ ptr[1] = x.v.high;
+}
+
+KFR_INTRINSIC void write(cunaligned_t, u8* ptr, const u8x2& x) { *reinterpret_cast<u16*>(ptr) = x.v.whole; }
+KFR_INTRINSIC void write(cunaligned_t, i8* ptr, const i8x2& x) { *reinterpret_cast<u16*>(ptr) = x.v.whole; }
+KFR_INTRINSIC void write(cunaligned_t, u8* ptr, const u8x4& x) { *reinterpret_cast<u32*>(ptr) = x.v.whole; }
+KFR_INTRINSIC void write(cunaligned_t, i8* ptr, const i8x4& x) { *reinterpret_cast<u32*>(ptr) = x.v.whole; }
+KFR_INTRINSIC void write(cunaligned_t, u16* ptr, const u16x2& x) { *reinterpret_cast<u32*>(ptr) = x.v.whole; }
+KFR_INTRINSIC void write(cunaligned_t, i16* ptr, const i16x2& x) { *reinterpret_cast<u32*>(ptr) = x.v.whole; }
+KFR_INTRINSIC void write(cunaligned_t, u8* ptr, const u8x8& x) { *reinterpret_cast<u64*>(ptr) = x.v.whole; }
+KFR_INTRINSIC void write(cunaligned_t, i8* ptr, const i8x8& x) { *reinterpret_cast<u64*>(ptr) = x.v.whole; }
+KFR_INTRINSIC void write(cunaligned_t, u16* ptr, const u16x4& x) { *reinterpret_cast<u64*>(ptr) = x.v.whole; }
+KFR_INTRINSIC void write(cunaligned_t, i16* ptr, const i16x4& x) { *reinterpret_cast<u64*>(ptr) = x.v.whole; }
+KFR_INTRINSIC void write(cunaligned_t, u32* ptr, const u32x2& x) { *reinterpret_cast<u64*>(ptr) = x.v.whole; }
+KFR_INTRINSIC void write(cunaligned_t, i32* ptr, const i32x2& x) { *reinterpret_cast<u64*>(ptr) = x.v.whole; }
+
+KFR_INTRINSIC void write(cunaligned_t, f32* ptr, const f32sse& x) { _mm_storeu_ps(ptr, x.v); }
+KFR_INTRINSIC void write(cunaligned_t, f64* ptr, const f64sse& x) { _mm_storeu_pd(ptr, x.v); }
+KFR_INTRINSIC void write(cunaligned_t, u8* ptr, const u8sse& x)
+{
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), x.v);
+}
+KFR_INTRINSIC void write(cunaligned_t, i8* ptr, const i8sse& x)
+{
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), x.v);
+}
+KFR_INTRINSIC void write(cunaligned_t, u16* ptr, const u16sse& x)
+{
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), x.v);
+}
+KFR_INTRINSIC void write(cunaligned_t, i16* ptr, const i16sse& x)
+{
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), x.v);
+}
+KFR_INTRINSIC void write(cunaligned_t, u32* ptr, const u32sse& x)
+{
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), x.v);
+}
+KFR_INTRINSIC void write(cunaligned_t, i32* ptr, const i32sse& x)
+{
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), x.v);
+}
+KFR_INTRINSIC void write(cunaligned_t, u64* ptr, const u64sse& x)
+{
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), x.v);
+}
+KFR_INTRINSIC void write(cunaligned_t, i64* ptr, const i64sse& x)
+{
+ _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), x.v);
+}
+
+#if defined CMT_ARCH_AVX
+
+KFR_INTRINSIC f32avx read(cunaligned_t, csize_t<8>, const f32* ptr) { return _mm256_loadu_ps(ptr); }
+KFR_INTRINSIC f64avx read(cunaligned_t, csize_t<4>, const f64* ptr) { return _mm256_loadu_pd(ptr); }
+
+KFR_INTRINSIC void write(cunaligned_t, f32* ptr, const f32avx& x) { _mm256_storeu_ps(ptr, x.v); }
+KFR_INTRINSIC void write(cunaligned_t, f64* ptr, const f64avx& x) { _mm256_storeu_pd(ptr, x.v); }
+
+#if defined CMT_ARCH_AVX2
+
+KFR_INTRINSIC u8avx read(cunaligned_t, csize_t<32>, const u8* ptr)
+{
+ return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+}
+KFR_INTRINSIC i8avx read(cunaligned_t, csize_t<32>, const i8* ptr)
+{
+ return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+}
+KFR_INTRINSIC u16avx read(cunaligned_t, csize_t<16>, const u16* ptr)
+{
+ return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+}
+KFR_INTRINSIC i16avx read(cunaligned_t, csize_t<16>, const i16* ptr)
+{
+ return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+}
+KFR_INTRINSIC u32avx read(cunaligned_t, csize_t<8>, const u32* ptr)
+{
+ return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+}
+KFR_INTRINSIC i32avx read(cunaligned_t, csize_t<8>, const i32* ptr)
+{
+ return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+}
+KFR_INTRINSIC u64avx read(cunaligned_t, csize_t<4>, const u64* ptr)
+{
+ return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+}
+KFR_INTRINSIC i64avx read(cunaligned_t, csize_t<4>, const i64* ptr)
+{
+ return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr));
+}
+
+KFR_INTRINSIC void write(cunaligned_t, u8* ptr, const u8avx& x)
+{
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), x.v);
+}
+KFR_INTRINSIC void write(cunaligned_t, i8* ptr, const i8avx& x)
+{
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), x.v);
+}
+KFR_INTRINSIC void write(cunaligned_t, u16* ptr, const u16avx& x)
+{
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), x.v);
+}
+KFR_INTRINSIC void write(cunaligned_t, i16* ptr, const i16avx& x)
+{
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), x.v);
+}
+KFR_INTRINSIC void write(cunaligned_t, u32* ptr, const u32avx& x)
+{
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), x.v);
+}
+KFR_INTRINSIC void write(cunaligned_t, i32* ptr, const i32avx& x)
+{
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), x.v);
+}
+KFR_INTRINSIC void write(cunaligned_t, u64* ptr, const u64avx& x)
+{
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), x.v);
+}
+KFR_INTRINSIC void write(cunaligned_t, i64* ptr, const i64avx& x)
+{
+ _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), x.v);
+}
+
+#if defined CMT_ARCH_AVX512
+
+KFR_INTRINSIC f32avx512 read(cunaligned_t, csize_t<16>, const f32* ptr) { return _mm512_loadu_ps(ptr); }
+KFR_INTRINSIC f64avx512 read(cunaligned_t, csize_t<8>, const f64* ptr) { return _mm512_loadu_pd(ptr); }
+
+KFR_INTRINSIC u8avx512 read(cunaligned_t, csize_t<64>, const u8* ptr) { return _mm512_loadu_epi8(ptr); }
+KFR_INTRINSIC i8avx512 read(cunaligned_t, csize_t<64>, const i8* ptr) { return _mm512_loadu_epi8(ptr); }
+KFR_INTRINSIC u16avx512 read(cunaligned_t, csize_t<32>, const u16* ptr) { return _mm512_loadu_epi16(ptr); }
+KFR_INTRINSIC i16avx512 read(cunaligned_t, csize_t<32>, const i16* ptr) { return _mm512_loadu_epi16(ptr); }
+KFR_INTRINSIC u32avx512 read(cunaligned_t, csize_t<16>, const u32* ptr) { return _mm512_loadu_epi32(ptr); }
+KFR_INTRINSIC i32avx512 read(cunaligned_t, csize_t<16>, const i32* ptr) { return _mm512_loadu_epi32(ptr); }
+KFR_INTRINSIC u64avx512 read(cunaligned_t, csize_t<8>, const u64* ptr) { return _mm512_loadu_epi64(ptr); }
+KFR_INTRINSIC i64avx512 read(cunaligned_t, csize_t<8>, const i64* ptr) { return _mm512_loadu_epi64(ptr); }
+
+KFR_INTRINSIC void write(cunaligned_t, f32* ptr, const f32avx512& x) { _mm512_storeu_ps(ptr, x.v); }
+KFR_INTRINSIC void write(cunaligned_t, f64* ptr, const f64avx512& x) { _mm512_storeu_pd(ptr, x.v); }
+
+KFR_INTRINSIC void write(cunaligned_t, u8* ptr, const u8avx512& x) { _mm512_storeu_epi8(ptr, x.v); }
+KFR_INTRINSIC void write(cunaligned_t, i8* ptr, const i8avx512& x) { _mm512_storeu_epi8(ptr, x.v); }
+KFR_INTRINSIC void write(cunaligned_t, u16* ptr, const u16avx512& x) { _mm512_storeu_epi16(ptr, x.v); }
+KFR_INTRINSIC void write(cunaligned_t, i16* ptr, const i16avx512& x) { _mm512_storeu_epi16(ptr, x.v); }
+KFR_INTRINSIC void write(cunaligned_t, u32* ptr, const u32avx512& x) { _mm512_storeu_epi32(ptr, x.v); }
+KFR_INTRINSIC void write(cunaligned_t, i32* ptr, const i32avx512& x) { _mm512_storeu_epi32(ptr, x.v); }
+KFR_INTRINSIC void write(cunaligned_t, u64* ptr, const u64avx512& x) { _mm512_storeu_epi64(ptr, x.v); }
+KFR_INTRINSIC void write(cunaligned_t, i64* ptr, const i64avx512& x) { _mm512_storeu_epi64(ptr, x.v); }
+
+#endif
+#endif
+#endif
+#else
+
+// fallback
+
+template <size_t N, typename T, KFR_ENABLE_IF(N == 1 || is_simd_size<T>(N))>
+KFR_INTRINSIC vec<T, N> read(cunaligned_t, csize_t<N>, const T* ptr) CMT_NOEXCEPT
+{
+ vec<T, N> result{};
+ for (size_t i = 0; i < N; i++)
+ result[i] = ptr[i];
+ return result;
+}
+
+template <size_t N, typename T, KFR_ENABLE_IF(N == 1 || is_simd_size<T>(N))>
+KFR_INTRINSIC void write(cunaligned_t, T* ptr, const vec<T, N>& x) CMT_NOEXCEPT
+{
+ for (size_t i = 0; i < N; i++)
+ ptr[i] = x[i];
+}
+
+#endif
+
+template <size_t N, typename T, KFR_ENABLE_IF(N != 1 && !is_simd_size<T>(N)),
+ size_t Nlow = prev_poweroftwo(N - 1)>
+KFR_INTRINSIC vec<T, N> read(cunaligned_t, csize_t<N>, const T* ptr) CMT_NOEXCEPT
+{
+ return concat(read(cunaligned, csize<Nlow>, ptr), read(cunaligned, csize<N - Nlow>, ptr + Nlow));
+}
+
+template <size_t N, typename T, KFR_ENABLE_IF(N != 1 && !is_simd_size<T>(N)),
+ size_t Nlow = prev_poweroftwo(N - 1)>
+KFR_INTRINSIC void write(cunaligned_t, T* ptr, const vec<T, N>& x) CMT_NOEXCEPT
+{
+ write(cunaligned, ptr, x.shuffle(csizeseq<Nlow>));
+ write(cunaligned, ptr + Nlow, x.shuffle(csizeseq<N - Nlow, Nlow>));
+}
+
+#else
+
+template <size_t N, typename T>
+KFR_INTRINSIC simd<T, N> simd_read(const T* src) CMT_NOEXCEPT
+{
+ return reinterpret_cast<typename simd_storage<T, N, false>::const_pointer>(src)->value;
+}
+
+template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
+KFR_INTRINSIC vec<T, N> read(cunaligned_t, csize_t<N>, const T* src) CMT_NOEXCEPT
+{
+ // Clang requires a separate function returning vector (simd).
+ // Direct returning vec causes aligned read instruction
+ return simd_read<N>(src);
+}
+
+template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void>
+KFR_INTRINSIC vec<T, N> read(cunaligned_t, csize_t<N>, const T* src) CMT_NOEXCEPT
+{
+ constexpr size_t first = prev_poweroftwo(N);
+ return concat(read(cunaligned, csize<first>, src), read(cunaligned, csize<N - first>, src + first));
+}
+
+template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))>
+KFR_INTRINSIC void write(cunaligned_t, T* dest, const vec<T, N>& x) CMT_NOEXCEPT
+{
+ reinterpret_cast<typename simd_storage<T, N, A>::pointer>(dest)->value = x.v;
+}
+
+template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)),
+ size_t Nlow = prev_poweroftwo(N - 1)>
+KFR_INTRINSIC void write(cunaligned_t, T* dest, const vec<T, N>& x) CMT_NOEXCEPT
+{
+ write(cunaligned, dest, x.shuffle(csizeseq<Nlow>));
+ write(cunaligned, dest + Nlow, x.shuffle(csizeseq<N - Nlow, Nlow>));
+}
+
+#endif
+
+template <size_t N, typename T>
+KFR_INTRINSIC vec<T, N> read(caligned_t, csize_t<N>, const T* __restrict ptr) CMT_NOEXCEPT
+{
+ return *reinterpret_cast<const typename vec<T, N>::simd_type*>(ptr);
+}
+
+template <size_t N, typename T>
+KFR_INTRINSIC void write(caligned_t, T* __restrict ptr, const vec<T, N>& __restrict x) CMT_NOEXCEPT
+{
+ *reinterpret_cast<typename vec<T, N>::simd_type*>(ptr) = x.v;
+}
+
+} // namespace intrinsics
+
+} // namespace CMT_ARCH_NAME
+} // namespace kfr
diff --git a/include/kfr/simd/read_write.hpp b/include/kfr/simd/read_write.hpp
@@ -25,9 +25,7 @@
*/
#pragma once
-#include "shuffle.hpp"
-#include "types.hpp"
-#include "vec.hpp"
+#include "impl/read_write.hpp"
namespace kfr
{
@@ -35,15 +33,16 @@ inline namespace CMT_ARCH_NAME
{
template <size_t N, bool A = false, typename T>
-KFR_INTRINSIC static vec<T, N> read(const T* src)
+KFR_INTRINSIC vec<T, N> read(const T* src)
{
- return vec<T, N>(src, cbool_t<A>());
+ return vec<T, N>::from_flatten(intrinsics::read(cbool<A>, csize<N * compound_type_traits<T>::deep_width>,
+ ptr_cast<deep_subtype<T>>(src)));
}
template <bool A = false, size_t N, typename T>
-KFR_INTRINSIC static void write(T* dest, const vec<T, N>& value)
+KFR_INTRINSIC void write(T* dest, const vec<T, N>& value)
{
- value.write(dest, cbool_t<A>());
+ intrinsics::write(cbool<A>, ptr_cast<deep_subtype<T>>(dest), value.flatten());
}
template <typename... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)>
@@ -239,5 +238,24 @@ KFR_INTRINSIC vec<T, N> partial_mask(size_t index, vec_shape<T, N>)
{
return partial_mask<T, N>(index);
}
+
+// read/write
+template <typename T, size_t N>
+template <bool aligned>
+KFR_MEM_INTRINSIC constexpr vec<T, N>::vec(const value_type* src, cbool_t<aligned>) CMT_NOEXCEPT
+ : vec(vec<T, N>::from_flatten(intrinsics::read(cbool<aligned>,
+ csize<N * compound_type_traits<T>::deep_width>,
+ ptr_cast<deep_subtype<T>>(src))))
+{
+}
+
+template <typename T, size_t N>
+template <bool aligned>
+KFR_MEM_INTRINSIC const vec<T, N>& vec<T, N>::write(value_type* dest, cbool_t<aligned>) const CMT_NOEXCEPT
+{
+ intrinsics::write(cbool<aligned>, ptr_cast<deep_subtype<T>>(dest), flatten());
+ return *this;
+}
+
} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/simd/types.hpp b/include/kfr/simd/types.hpp
@@ -41,6 +41,7 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wignored-qualifiers")
#ifdef KFR_TESTING
#include "../testo/testo.hpp"
+#include "../cometa/function.hpp"
#endif
#include "../cometa.hpp"
@@ -149,12 +150,20 @@ constexpr ctypes_t<i8, i16, i32, i64, u8, u16, u32, u64, f32
constexpr csizes_t<1, 2, 3, 4, 8, 16, 32, 64> test_vector_sizes{};
+#ifdef CMT_ARCH_AVX512
+constexpr size_t max_test_size = 128;
+#elif defined CMT_ARCH_AVX
+constexpr size_t max_test_size = 64;
+#else
+constexpr size_t max_test_size = 32;
+#endif
+
template <template <typename, size_t> class vec_tpl, typename T,
typename sizes =
#ifdef KFR_EXTENDED_TESTS
- cfilter_t<decltype(test_vector_sizes), decltype(test_vector_sizes <= csize<64 / sizeof(T)>)>
+ cfilter_t<decltype(test_vector_sizes), decltype(test_vector_sizes <= csize<max_test_size / sizeof(T)>)>
#else
- csizes_t<1>
+ csizes_t<1, 2>
#endif
>
struct vector_types_for_size_t_impl;
diff --git a/include/kfr/simd/vec.hpp b/include/kfr/simd/vec.hpp
@@ -143,18 +143,23 @@ template <typename T>
struct compoundcast
{
static vec<T, 1> to_flat(const T& x) { return vec<T, 1>(x); }
+
static T from_flat(const vec<T, 1>& x) { return x.front(); }
};
+
template <typename T, size_t N>
struct compoundcast<vec<T, N>>
{
static const vec<T, N>& to_flat(const vec<T, N>& x) { return x; }
+
static const vec<T, N>& from_flat(const vec<T, N>& x) { return x; }
};
+
template <typename T, size_t N1, size_t N2>
struct compoundcast<vec<vec<T, N1>, N2>>
{
static vec<T, N1 * N2> to_flat(const vec<vec<T, N1>, N2>& x) { return x; }
+
static vec<vec<T, N1>, N2> from_flat(const vec<T, N1 * N2>& x) { return x; }
};
} // namespace internal
@@ -219,6 +224,7 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits<
: v(intrinsics::simd_broadcast(intrinsics::simd_t<ST, SN>{}, static_cast<ST>(s)))
{
}
+
template <typename U,
KFR_ENABLE_IF(std::is_convertible<U, value_type>::value && !compound_type_traits<T>::is_scalar)>
KFR_MEM_INTRINSIC vec(const U& s) CMT_NOEXCEPT
@@ -234,6 +240,7 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits<
: v(intrinsics::simd_make(ctype<T>, s0, s1, static_cast<value_type>(rest)...))
{
}
+
template <typename... Us, KFR_ENABLE_IF(sizeof...(Us) <= 1022 && !compound_type_traits<T>::is_scalar)>
KFR_MEM_INTRINSIC vec(const value_type& s0, const value_type& s1, const Us&... rest) CMT_NOEXCEPT
: v(intrinsics::simd_concat<ST, size_t(SW), size_t(SW), just_value<Us, size_t>(SW)...>(
@@ -249,6 +256,7 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits<
: v(intrinsics::simd_convert(intrinsics::simd_cvt_t<ST, deep_subtype<U>, SN>{}, x.v))
{
}
+
template <typename U,
KFR_ENABLE_IF(std::is_convertible<U, value_type>::value && !compound_type_traits<T>::is_scalar)>
KFR_MEM_INTRINSIC vec(const vec<U, N>& x) CMT_NOEXCEPT
@@ -315,6 +323,7 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits<
{
return intrinsics::simd_get_element<T, N>(v, index);
}
+
template <int dummy = 0, typename = void,
KFR_ENABLE_IF(dummy == 0 && !compound_type_traits<T>::is_scalar)>
KFR_MEM_INTRINSIC constexpr value_type get(size_t index) const CMT_NOEXCEPT
@@ -327,6 +336,7 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits<
{
return intrinsics::simd_get_element<T, N>(v, csize<index>);
}
+
template <size_t index, typename = void,
KFR_ENABLE_IF(index < 1024 && !compound_type_traits<T>::is_scalar)>
KFR_MEM_INTRINSIC constexpr value_type get(csize_t<index>) const CMT_NOEXCEPT
@@ -340,6 +350,7 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits<
{
v = intrinsics::simd_set_element<T, N>(v, index, s);
}
+
template <int dummy = 0, KFR_ENABLE_IF(dummy == 0 && !compound_type_traits<T>::is_scalar)>
KFR_MEM_INTRINSIC constexpr void set(size_t index, const value_type& s) CMT_NOEXCEPT
{
@@ -351,6 +362,7 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits<
{
v = intrinsics::simd_set_element<T, N>(v, csize<index>, s);
}
+
template <size_t index, typename = void,
KFR_ENABLE_IF(index < 1024 && !compound_type_traits<T>::is_scalar)>
KFR_MEM_INTRINSIC constexpr void set(csize_t<index>, const value_type& s) CMT_NOEXCEPT
@@ -388,20 +400,14 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits<
// read/write
template <bool aligned = false>
KFR_MEM_INTRINSIC explicit constexpr vec(const value_type* src,
- cbool_t<aligned> = cbool_t<aligned>()) CMT_NOEXCEPT
- : v(intrinsics::simd_read<SN, aligned>(ptr_cast<ST>(src)))
- {
- }
+ cbool_t<aligned> = cbool_t<aligned>()) CMT_NOEXCEPT;
template <bool aligned = false>
KFR_MEM_INTRINSIC const vec& write(value_type* dest,
- cbool_t<aligned> = cbool_t<aligned>()) const CMT_NOEXCEPT
- {
- intrinsics::simd_write<aligned, SN>(ptr_cast<ST>(dest), v);
- return *this;
- }
+ cbool_t<aligned> = cbool_t<aligned>()) const CMT_NOEXCEPT;
KFR_MEM_INTRINSIC vec<ST, SN> flatten() const CMT_NOEXCEPT { return v; }
+
KFR_MEM_INTRINSIC static vec from_flatten(const vec<ST, SN>& x) { return vec(x.v); }
KFR_MEM_INTRINSIC constexpr mask_t asmask() const CMT_NOEXCEPT { return mask_t(v); }
@@ -1073,8 +1079,11 @@ constexpr cint_t<2> vectors{};
constexpr cint_t<3> all{};
constexpr inline auto types(cint_t<0>) { return ctypes_t<>{}; }
+
constexpr inline auto types(cint_t<1>) { return cconcat(numeric_types); }
+
constexpr inline auto types(cint_t<2>) { return cconcat(numeric_vector_types<vec>); }
+
constexpr inline auto types(cint_t<3>) { return cconcat(numeric_types, numeric_vector_types<vec>); }
} // namespace test_catogories
@@ -1089,15 +1098,19 @@ template <int Cat, typename Fn, typename RefFn, typename IsApplicable = fn_retur
void test_function1(cint_t<Cat> cat, Fn&& fn, RefFn&& reffn, IsApplicable&& isapplicable = IsApplicable{})
{
testo::matrix(
- named("type") = test_catogories::types(cat), named("value") = special_values(),
- [&](auto type, special_value value) {
+ named("value") = special_values(), named("type") = test_catogories::types(cat),
+ [&](special_value value, auto type) {
using T = type_of<decltype(type)>;
if (isapplicable(ctype<T>, value))
{
const T x(value);
CHECK(std::is_same<decltype(fn(x)), typename compound_type_traits<T>::template rebind<
decltype(reffn(std::declval<subtype<T>>()))>>::value);
- CHECK(fn(x) == apply(reffn, x));
+ const auto fn_x = fn(x);
+ const auto ref_x = apply(reffn, x);
+ ::testo::active_test()->check(testo::deep_is_equal(ref_x, fn_x),
+ as_string(fn_x, " == ", ref_x), "fn(x) == apply(reffn, x)");
+ // CHECK(fn(x) == apply(reffn, x));
}
});
@@ -1112,9 +1125,9 @@ template <int Cat, typename Fn, typename RefFn, typename IsApplicable = fn_retur
void test_function2(cint_t<Cat> cat, Fn&& fn, RefFn&& reffn, IsApplicable&& isapplicable = IsApplicable{})
{
testo::matrix(
- named("type") = test_catogories::types(cat),
named("value1") = special_values(), //
- named("value2") = special_values(), [&](auto type, special_value value1, special_value value2) {
+ named("value2") = special_values(), named("type") = test_catogories::types(cat),
+ [&](special_value value1, special_value value2, auto type) {
using T = type_of<decltype(type)>;
const T x1(value1);
const T x2(value2);
@@ -1146,6 +1159,7 @@ struct conversion<vec<vec<To, N1>, N2>, vec<From, Ns1>>
static_assert(N1 == Ns1, "");
static_assert(!is_compound<To>::value, "");
static_assert(!is_compound<From>::value, "");
+
static vec<vec<To, N1>, N2> cast(const vec<From, N1>& value)
{
return vec<vec<To, N1>, N2>::from_flatten(
@@ -1153,6 +1167,7 @@ struct conversion<vec<vec<To, N1>, N2>, vec<From, Ns1>>
.shuffle(csizeseq<N2 * vec<From, N1>::scalar_size()> % csize<N2>));
}
};
+
// vector<vector> to vector<vector>
template <typename To, typename From, size_t N1, size_t N2, size_t NN1, size_t NN2>
struct conversion<vec<vec<To, N1>, N2>, vec<vec<From, NN1>, NN2>>
@@ -1161,6 +1176,7 @@ struct conversion<vec<vec<To, N1>, N2>, vec<vec<From, NN1>, NN2>>
static_assert(N2 == NN2, "");
static_assert(!is_compound<To>::value, "");
static_assert(!is_compound<From>::value, "");
+
static vec<vec<To, N1>, N2> cast(const vec<vec<From, N1>, N2>& value)
{
return vec<vec<To, N1>, N2>::from_flatten(kfr::innercast<To>(value.flatten()));
diff --git a/include/kfr/testo/comparison.hpp b/include/kfr/testo/comparison.hpp
@@ -79,55 +79,46 @@ struct eplison_scope<void>
eplison_scope<long double> ld;
};
-template <>
-struct equality_comparer<float, float>
-{
- bool operator()(const float& l, const float& r) const
- {
- return !(std::abs(l - r) > current_epsilon<float>());
- }
-};
-template <>
-struct equality_comparer<double, double>
-{
- bool operator()(const double& l, const double& r) const
- {
- return !(std::abs(l - r) > current_epsilon<double>());
- }
-};
-template <>
-struct equality_comparer<long double, long double>
+CMT_PRAGMA_GNU(GCC diagnostic pop)
+
+template <typename T1, typename T2,
+ CMT_ENABLE_IF(compound_type_traits<T1>::is_scalar&& compound_type_traits<T2>::is_scalar &&
+ (std::is_floating_point<T1>::value || std::is_floating_point<T2>::value))>
+constexpr bool deep_is_equal(const T1& x, const T2& y)
{
- bool operator()(const long double& l, const long double& r) const
- {
- return !(std::abs(l - r) > current_epsilon<long double>());
- }
-};
+ using C = std::common_type_t<T1, T2>;
+ const C xx = static_cast<C>(x);
+ const C yy = static_cast<C>(y);
+ if (std::isnan(xx) && std::isnan(yy))
+ return true;
+ if (std::isnan(xx) || std::isnan(yy))
+ return false;
-CMT_PRAGMA_GNU(GCC diagnostic pop)
+ return !(std::abs(xx - yy) > current_epsilon<C>());
+}
-template <typename L, typename R>
-struct equality_comparer<L, R, void_t<enable_if<!compound_type_traits<L>::is_scalar>>>
+template <typename T1, typename T2,
+ CMT_ENABLE_IF(compound_type_traits<T1>::is_scalar&& compound_type_traits<T2>::is_scalar &&
+ !std::is_floating_point<T1>::value && !std::is_floating_point<T2>::value)>
+constexpr bool deep_is_equal(const T1& x, const T2& y)
{
- using Tsubtype = subtype<L>;
- constexpr static static_assert_type_eq<subtype<L>, subtype<R>> assert{};
+ return x == y;
+}
- bool operator()(const L& l, const R& r) const
+template <typename T1, typename T2,
+ CMT_ENABLE_IF(!compound_type_traits<T1>::is_scalar || !compound_type_traits<T2>::is_scalar)>
+constexpr bool deep_is_equal(const T1& x, const T2& y)
+{
+ static_assert(compound_type_traits<T1>::width == compound_type_traits<T2>::width ||
+ compound_type_traits<T1>::is_scalar || compound_type_traits<T2>::is_scalar,
+ "");
+ for (size_t i = 0; i < std::max(+compound_type_traits<T1>::width, +compound_type_traits<T2>::width); i++)
{
- if (compound_type_traits<L>::width != compound_type_traits<R>::width)
+ if (!deep_is_equal(compound_type_traits<T1>::at(x, i), compound_type_traits<T2>::at(y, i)))
return false;
-
- compound_type_traits<L> itl;
- compound_type_traits<R> itr;
- for (size_t i = 0; i < compound_type_traits<L>::width; i++)
- {
- equality_comparer<Tsubtype, Tsubtype> cmp;
- if (!cmp(itl.at(l, i), itr.at(r, i)))
- return false;
- }
- return true;
}
-};
+ return true;
+}
struct cmp_eq
{
@@ -136,8 +127,7 @@ struct cmp_eq
template <typename L, typename R>
bool operator()(L&& left, R&& right) const
{
- equality_comparer<decay<L>, decay<R>> eq;
- return eq(left, right);
+ return deep_is_equal(left, right);
}
};
diff --git a/include/kfr/testo/testo.hpp b/include/kfr/testo/testo.hpp
@@ -346,6 +346,7 @@ void matrix(named_arg<Arg0>&& arg0, named_arg<Arg1>&& arg1, named_arg<Arg2>&& ar
CMT_UNUSED static int run_all(const std::string& name = std::string(), bool show_successful = false)
{
+ console_color c(White);
std::vector<test_case*> success;
std::vector<test_case*> failed;
int success_checks = 0;
@@ -381,6 +382,7 @@ CMT_UNUSED static int run_all(const std::string& name = std::string(), bool show
return static_cast<int>(failed.size());
}
+
template <typename T1, typename T2>
void assert_is_same()
{
diff --git a/sources.cmake b/sources.cmake
@@ -134,6 +134,7 @@ set(
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_generic.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/function.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/operators.hpp
+ ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/read_write.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/simd.hpp
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/intrinsics.h
${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/specializations.i
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -20,6 +20,10 @@ cmake_minimum_required(VERSION 3.1)
add_definitions(-DKFR_TESTING=1)
add_definitions(-DKFR_SRC_DIR=\"${CMAKE_SOURCE_DIR}\")
+if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+ add_compile_options(-Wno-parentheses)
+endif ()
+
# Binary output directories
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin)
@@ -29,8 +33,11 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/tests/cmake/")
if (ENABLE_ASMTEST)
add_executable(asm_test asm_test.cpp)
target_link_libraries(asm_test kfr)
- target_set_arch(asm_test PRIVATE avx)
+ target_set_arch(asm_test PRIVATE avx2)
target_compile_definitions(asm_test PRIVATE KFR_SHOW_NOT_OPTIMIZED)
+ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
+ target_compile_options(asm_test PRIVATE -fno-stack-protector)
+ endif ()
add_custom_command(TARGET asm_test POST_BUILD COMMAND objconv -fyasm $<TARGET_FILE:asm_test>)
endif()
@@ -113,30 +120,32 @@ function(add_x86_test ARCH)
endfunction()
+message(STATUS "ARCH_TESTS = ${ARCH_TESTS}")
+
+if (ARCH_TESTS AND ARCH_TESTS STREQUAL "ON")
+ set (ARCH_LIST generic sse2 ssse3 sse42 avx avx2 avx512)
+else ()
+ string (REPLACE "," ";" ARCH_LIST "${ARCH_TESTS}")
+endif ()
+
+if (MSVC AND NOT CLANG)
+ list(REMOVE_ITEM ARCH_LIST generic)
+endif ()
+
+message(STATUS "Testing for ${ARCH_LIST}")
+
if (ARCH_TESTS)
- if (NOT MSVC OR CLANG)
- add_x86_test(generic)
- endif ()
- add_x86_test(sse2)
- add_x86_test(sse3)
- add_x86_test(ssse3)
- add_x86_test(sse41)
- add_x86_test(avx)
- add_x86_test(avx2)
- add_x86_test(avx512)
+
+ foreach(A IN LISTS ARCH_LIST)
+ add_x86_test(${A})
+ endforeach()
if (ARCH_TESTS_MULTI)
add_executable(all_tests_multiarch all_tests.cpp)
target_compile_definitions(all_tests_multiarch PRIVATE KFR_MULTI_ARCH)
- target_link_libraries(all_tests_multiarch
- all_tests_multiarch_sse2
- all_tests_multiarch_sse3
- all_tests_multiarch_ssse3
- all_tests_multiarch_sse41
- all_tests_multiarch_avx
- all_tests_multiarch_avx2
- all_tests_multiarch_avx512
- )
+ foreach(A IN LISTS ARCH_LIST)
+ target_link_libraries(all_tests_multiarch all_tests_multiarch_${A})
+ endforeach()
endif ()
endif()
@@ -162,13 +171,8 @@ if (NOT IOS)
COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests)
if (ARCH_TESTS)
- add_test(NAME generic COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_generic )
- add_test(NAME sse2 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_sse2 )
- add_test(NAME sse3 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_sse3 )
- add_test(NAME ssse3 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_ssse3 )
- add_test(NAME sse41 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_sse41 )
- add_test(NAME avx COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_avx )
- add_test(NAME avx2 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_avx2 )
- add_test(NAME avx512 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_avx512 )
+ foreach(A IN LISTS ARCH_LIST)
+ add_test(NAME ${A} COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_${A})
+ endforeach()
endif ()
endif ()
diff --git a/tests/asm_test.cpp b/tests/asm_test.cpp
@@ -4,12 +4,20 @@
* See LICENSE.txt for details
*/
+#define KFR_EXTENDED_TESTS
+
#include <kfr/base.hpp>
#include <kfr/io.hpp>
#include <kfr/testo/console_colors.hpp>
using namespace kfr;
+#ifdef CMT_COMPILER_MSVC
+#define KFR_PUBLIC CMT_PUBLIC_C CMT_DLL_EXPORT
+#else
+#define KFR_PUBLIC CMT_PUBLIC_C
+#endif
+
#define TEST_ASM_8(fn, ty, MACRO) \
MACRO(fn, ty, 1) \
MACRO(fn, ty, 2) \
@@ -43,12 +51,6 @@ using namespace kfr;
MACRO(fn, ty, 8) \
MACRO(fn, ty, 16)
-#ifdef CMT_COMPILER_MSVC
-#define KFR_PUBLIC CMT_PUBLIC_C CMT_DLL_EXPORT
-#else
-#define KFR_PUBLIC CMT_PUBLIC_C
-#endif
-
#define TEST_ASM_VTY1(fn, ty, n) \
KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n>& x) { r = kfr::fn(x); }
@@ -121,6 +123,28 @@ using namespace kfr;
#define TEST_ASM_DOUBLE1(fn, ty, n) \
KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n * 2>& r, const vec<ty, n>& x) { r = kfr::fn(x); }
+#define TEST_READ(fn, ty, n) \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__aligned(vec<ty, n>& __restrict r, \
+ const ty* __restrict x) \
+ { \
+ r = kfr::fn<n, true>(x); \
+ } \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__unaligned(vec<ty, n> & __restrict r, \
+ const ty* __restrict x) \
+ { \
+ r = kfr::fn<n, false>(x); \
+ }
+
+#define TEST_WRITE(fn, ty, n) \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__aligned(ty* __restrict p, const vec<ty, n>& x) \
+ { \
+ kfr::fn<true>(p, x); \
+ } \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__unaligned(ty * __restrict p, const vec<ty, n>& x) \
+ { \
+ kfr::fn<false>(p, x); \
+ }
+
#define TEST_ASM_U(fn, MACRO) \
TEST_ASM_8(fn, u8, MACRO) \
TEST_ASM_16(fn, u16, MACRO) \
@@ -203,10 +227,14 @@ TEST_ASM_UIF(make_vector, TEST_ASM_MAKE_VECTOR)
TEST_ASM_UIF(broadcast, TEST_ASM_BROADCAST)
+TEST_ASM_UIF(read, TEST_READ)
+
+TEST_ASM_UIF(write, TEST_WRITE)
+
namespace kfr
{
#ifdef KFR_SHOW_NOT_OPTIMIZED
-CMT_PUBLIC_C CMT_DLL_EXPORT void not_optimized(const char* fn) CMT_NOEXCEPT { puts(fn); }
+KFR_PUBLIC void not_optimized(const char* fn) CMT_NOEXCEPT { puts(fn); }
#endif
} // namespace kfr
diff --git a/tests/unit/simd/vec.cpp b/tests/unit/simd/vec.cpp
@@ -110,5 +110,25 @@ TEST(cast)
[](auto t, special_value x) { return is_in_range_of<f64>(x.get<subtype<type_of<decltype(t)>>>()); });
}
+TEST(unaligned_read)
+{
+ testo::matrix(named("type") = numeric_vector_types<vec>, [](auto type) {
+ using T = type_of<decltype(type)>;
+ using Tsub = subtype<T>;
+ constexpr static size_t N = T::size();
+ Tsub data[N * 2];
+ for (size_t i = 0; i < arraysize(data); i++)
+ {
+ data[i] = static_cast<Tsub>(i);
+ }
+
+ for (size_t i = 0; i < N; i++)
+ {
+// testo::scope sc(as_string("i = ", i));
+ CHECK(read<N, false>(data + i) == (enumerate<Tsub, N>() + static_cast<Tsub>(i)));
+ }
+ });
+}
+
} // namespace CMT_ARCH_NAME
} // namespace kfr