kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit aa603f76838c450537b1864177cc81255ad70d3e
parent 9ec57529ec94779c6c708497659d74685c53674f
Author: [email protected] <[email protected]>
Date:   Fri,  1 Mar 2019 14:17:28 +0000

Unaligned read

Diffstat:
Mcmake/target_set_arch.cmake | 8+++++---
Minclude/kfr/cometa.hpp | 20+++++++++++++++++---
Minclude/kfr/dft/impl/ft.hpp | 6+++---
Minclude/kfr/simd/impl/backend_clang.hpp | 35-----------------------------------
Minclude/kfr/simd/impl/backend_generic.hpp | 138+++++++++++++++++++++++++++++++++++++++++++++++++------------------------------
Ainclude/kfr/simd/impl/read_write.hpp | 397+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Minclude/kfr/simd/read_write.hpp | 32+++++++++++++++++++++++++-------
Minclude/kfr/simd/types.hpp | 13+++++++++++--
Minclude/kfr/simd/vec.hpp | 44++++++++++++++++++++++++++++++--------------
Minclude/kfr/testo/comparison.hpp | 76+++++++++++++++++++++++++++++++++-------------------------------------------
Minclude/kfr/testo/testo.hpp | 2++
Msources.cmake | 1+
Mtests/CMakeLists.txt | 60++++++++++++++++++++++++++++++++----------------------------
Mtests/asm_test.cpp | 42+++++++++++++++++++++++++++++++++++-------
Mtests/unit/simd/vec.cpp | 20++++++++++++++++++++
15 files changed, 696 insertions(+), 198 deletions(-)

diff --git a/cmake/target_set_arch.cmake b/cmake/target_set_arch.cmake @@ -6,9 +6,10 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") set(ARCH_FLAGS_GNU_sse3 -msse3) set(ARCH_FLAGS_GNU_ssse3 -mssse3) set(ARCH_FLAGS_GNU_sse41 -msse4.1) - set(ARCH_FLAGS_GNU_avx -msse4.1 -mavx) - set(ARCH_FLAGS_GNU_avx2 -msse4.1 -mavx2 -mfma) - set(ARCH_FLAGS_GNU_avx512 -msse4.1 -mavx2 -mfma -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl) + set(ARCH_FLAGS_GNU_sse42 -msse4.2) + set(ARCH_FLAGS_GNU_avx -msse4.2 -mavx) + set(ARCH_FLAGS_GNU_avx2 -msse4.2 -mavx2 -mfma) + set(ARCH_FLAGS_GNU_avx512 -msse4.2 -mavx2 -mfma -mavx512f -mavx512cd -mavx512bw -mavx512dq -mavx512vl) if (CMAKE_SIZEOF_VOID_P EQUAL 8) # SSE2 is part of x86_64 @@ -22,6 +23,7 @@ if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") set(ARCH_FLAGS_MS_sse3 ${ARCH_FLAG_MS_SSE2} -D__SSE3__) set(ARCH_FLAGS_MS_ssse3 ${ARCH_FLAG_MS_SSE2} -D__SSSE3__) set(ARCH_FLAGS_MS_sse41 ${ARCH_FLAG_MS_SSE2} -D__SSE3__ -D__SSSE3__ -D__SSE4_1__) + set(ARCH_FLAGS_MS_sse42 ${ARCH_FLAG_MS_SSE2} -D__SSE3__ -D__SSSE3__ -D__SSE4_1__ -D__SSE4_2__) set(ARCH_FLAGS_MS_avx -arch:AVX) set(ARCH_FLAGS_MS_avx2 -arch:AVX2) set(ARCH_FLAGS_MS_avx512 -arch:AVX512) diff --git a/include/kfr/cometa.hpp b/include/kfr/cometa.hpp @@ -7,6 +7,7 @@ #include <cstdint> #include <cstdlib> +#include <cstring> #include <limits> #include <random> #include <type_traits> @@ -714,7 +715,7 @@ template <typename... List> using indicesfor_t = cvalseq_t<size_t, sizeof...(List), 0>; template <size_t group, size_t... indices, size_t N = group * sizeof...(indices)> -constexpr inline auto scale(csizes_t<indices...> i) CMT_NOEXCEPT +constexpr inline auto scale(csizes_t<indices...>) CMT_NOEXCEPT { return cconcat(csizeseq_t<group, group * indices>()...); // return i[csizeseq_t<N>() / csize_t<group>()] * csize_t<group>() + csizeseq_t<N>() % @@ -1941,10 +1942,10 @@ using overload_generic = overload_priority<0>; #define CMT_GEN_LIST(c, m, ...) CMT_GEN_LIST##c(m, __VA_ARGS__) template <typename Tout, typename Tin> -constexpr CMT_INLINE Tout bitcast_anything(const Tin& in) +CMT_INLINE Tout bitcast_anything(const Tin& in) { static_assert(sizeof(Tin) == sizeof(Tout), "Invalid arguments for bitcast_anything"); -#ifdef CMT_COMPILER_INTEL +#if defined CMT_COMPILER_INTEL const union { const Tin in; Tout out; @@ -1971,6 +1972,19 @@ constexpr T just_value(T value) return value; } +template <typename Tout, typename> +CMT_INTRINSIC constexpr Tout pack_elements() +{ + return 0; +} + +template <typename Tout, typename Arg, typename... Args> +CMT_INTRINSIC constexpr Tout pack_elements(Arg x, Args... args) +{ + return static_cast<typename std::make_unsigned<Arg>::type>(x) | + (pack_elements<Tout, Arg>(args...) << (sizeof(Arg) * 8)); +} + enum class special_constant { undefined, diff --git a/include/kfr/dft/impl/ft.hpp b/include/kfr/dft/impl/ft.hpp @@ -127,7 +127,7 @@ KFR_INTRINSIC void cwrite(complex<T>* dest, const cvec<T, N>& value) template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices> KFR_INTRINSIC cvec<T, count * N> cread_group_impl(const complex<T>* src, csizes_t<indices...>) { - return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...); + return concat(read(cbool<A>, csize<N * 2>, ptr_cast<T>(src + stride * indices))...); } template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices> KFR_INTRINSIC void cwrite_group_impl(complex<T>* dest, const cvec<T, count * N>& value, csizes_t<indices...>) @@ -138,7 +138,7 @@ KFR_INTRINSIC void cwrite_group_impl(complex<T>* dest, const cvec<T, count * N>& template <size_t count, size_t N, bool A, typename T, size_t... indices> KFR_INTRINSIC cvec<T, count * N> cread_group_impl(const complex<T>* src, size_t stride, csizes_t<indices...>) { - return concat(read<N * 2, A>(ptr_cast<T>(src + stride * indices))...); + return concat(read(cbool<A>, csize<N * 2>, ptr_cast<T>(src + stride * indices))...); } template <size_t count, size_t N, bool A, typename T, size_t... indices> KFR_INTRINSIC void cwrite_group_impl(complex<T>* dest, size_t stride, const cvec<T, count * N>& value, @@ -1459,7 +1459,7 @@ KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()> KFR_INTRINSIC void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w) { - vec<T, Nout> temp = read<Nout>(ptr_cast<T>(ptr)); + vec<T, Nout> temp = read(cunaligned, csize<Nout>, ptr_cast<T>(ptr)); if (transposed) temp = ctranspose<sizeof...(N)>(temp); split(temp, w...); diff --git a/include/kfr/simd/impl/backend_clang.hpp b/include/kfr/simd/impl/backend_clang.hpp @@ -173,41 +173,6 @@ KFR_INTRINSIC simd<T, N> simd_convert(simd_cvt_t<T, T, N>, const simd<T, N>& x) template <typename T, size_t N, bool A> using simd_storage = struct_with_alignment<simd<T, N>, A>; -template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))> -KFR_INTRINSIC simd<T, N> simd_read(const T* src) -{ - return ptr_cast<simd_storage<T, N, A>>(src)->value; -} - -template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void> -KFR_INTRINSIC simd<T, N> simd_read(const T* src) -{ - constexpr size_t first = prev_poweroftwo(N); - constexpr size_t rest = N - first; - constexpr auto extend_indices = cconcat(csizeseq<rest>, csizeseq<first - rest, index_undefined, 0>); - constexpr auto concat_indices = cvalseq_t<size_t, N>(); - return simd_shuffle( - simd2_t<T, first, first>{}, simd_read<first, A>(src), - simd_shuffle(simd_t<T, rest>{}, simd_read<rest, false>(src + first), extend_indices, overload_auto), - concat_indices, overload_auto); -} - -template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))> -KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value) -{ - ptr_cast<simd_storage<T, N, A>>(dest)->value = value; -} - -template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void> -KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value) -{ - constexpr size_t first = prev_poweroftwo(N); - constexpr size_t rest = N - first; - simd_write<A, first>(dest, simd_shuffle(simd_t<T, N>{}, value, csizeseq<first>, overload_auto)); - simd_write<false, rest>(dest + first, - simd_shuffle(simd_t<T, N>{}, value, csizeseq<rest, first>, overload_auto)); -} - template <typename T, size_t N> KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, size_t index) { diff --git a/include/kfr/simd/impl/backend_generic.hpp b/include/kfr/simd/impl/backend_generic.hpp @@ -54,17 +54,32 @@ using simd = typename simd_type<T, N>::type; template <typename T, size_t N, typename U> union simd_small_array { static_assert(sizeof(T) * N == sizeof(U), ""); - T arr[N]; U whole; - KFR_INTRINSIC static constexpr simd_small_array from(U whole) + using value_type = T; + constexpr static size_t size = N; + using packed_type = U; + + KFR_INTRINSIC constexpr simd_small_array() CMT_NOEXCEPT = default; + + KFR_INTRINSIC constexpr simd_small_array(U whole) CMT_NOEXCEPT : whole(whole) {} + + template <typename... Args> + KFR_INTRINSIC constexpr simd_small_array(T a, T b, Args... args) CMT_NOEXCEPT + : whole(pack_elements<U, T>(a, b, args...)) { - union { - const U w; - simd_small_array r; - } u{ whole }; - return u.r; } + + KFR_INTRINSIC static constexpr simd_small_array from(U whole) CMT_NOEXCEPT { return { whole }; } +}; + +template <typename T> +struct is_simd_small_array : cfalse_t +{ +}; +template <typename T, size_t N, typename U> +struct is_simd_small_array<simd_small_array<T, N, U>> : ctrue_t +{ }; #define KFR_SIMD_TYPE(T, N, ...) \ @@ -108,8 +123,6 @@ KFR_SIMD_SMALL_TYPE(i8, 8, u64) KFR_SIMD_SMALL_TYPE(i16, 4, u64) KFR_SIMD_SMALL_TYPE(i32, 2, u64) -KFR_SIMD_SMALL_TYPE(f32, 2, f64) - #ifdef CMT_ARCH_SSE KFR_SIMD_TYPE(f32, 4, __m128) KFR_SIMD_TYPE(f64, 2, __m128d) @@ -207,11 +220,15 @@ KFR_SIMD_TYPE(f64, 2, float64x2_t); #ifdef CMT_ARCH_SSE2 inline __m128i KFR_mm_setr_epi64x(int64_t q0, int64_t q1) CMT_NOEXCEPT { return _mm_set_epi64x(q1, q0); } +inline __m128i KFR_mm_setr_epi32(int32_t q0, int32_t q1, int32_t q2, int32_t q3) CMT_NOEXCEPT +{ + return _mm_set_epi32(q3, q2, q1, q0); +} KFR_INTRIN_MAKE(2, i64, KFR_mm_setr_epi64x) KFR_INTRIN_MAKE(2, u64, KFR_mm_setr_epi64x) KFR_INTRIN_MAKE(2, f64, _mm_setr_pd) -KFR_INTRIN_MAKE(4, i32, _mm_setr_epi32) -KFR_INTRIN_MAKE(4, u32, _mm_setr_epi32) +KFR_INTRIN_MAKE(4, i32, KFR_mm_setr_epi32) +KFR_INTRIN_MAKE(4, u32, KFR_mm_setr_epi32) KFR_INTRIN_MAKE(4, f32, _mm_setr_ps) KFR_INTRIN_MAKE(8, i16, _mm_setr_epi16) KFR_INTRIN_MAKE(8, u16, _mm_setr_epi16) @@ -301,7 +318,7 @@ KFR_INTRIN_SHUFFLE_LINEAR(i64, 1, 2, _mm_cvtsi128_si64(x)) KFR_INTRIN_SHUFFLE_LINEAR(u64, 1, 2, _mm_cvtsi128_si64(x)) KFR_INTRIN_SHUFFLE_LINEAR(f32, 1, 4, _mm_cvtss_f32(x)) KFR_INTRIN_SHUFFLE_LINEAR(f32, 2, 4, bitcast_anything<simd<float, 2>>(_mm_cvtsd_f64(_mm_castps_pd(x)))) -KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 2, _mm_castpd_ps(_mm_set_sd(x.whole))) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 2, _mm_unpacklo_ps(_mm_set_ss(x.low), _mm_set_ss(x.high))) KFR_INTRIN_SHUFFLE_LINEAR(f64, 1, 2, _mm_cvtsd_f64(x)) KFR_INTRIN_SHUFFLE_LINEAR(i8, 2, 16, simd<i8, 2>::from(u16(_mm_cvtsi128_si32(x)))) @@ -333,11 +350,24 @@ KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 2, 4, 2, simd<i32, 2>::from(KFR_u64sse_INDE return __VA_ARGS__; \ } +#define KFR_INTRIN_CONVERT_NOOP_REF(Tout, Tin, N) \ + KFR_INTRINSIC const simd<Tout, N>& simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) \ + CMT_NOEXCEPT \ + { \ + return x; \ + } +#define KFR_INTRIN_CONVERT_NOOP(Tout, Tin, N) \ + KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT \ + { \ + return x; \ + } + KFR_INTRIN_CONVERT(f32, i32, 4, _mm_cvtepi32_ps(x)) KFR_INTRIN_CONVERT(i32, f32, 4, _mm_cvttps_epi32(x)) KFR_INTRIN_CONVERT(i32, f64, 2, simd<i32, 2>::from(_mm_cvtsi128_si64(_mm_cvttpd_epi32(x)))) KFR_INTRIN_CONVERT(f64, i32, 2, _mm_cvtepi32_pd(KFR_mm_setr_epi64x(x.whole, 0))) -KFR_INTRIN_CONVERT(i64, f64, 2, _mm_set_epi64x(_mm_cvttsd_si64(_mm_unpackhi_pd(x, x)), _mm_cvttsd_si64(x))) +KFR_INTRIN_CONVERT(i64, f64, 2, + KFR_mm_setr_epi64x(_mm_cvttsd_si64(x), _mm_cvttsd_si64(_mm_unpackhi_pd(x, x)))) KFR_INTRIN_CONVERT(f64, i64, 2, _mm_unpacklo_pd(_mm_cvtsi64_sd(_mm_setzero_pd(), _mm_cvtsi128_si64(x)), _mm_cvtsi64_sd(_mm_setzero_pd(), KFR_i64sse_INDEX(x, 1)))) @@ -355,6 +385,25 @@ KFR_INTRIN_CONVERT(f32, f64, 4, simd<f32, 4>{ _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(_mm_cvtpd_ps(x.low)), _mm_castps_pd(_mm_cvtpd_ps(x.high)))) }) #endif + +KFR_INTRIN_CONVERT_NOOP(u8, i8, 1) +KFR_INTRIN_CONVERT_NOOP(i8, u8, 1) +KFR_INTRIN_CONVERT_NOOP(u16, i16, 1) +KFR_INTRIN_CONVERT_NOOP(i16, u16, 1) +KFR_INTRIN_CONVERT_NOOP(u32, i32, 1) +KFR_INTRIN_CONVERT_NOOP(i32, u32, 1) +KFR_INTRIN_CONVERT_NOOP(u64, i64, 1) +KFR_INTRIN_CONVERT_NOOP(i64, u64, 1) + +KFR_INTRIN_CONVERT_NOOP_REF(u8, i8, 16) +KFR_INTRIN_CONVERT_NOOP_REF(i8, u8, 16) +KFR_INTRIN_CONVERT_NOOP_REF(u16, i16, 8) +KFR_INTRIN_CONVERT_NOOP_REF(i16, u16, 8) +KFR_INTRIN_CONVERT_NOOP_REF(u32, i32, 4) +KFR_INTRIN_CONVERT_NOOP_REF(i32, u32, 4) +KFR_INTRIN_CONVERT_NOOP_REF(u64, i64, 2) +KFR_INTRIN_CONVERT_NOOP_REF(i64, u64, 2) + #endif // CMT_ARCH_SSE2 #ifdef CMT_ARCH_SSE41 @@ -707,12 +756,34 @@ KFR_INTRINSIC simd_array<T, N> to_simd_array(const simd<T, N>& x) CMT_NOEXCEPT return bitcast_anything<simd_array<T, N>>(x); } +#if defined CMT_COMPILER_MSVC + +template <typename T, size_t N, KFR_ENABLE_IF(!is_simd_small_array<simd<T, N>>::value)> +KFR_INTRINSIC simd<T, N> from_simd_array(const simd_array<T, N>& x) CMT_NOEXCEPT +{ + return bitcast_anything<simd<T, N>>(x); +} + +template <typename T, size_t N, size_t... indices> +KFR_INTRINSIC simd<T, N> from_simd_array_impl(const simd_array<T, N>& x, csizes_t<indices...>) CMT_NOEXCEPT +{ + return { x.val[indices]... }; +} + +template <typename T, size_t N, KFR_ENABLE_IF(is_simd_small_array<simd<T, N>>::value)> +KFR_INTRINSIC simd<T, N> from_simd_array(const simd_array<T, N>& x) CMT_NOEXCEPT +{ + return from_simd_array_impl(x, csizeseq<N>); +} +#else template <typename T, size_t N> KFR_INTRINSIC simd<T, N> from_simd_array(const simd_array<T, N>& x) CMT_NOEXCEPT { return bitcast_anything<simd<T, N>>(x); } +#endif + #define KFR_COMPONENTWISE_RET(code) \ vec<T, N> result; \ for (size_t i = 0; i < N; i++) \ @@ -815,8 +886,7 @@ KFR_INTRINSIC const simd<T, N>& simd_bitcast(simd_cvt_t<T, T, N>, const simd<T, template <typename T, size_t N, size_t index> KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, csize_t<index>) CMT_NOEXCEPT { - not_optimized(CMT_FUNC_SIGNATURE); - return to_simd_array<T, N>(value).val[index]; + return simd_shuffle(simd_t<T, N>{}, value, csizes<index>, overload_auto); } template <typename T, size_t N, size_t index> @@ -1022,53 +1092,15 @@ using simd_storage = struct_with_alignment<simd<T, N>, A>; CMT_PRAGMA_GNU(GCC diagnostic pop) -template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))> -KFR_INTRINSIC simd<T, N> simd_read(const T* src) CMT_NOEXCEPT -{ - return reinterpret_cast<typename simd_storage<T, N, A>::const_pointer>(src)->value; -} - -template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void> -KFR_INTRINSIC simd<T, N> simd_read(const T* src) CMT_NOEXCEPT -{ - constexpr size_t first = prev_poweroftwo(N); - constexpr size_t rest = N - first; - constexpr auto extend_indices = - cconcat(csizeseq_t<rest>(), csizeseq_t<first - rest, index_undefined, 0>()); - constexpr auto concat_indices = cvalseq_t<size_t, N>(); - return simd_shuffle( - simd2_t<T, first, first>{}, simd_read<first, A>(src), - simd_shuffle(simd_t<T, rest>{}, simd_read<rest, false>(src + first), extend_indices, overload_auto), - concat_indices, overload_auto); -} - -template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))> -KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value) CMT_NOEXCEPT -{ - reinterpret_cast<typename simd_storage<T, N, A>::pointer>(dest)->value = value; -} - -template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void> -KFR_INTRINSIC void simd_write(T* dest, const simd<T, N>& value) CMT_NOEXCEPT -{ - constexpr size_t first = prev_poweroftwo(N); - constexpr size_t rest = N - first; - simd_write<A, first>(dest, simd_shuffle(simd_t<T, N>{}, value, csizeseq_t<first>(), overload_auto)); - simd_write<false, rest>(dest + first, - simd_shuffle(simd_t<T, N>{}, value, csizeseq_t<rest, first>(), overload_auto)); -} - template <typename T, size_t N> KFR_INTRINSIC T simd_get_element(const simd<T, N>& value, size_t index) CMT_NOEXCEPT { - not_optimized(CMT_FUNC_SIGNATURE); return to_simd_array<T, N>(value).val[index]; } template <typename T, size_t N> KFR_INTRINSIC simd<T, N> simd_set_element(const simd<T, N>& value, size_t index, T x) CMT_NOEXCEPT { - not_optimized(CMT_FUNC_SIGNATURE); simd_array<T, N> arr = to_simd_array<T, N>(value); arr.val[index] = x; return from_simd_array(arr); diff --git a/include/kfr/simd/impl/read_write.hpp b/include/kfr/simd/impl/read_write.hpp @@ -0,0 +1,397 @@ +/** @addtogroup read_write + * @{ + */ +/* + Copyright (C) 2016 D Levin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include "../shuffle.hpp" +#include "../types.hpp" +#include "../vec.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +namespace intrinsics +{ + +#ifndef CMT_CLANG_EXT + +#ifdef CMT_ARCH_SSE2 + +template <typename T> +KFR_INTRINSIC vec<T, 1> read(cunaligned_t, csize_t<1>, const T* ptr) +{ + return *ptr; +} + +KFR_INTRINSIC f32x2 read(cunaligned_t, csize_t<2>, const f32* ptr) +{ + return f32x2::simd_type{ ptr[0], ptr[1] }; +} + +#if !defined(CMT_COMPILER_GCC) + +KFR_INTRINSIC u8x2 read(cunaligned_t, csize_t<2>, const u8* ptr) +{ + return u8x2::simd_type::from(*reinterpret_cast<const u16*>(ptr)); +} +KFR_INTRINSIC i8x2 read(cunaligned_t, csize_t<2>, const i8* ptr) +{ + return i8x2::simd_type::from(*reinterpret_cast<const u16*>(ptr)); +} +KFR_INTRINSIC u8x4 read(cunaligned_t, csize_t<4>, const u8* ptr) +{ + return u8x4::simd_type::from(*reinterpret_cast<const u32*>(ptr)); +} +KFR_INTRINSIC i8x4 read(cunaligned_t, csize_t<4>, const i8* ptr) +{ + return i8x4::simd_type::from(*reinterpret_cast<const u32*>(ptr)); +} +KFR_INTRINSIC u16x2 read(cunaligned_t, csize_t<2>, const u16* ptr) +{ + return u16x2::simd_type::from(*reinterpret_cast<const u32*>(ptr)); +} +KFR_INTRINSIC i16x2 read(cunaligned_t, csize_t<2>, const i16* ptr) +{ + return i16x2::simd_type::from(*reinterpret_cast<const u32*>(ptr)); +} +KFR_INTRINSIC u8x8 read(cunaligned_t, csize_t<8>, const u8* ptr) +{ + return u8x8::simd_type::from(*reinterpret_cast<const u64*>(ptr)); +} +KFR_INTRINSIC i8x8 read(cunaligned_t, csize_t<8>, const i8* ptr) +{ + return i8x8::simd_type::from(*reinterpret_cast<const u64*>(ptr)); +} +KFR_INTRINSIC u16x4 read(cunaligned_t, csize_t<4>, const u16* ptr) +{ + return u16x4::simd_type::from(*reinterpret_cast<const u64*>(ptr)); +} +KFR_INTRINSIC i16x4 read(cunaligned_t, csize_t<4>, const i16* ptr) +{ + return i16x4::simd_type::from(*reinterpret_cast<const u64*>(ptr)); +} +KFR_INTRINSIC u32x2 read(cunaligned_t, csize_t<2>, const u32* ptr) +{ + return u32x2::simd_type::from(*reinterpret_cast<const u64*>(ptr)); +} +KFR_INTRINSIC i32x2 read(cunaligned_t, csize_t<2>, const i32* ptr) +{ + return i32x2::simd_type::from(*reinterpret_cast<const u64*>(ptr)); +} + +#endif + +KFR_INTRINSIC f32sse read(cunaligned_t, csize_t<4>, const f32* ptr) { return _mm_loadu_ps(ptr); } +KFR_INTRINSIC f64sse read(cunaligned_t, csize_t<2>, const f64* ptr) { return _mm_loadu_pd(ptr); } +KFR_INTRINSIC u8sse read(cunaligned_t, csize_t<16>, const u8* ptr) +{ + return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)); +} +KFR_INTRINSIC i8sse read(cunaligned_t, csize_t<16>, const i8* ptr) +{ + return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)); +} +KFR_INTRINSIC u16sse read(cunaligned_t, csize_t<8>, const u16* ptr) +{ + return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)); +} +KFR_INTRINSIC i16sse read(cunaligned_t, csize_t<8>, const i16* ptr) +{ + return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)); +} +KFR_INTRINSIC u32sse read(cunaligned_t, csize_t<4>, const u32* ptr) +{ + return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)); +} +KFR_INTRINSIC i32sse read(cunaligned_t, csize_t<4>, const i32* ptr) +{ + return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)); +} +KFR_INTRINSIC u64sse read(cunaligned_t, csize_t<2>, const u64* ptr) +{ + return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)); +} +KFR_INTRINSIC i64sse read(cunaligned_t, csize_t<2>, const i64* ptr) +{ + return _mm_loadu_si128(reinterpret_cast<const __m128i*>(ptr)); +} + +template <typename T> +KFR_INTRINSIC void write(cunaligned_t, T* ptr, const vec<T, 1>& x) +{ + *ptr = x.front(); +} +KFR_INTRINSIC void write(cunaligned_t, f32* ptr, const f32x2& x) +{ + ptr[0] = x.v.low; + ptr[1] = x.v.high; +} + +KFR_INTRINSIC void write(cunaligned_t, u8* ptr, const u8x2& x) { *reinterpret_cast<u16*>(ptr) = x.v.whole; } +KFR_INTRINSIC void write(cunaligned_t, i8* ptr, const i8x2& x) { *reinterpret_cast<u16*>(ptr) = x.v.whole; } +KFR_INTRINSIC void write(cunaligned_t, u8* ptr, const u8x4& x) { *reinterpret_cast<u32*>(ptr) = x.v.whole; } +KFR_INTRINSIC void write(cunaligned_t, i8* ptr, const i8x4& x) { *reinterpret_cast<u32*>(ptr) = x.v.whole; } +KFR_INTRINSIC void write(cunaligned_t, u16* ptr, const u16x2& x) { *reinterpret_cast<u32*>(ptr) = x.v.whole; } +KFR_INTRINSIC void write(cunaligned_t, i16* ptr, const i16x2& x) { *reinterpret_cast<u32*>(ptr) = x.v.whole; } +KFR_INTRINSIC void write(cunaligned_t, u8* ptr, const u8x8& x) { *reinterpret_cast<u64*>(ptr) = x.v.whole; } +KFR_INTRINSIC void write(cunaligned_t, i8* ptr, const i8x8& x) { *reinterpret_cast<u64*>(ptr) = x.v.whole; } +KFR_INTRINSIC void write(cunaligned_t, u16* ptr, const u16x4& x) { *reinterpret_cast<u64*>(ptr) = x.v.whole; } +KFR_INTRINSIC void write(cunaligned_t, i16* ptr, const i16x4& x) { *reinterpret_cast<u64*>(ptr) = x.v.whole; } +KFR_INTRINSIC void write(cunaligned_t, u32* ptr, const u32x2& x) { *reinterpret_cast<u64*>(ptr) = x.v.whole; } +KFR_INTRINSIC void write(cunaligned_t, i32* ptr, const i32x2& x) { *reinterpret_cast<u64*>(ptr) = x.v.whole; } + +KFR_INTRINSIC void write(cunaligned_t, f32* ptr, const f32sse& x) { _mm_storeu_ps(ptr, x.v); } +KFR_INTRINSIC void write(cunaligned_t, f64* ptr, const f64sse& x) { _mm_storeu_pd(ptr, x.v); } +KFR_INTRINSIC void write(cunaligned_t, u8* ptr, const u8sse& x) +{ + _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), x.v); +} +KFR_INTRINSIC void write(cunaligned_t, i8* ptr, const i8sse& x) +{ + _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), x.v); +} +KFR_INTRINSIC void write(cunaligned_t, u16* ptr, const u16sse& x) +{ + _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), x.v); +} +KFR_INTRINSIC void write(cunaligned_t, i16* ptr, const i16sse& x) +{ + _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), x.v); +} +KFR_INTRINSIC void write(cunaligned_t, u32* ptr, const u32sse& x) +{ + _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), x.v); +} +KFR_INTRINSIC void write(cunaligned_t, i32* ptr, const i32sse& x) +{ + _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), x.v); +} +KFR_INTRINSIC void write(cunaligned_t, u64* ptr, const u64sse& x) +{ + _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), x.v); +} +KFR_INTRINSIC void write(cunaligned_t, i64* ptr, const i64sse& x) +{ + _mm_storeu_si128(reinterpret_cast<__m128i*>(ptr), x.v); +} + +#if defined CMT_ARCH_AVX + +KFR_INTRINSIC f32avx read(cunaligned_t, csize_t<8>, const f32* ptr) { return _mm256_loadu_ps(ptr); } +KFR_INTRINSIC f64avx read(cunaligned_t, csize_t<4>, const f64* ptr) { return _mm256_loadu_pd(ptr); } + +KFR_INTRINSIC void write(cunaligned_t, f32* ptr, const f32avx& x) { _mm256_storeu_ps(ptr, x.v); } +KFR_INTRINSIC void write(cunaligned_t, f64* ptr, const f64avx& x) { _mm256_storeu_pd(ptr, x.v); } + +#if defined CMT_ARCH_AVX2 + +KFR_INTRINSIC u8avx read(cunaligned_t, csize_t<32>, const u8* ptr) +{ + return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr)); +} +KFR_INTRINSIC i8avx read(cunaligned_t, csize_t<32>, const i8* ptr) +{ + return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr)); +} +KFR_INTRINSIC u16avx read(cunaligned_t, csize_t<16>, const u16* ptr) +{ + return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr)); +} +KFR_INTRINSIC i16avx read(cunaligned_t, csize_t<16>, const i16* ptr) +{ + return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr)); +} +KFR_INTRINSIC u32avx read(cunaligned_t, csize_t<8>, const u32* ptr) +{ + return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr)); +} +KFR_INTRINSIC i32avx read(cunaligned_t, csize_t<8>, const i32* ptr) +{ + return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr)); +} +KFR_INTRINSIC u64avx read(cunaligned_t, csize_t<4>, const u64* ptr) +{ + return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr)); +} +KFR_INTRINSIC i64avx read(cunaligned_t, csize_t<4>, const i64* ptr) +{ + return _mm256_loadu_si256(reinterpret_cast<const __m256i*>(ptr)); +} + +KFR_INTRINSIC void write(cunaligned_t, u8* ptr, const u8avx& x) +{ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), x.v); +} +KFR_INTRINSIC void write(cunaligned_t, i8* ptr, const i8avx& x) +{ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), x.v); +} +KFR_INTRINSIC void write(cunaligned_t, u16* ptr, const u16avx& x) +{ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), x.v); +} +KFR_INTRINSIC void write(cunaligned_t, i16* ptr, const i16avx& x) +{ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), x.v); +} +KFR_INTRINSIC void write(cunaligned_t, u32* ptr, const u32avx& x) +{ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), x.v); +} +KFR_INTRINSIC void write(cunaligned_t, i32* ptr, const i32avx& x) +{ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), x.v); +} +KFR_INTRINSIC void write(cunaligned_t, u64* ptr, const u64avx& x) +{ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), x.v); +} +KFR_INTRINSIC void write(cunaligned_t, i64* ptr, const i64avx& x) +{ + _mm256_storeu_si256(reinterpret_cast<__m256i*>(ptr), x.v); +} + +#if defined CMT_ARCH_AVX512 + +KFR_INTRINSIC f32avx512 read(cunaligned_t, csize_t<16>, const f32* ptr) { return _mm512_loadu_ps(ptr); } +KFR_INTRINSIC f64avx512 read(cunaligned_t, csize_t<8>, const f64* ptr) { return _mm512_loadu_pd(ptr); } + +KFR_INTRINSIC u8avx512 read(cunaligned_t, csize_t<64>, const u8* ptr) { return _mm512_loadu_epi8(ptr); } +KFR_INTRINSIC i8avx512 read(cunaligned_t, csize_t<64>, const i8* ptr) { return _mm512_loadu_epi8(ptr); } +KFR_INTRINSIC u16avx512 read(cunaligned_t, csize_t<32>, const u16* ptr) { return _mm512_loadu_epi16(ptr); } +KFR_INTRINSIC i16avx512 read(cunaligned_t, csize_t<32>, const i16* ptr) { return _mm512_loadu_epi16(ptr); } +KFR_INTRINSIC u32avx512 read(cunaligned_t, csize_t<16>, const u32* ptr) { return _mm512_loadu_epi32(ptr); } +KFR_INTRINSIC i32avx512 read(cunaligned_t, csize_t<16>, const i32* ptr) { return _mm512_loadu_epi32(ptr); } +KFR_INTRINSIC u64avx512 read(cunaligned_t, csize_t<8>, const u64* ptr) { return _mm512_loadu_epi64(ptr); } +KFR_INTRINSIC i64avx512 read(cunaligned_t, csize_t<8>, const i64* ptr) { return _mm512_loadu_epi64(ptr); } + +KFR_INTRINSIC void write(cunaligned_t, f32* ptr, const f32avx512& x) { _mm512_storeu_ps(ptr, x.v); } +KFR_INTRINSIC void write(cunaligned_t, f64* ptr, const f64avx512& x) { _mm512_storeu_pd(ptr, x.v); } + +KFR_INTRINSIC void write(cunaligned_t, u8* ptr, const u8avx512& x) { _mm512_storeu_epi8(ptr, x.v); } +KFR_INTRINSIC void write(cunaligned_t, i8* ptr, const i8avx512& x) { _mm512_storeu_epi8(ptr, x.v); } +KFR_INTRINSIC void write(cunaligned_t, u16* ptr, const u16avx512& x) { _mm512_storeu_epi16(ptr, x.v); } +KFR_INTRINSIC void write(cunaligned_t, i16* ptr, const i16avx512& x) { _mm512_storeu_epi16(ptr, x.v); } +KFR_INTRINSIC void write(cunaligned_t, u32* ptr, const u32avx512& x) { _mm512_storeu_epi32(ptr, x.v); } +KFR_INTRINSIC void write(cunaligned_t, i32* ptr, const i32avx512& x) { _mm512_storeu_epi32(ptr, x.v); } +KFR_INTRINSIC void write(cunaligned_t, u64* ptr, const u64avx512& x) { _mm512_storeu_epi64(ptr, x.v); } +KFR_INTRINSIC void write(cunaligned_t, i64* ptr, const i64avx512& x) { _mm512_storeu_epi64(ptr, x.v); } + +#endif +#endif +#endif +#else + +// fallback + +template <size_t N, typename T, KFR_ENABLE_IF(N == 1 || is_simd_size<T>(N))> +KFR_INTRINSIC vec<T, N> read(cunaligned_t, csize_t<N>, const T* ptr) CMT_NOEXCEPT +{ + vec<T, N> result{}; + for (size_t i = 0; i < N; i++) + result[i] = ptr[i]; + return result; +} + +template <size_t N, typename T, KFR_ENABLE_IF(N == 1 || is_simd_size<T>(N))> +KFR_INTRINSIC void write(cunaligned_t, T* ptr, const vec<T, N>& x) CMT_NOEXCEPT +{ + for (size_t i = 0; i < N; i++) + ptr[i] = x[i]; +} + +#endif + +template <size_t N, typename T, KFR_ENABLE_IF(N != 1 && !is_simd_size<T>(N)), + size_t Nlow = prev_poweroftwo(N - 1)> +KFR_INTRINSIC vec<T, N> read(cunaligned_t, csize_t<N>, const T* ptr) CMT_NOEXCEPT +{ + return concat(read(cunaligned, csize<Nlow>, ptr), read(cunaligned, csize<N - Nlow>, ptr + Nlow)); +} + +template <size_t N, typename T, KFR_ENABLE_IF(N != 1 && !is_simd_size<T>(N)), + size_t Nlow = prev_poweroftwo(N - 1)> +KFR_INTRINSIC void write(cunaligned_t, T* ptr, const vec<T, N>& x) CMT_NOEXCEPT +{ + write(cunaligned, ptr, x.shuffle(csizeseq<Nlow>)); + write(cunaligned, ptr + Nlow, x.shuffle(csizeseq<N - Nlow, Nlow>)); +} + +#else + +template <size_t N, typename T> +KFR_INTRINSIC simd<T, N> simd_read(const T* src) CMT_NOEXCEPT +{ + return reinterpret_cast<typename simd_storage<T, N, false>::const_pointer>(src)->value; +} + +template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(is_poweroftwo(N))> +KFR_INTRINSIC vec<T, N> read(cunaligned_t, csize_t<N>, const T* src) CMT_NOEXCEPT +{ + // Clang requires a separate function returning vector (simd). + // Direct returning vec causes aligned read instruction + return simd_read<N>(src); +} + +template <size_t N, bool A = false, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), typename = void> +KFR_INTRINSIC vec<T, N> read(cunaligned_t, csize_t<N>, const T* src) CMT_NOEXCEPT +{ + constexpr size_t first = prev_poweroftwo(N); + return concat(read(cunaligned, csize<first>, src), read(cunaligned, csize<N - first>, src + first)); +} + +template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(is_poweroftwo(N))> +KFR_INTRINSIC void write(cunaligned_t, T* dest, const vec<T, N>& x) CMT_NOEXCEPT +{ + reinterpret_cast<typename simd_storage<T, N, A>::pointer>(dest)->value = x.v; +} + +template <bool A = false, size_t N, typename T, KFR_ENABLE_IF(!is_poweroftwo(N)), + size_t Nlow = prev_poweroftwo(N - 1)> +KFR_INTRINSIC void write(cunaligned_t, T* dest, const vec<T, N>& x) CMT_NOEXCEPT +{ + write(cunaligned, dest, x.shuffle(csizeseq<Nlow>)); + write(cunaligned, dest + Nlow, x.shuffle(csizeseq<N - Nlow, Nlow>)); +} + +#endif + +template <size_t N, typename T> +KFR_INTRINSIC vec<T, N> read(caligned_t, csize_t<N>, const T* __restrict ptr) CMT_NOEXCEPT +{ + return *reinterpret_cast<const typename vec<T, N>::simd_type*>(ptr); +} + +template <size_t N, typename T> +KFR_INTRINSIC void write(caligned_t, T* __restrict ptr, const vec<T, N>& __restrict x) CMT_NOEXCEPT +{ + *reinterpret_cast<typename vec<T, N>::simd_type*>(ptr) = x.v; +} + +} // namespace intrinsics + +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/simd/read_write.hpp b/include/kfr/simd/read_write.hpp @@ -25,9 +25,7 @@ */ #pragma once -#include "shuffle.hpp" -#include "types.hpp" -#include "vec.hpp" +#include "impl/read_write.hpp" namespace kfr { @@ -35,15 +33,16 @@ inline namespace CMT_ARCH_NAME { template <size_t N, bool A = false, typename T> -KFR_INTRINSIC static vec<T, N> read(const T* src) +KFR_INTRINSIC vec<T, N> read(const T* src) { - return vec<T, N>(src, cbool_t<A>()); + return vec<T, N>::from_flatten(intrinsics::read(cbool<A>, csize<N * compound_type_traits<T>::deep_width>, + ptr_cast<deep_subtype<T>>(src))); } template <bool A = false, size_t N, typename T> -KFR_INTRINSIC static void write(T* dest, const vec<T, N>& value) +KFR_INTRINSIC void write(T* dest, const vec<T, N>& value) { - value.write(dest, cbool_t<A>()); + intrinsics::write(cbool<A>, ptr_cast<deep_subtype<T>>(dest), value.flatten()); } template <typename... Indices, typename T, size_t Nout = 1 + sizeof...(Indices)> @@ -239,5 +238,24 @@ KFR_INTRINSIC vec<T, N> partial_mask(size_t index, vec_shape<T, N>) { return partial_mask<T, N>(index); } + +// read/write +template <typename T, size_t N> +template <bool aligned> +KFR_MEM_INTRINSIC constexpr vec<T, N>::vec(const value_type* src, cbool_t<aligned>) CMT_NOEXCEPT + : vec(vec<T, N>::from_flatten(intrinsics::read(cbool<aligned>, + csize<N * compound_type_traits<T>::deep_width>, + ptr_cast<deep_subtype<T>>(src)))) +{ +} + +template <typename T, size_t N> +template <bool aligned> +KFR_MEM_INTRINSIC const vec<T, N>& vec<T, N>::write(value_type* dest, cbool_t<aligned>) const CMT_NOEXCEPT +{ + intrinsics::write(cbool<aligned>, ptr_cast<deep_subtype<T>>(dest), flatten()); + return *this; +} + } // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/simd/types.hpp b/include/kfr/simd/types.hpp @@ -41,6 +41,7 @@ CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wignored-qualifiers") #ifdef KFR_TESTING #include "../testo/testo.hpp" +#include "../cometa/function.hpp" #endif #include "../cometa.hpp" @@ -149,12 +150,20 @@ constexpr ctypes_t<i8, i16, i32, i64, u8, u16, u32, u64, f32 constexpr csizes_t<1, 2, 3, 4, 8, 16, 32, 64> test_vector_sizes{}; +#ifdef CMT_ARCH_AVX512 +constexpr size_t max_test_size = 128; +#elif defined CMT_ARCH_AVX +constexpr size_t max_test_size = 64; +#else +constexpr size_t max_test_size = 32; +#endif + template <template <typename, size_t> class vec_tpl, typename T, typename sizes = #ifdef KFR_EXTENDED_TESTS - cfilter_t<decltype(test_vector_sizes), decltype(test_vector_sizes <= csize<64 / sizeof(T)>)> + cfilter_t<decltype(test_vector_sizes), decltype(test_vector_sizes <= csize<max_test_size / sizeof(T)>)> #else - csizes_t<1> + csizes_t<1, 2> #endif > struct vector_types_for_size_t_impl; diff --git a/include/kfr/simd/vec.hpp b/include/kfr/simd/vec.hpp @@ -143,18 +143,23 @@ template <typename T> struct compoundcast { static vec<T, 1> to_flat(const T& x) { return vec<T, 1>(x); } + static T from_flat(const vec<T, 1>& x) { return x.front(); } }; + template <typename T, size_t N> struct compoundcast<vec<T, N>> { static const vec<T, N>& to_flat(const vec<T, N>& x) { return x; } + static const vec<T, N>& from_flat(const vec<T, N>& x) { return x; } }; + template <typename T, size_t N1, size_t N2> struct compoundcast<vec<vec<T, N1>, N2>> { static vec<T, N1 * N2> to_flat(const vec<vec<T, N1>, N2>& x) { return x; } + static vec<vec<T, N1>, N2> from_flat(const vec<T, N1 * N2>& x) { return x; } }; } // namespace internal @@ -219,6 +224,7 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits< : v(intrinsics::simd_broadcast(intrinsics::simd_t<ST, SN>{}, static_cast<ST>(s))) { } + template <typename U, KFR_ENABLE_IF(std::is_convertible<U, value_type>::value && !compound_type_traits<T>::is_scalar)> KFR_MEM_INTRINSIC vec(const U& s) CMT_NOEXCEPT @@ -234,6 +240,7 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits< : v(intrinsics::simd_make(ctype<T>, s0, s1, static_cast<value_type>(rest)...)) { } + template <typename... Us, KFR_ENABLE_IF(sizeof...(Us) <= 1022 && !compound_type_traits<T>::is_scalar)> KFR_MEM_INTRINSIC vec(const value_type& s0, const value_type& s1, const Us&... rest) CMT_NOEXCEPT : v(intrinsics::simd_concat<ST, size_t(SW), size_t(SW), just_value<Us, size_t>(SW)...>( @@ -249,6 +256,7 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits< : v(intrinsics::simd_convert(intrinsics::simd_cvt_t<ST, deep_subtype<U>, SN>{}, x.v)) { } + template <typename U, KFR_ENABLE_IF(std::is_convertible<U, value_type>::value && !compound_type_traits<T>::is_scalar)> KFR_MEM_INTRINSIC vec(const vec<U, N>& x) CMT_NOEXCEPT @@ -315,6 +323,7 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits< { return intrinsics::simd_get_element<T, N>(v, index); } + template <int dummy = 0, typename = void, KFR_ENABLE_IF(dummy == 0 && !compound_type_traits<T>::is_scalar)> KFR_MEM_INTRINSIC constexpr value_type get(size_t index) const CMT_NOEXCEPT @@ -327,6 +336,7 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits< { return intrinsics::simd_get_element<T, N>(v, csize<index>); } + template <size_t index, typename = void, KFR_ENABLE_IF(index < 1024 && !compound_type_traits<T>::is_scalar)> KFR_MEM_INTRINSIC constexpr value_type get(csize_t<index>) const CMT_NOEXCEPT @@ -340,6 +350,7 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits< { v = intrinsics::simd_set_element<T, N>(v, index, s); } + template <int dummy = 0, KFR_ENABLE_IF(dummy == 0 && !compound_type_traits<T>::is_scalar)> KFR_MEM_INTRINSIC constexpr void set(size_t index, const value_type& s) CMT_NOEXCEPT { @@ -351,6 +362,7 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits< { v = intrinsics::simd_set_element<T, N>(v, csize<index>, s); } + template <size_t index, typename = void, KFR_ENABLE_IF(index < 1024 && !compound_type_traits<T>::is_scalar)> KFR_MEM_INTRINSIC constexpr void set(csize_t<index>, const value_type& s) CMT_NOEXCEPT @@ -388,20 +400,14 @@ struct alignas(const_max(alignof(intrinsics::simd<typename compound_type_traits< // read/write template <bool aligned = false> KFR_MEM_INTRINSIC explicit constexpr vec(const value_type* src, - cbool_t<aligned> = cbool_t<aligned>()) CMT_NOEXCEPT - : v(intrinsics::simd_read<SN, aligned>(ptr_cast<ST>(src))) - { - } + cbool_t<aligned> = cbool_t<aligned>()) CMT_NOEXCEPT; template <bool aligned = false> KFR_MEM_INTRINSIC const vec& write(value_type* dest, - cbool_t<aligned> = cbool_t<aligned>()) const CMT_NOEXCEPT - { - intrinsics::simd_write<aligned, SN>(ptr_cast<ST>(dest), v); - return *this; - } + cbool_t<aligned> = cbool_t<aligned>()) const CMT_NOEXCEPT; KFR_MEM_INTRINSIC vec<ST, SN> flatten() const CMT_NOEXCEPT { return v; } + KFR_MEM_INTRINSIC static vec from_flatten(const vec<ST, SN>& x) { return vec(x.v); } KFR_MEM_INTRINSIC constexpr mask_t asmask() const CMT_NOEXCEPT { return mask_t(v); } @@ -1073,8 +1079,11 @@ constexpr cint_t<2> vectors{}; constexpr cint_t<3> all{}; constexpr inline auto types(cint_t<0>) { return ctypes_t<>{}; } + constexpr inline auto types(cint_t<1>) { return cconcat(numeric_types); } + constexpr inline auto types(cint_t<2>) { return cconcat(numeric_vector_types<vec>); } + constexpr inline auto types(cint_t<3>) { return cconcat(numeric_types, numeric_vector_types<vec>); } } // namespace test_catogories @@ -1089,15 +1098,19 @@ template <int Cat, typename Fn, typename RefFn, typename IsApplicable = fn_retur void test_function1(cint_t<Cat> cat, Fn&& fn, RefFn&& reffn, IsApplicable&& isapplicable = IsApplicable{}) { testo::matrix( - named("type") = test_catogories::types(cat), named("value") = special_values(), - [&](auto type, special_value value) { + named("value") = special_values(), named("type") = test_catogories::types(cat), + [&](special_value value, auto type) { using T = type_of<decltype(type)>; if (isapplicable(ctype<T>, value)) { const T x(value); CHECK(std::is_same<decltype(fn(x)), typename compound_type_traits<T>::template rebind< decltype(reffn(std::declval<subtype<T>>()))>>::value); - CHECK(fn(x) == apply(reffn, x)); + const auto fn_x = fn(x); + const auto ref_x = apply(reffn, x); + ::testo::active_test()->check(testo::deep_is_equal(ref_x, fn_x), + as_string(fn_x, " == ", ref_x), "fn(x) == apply(reffn, x)"); + // CHECK(fn(x) == apply(reffn, x)); } }); @@ -1112,9 +1125,9 @@ template <int Cat, typename Fn, typename RefFn, typename IsApplicable = fn_retur void test_function2(cint_t<Cat> cat, Fn&& fn, RefFn&& reffn, IsApplicable&& isapplicable = IsApplicable{}) { testo::matrix( - named("type") = test_catogories::types(cat), named("value1") = special_values(), // - named("value2") = special_values(), [&](auto type, special_value value1, special_value value2) { + named("value2") = special_values(), named("type") = test_catogories::types(cat), + [&](special_value value1, special_value value2, auto type) { using T = type_of<decltype(type)>; const T x1(value1); const T x2(value2); @@ -1146,6 +1159,7 @@ struct conversion<vec<vec<To, N1>, N2>, vec<From, Ns1>> static_assert(N1 == Ns1, ""); static_assert(!is_compound<To>::value, ""); static_assert(!is_compound<From>::value, ""); + static vec<vec<To, N1>, N2> cast(const vec<From, N1>& value) { return vec<vec<To, N1>, N2>::from_flatten( @@ -1153,6 +1167,7 @@ struct conversion<vec<vec<To, N1>, N2>, vec<From, Ns1>> .shuffle(csizeseq<N2 * vec<From, N1>::scalar_size()> % csize<N2>)); } }; + // vector<vector> to vector<vector> template <typename To, typename From, size_t N1, size_t N2, size_t NN1, size_t NN2> struct conversion<vec<vec<To, N1>, N2>, vec<vec<From, NN1>, NN2>> @@ -1161,6 +1176,7 @@ struct conversion<vec<vec<To, N1>, N2>, vec<vec<From, NN1>, NN2>> static_assert(N2 == NN2, ""); static_assert(!is_compound<To>::value, ""); static_assert(!is_compound<From>::value, ""); + static vec<vec<To, N1>, N2> cast(const vec<vec<From, N1>, N2>& value) { return vec<vec<To, N1>, N2>::from_flatten(kfr::innercast<To>(value.flatten())); diff --git a/include/kfr/testo/comparison.hpp b/include/kfr/testo/comparison.hpp @@ -79,55 +79,46 @@ struct eplison_scope<void> eplison_scope<long double> ld; }; -template <> -struct equality_comparer<float, float> -{ - bool operator()(const float& l, const float& r) const - { - return !(std::abs(l - r) > current_epsilon<float>()); - } -}; -template <> -struct equality_comparer<double, double> -{ - bool operator()(const double& l, const double& r) const - { - return !(std::abs(l - r) > current_epsilon<double>()); - } -}; -template <> -struct equality_comparer<long double, long double> +CMT_PRAGMA_GNU(GCC diagnostic pop) + +template <typename T1, typename T2, + CMT_ENABLE_IF(compound_type_traits<T1>::is_scalar&& compound_type_traits<T2>::is_scalar && + (std::is_floating_point<T1>::value || std::is_floating_point<T2>::value))> +constexpr bool deep_is_equal(const T1& x, const T2& y) { - bool operator()(const long double& l, const long double& r) const - { - return !(std::abs(l - r) > current_epsilon<long double>()); - } -}; + using C = std::common_type_t<T1, T2>; + const C xx = static_cast<C>(x); + const C yy = static_cast<C>(y); + if (std::isnan(xx) && std::isnan(yy)) + return true; + if (std::isnan(xx) || std::isnan(yy)) + return false; -CMT_PRAGMA_GNU(GCC diagnostic pop) + return !(std::abs(xx - yy) > current_epsilon<C>()); +} -template <typename L, typename R> -struct equality_comparer<L, R, void_t<enable_if<!compound_type_traits<L>::is_scalar>>> +template <typename T1, typename T2, + CMT_ENABLE_IF(compound_type_traits<T1>::is_scalar&& compound_type_traits<T2>::is_scalar && + !std::is_floating_point<T1>::value && !std::is_floating_point<T2>::value)> +constexpr bool deep_is_equal(const T1& x, const T2& y) { - using Tsubtype = subtype<L>; - constexpr static static_assert_type_eq<subtype<L>, subtype<R>> assert{}; + return x == y; +} - bool operator()(const L& l, const R& r) const +template <typename T1, typename T2, + CMT_ENABLE_IF(!compound_type_traits<T1>::is_scalar || !compound_type_traits<T2>::is_scalar)> +constexpr bool deep_is_equal(const T1& x, const T2& y) +{ + static_assert(compound_type_traits<T1>::width == compound_type_traits<T2>::width || + compound_type_traits<T1>::is_scalar || compound_type_traits<T2>::is_scalar, + ""); + for (size_t i = 0; i < std::max(+compound_type_traits<T1>::width, +compound_type_traits<T2>::width); i++) { - if (compound_type_traits<L>::width != compound_type_traits<R>::width) + if (!deep_is_equal(compound_type_traits<T1>::at(x, i), compound_type_traits<T2>::at(y, i))) return false; - - compound_type_traits<L> itl; - compound_type_traits<R> itr; - for (size_t i = 0; i < compound_type_traits<L>::width; i++) - { - equality_comparer<Tsubtype, Tsubtype> cmp; - if (!cmp(itl.at(l, i), itr.at(r, i))) - return false; - } - return true; } -}; + return true; +} struct cmp_eq { @@ -136,8 +127,7 @@ struct cmp_eq template <typename L, typename R> bool operator()(L&& left, R&& right) const { - equality_comparer<decay<L>, decay<R>> eq; - return eq(left, right); + return deep_is_equal(left, right); } }; diff --git a/include/kfr/testo/testo.hpp b/include/kfr/testo/testo.hpp @@ -346,6 +346,7 @@ void matrix(named_arg<Arg0>&& arg0, named_arg<Arg1>&& arg1, named_arg<Arg2>&& ar CMT_UNUSED static int run_all(const std::string& name = std::string(), bool show_successful = false) { + console_color c(White); std::vector<test_case*> success; std::vector<test_case*> failed; int success_checks = 0; @@ -381,6 +382,7 @@ CMT_UNUSED static int run_all(const std::string& name = std::string(), bool show return static_cast<int>(failed.size()); } + template <typename T1, typename T2> void assert_is_same() { diff --git a/sources.cmake b/sources.cmake @@ -134,6 +134,7 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/basicoperators_generic.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/function.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/operators.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/read_write.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/simd.hpp ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/intrinsics.h ${PROJECT_SOURCE_DIR}/include/kfr/simd/impl/specializations.i diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt @@ -20,6 +20,10 @@ cmake_minimum_required(VERSION 3.1) add_definitions(-DKFR_TESTING=1) add_definitions(-DKFR_SRC_DIR=\"${CMAKE_SOURCE_DIR}\") +if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + add_compile_options(-Wno-parentheses) +endif () + # Binary output directories set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin) @@ -29,8 +33,11 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/tests/cmake/") if (ENABLE_ASMTEST) add_executable(asm_test asm_test.cpp) target_link_libraries(asm_test kfr) - target_set_arch(asm_test PRIVATE avx) + target_set_arch(asm_test PRIVATE avx2) target_compile_definitions(asm_test PRIVATE KFR_SHOW_NOT_OPTIMIZED) + if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") + target_compile_options(asm_test PRIVATE -fno-stack-protector) + endif () add_custom_command(TARGET asm_test POST_BUILD COMMAND objconv -fyasm $<TARGET_FILE:asm_test>) endif() @@ -113,30 +120,32 @@ function(add_x86_test ARCH) endfunction() +message(STATUS "ARCH_TESTS = ${ARCH_TESTS}") + +if (ARCH_TESTS AND ARCH_TESTS STREQUAL "ON") + set (ARCH_LIST generic sse2 ssse3 sse42 avx avx2 avx512) +else () + string (REPLACE "," ";" ARCH_LIST "${ARCH_TESTS}") +endif () + +if (MSVC AND NOT CLANG) + list(REMOVE_ITEM ARCH_LIST generic) +endif () + +message(STATUS "Testing for ${ARCH_LIST}") + if (ARCH_TESTS) - if (NOT MSVC OR CLANG) - add_x86_test(generic) - endif () - add_x86_test(sse2) - add_x86_test(sse3) - add_x86_test(ssse3) - add_x86_test(sse41) - add_x86_test(avx) - add_x86_test(avx2) - add_x86_test(avx512) + + foreach(A IN LISTS ARCH_LIST) + add_x86_test(${A}) + endforeach() if (ARCH_TESTS_MULTI) add_executable(all_tests_multiarch all_tests.cpp) target_compile_definitions(all_tests_multiarch PRIVATE KFR_MULTI_ARCH) - target_link_libraries(all_tests_multiarch - all_tests_multiarch_sse2 - all_tests_multiarch_sse3 - all_tests_multiarch_ssse3 - all_tests_multiarch_sse41 - all_tests_multiarch_avx - all_tests_multiarch_avx2 - all_tests_multiarch_avx512 - ) + foreach(A IN LISTS ARCH_LIST) + target_link_libraries(all_tests_multiarch all_tests_multiarch_${A}) + endforeach() endif () endif() @@ -162,13 +171,8 @@ if (NOT IOS) COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests) if (ARCH_TESTS) - add_test(NAME generic COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_generic ) - add_test(NAME sse2 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_sse2 ) - add_test(NAME sse3 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_sse3 ) - add_test(NAME ssse3 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_ssse3 ) - add_test(NAME sse41 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_sse41 ) - add_test(NAME avx COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_avx ) - add_test(NAME avx2 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_avx2 ) - add_test(NAME avx512 COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_avx512 ) + foreach(A IN LISTS ARCH_LIST) + add_test(NAME ${A} COMMAND ${EMULATOR} ${PROJECT_BINARY_DIR}/bin/all_tests_${A}) + endforeach() endif () endif () diff --git a/tests/asm_test.cpp b/tests/asm_test.cpp @@ -4,12 +4,20 @@ * See LICENSE.txt for details */ +#define KFR_EXTENDED_TESTS + #include <kfr/base.hpp> #include <kfr/io.hpp> #include <kfr/testo/console_colors.hpp> using namespace kfr; +#ifdef CMT_COMPILER_MSVC +#define KFR_PUBLIC CMT_PUBLIC_C CMT_DLL_EXPORT +#else +#define KFR_PUBLIC CMT_PUBLIC_C +#endif + #define TEST_ASM_8(fn, ty, MACRO) \ MACRO(fn, ty, 1) \ MACRO(fn, ty, 2) \ @@ -43,12 +51,6 @@ using namespace kfr; MACRO(fn, ty, 8) \ MACRO(fn, ty, 16) -#ifdef CMT_COMPILER_MSVC -#define KFR_PUBLIC CMT_PUBLIC_C CMT_DLL_EXPORT -#else -#define KFR_PUBLIC CMT_PUBLIC_C -#endif - #define TEST_ASM_VTY1(fn, ty, n) \ KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n>& r, const vec<ty, n>& x) { r = kfr::fn(x); } @@ -121,6 +123,28 @@ using namespace kfr; #define TEST_ASM_DOUBLE1(fn, ty, n) \ KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n * 2>& r, const vec<ty, n>& x) { r = kfr::fn(x); } +#define TEST_READ(fn, ty, n) \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__aligned(vec<ty, n>& __restrict r, \ + const ty* __restrict x) \ + { \ + r = kfr::fn<n, true>(x); \ + } \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__unaligned(vec<ty, n> & __restrict r, \ + const ty* __restrict x) \ + { \ + r = kfr::fn<n, false>(x); \ + } + +#define TEST_WRITE(fn, ty, n) \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__aligned(ty* __restrict p, const vec<ty, n>& x) \ + { \ + kfr::fn<true>(p, x); \ + } \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n##__unaligned(ty * __restrict p, const vec<ty, n>& x) \ + { \ + kfr::fn<false>(p, x); \ + } + #define TEST_ASM_U(fn, MACRO) \ TEST_ASM_8(fn, u8, MACRO) \ TEST_ASM_16(fn, u16, MACRO) \ @@ -203,10 +227,14 @@ TEST_ASM_UIF(make_vector, TEST_ASM_MAKE_VECTOR) TEST_ASM_UIF(broadcast, TEST_ASM_BROADCAST) +TEST_ASM_UIF(read, TEST_READ) + +TEST_ASM_UIF(write, TEST_WRITE) + namespace kfr { #ifdef KFR_SHOW_NOT_OPTIMIZED -CMT_PUBLIC_C CMT_DLL_EXPORT void not_optimized(const char* fn) CMT_NOEXCEPT { puts(fn); } +KFR_PUBLIC void not_optimized(const char* fn) CMT_NOEXCEPT { puts(fn); } #endif } // namespace kfr diff --git a/tests/unit/simd/vec.cpp b/tests/unit/simd/vec.cpp @@ -110,5 +110,25 @@ TEST(cast) [](auto t, special_value x) { return is_in_range_of<f64>(x.get<subtype<type_of<decltype(t)>>>()); }); } +TEST(unaligned_read) +{ + testo::matrix(named("type") = numeric_vector_types<vec>, [](auto type) { + using T = type_of<decltype(type)>; + using Tsub = subtype<T>; + constexpr static size_t N = T::size(); + Tsub data[N * 2]; + for (size_t i = 0; i < arraysize(data); i++) + { + data[i] = static_cast<Tsub>(i); + } + + for (size_t i = 0; i < N; i++) + { +// testo::scope sc(as_string("i = ", i)); + CHECK(read<N, false>(data + i) == (enumerate<Tsub, N>() + static_cast<Tsub>(i))); + } + }); +} + } // namespace CMT_ARCH_NAME } // namespace kfr