kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit 10faf3feba5ec538d51e2072844e4745fa6a1f59
parent e06725ed1fd70a142acdcd52cbe9cdeb7a2731cf
Author: [email protected] <[email protected]>
Date:   Tue, 19 Nov 2019 15:18:46 +0000

KFR 4.0

Diffstat:
MCMakeLists.txt | 20+++++++++++++-------
Minclude/kfr/cident.h | 2+-
Minclude/kfr/cometa.hpp | 36++++++++++++++++++++++++++++++++++++
Minclude/kfr/dft/impl/dft-fft.hpp | 9+++++++++
Minclude/kfr/dft/impl/fft-impl.hpp | 17+++--------------
Minclude/kfr/dft/impl/ft.hpp | 2+-
Minclude/kfr/kfr.h | 4++--
Minclude/kfr/simd/impl/backend_generic.hpp | 628+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
Minclude/kfr/simd/impl/read_write.hpp | 4++++
Minclude/kfr/simd/platform.hpp | 5+++++
Minclude/kfr/simd/vec.hpp | 25++++++++++++++++++++++++-
Mtests/CMakeLists.txt | 3+--
Mtests/asm_test.cpp | 210+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++--
13 files changed, 921 insertions(+), 44 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt @@ -17,15 +17,11 @@ cmake_minimum_required(VERSION 3.1) -message(STATUS CMAKE_CXX_FLAGS = ${CMAKE_CXX_FLAGS}) - set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS}" CACHE STRING "compile flags" FORCE) -message(STATUS CMAKE_CXX_FLAGS = ${CMAKE_CXX_FLAGS}) - project(kfr CXX) -set(CMAKE_CXX_STANDARD 14) +set(CMAKE_CXX_STANDARD 17) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS ON) @@ -56,11 +52,15 @@ endif() include(sources.cmake) option(ENABLE_TESTS "Enable tests and examples" OFF) -option(ENABLE_DFT "Enable DFT and related algorithms." ON) if (CLANG) + option(ENABLE_DFT "Enable DFT and related algorithms." ON) + option(ENABLE_DFT_NP "Enable Non-power of 2 DFT" ON) if (X86) option(ENABLE_DFT_MULTIARCH "Build DFT static libraries for various architectures. Requires Clang" OFF) endif () +else () + option(ENABLE_DFT "Enable DFT and related algorithms." OFF) + option(ENABLE_DFT_NP "Enable Non-power of 2 DFT" OFF) endif() option(ENABLE_ASMTEST "Enable writing disassembly" OFF) option(REGENERATE_TESTS "Regenerate auto tests" OFF) @@ -81,7 +81,7 @@ if (CPU_ARCH STREQUAL "detect" AND X86) RUN_RESULT COMPILE_RESULT "${CMAKE_CURRENT_BINARY_DIR}/tmpdir" ${CMAKE_CURRENT_SOURCE_DIR}/cmake/detect_cpu.cpp - CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CMAKE_CURRENT_SOURCE_DIR}/include" -DCMAKE_CXX_STANDARD=14 + CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CMAKE_CURRENT_SOURCE_DIR}/include" -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_STANDARD_REQUIRED=ON -DCMAKE_CXX_EXTENSIONS=ON COMPILE_OUTPUT_VARIABLE COMPILE_OUT RUN_OUTPUT_VARIABLE RUN_OUT @@ -177,6 +177,12 @@ if (ENABLE_DFT) target_compile_options(kfr_dft PRIVATE -ffast-math) endif() + if (ENABLE_DFT_NP) + target_compile_definitions(kfr_dft PUBLIC -DKFR_DFT_NPo2) + else () + target_compile_definitions(kfr_dft PUBLIC -DKFR_DFT_NO_NPo2) + endif () + if (ENABLE_DFT_MULTIARCH) add_subdirectory(dft) endif () diff --git a/include/kfr/cident.h b/include/kfr/cident.h @@ -625,7 +625,7 @@ extern char* gets(char* __s); "clang-msvc-" CMT_STRINGIFY(__clang_major__) "." CMT_STRINGIFY(__clang_minor__) "." CMT_STRINGIFY( \ __clang_patchlevel__) #else -#define CMT_COMPILER_NAME "clang" +#define CMT_COMPILER_NAME "clang-mingw" #define CMT_COMPILER_FULL_NAME \ "clang-" CMT_STRINGIFY(__clang_major__) "." CMT_STRINGIFY(__clang_minor__) "." CMT_STRINGIFY( \ __clang_patchlevel__) diff --git a/include/kfr/cometa.hpp b/include/kfr/cometa.hpp @@ -428,6 +428,19 @@ struct cvals_t : ops::empty { return {}; } + + constexpr bool operator==(cvals_t<T, values...>) const noexcept { return true; } + template <T... values2> + constexpr bool operator==(cvals_t<T, values2...>) const noexcept + { + return false; + } + + template <T... values2> + constexpr bool operator!=(cvals_t<T, values...> ind) const noexcept + { + return !operator==(ind); + } }; template <typename T> @@ -1849,6 +1862,29 @@ constexpr conditional<std::is_scalar<T>::value, T, const T&> const_min(const T& return x < y ? x : y; } +template <typename T> +constexpr T cminof(cvals_t<T>) +{ + return std::numeric_limits<T>::max(); +} +template <typename T, T val, T... vals> +constexpr T cminof(cvals_t<T, val, vals...>) +{ + T m = cminof(cvals<T, vals...>); + return val < m ? val : m; +} +template <typename T> +constexpr T cmaxof(cvals_t<T>) +{ + return std::numeric_limits<T>::min(); +} +template <typename T, T val, T... vals> +constexpr T cmaxof(cvals_t<T, val, vals...>) +{ + T m = cmaxof(cvals<T, vals...>); + return val > m ? val : m; +} + template <int n = 10> struct overload_priority : overload_priority<n - 1> { diff --git a/include/kfr/dft/impl/dft-fft.hpp b/include/kfr/dft/impl/dft-fft.hpp @@ -39,6 +39,15 @@ namespace kfr { #define DFT_STAGE_FN \ + KFR_MEM_INTRINSIC void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) final \ + { \ + return do_execute<false>(out, in, temp); \ + } \ + KFR_MEM_INTRINSIC void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) final \ + { \ + return do_execute<true>(out, in, temp); \ + } +#define DFT_STAGE_FN_NONFINAL \ void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override \ { \ return do_execute<false>(out, in, temp); \ diff --git a/include/kfr/dft/impl/fft-impl.hpp b/include/kfr/dft/impl/fft-impl.hpp @@ -329,7 +329,7 @@ KFR_INTRINSIC ctrue_t radix4_pass(csize_t<32>, size_t blocks, csize_t<width>, cf transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3); butterfly4<8, inverse>(cfalse, z0, z1, z2, z3, z0, z1, z2, z3); - cwrite<32, aligned>(out, bitreverse<2>(concat(z0, z1, z2, z3))); + cwrite<32, aligned>(out, bitreverse<2>(concat(concat(z0, z1), concat(z2, z3)))); out += 32; } return {}; @@ -430,7 +430,6 @@ struct fft_stage_impl : dft_stage<T> align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment); } -protected: constexpr static bool prefetch = true; constexpr static bool aligned = false; constexpr static size_t width = fft_vector_width<T>; @@ -470,7 +469,6 @@ struct fft_final_stage_impl : dft_stage<T> this->data_size = align_up(sizeof(complex<T>) * size * 3 / 2, platform<>::native_cache_alignment); } -protected: constexpr static size_t width = fft_vector_width<T>; constexpr static bool is_even = cometa::is_even(ilog2(size)); constexpr static bool use_br2 = !is_even; @@ -496,7 +494,7 @@ protected: init_twiddles(csize<size>, total_size, cbool<splitin>, twiddle); } - DFT_STAGE_FN + DFT_STAGE_FN_NONFINAL template <bool inverse> KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) { @@ -563,7 +561,6 @@ struct fft_reorder_stage_impl : dft_stage<T> this->data_size = 0; } -protected: virtual void do_initialize(size_t) override final {} DFT_STAGE_FN @@ -582,7 +579,6 @@ struct fft_specialization<T, 1> : dft_stage<T> { fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } -protected: constexpr static bool aligned = false; DFT_STAGE_FN @@ -600,7 +596,6 @@ struct fft_specialization<T, 2> : dft_stage<T> { fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } -protected: constexpr static bool aligned = false; DFT_STAGE_FN template <bool inverse> @@ -609,7 +604,7 @@ protected: cvec<T, 1> a0, a1, a2, a3; split(cread<4>(in), a0, a1, a2, a3); butterfly(cbool_t<inverse>(), a0, a1, a2, a3, a0, a1, a2, a3); - cwrite<4>(out, concat(a0, a1, a2, a3)); + cwrite<4>(out, concat(concat(a0, a1), concat(a2, a3))); } }; @@ -618,7 +613,6 @@ struct fft_specialization<T, 3> : dft_stage<T> { fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } -protected: constexpr static bool aligned = false; DFT_STAGE_FN template <bool inverse> @@ -635,7 +629,6 @@ struct fft_specialization<T, 4> : dft_stage<T> { fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } -protected: constexpr static bool aligned = false; DFT_STAGE_FN template <bool inverse> @@ -652,7 +645,6 @@ struct fft_specialization<T, 5> : dft_stage<T> { fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } -protected: constexpr static bool aligned = false; DFT_STAGE_FN template <bool inverse> @@ -669,7 +661,6 @@ struct fft_specialization<T, 6> : dft_stage<T> { fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); } -protected: constexpr static bool aligned = false; DFT_STAGE_FN template <bool inverse> @@ -689,7 +680,6 @@ struct fft_specialization<T, 7> : dft_stage<T> this->data_size = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment); } -protected: constexpr static bool aligned = false; constexpr static size_t width = vector_width<T>; constexpr static bool use_br2 = true; @@ -748,7 +738,6 @@ struct fft_specialization<float, 8> : dft_stage<float> this->temp_size = sizeof(complex<float>) * 256; } -protected: using T = float; DFT_STAGE_FN template <bool inverse> diff --git a/include/kfr/dft/impl/ft.hpp b/include/kfr/dft/impl/ft.hpp @@ -121,7 +121,7 @@ KFR_INTRINSIC cvec<T, N> cread(const complex<T>* src) template <size_t N, bool A = false, typename T> KFR_INTRINSIC void cwrite(complex<T>* dest, const cvec<T, N>& value) { - value.write(ptr_cast<T>(dest)); + value.write(ptr_cast<T>(dest), cbool_t<A>()); } template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices> diff --git a/include/kfr/kfr.h b/include/kfr/kfr.h @@ -8,9 +8,9 @@ #include "cident.h" -#define KFR_VERSION_MAJOR 3 +#define KFR_VERSION_MAJOR 4 #define KFR_VERSION_MINOR 0 -#define KFR_VERSION_PATCH 9 +#define KFR_VERSION_PATCH 0 #define KFR_VERSION_LABEL "" #define KFR_VERSION_STRING \ diff --git a/include/kfr/simd/impl/backend_generic.hpp b/include/kfr/simd/impl/backend_generic.hpp @@ -49,10 +49,12 @@ namespace intrinsics { template <typename T, size_t N> -using simd = typename simd_type<unwrap_bit<T>, N>::type; +using simd = typename simd_type<unwrap_bit<T>, next_poweroftwo(static_cast<size_t>(N))>::type; template <typename T, size_t N, typename U> -union simd_small_array { +struct simd_small_array +{ + static_assert(is_poweroftwo(N), ""); static_assert(sizeof(T) * N == sizeof(U), ""); U whole; @@ -60,7 +62,11 @@ union simd_small_array { constexpr static size_t size = N; using packed_type = U; +#ifdef _MSC_VER KFR_INTRINSIC constexpr simd_small_array() CMT_NOEXCEPT = default; +#else + KFR_INTRINSIC simd_small_array() CMT_NOEXCEPT {} +#endif KFR_INTRINSIC constexpr simd_small_array(U whole) CMT_NOEXCEPT : whole(whole) {} @@ -73,6 +79,57 @@ union simd_small_array { KFR_INTRINSIC static constexpr simd_small_array from(U whole) CMT_NOEXCEPT { return { whole }; } }; +template <> +struct simd_small_array<f32, 2, f64> +{ + f64 whole; + + using value_type = f32; + constexpr static size_t size = 2; + using packed_type = f64; + +#ifdef _MSC_VER + KFR_INTRINSIC constexpr simd_small_array() CMT_NOEXCEPT = default; +#else + KFR_INTRINSIC simd_small_array() CMT_NOEXCEPT {} +#endif + + KFR_INTRINSIC constexpr simd_small_array(f64 whole) CMT_NOEXCEPT : whole(whole) {} + + KFR_INTRINSIC simd_small_array(f32 x, f32 y) CMT_NOEXCEPT + { +#ifdef _MSC_VER +#ifdef CMT_ARCH_SSE2 + whole = _mm_cvtsd_f64(_mm_castps_pd(_mm_setr_ps(x, y, x, y))); +#else + union { + struct + { + f32 x; + f32 y; + }; + f64 r; + } u; + u.x = x; + u.y = y; + whole = u.r; +#endif +#else + union { + struct + { + f32 x; + f32 y; + }; + f64 r; + } u{ { x, y } }; + whole = u.r; +#endif + } + + KFR_INTRINSIC static constexpr simd_small_array from(f64 whole) CMT_NOEXCEPT { return { whole }; } +}; + template <typename T> struct is_simd_small_array : cfalse_t { @@ -124,6 +181,10 @@ KFR_SIMD_SMALL_TYPE(i16, 4, u64) KFR_SIMD_SMALL_TYPE(i32, 2, u64) #ifdef CMT_ARCH_SSE +#ifndef KFR_f32x2_array +KFR_SIMD_SMALL_TYPE(f32, 2, f64) +#endif + KFR_SIMD_TYPE(f32, 4, __m128) KFR_SIMD_TYPE(f64, 2, __m128d) #endif // CMT_ARCH_SSE @@ -206,6 +267,9 @@ KFR_SIMD_TYPE(f64, 2, float64x2_t); // specializations +template <typename T, size_t N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<T, Nout> universal_shuffle(simd_t<T, N>, const simd<T, N>& x, csizes_t<indices...>); + #ifdef KFR_NATIVE_INTRINSICS #define KFR_GEN_ty(n, ty) ty(n) @@ -219,8 +283,14 @@ KFR_SIMD_TYPE(f64, 2, float64x2_t); } #ifdef CMT_ARCH_SSE2 -inline __m128i KFR_mm_setr_epi64x(int64_t q0, int64_t q1) CMT_NOEXCEPT { return _mm_set_epi64x(q1, q0); } -inline __m128i KFR_mm_setr_epi32(int32_t q0, int32_t q1, int32_t q2, int32_t q3) CMT_NOEXCEPT + +KFR_INTRINSIC double take_hi_sd(__m128d x) { return _mm_cvtsd_f64(_mm_unpackhi_pd(x, x)); } + +KFR_INTRINSIC __m128i KFR_mm_setr_epi64x(int64_t q0, int64_t q1) CMT_NOEXCEPT +{ + return _mm_set_epi64x(q1, q0); +} +KFR_INTRINSIC __m128i KFR_mm_setr_epi32(int32_t q0, int32_t q1, int32_t q2, int32_t q3) CMT_NOEXCEPT { return _mm_set_epi32(q3, q2, q1, q0); } @@ -263,6 +333,36 @@ KFR_INTRIN_BROADCAST(i32, 2, simd<i32, 2>(value, value)) KFR_INTRIN_BROADCAST(u32, 2, simd<u32, 2>(value, value)) KFR_INTRIN_BROADCAST(f32, 2, simd<f32, 2>{ value, value }) +template <size_t N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<float, Nout> simd_shuffle(simd_t<float, N>, const simd<float, N>& x, + csizes_t<indices...> ind, overload_priority<2>) CMT_NOEXCEPT +{ + return universal_shuffle(simd_t<float, N>{}, x, ind); +} + +template <size_t N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<double, Nout> simd_shuffle(simd_t<double, N>, const simd<double, N>& x, + csizes_t<indices...> ind, overload_priority<2>) CMT_NOEXCEPT +{ + return universal_shuffle(simd_t<double, N>{}, x, ind); +} + +template <size_t N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<float, Nout> simd_shuffle(simd2_t<float, N, N>, const simd<float, N>& x, + const simd<float, N>& y, csizes_t<indices...> ind, + overload_priority<2>) CMT_NOEXCEPT +{ + return universal_shuffle(simd_t<float, 2 * N>{}, simd_from_halves(simd_t<float, 2 * N>{}, x, y), ind); +} + +template <size_t N, size_t... indices, size_t Nout = sizeof...(indices)> +KFR_INTRINSIC simd<double, Nout> simd_shuffle(simd2_t<double, N, N>, const simd<double, N>& x, + const simd<double, N>& y, csizes_t<indices...> ind, + overload_priority<2>) CMT_NOEXCEPT +{ + return universal_shuffle(simd_t<double, 2 * N>{}, simd_from_halves(simd_t<double, 2 * N>{}, x, y), ind); +} + #define KFR_INTRIN_SHUFFLE_DUPHALVES(T, N, ...) \ KFR_INTRINSIC simd<T, N * 2> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, \ decltype(csizeseq<N * 2> % csize<N>), overload_priority<9>) \ @@ -300,6 +400,16 @@ KFR_INTRIN_BROADCAST(f32, 2, simd<f32, 2>{ value, value }) return __VA_ARGS__; \ } +KFR_INTRINSIC __m128 KFR_swap_ps(__m128 x) { return _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1)); } + +#ifndef KFR_f32x2_array +// KFR_INTRIN_SHUFFLE_CONCAT(f32, 2, _mm_castpd_ps(_mm_setr_pd(x.whole, y.whole))) +KFR_INTRIN_SHUFFLE_SWAP(f32, 2, _mm_cvtsd_f64(_mm_castps_pd(KFR_swap_ps(_mm_castpd_ps(_mm_set_sd(x.whole)))))) +#else +KFR_INTRIN_SHUFFLE_CONCAT(f32, 2, _mm_setr_ps(x.low, x.high, y.low, y.high)) +KFR_INTRIN_SHUFFLE_SWAP(f32, 2, simd<f32, 2>(x.high, x.low)) +#endif + #if defined CMT_COMPILER_MSVC && !defined CMT_COMPILER_CLANG && defined CMT_ARCH_X32 KFR_INTRINSIC __m128i _mm_cvtsi64_si128(int64_t u) { @@ -364,7 +474,11 @@ KFR_INTRIN_SHUFFLE_LINEAR(i64, 1, 2, _mm_cvtsi128_si64(x)) KFR_INTRIN_SHUFFLE_LINEAR(u64, 1, 2, _mm_cvtsi128_si64(x)) KFR_INTRIN_SHUFFLE_LINEAR(f32, 1, 4, _mm_cvtss_f32(x)) KFR_INTRIN_SHUFFLE_LINEAR(f32, 2, 4, bitcast_anything<simd<float, 2>>(_mm_cvtsd_f64(_mm_castps_pd(x)))) +#ifndef KFR_f32x2_array +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 2, _mm_castpd_ps(_mm_set_sd(x.whole))) +#else KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 2, _mm_unpacklo_ps(_mm_set_ss(x.low), _mm_set_ss(x.high))) +#endif KFR_INTRIN_SHUFFLE_LINEAR(f64, 1, 2, _mm_cvtsd_f64(x)) KFR_INTRIN_SHUFFLE_LINEAR(i8, 2, 16, simd<i8, 2>::from(u16(_mm_cvtsi128_si32(x)))) @@ -390,6 +504,13 @@ KFR_INTRIN_SHUFFLE_LINEAR_START(i16, 4, 8, 4, simd<i16, 4>::from(KFR_u64sse_INDE KFR_INTRIN_SHUFFLE_LINEAR_START(u32, 2, 4, 2, simd<u32, 2>::from(KFR_u64sse_INDEX(x, 1))) KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 2, 4, 2, simd<i32, 2>::from(KFR_u64sse_INDEX(x, 1))) +#ifndef KFR_f32x2_array +KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 2, 4, 2, simd<f32, 2>::from(take_hi_sd(_mm_castps_pd(x)))) +#else +KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 2, 4, 2, + simd_halves<f32, 2>{ KFR_f32sse_INDEX(x, 2), KFR_f32sse_INDEX(x, 3) }) +#endif + #define KFR_INTRIN_CONVERT(Tout, Tin, N, ...) \ KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT \ { \ @@ -492,14 +613,14 @@ KFR_INTRIN_BITCAST(f64, i64, 4, _mm256_castsi256_pd(x)) KFR_INTRIN_BITCAST(i64, f64, 4, _mm256_castpd_si256(x)) #ifndef CMT_ARCH_AVX2 -KFR_INTRIN_SHUFFLE_DUPHALVES(i8, 16, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1) ) -KFR_INTRIN_SHUFFLE_DUPHALVES(u8, 16, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1) ) -KFR_INTRIN_SHUFFLE_DUPHALVES(i16, 8, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1) ) -KFR_INTRIN_SHUFFLE_DUPHALVES(u16, 8, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1) ) -KFR_INTRIN_SHUFFLE_DUPHALVES(i32, 4, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1) ) -KFR_INTRIN_SHUFFLE_DUPHALVES(u32, 4, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1) ) -KFR_INTRIN_SHUFFLE_DUPHALVES(i64, 2, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1) ) -KFR_INTRIN_SHUFFLE_DUPHALVES(u64, 2, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1) ) +KFR_INTRIN_SHUFFLE_DUPHALVES(i8, 16, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1)) +KFR_INTRIN_SHUFFLE_DUPHALVES(u8, 16, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1)) +KFR_INTRIN_SHUFFLE_DUPHALVES(i16, 8, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1)) +KFR_INTRIN_SHUFFLE_DUPHALVES(u16, 8, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1)) +KFR_INTRIN_SHUFFLE_DUPHALVES(i32, 4, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1)) +KFR_INTRIN_SHUFFLE_DUPHALVES(u32, 4, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1)) +KFR_INTRIN_SHUFFLE_DUPHALVES(i64, 2, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1)) +KFR_INTRIN_SHUFFLE_DUPHALVES(u64, 2, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1)) #endif KFR_INTRINSIC __m256 KFR_mm256_setr_m128(__m128 x, __m128 y) @@ -562,9 +683,15 @@ KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 16, _mm256_castsi256_si128(x)) KFR_INTRIN_SHUFFLE_LINEAR(u32, 4, 8, _mm256_castsi256_si128(x)) KFR_INTRIN_SHUFFLE_LINEAR(u64, 2, 4, _mm256_castsi256_si128(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 2, 8, _mm_cvtsd_f64(_mm_castps_pd(_mm256_castps256_ps128(x)))) +KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 2, 8, 2, take_hi_sd(_mm_castps_pd(_mm256_castps256_ps128(x)))) +KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 2, 8, 4, _mm_cvtsd_f64(_mm_castps_pd(_mm256_extractf128_ps(x, 1)))) +KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 2, 8, 6, take_hi_sd(_mm_castps_pd(_mm256_extractf128_ps(x, 1)))) + // extend KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 2, 4, _mm256_castps128_ps256(x)) KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 2, 2, _mm256_castpd128_pd256(x)) +KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 2, 2, _mm256_castps128_ps256(_mm_castpd_ps(_mm_set_sd(x.whole)))) // high KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 4, 8, 4, _mm256_extractf128_ps(x, 1)) @@ -1222,6 +1349,483 @@ KFR_INTRINSIC simd<T, N> simd_set_element(const simd<T, N>& value, size_t index, arr.val[index] = x; return from_simd_array(arr); } + +#define SIMD_TYPE_INTRIN(T, N, TO_SCALAR, FROM_SCALAR, FROM_BROADCAST, FROM_ZERO) \ + KFR_INTRINSIC T simd_to_scalar(simd_t<T, N>, const simd<T, N>& x) { return TO_SCALAR; } \ + KFR_INTRINSIC simd<T, N> simd_from_scalar(simd_t<T, N>, T x) { return FROM_SCALAR; } \ + KFR_INTRINSIC simd<T, N> simd_from_broadcast(simd_t<T, N>, T x) { return FROM_BROADCAST; } \ + KFR_INTRINSIC simd<T, N> simd_from_zero(simd_t<T, N>) { return FROM_ZERO; } + +#define SIMD_TYPE_INTRIN_EX(T, N, TO_SCALAR, FROM_SCALAR, FROM_BROADCAST, FROM_ZERO, GET_LOW, GET_HIGH, \ + FROM_HALVES) \ + SIMD_TYPE_INTRIN(T, N, TO_SCALAR, FROM_SCALAR, FROM_BROADCAST, FROM_ZERO) \ + KFR_INTRINSIC simd<T, N / 2> simd_get_low(simd_t<T, N>, const simd<T, N>& x) { return GET_LOW; } \ + KFR_INTRINSIC simd<T, N / 2> simd_get_high(simd_t<T, N>, const simd<T, N>& x) { return GET_HIGH; } \ + KFR_INTRINSIC simd<T, N> simd_from_halves(simd_t<T, N>, const simd<T, N / 2>& x, \ + const simd<T, N / 2>& y) \ + { \ + return FROM_HALVES; \ + } + +template <typename T, size_t Nout, size_t Nin> +KFR_INTRINSIC simd<T, Nout> simd_from_partial(simd2_t<T, Nout, Nin>, const simd<T, Nin>& x) +{ +#ifdef _MSC_VER + union { + simd<T, Nin> in; + simd<T, Nout> out; + } u; + u.in = x; + return u.out; +#else + union { + simd<T, Nin> in; + simd<T, Nout> out; + } u{ x }; + return u.out; +#endif +} +template <typename T, size_t N> +KFR_INTRINSIC simd<T, N / 2> simd_get_low(simd_t<T, N>, const simd<T, N>& x) +{ + return x.low; +} +template <typename T, size_t N> +KFR_INTRINSIC simd<T, N / 2> simd_get_high(simd_t<T, N>, const simd<T, N>& x) +{ + return x.high; +} +template <typename T, size_t N> +KFR_INTRINSIC simd<T, N> simd_from_halves(simd_t<T, N>, const simd<T, N / 2>& x, const simd<T, N / 2>& y) +{ + return { x, y }; +} + +KFR_INTRINSIC simd<float, 4> simd_from_halves(simd_t<float, 4>, const simd<float, 2>& x, + const simd<float, 2>& y) +{ + return _mm_castpd_ps(_mm_setr_pd(x.whole, y.whole)); +} + +KFR_INTRINSIC simd<double, 2> simd_from_halves(simd_t<double, 2>, const simd<double, 1>& x, + const simd<double, 1>& y) +{ + return _mm_setr_pd(x, y); +} + +SIMD_TYPE_INTRIN(f32, 4, _mm_cvtss_f32(x), _mm_set_ss(x), _mm_set1_ps(x), _mm_setzero_ps()) +SIMD_TYPE_INTRIN(f64, 2, _mm_cvtsd_f64(x), _mm_set_sd(x), _mm_set1_pd(x), _mm_setzero_pd()) + +#ifdef CMT_ARCH_AVX +SIMD_TYPE_INTRIN_EX(f32, 8, _mm256_cvtss_f32(x), _mm256_castps128_ps256(_mm_set_ss(x)), _mm256_set1_ps(x), + _mm256_setzero_ps(), _mm256_castps256_ps128(x), _mm256_extractf128_ps(x, 1), + KFR_mm256_setr_m128(x, y)) +SIMD_TYPE_INTRIN_EX(f64, 4, _mm256_cvtsd_f64(x), _mm256_castpd128_pd256(_mm_set_sd(x)), _mm256_set1_pd(x), + _mm256_setzero_pd(), _mm256_castpd256_pd128(x), _mm256_extractf128_pd(x, 1), + KFR_mm256_setr_m128d(x, y)) +#endif + +#ifdef CMT_ARCH_AVX512 +SIMD_TYPE_INTRIN_EX(f32, 16, _mm512_cvtss_f32(x), _mm512_castps128_ps512(_mm_set_ss(x)), _mm512_set1_ps(x), + _mm512_setzero_ps(), _mm512_castps512_ps256(x), _mm512_extractf32x8_ps(x, 1), + KFR_mm512_setr_m256(x, y)) +SIMD_TYPE_INTRIN_EX(f64, 8, _mm512_cvtsd_f64(x), _mm512_castpd128_pd512(_mm_set_sd(x)), _mm512_set1_pd(x), + _mm512_setzero_pd(), _mm512_castpd512_pd256(x), _mm512_extractf64x4_pd(x, 1), + KFR_mm512_setr_m256d(x, y)) +#endif + +template <size_t bits, size_t...> +struct shuffle_mask; + +template <size_t i0, size_t i1, size_t i2, size_t i3, size_t i4, size_t i5, size_t i6, size_t i7> +struct shuffle_mask<8, i0, i1, i2, i3, i4, i5, i6, i7> +{ + constexpr static inline size_t Nmax = 1; + constexpr static inline size_t value = (const_min(i7, Nmax) << 7) | (const_min(i6, Nmax) << 6) | + (const_min(i5, Nmax) << 5) | (const_min(i4, Nmax) << 4) | + (const_min(i3, Nmax) << 3) | (const_min(i2, Nmax) << 2) | + (const_min(i1, Nmax) << 1) | const_min(i0, Nmax); +}; + +template <size_t i0, size_t i1, size_t i2, size_t i3> +struct shuffle_mask<8, i0, i1, i2, i3> +{ + constexpr static inline size_t Nmax = 3; + constexpr static inline size_t value = (const_min(i3, Nmax) << 6) | (const_min(i2, Nmax) << 4) | + (const_min(i1, Nmax) << 2) | const_min(i0, Nmax); +}; + +template <size_t i0, size_t i1, size_t i2, size_t i3> +struct shuffle_mask<4, i0, i1, i2, i3> +{ + constexpr static inline size_t Nmax = 1; + constexpr static inline size_t value = (const_min(i3, Nmax) << 3) | (const_min(i2, Nmax) << 2) | + (const_min(i1, Nmax) << 1) | const_min(i0, Nmax); +}; + +template <size_t i0, size_t i1> +struct shuffle_mask<2, i0, i1> +{ + constexpr static inline size_t Nmax = 1; + constexpr static inline size_t value = (const_min(i1, Nmax) << 1) | const_min(i0, Nmax); +}; + +#ifdef CMT_ARCH_SSE2 + +template <size_t I0, size_t I1, size_t I2, size_t I3> +KFR_INTRINSIC simd<float, 4> simd_vec_shuffle(simd_t<float, 4>, const simd<float, 4>& x, + csizes_t<I0, I1, I2, I3>) +{ + // SSE -> SSE + return _mm_shuffle_ps(x, x, shuffle_mask<8, I0, I1, I2, I3>::value); +} + +template <size_t I0, size_t I1> +KFR_INTRINSIC simd<double, 2> simd_vec_shuffle(simd_t<double, 2>, const simd<double, 2>& x, csizes_t<I0, I1>) +{ + // SSE -> SSE + return _mm_shuffle_pd(x, x, shuffle_mask<2, I0, I1>::value); +} +#endif + +#ifdef CMT_ARCH_AVX512 + +template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7, size_t I8, + size_t I9, size_t I10, size_t I11, size_t I12, size_t I13, size_t I14, size_t I15> +KFR_INTRINSIC simd<float, 16> simd_vec_shuffle( + simd_t<float, 16>, const simd<float, 16>& x, + csizes_t<I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15>) +{ + // AVX512 -> AVX512 + return _mm512_permutexvar_ps( + _mm512_setr_epi32(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15), x); +} + +template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> +KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_t<double, 8>, const simd<double, 8>& x, + csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>) +{ + // AVX512 -> AVX512 + return _mm512_permutexvar_pd(_mm512_setr_epi64(I0, I1, I2, I3, I4, I5, I6, I7), x); +} + +template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> +KFR_INTRINSIC simd<float, 8> simd_vec_shuffle(simd_t<float, 16>, const simd<float, 16>& x, + csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>) +{ + // AVX512 -> AVX + return _mm512_castps512_ps256(_mm512_permutexvar_ps( + _mm512_setr_epi32(I0, I1, I2, I3, I4, I5, I6, I7, I0, I1, I2, I3, I4, I5, I6, I7), x)); +} + +template <size_t I0, size_t I1, size_t I2, size_t I3> +KFR_INTRINSIC simd<float, 4> simd_vec_shuffle(simd_t<float, 16>, const simd<float, 16>& x, + csizes_t<I0, I1, I2, I3>) +{ + // AVX512 -> SSE + return _mm512_castps512_ps128(_mm512_permutexvar_ps( + _mm512_setr_epi32(I0, I1, I2, I3, I0, I1, I2, I3, I0, I1, I2, I3, I0, I1, I2, I3), x)); +} + +template <size_t I0, size_t I1, size_t I2, size_t I3> +KFR_INTRINSIC simd<double, 4> simd_vec_shuffle(simd_t<double, 8>, const simd<double, 8>& x, + csizes_t<I0, I1, I2, I3>) +{ + // AVX512 -> AVX + return _mm512_castpd512_pd256( + _mm512_permutexvar_pd(_mm512_setr_epi64(I0, I1, I2, I3, I0, I1, I2, I3), x)); +} + +template <size_t I0, size_t I1> +KFR_INTRINSIC simd<double, 2> simd_vec_shuffle(simd_t<double, 8>, const simd<double, 8>& x, csizes_t<I0, I1>) +{ + // AVX512 -> SSE + return _mm512_castpd512_pd128( + _mm512_permutexvar_pd(_mm512_setr_epi64(I0, I1, I0, I1, I0, I1, I0, I1), x)); +} + +template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7, size_t I8, + size_t I9, size_t I10, size_t I11, size_t I12, size_t I13, size_t I14, size_t I15> +KFR_INTRINSIC simd<float, 16> simd_vec_shuffle( + simd_t<float, 8>, const simd<float, 8>& x, + csizes_t<I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15>) +{ + // AVX -> AVX512 + return _mm512_permutexvar_ps( + _mm512_setr_epi32(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15), + _mm512_castps256_ps512(x)); +} + +template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7, size_t I8, + size_t I9, size_t I10, size_t I11, size_t I12, size_t I13, size_t I14, size_t I15> +KFR_INTRINSIC simd<float, 16> simd_vec_shuffle( + simd_t<float, 4>, const simd<float, 4>& x, + csizes_t<I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15>) +{ + // SSE -> AVX512 + return _mm512_permutexvar_ps( + _mm512_setr_epi32(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15), + _mm512_castps128_ps512(x)); +} + +template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> +KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_t<double, 4>, const simd<double, 4>& x, + csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>) +{ + // AVX -> AVX512 + return _mm512_permutexvar_pd(_mm512_setr_epi64(I0, I1, I2, I3, I4, I5, I6, I7), + _mm512_castpd256_pd512(x)); +} + +template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> +KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_t<double, 2>, const simd<double, 2>& x, + csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>) +{ + // SSE -> AVX512 + return _mm512_permutexvar_pd(_mm512_setr_epi64(I0, I1, I2, I3, I4, I5, I6, I7), + _mm512_castpd128_pd512(x)); +} + +#endif + +#ifdef CMT_ARCH_AVX + +template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> +KFR_INTRINSIC simd<float, 8> simd_vec_shuffle(simd_t<float, 8>, const simd<float, 8>& x, + csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>) +{ + // AVX -> AVX + if constexpr (cmaxof(csizes<I0, I1, I2, I3>) < 4 && csizes<I0, I1, I2, I3> == csizes<I4, I5, I6, I7>) + { + const simd<float, 4> tmp = universal_shuffle(simd_t<float, 4>{}, simd_get_low(simd_t<float, 8>{}, x), + csizes<I0, I1, I2, I3>); + return simd_from_halves(simd_t<float, 8>{}, tmp, tmp); + } + else if constexpr (cmaxof(csizes<I0, I1, I2, I3>) < 4 && cminof(csizes<I4, I5, I6, I7>) >= 4) + { + if constexpr (csizes<I0, I1, I2, I3, I4, I5, I6, I7> == + csizes<I0, I1, I2, I3, I0 + 4, I1 + 4, I2 + 4, I3 + 4>) + { + return _mm256_shuffle_ps(x, x, shuffle_mask<8, I0, I1, I2, I3>::value); + } + else + { + return simd_from_halves(simd_t<float, 8>{}, + universal_shuffle(simd_t<float, 4>{}, simd_get_low(simd_t<float, 8>{}, x), + shuffle_mask<8, I0, I1, I2, I3>::value), + universal_shuffle(simd_t<float, 4>{}, + simd_get_high(simd_t<float, 8>{}, x), + shuffle_mask<8, I4, I5, I6, I7>::value)); + } + } + else + { + const __m256 sw = _mm256_permute2f128_ps(x, x, 1); // swap lanes + const __m256 t1 = _mm256_permutevar_ps( + x, _mm256_setr_epi32(I0 % 4, I1 % 4, I2 % 4, I3 % 4, I4 % 4, I5 % 4, I6 % 4, I7 % 4)); + const __m256 t2 = _mm256_permutevar_ps( + sw, _mm256_setr_epi32(I0 % 4, I1 % 4, I2 % 4, I3 % 4, I4 % 4, I5 % 4, I6 % 4, I7 % 4)); + return _mm256_blend_ps(t1, t2, + shuffle_mask<8, I0 / 4, I1 / 4, I2 / 4, I3 / 4, 1 - I4 / 4, 1 - I5 / 4, + 1 - I6 / 4, 1 - I7 / 4>::value); + } +} + +template <size_t I0, size_t I1, size_t I2, size_t I3> +KFR_INTRINSIC simd<double, 4> simd_vec_shuffle(simd_t<double, 4>, const simd<double, 4>& x, + csizes_t<I0, I1, I2, I3>) +{ + // AVX -> AVX + if constexpr (cmaxof(csizes<I0, I1>) < 2 && csizes<I0, I1> == csizes<I2, I3>) + { + const simd<double, 2> tmp = + universal_shuffle(simd_t<double, 2>{}, simd_get_low(simd_t<double, 4>{}, x), csizes<I0, I1>); + return simd_from_halves(simd_t<double, 4>{}, tmp, tmp); + } + else if constexpr (cmaxof(csizes<I0, I1>) < 4 && cminof(csizes<I2, I3>) >= 4) + { + if constexpr (csizes<I0, I1, I2, I3> == csizes<I0, I1, I2 + 2, I3 + 2>) + { + return _mm256_shuffle_ps(x, x, shuffle_mask<2, I0, I1>::value); + } + else + { + return simd_from_halves( + simd_t<double, 4>{}, + universal_shuffle(simd_t<double, 2>{}, simd_get_low(simd_t<double, 4>{}, x), + shuffle_mask<2, I0, I1>::value), + universal_shuffle(simd_t<double, 2>{}, simd_get_high(simd_t<double, 4>{}, x), + shuffle_mask<2, I2, I3>::value)); + } + } + else + { + const __m256d sw = _mm256_permute2f128_pd(x, x, 1); // swap lanes + const __m256d t1 = _mm256_permutevar_pd( + x, _mm256_setr_epi64x((I0 % 2) << 1, (I1 % 2) << 1, (I2 % 2) << 1, (I3 % 2) << 1)); + const __m256d t2 = _mm256_permutevar_pd( + sw, _mm256_setr_epi64x((I0 % 2) << 1, (I1 % 2) << 1, (I2 % 2) << 1, (I3 % 2) << 1)); + return _mm256_blend_pd(t1, t2, shuffle_mask<4, I0 / 2, I1 / 2, 1 - I2 / 2, 1 - I3 / 2>::value); + } +} + +template <size_t I0, size_t I1, size_t I2, size_t I3> +KFR_INTRINSIC simd<float, 4> simd_vec_shuffle(simd_t<float, 8>, const simd<float, 8>& x, + csizes_t<I0, I1, I2, I3>) +{ + // AVX -> SSE + if constexpr (I0 % 4 == 0 && I1 % 4 == 1 && I2 % 4 == 2 && I3 % 4 == 3) + { + __m128 t1 = simd_get_low(simd_t<float, 8>{}, x); + __m128 t2 = simd_get_high(simd_t<float, 8>{}, x); + return _mm_blend_ps(t1, t2, shuffle_mask<4, I0 / 4, I1 / 4, I2 / 4, I3 / 4>::value); + } + else + { + __m128 t1 = simd_get_low(simd_t<float, 8>{}, x); + __m128 t2 = simd_get_high(simd_t<float, 8>{}, x); + t1 = _mm_permute_ps(t1, shuffle_mask<8, I0 % 4, I1 % 4, I2 % 4, I3 % 4>::value); + t2 = _mm_permute_ps(t2, shuffle_mask<8, I0 % 4, I1 % 4, I2 % 4, I3 % 4>::value); + return _mm_blend_ps(t1, t2, shuffle_mask<4, I0 / 4, I1 / 4, I2 / 4, I3 / 4>::value); + } +} + +template <size_t I0, size_t I1> +KFR_INTRINSIC simd<double, 2> simd_vec_shuffle(simd_t<double, 4>, const simd<double, 4>& x, csizes_t<I0, I1>) +{ + // AVX -> SSE + if constexpr (I0 % 2 == 0 && I1 % 2 == 1) + { + __m128d t1 = simd_get_low(simd_t<double, 4>{}, x); + __m128d t2 = simd_get_high(simd_t<double, 4>{}, x); + return _mm_blend_pd(t1, t2, shuffle_mask<2, I0 / 2, I1 / 2>::value); + } + else + { + __m128d t1 = simd_get_low(simd_t<double, 4>{}, x); + __m128d t2 = simd_get_high(simd_t<double, 4>{}, x); + t1 = _mm_permute_pd(t1, shuffle_mask<2, I0 % 2, I1 % 2>::value); + t2 = _mm_permute_pd(t2, shuffle_mask<2, I0 % 2, I1 % 2>::value); + return _mm_blend_pd(t1, t2, shuffle_mask<2, I0 / 2, I1 / 2>::value); + } +} + +template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> +KFR_INTRINSIC simd<float, 8> simd_vec_shuffle(simd_t<float, 4>, const simd<float, 4>& x, + csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>) +{ + // SSE -> AVX + return KFR_mm256_setr_m128(_mm_shuffle_ps(x, x, shuffle_mask<8, I0, I1, I2, I3>::value), + _mm_shuffle_ps(x, x, shuffle_mask<8, I4, I5, I6, I7>::value)); +} + +template <size_t I0, size_t I1, size_t I2, size_t I3> +KFR_INTRINSIC simd<double, 4> simd_vec_shuffle(simd_t<double, 2>, const simd<double, 2>& x, + csizes_t<I0, I1, I2, I3>) +{ + // SSE -> AVX + return KFR_mm256_setr_m128d(_mm_shuffle_pd(x, x, shuffle_mask<2, I0, I1>::value), + _mm_shuffle_pd(x, x, shuffle_mask<2, I2, I3>::value)); +} + +#endif + +template <typename T, size_t Nin, size_t... indices, size_t Nout> +KFR_INTRINSIC simd<T, Nout> universal_shuffle(simd_t<T, Nin>, const simd<T, Nin>& x, csizes_t<indices...>) +{ + using Indices = csizes_t<indices...>; + + constexpr bool floating = typeclass<T> == datatype::f; + + constexpr size_t minwidth = minimum_vector_width<T>; + constexpr size_t maxwidth = vector_width<T>; + constexpr size_t minindex = cminof(Indices{}); + constexpr size_t maxindex = cmaxof(csizes<(indices >= Nin ? 0 : indices)...>); + + if constexpr (Nin == 1 && Nout == 1) + { + return x; + } + else if constexpr (next_poweroftwo(Nin) == next_poweroftwo(Nout) && Indices{} == csizeseq<Nout>) + { + return x; + } + else if constexpr (!is_poweroftwo(Nin) || !is_poweroftwo(Nout)) + { + // Fix if not power of two + return universal_shuffle( + simd_t<T, next_poweroftwo(Nin)>{}, x, + cconcat(Indices{}, csizeseq<next_poweroftwo(Nout) - Nout, index_undefined, 0>)); + } + else if constexpr (Nout < minwidth) + { + // Expand indices if less than vector + const simd<T, minwidth> tmp = universal_shuffle( + simd_t<T, Nin>{}, x, cconcat(Indices{}, csizeseq<minwidth - Nout, index_undefined, 0>)); + + if constexpr (Nout == 1) + { + return simd_to_scalar(simd_t<T, minwidth>{}, tmp); + } + else + { + union { + simd<T, minwidth> tmp; + simd<T, Nout> r; + } u{ tmp }; + return u.r; + } + } + else if constexpr (Nout > maxwidth) + { + auto lowi = Indices{}[csizeseq<Nout / 2, 0>]; + auto highi = Indices{}[csizeseq<Nout / 2, Nout / 2>]; + if constexpr (lowi == highi) + { + auto tmp = universal_shuffle(simd_t<T, Nin>{}, x, lowi); + return { tmp, tmp }; + } + else + { + return { universal_shuffle(simd_t<T, Nin>{}, x, lowi), + universal_shuffle(simd_t<T, Nin>{}, x, highi) }; + } + } + else if constexpr (minindex >= Nin) + { + return simd_from_zero(simd_t<T, Nout>{}); + } + else if constexpr (Nin == 1) + { + return simd_from_broadcast(simd_t<T, Nout>{}, x); + } + else if constexpr (Nin < minwidth) + { + return universal_shuffle(simd_t<T, minwidth>{}, simd_from_partial(simd2_t<T, minwidth, Nin>{}, x), + Indices{}); + } + else if constexpr (Nin > Nout && maxindex < Nin / 2) + { + return universal_shuffle(simd_t<T, Nin / 2>{}, simd_get_low(simd_t<T, Nin>{}, x), Indices{}); + } + else if constexpr (Nin > Nout && minindex >= Nin / 2) + { + return universal_shuffle(simd_t<T, Nin / 2>{}, simd_get_high(simd_t<T, Nin>{}, x), + csizes<(indices < Nin ? indices - csize<Nin / 2> : indices)...>); + } + else if constexpr (Nin >= minwidth && Nin <= maxwidth && Nout >= minwidth && Nout <= maxwidth) + { + return simd_vec_shuffle(simd_t<T, Nin>{}, x, Indices{}); + } + else + { + not_optimized(CMT_FUNC_SIGNATURE); + const simd_array<T, Nin> xx = to_simd_array<T, Nin>(x); + constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... }; + return from_simd_array<T, Nout>(simd_shuffle_generic<T, Nout, Nin>(xx, indices_array)); + } +} + } // namespace intrinsics } // namespace CMT_ARCH_NAME } // namespace kfr diff --git a/include/kfr/simd/impl/read_write.hpp b/include/kfr/simd/impl/read_write.hpp @@ -147,8 +147,12 @@ KFR_INTRINSIC void write(cunaligned_t, T* ptr, const vec<T, 1>& x) } KFR_INTRINSIC void write(cunaligned_t, f32* ptr, const f32x2& x) { +#ifndef KFR_f32x2_array + *reinterpret_cast<f64*>(ptr) = x.v.whole; +#else ptr[0] = x.v.low; ptr[1] = x.v.high; +#endif } KFR_INTRINSIC void write(cunaligned_t, u8* ptr, const u8x2& x) { *reinterpret_cast<u16*>(ptr) = x.v.whole; } diff --git a/include/kfr/simd/platform.hpp b/include/kfr/simd/platform.hpp @@ -260,6 +260,11 @@ constexpr static size_t vector_width = (const_max(size_t(1), typeclass<T> == datatype::f ? platform<>::native_float_vector_size / sizeof(T) : platform<>::native_int_vector_size / sizeof(T))); +template <typename T, cpu_t cpu> +constexpr static size_t vector_width_for = + (const_max(size_t(1), typeclass<T> == datatype::f ? platform<cpu>::native_float_vector_size / sizeof(T) + : platform<cpu>::native_int_vector_size / sizeof(T))); + template <typename T> constexpr static size_t minimum_vector_width = (const_max(size_t(1), typeclass<T> == datatype::f ? platform<>::minimum_float_vector_size / sizeof(T) diff --git a/include/kfr/simd/vec.hpp b/include/kfr/simd/vec.hpp @@ -207,7 +207,7 @@ struct alignas(force_compiletime_size_t< // from SIMD KFR_MEM_INTRINSIC vec(const simd_type& simd) CMT_NOEXCEPT : v(simd) {} // default - KFR_MEM_INTRINSIC constexpr vec() CMT_NOEXCEPT = default; + KFR_MEM_INTRINSIC vec() CMT_NOEXCEPT {} // copy KFR_MEM_INTRINSIC constexpr vec(const vec& value) CMT_NOEXCEPT = default; // move @@ -345,6 +345,12 @@ struct alignas(force_compiletime_size_t< intrinsics::simd_t<unwrap_bit<ST>, SN>{}, v, csizeseq<SW, SW * index>, overload_auto)); } + template <size_t index> + KFR_MEM_INTRINSIC constexpr value_type get() const CMT_NOEXCEPT + { + return this->get(csize_t<index>{}); + } + template <int dummy = 0, KFR_ENABLE_IF(dummy == 0 && compound_type_traits<T>::is_scalar)> KFR_MEM_INTRINSIC constexpr void set(size_t index, const value_type& s) CMT_NOEXCEPT { @@ -1318,3 +1324,19 @@ struct flt_type_impl<kfr::vec<T, N>> CMT_PRAGMA_GNU(GCC diagnostic pop) CMT_PRAGMA_MSVC(warning(pop)) + +namespace std +{ + +template <typename T, size_t N> +class tuple_size<kfr::vec<T, N>> : public integral_constant<size_t, N> +{ +}; + +template <size_t I, class T, size_t N> +struct tuple_element<I, kfr::vec<T, N>> +{ + using type = T; +}; + +} // namespace std +\ No newline at end of file diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt @@ -33,13 +33,12 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/tests/cmake/") if (ENABLE_ASMTEST) add_executable(asm_test asm_test.cpp) target_link_libraries(asm_test kfr) - target_set_arch(asm_test PRIVATE avx) + target_set_arch(asm_test PRIVATE avx2) target_compile_definitions(asm_test PRIVATE KFR_SHOW_NOT_OPTIMIZED) target_compile_definitions(asm_test PRIVATE KFR_FUNCTION_IS_INTRINSIC) if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU") target_compile_options(asm_test PRIVATE -fno-stack-protector) endif () - message("CMAKE_CXX_COMPILER_ID = ${CMAKE_CXX_COMPILER_ID}") if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") target_compile_options(asm_test PRIVATE -GS-) target_compile_options(asm_test PRIVATE -Gs16384) diff --git a/tests/asm_test.cpp b/tests/asm_test.cpp @@ -7,6 +7,7 @@ #define KFR_EXTENDED_TESTS #include <kfr/base.hpp> +#include <kfr/dft/impl/fft-impl.hpp> #include <kfr/io.hpp> #include <kfr/testo/console_colors.hpp> @@ -119,6 +120,13 @@ using namespace kfr; { \ r = kfr::fn(x, y); \ } +#define TEST_ASM_QUAD4(fn, ty, n) \ + KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n * 4>& r, const vec<ty, n>& x, \ + const vec<ty, n>& y, const vec<ty, n>& z, \ + const vec<ty, n>& w) \ + { \ + r = kfr::fn(x, y, z, w); \ + } #define TEST_ASM_DOUBLE1(fn, ty, n) \ KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n * 2>& r, const vec<ty, n>& x) { r = kfr::fn(x); } @@ -167,6 +175,8 @@ using namespace kfr; #define TEST_ASM_IF(fn, MACRO) TEST_ASM_I(fn, MACRO) TEST_ASM_F(fn, MACRO) +#if 1 + TEST_ASM_UIF(add, TEST_ASM_VTY2) TEST_ASM_UIF(sub, TEST_ASM_VTY2) @@ -207,6 +217,14 @@ TEST_ASM_UIF(high, TEST_ASM_HALF1) TEST_ASM_UIF(concat, TEST_ASM_DOUBLE2) +template <typename... Args> +KFR_INTRINSIC decltype(auto) concat4(const Args&... args) +{ + return concat(args...); +} + +TEST_ASM_UIF(concat4, TEST_ASM_QUAD4) + TEST_ASM_UIF(shl, TEST_ASM_SHIFT) TEST_ASM_UIF(shr, TEST_ASM_SHIFT) @@ -217,10 +235,6 @@ TEST_ASM_UIF(shr, TEST_ASM_SHIFT_SCALAR) TEST_ASM_UIF(duphalfs, TEST_ASM_DOUBLE1) -TEST_ASM_F(sin, TEST_ASM_VTY1_F) - -TEST_ASM_F(cos, TEST_ASM_VTY1_F) - TEST_ASM_UIF(sqr, TEST_ASM_VTY1) TEST_ASM_UIF(make_vector, TEST_ASM_MAKE_VECTOR) @@ -231,11 +245,199 @@ TEST_ASM_UIF(read, TEST_READ) TEST_ASM_UIF(write, TEST_WRITE) +#define TEST_FFT_SPEC(ty, size) \ + static intrinsics::fft_specialization<ty, size> fft__##ty##__##size(static_cast<size_t>(1 << size)); \ + KFR_PUBLIC void asm__test__fft__##ty##__##size(complex<ty>* out, const complex<ty>* in, u8* temp) \ + { \ + fft__##ty##__##size.do_execute<false>(out, in, temp); \ + } \ + KFR_PUBLIC void asm__test__ifft__##ty##__##size(complex<ty>* out, const complex<ty>* in, u8* temp) \ + { \ + fft__##ty##__##size.do_execute<true>(out, in, temp); \ + } +#define TEST_FFT_GEN(ty) \ + static intrinsics::fft_stage_impl<ty, true, true> fft__##ty##__##size(static_cast<size_t>(65526)); \ + KFR_PUBLIC void asm__test__fft__##ty##__gen(complex<ty>* out, const complex<ty>* in, u8* temp) \ + { \ + fft__##ty##__##size.do_execute<false>(out, in, temp); \ + } \ + KFR_PUBLIC void asm__test__ifft__##ty##__gen(complex<ty>* out, const complex<ty>* in, u8* temp) \ + { \ + fft__##ty##__##size.do_execute<true>(out, in, temp); \ + } + +TEST_FFT_SPEC(f32, 1) +TEST_FFT_SPEC(f32, 2) +TEST_FFT_SPEC(f32, 3) +TEST_FFT_SPEC(f32, 4) +TEST_FFT_SPEC(f64, 1) +TEST_FFT_SPEC(f64, 2) +TEST_FFT_SPEC(f64, 3) +TEST_FFT_SPEC(f64, 4) + +TEST_FFT_GEN(f32) +TEST_FFT_GEN(f64) + +#endif + +TEST_ASM_F(sin, TEST_ASM_VTY1_F) + +TEST_ASM_F(cos, TEST_ASM_VTY1_F) + namespace kfr { + #ifdef KFR_SHOW_NOT_OPTIMIZED KFR_PUBLIC void not_optimized(const char* fn) CMT_NOEXCEPT { puts(fn); } #endif } // namespace kfr +KFR_PUBLIC void test_shuffle_old1(f32x1& x, const f32x4& y) +{ + x.v = kfr::intrinsics::simd_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, csizes<2>, overload_auto); +} + +KFR_PUBLIC void test_shuffle_old2(f32x4& x, const f32x4& y) +{ + x.v = kfr::intrinsics::simd_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, csizes<3, 2, 1, 0>, + overload_auto); +} + +KFR_PUBLIC void test_shuffle_old3(f32x4& x, const f32x4& y) +{ + x.v = kfr::intrinsics::simd_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, csizes<0, 1, 2, 3>, + overload_auto); +} + +KFR_PUBLIC void test_shuffle_old4(f32x2& x, const f32x4& y) +{ + x.v = kfr::intrinsics::simd_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, csizes<2, 3>, overload_auto); +} + +KFR_PUBLIC void test_shuffle_old5(f32x8& x, const f32x4& y) +{ + x.v = kfr::intrinsics::simd_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, + csizes<3, 2, 1, 0, 0, 1, 2, 3>, overload_auto); +} + +KFR_PUBLIC void test_shuffle_old6(f32x8& x, const f32x4& y) +{ + x.v = kfr::intrinsics::simd_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, + csizes<7, 6, 5, 4, 3, 2, 1, 0>, overload_auto); +} + +KFR_PUBLIC void test_shuffle_old9(vec<f32, 3>& x, const vec<f32, 15>& y) +{ + x.v = kfr::intrinsics::simd_shuffle(kfr::intrinsics::simd_t<f32, 15>{}, y.v, csizes<3, 2, 1>, + overload_auto); +} + +KFR_PUBLIC void test_shuffle_new1(f32x1& x, const f32x4& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, csizes<2>); +} + +KFR_PUBLIC void test_shuffle_new2(f32x4& x, const f32x4& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, csizes<3, 2, 1, 0>); +} + +KFR_PUBLIC void test_shuffle_new3(f32x4& x, const f32x4& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, csizes<0, 1, 2, 3>); +} + +KFR_PUBLIC void test_shuffle_new4(f32x2& x, const f32x4& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, csizes<2, 3>); +} + +KFR_PUBLIC void test_shuffle_new5(f32x8& x, const f32x4& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, + csizes<3, 2, 1, 0, 0, 1, 2, 3>); +} + +KFR_PUBLIC void test_shuffle_new6(f32x8& x, const f32x4& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, + csizes<7, 6, 5, 4, 3, 2, 1, 0>); +} + +KFR_PUBLIC void test_shuffle_new7(f32x1& x, const f32x32& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 32>{}, (y + 1.f).v, csizes<19>); +} + +KFR_PUBLIC void test_shuffle_new8(f32x8& x, const f32x8& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 8>{}, y.v, + csizes<3, 2, 1, 0, 3, 2, 1, 0>); +} + +KFR_PUBLIC void test_shuffle_new9(vec<f32, 3>& x, const vec<f32, 15>& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 15>{}, y.v, csizes<3, 2, 1>); +} + +KFR_PUBLIC void test_shuffle_new9a(vec<f32, 3>& x, const vec<f32, 15>& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 15>{}, y.v, csizes<5, 6, 7>); +} + +KFR_PUBLIC void test_shuffle_new9b(vec<f32, 3>& x, const vec<f32, 15>& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 15>{}, y.v, csizes<11, 11, 11>); +} + +KFR_PUBLIC void test_shuffle_new9c(vec<f32, 3>& x, const vec<f32, 15>& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 15>{}, y.v, csizes<3, 4, 5>); +} + +KFR_PUBLIC void test_shuffle_new10(vec<f32, 15>& x) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 1>{}, 0.f, + csizes<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15>); +} +KFR_PUBLIC void test_shuffle_new11(vec<f32, 15>& x, float y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 1>{}, y, + csizes<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>); +} +KFR_PUBLIC void test_shuffle_new12(vec<f32, 32>& x, const vec<f32, 32>& y) +{ + x.v = + kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 32>{}, y.v, csizeseq<32> ^ csize<1>); +} +KFR_PUBLIC void test_shuffle_new13(vec<f32, 8>& x, const vec<f32, 8>& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 8>{}, y.v, + csizes<0, 2, 4, 6, 1, 3, 5, 7>); +} +KFR_PUBLIC void test_shuffle_new14(vec<f32, 8>& x, const vec<f32, 8>& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 8>{}, y.v, + csizes<0, 4, 1, 5, 2, 6, 3, 7>); +} +KFR_PUBLIC void test_shuffle_new15(vec<f32, 4>& x, const vec<f32, 8>& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 8>{}, y.v, csizes<0, 5, 2, 7>); +} +KFR_PUBLIC void test_shuffle_new16(vec<f32, 2>& x, const vec<f32, 2>& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 2>{}, y.v, csizes<1, 0>); +} +KFR_PUBLIC void test_shuffle_new17(vec<f32, 16>& x, const vec<f32, 16>& y) +{ + x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 16>{}, y.v, + csizes<0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15>); +} + +KFR_PUBLIC float tuple_assign() +{ + auto [x, y, z, w] = f32x4(1.f, 2.f, 3.f, 4.f); + return x + y * y + z * z * z + w * w * w * w; +} + int main() { println(library_version()); }