commit 10faf3feba5ec538d51e2072844e4745fa6a1f59
parent e06725ed1fd70a142acdcd52cbe9cdeb7a2731cf
Author: [email protected] <[email protected]>
Date: Tue, 19 Nov 2019 15:18:46 +0000
KFR 4.0
Diffstat:
13 files changed, 921 insertions(+), 44 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -17,15 +17,11 @@
cmake_minimum_required(VERSION 3.1)
-message(STATUS CMAKE_CXX_FLAGS = ${CMAKE_CXX_FLAGS})
-
set(CMAKE_CXX_FLAGS " ${CMAKE_CXX_FLAGS}" CACHE STRING "compile flags" FORCE)
-message(STATUS CMAKE_CXX_FLAGS = ${CMAKE_CXX_FLAGS})
-
project(kfr CXX)
-set(CMAKE_CXX_STANDARD 14)
+set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS ON)
@@ -56,11 +52,15 @@ endif()
include(sources.cmake)
option(ENABLE_TESTS "Enable tests and examples" OFF)
-option(ENABLE_DFT "Enable DFT and related algorithms." ON)
if (CLANG)
+ option(ENABLE_DFT "Enable DFT and related algorithms." ON)
+ option(ENABLE_DFT_NP "Enable Non-power of 2 DFT" ON)
if (X86)
option(ENABLE_DFT_MULTIARCH "Build DFT static libraries for various architectures. Requires Clang" OFF)
endif ()
+else ()
+ option(ENABLE_DFT "Enable DFT and related algorithms." OFF)
+ option(ENABLE_DFT_NP "Enable Non-power of 2 DFT" OFF)
endif()
option(ENABLE_ASMTEST "Enable writing disassembly" OFF)
option(REGENERATE_TESTS "Regenerate auto tests" OFF)
@@ -81,7 +81,7 @@ if (CPU_ARCH STREQUAL "detect" AND X86)
RUN_RESULT COMPILE_RESULT
"${CMAKE_CURRENT_BINARY_DIR}/tmpdir"
${CMAKE_CURRENT_SOURCE_DIR}/cmake/detect_cpu.cpp
- CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CMAKE_CURRENT_SOURCE_DIR}/include" -DCMAKE_CXX_STANDARD=14
+ CMAKE_FLAGS "-DINCLUDE_DIRECTORIES=${CMAKE_CURRENT_SOURCE_DIR}/include" -DCMAKE_CXX_STANDARD=17
-DCMAKE_CXX_STANDARD_REQUIRED=ON -DCMAKE_CXX_EXTENSIONS=ON
COMPILE_OUTPUT_VARIABLE COMPILE_OUT
RUN_OUTPUT_VARIABLE RUN_OUT
@@ -177,6 +177,12 @@ if (ENABLE_DFT)
target_compile_options(kfr_dft PRIVATE -ffast-math)
endif()
+ if (ENABLE_DFT_NP)
+ target_compile_definitions(kfr_dft PUBLIC -DKFR_DFT_NPo2)
+ else ()
+ target_compile_definitions(kfr_dft PUBLIC -DKFR_DFT_NO_NPo2)
+ endif ()
+
if (ENABLE_DFT_MULTIARCH)
add_subdirectory(dft)
endif ()
diff --git a/include/kfr/cident.h b/include/kfr/cident.h
@@ -625,7 +625,7 @@ extern char* gets(char* __s);
"clang-msvc-" CMT_STRINGIFY(__clang_major__) "." CMT_STRINGIFY(__clang_minor__) "." CMT_STRINGIFY( \
__clang_patchlevel__)
#else
-#define CMT_COMPILER_NAME "clang"
+#define CMT_COMPILER_NAME "clang-mingw"
#define CMT_COMPILER_FULL_NAME \
"clang-" CMT_STRINGIFY(__clang_major__) "." CMT_STRINGIFY(__clang_minor__) "." CMT_STRINGIFY( \
__clang_patchlevel__)
diff --git a/include/kfr/cometa.hpp b/include/kfr/cometa.hpp
@@ -428,6 +428,19 @@ struct cvals_t : ops::empty
{
return {};
}
+
+ constexpr bool operator==(cvals_t<T, values...>) const noexcept { return true; }
+ template <T... values2>
+ constexpr bool operator==(cvals_t<T, values2...>) const noexcept
+ {
+ return false;
+ }
+
+ template <T... values2>
+ constexpr bool operator!=(cvals_t<T, values...> ind) const noexcept
+ {
+ return !operator==(ind);
+ }
};
template <typename T>
@@ -1849,6 +1862,29 @@ constexpr conditional<std::is_scalar<T>::value, T, const T&> const_min(const T&
return x < y ? x : y;
}
+template <typename T>
+constexpr T cminof(cvals_t<T>)
+{
+ return std::numeric_limits<T>::max();
+}
+template <typename T, T val, T... vals>
+constexpr T cminof(cvals_t<T, val, vals...>)
+{
+ T m = cminof(cvals<T, vals...>);
+ return val < m ? val : m;
+}
+template <typename T>
+constexpr T cmaxof(cvals_t<T>)
+{
+ return std::numeric_limits<T>::min();
+}
+template <typename T, T val, T... vals>
+constexpr T cmaxof(cvals_t<T, val, vals...>)
+{
+ T m = cmaxof(cvals<T, vals...>);
+ return val > m ? val : m;
+}
+
template <int n = 10>
struct overload_priority : overload_priority<n - 1>
{
diff --git a/include/kfr/dft/impl/dft-fft.hpp b/include/kfr/dft/impl/dft-fft.hpp
@@ -39,6 +39,15 @@ namespace kfr
{
#define DFT_STAGE_FN \
+ KFR_MEM_INTRINSIC void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) final \
+ { \
+ return do_execute<false>(out, in, temp); \
+ } \
+ KFR_MEM_INTRINSIC void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) final \
+ { \
+ return do_execute<true>(out, in, temp); \
+ }
+#define DFT_STAGE_FN_NONFINAL \
void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override \
{ \
return do_execute<false>(out, in, temp); \
diff --git a/include/kfr/dft/impl/fft-impl.hpp b/include/kfr/dft/impl/fft-impl.hpp
@@ -329,7 +329,7 @@ KFR_INTRINSIC ctrue_t radix4_pass(csize_t<32>, size_t blocks, csize_t<width>, cf
transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3);
butterfly4<8, inverse>(cfalse, z0, z1, z2, z3, z0, z1, z2, z3);
- cwrite<32, aligned>(out, bitreverse<2>(concat(z0, z1, z2, z3)));
+ cwrite<32, aligned>(out, bitreverse<2>(concat(concat(z0, z1), concat(z2, z3))));
out += 32;
}
return {};
@@ -430,7 +430,6 @@ struct fft_stage_impl : dft_stage<T>
align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment);
}
-protected:
constexpr static bool prefetch = true;
constexpr static bool aligned = false;
constexpr static size_t width = fft_vector_width<T>;
@@ -470,7 +469,6 @@ struct fft_final_stage_impl : dft_stage<T>
this->data_size = align_up(sizeof(complex<T>) * size * 3 / 2, platform<>::native_cache_alignment);
}
-protected:
constexpr static size_t width = fft_vector_width<T>;
constexpr static bool is_even = cometa::is_even(ilog2(size));
constexpr static bool use_br2 = !is_even;
@@ -496,7 +494,7 @@ protected:
init_twiddles(csize<size>, total_size, cbool<splitin>, twiddle);
}
- DFT_STAGE_FN
+ DFT_STAGE_FN_NONFINAL
template <bool inverse>
KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
{
@@ -563,7 +561,6 @@ struct fft_reorder_stage_impl : dft_stage<T>
this->data_size = 0;
}
-protected:
virtual void do_initialize(size_t) override final {}
DFT_STAGE_FN
@@ -582,7 +579,6 @@ struct fft_specialization<T, 1> : dft_stage<T>
{
fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-protected:
constexpr static bool aligned = false;
DFT_STAGE_FN
@@ -600,7 +596,6 @@ struct fft_specialization<T, 2> : dft_stage<T>
{
fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-protected:
constexpr static bool aligned = false;
DFT_STAGE_FN
template <bool inverse>
@@ -609,7 +604,7 @@ protected:
cvec<T, 1> a0, a1, a2, a3;
split(cread<4>(in), a0, a1, a2, a3);
butterfly(cbool_t<inverse>(), a0, a1, a2, a3, a0, a1, a2, a3);
- cwrite<4>(out, concat(a0, a1, a2, a3));
+ cwrite<4>(out, concat(concat(a0, a1), concat(a2, a3)));
}
};
@@ -618,7 +613,6 @@ struct fft_specialization<T, 3> : dft_stage<T>
{
fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-protected:
constexpr static bool aligned = false;
DFT_STAGE_FN
template <bool inverse>
@@ -635,7 +629,6 @@ struct fft_specialization<T, 4> : dft_stage<T>
{
fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-protected:
constexpr static bool aligned = false;
DFT_STAGE_FN
template <bool inverse>
@@ -652,7 +645,6 @@ struct fft_specialization<T, 5> : dft_stage<T>
{
fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-protected:
constexpr static bool aligned = false;
DFT_STAGE_FN
template <bool inverse>
@@ -669,7 +661,6 @@ struct fft_specialization<T, 6> : dft_stage<T>
{
fft_specialization(size_t) { this->name = type_name<decltype(*this)>(); }
-protected:
constexpr static bool aligned = false;
DFT_STAGE_FN
template <bool inverse>
@@ -689,7 +680,6 @@ struct fft_specialization<T, 7> : dft_stage<T>
this->data_size = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment);
}
-protected:
constexpr static bool aligned = false;
constexpr static size_t width = vector_width<T>;
constexpr static bool use_br2 = true;
@@ -748,7 +738,6 @@ struct fft_specialization<float, 8> : dft_stage<float>
this->temp_size = sizeof(complex<float>) * 256;
}
-protected:
using T = float;
DFT_STAGE_FN
template <bool inverse>
diff --git a/include/kfr/dft/impl/ft.hpp b/include/kfr/dft/impl/ft.hpp
@@ -121,7 +121,7 @@ KFR_INTRINSIC cvec<T, N> cread(const complex<T>* src)
template <size_t N, bool A = false, typename T>
KFR_INTRINSIC void cwrite(complex<T>* dest, const cvec<T, N>& value)
{
- value.write(ptr_cast<T>(dest));
+ value.write(ptr_cast<T>(dest), cbool_t<A>());
}
template <size_t count, size_t N, size_t stride, bool A, typename T, size_t... indices>
diff --git a/include/kfr/kfr.h b/include/kfr/kfr.h
@@ -8,9 +8,9 @@
#include "cident.h"
-#define KFR_VERSION_MAJOR 3
+#define KFR_VERSION_MAJOR 4
#define KFR_VERSION_MINOR 0
-#define KFR_VERSION_PATCH 9
+#define KFR_VERSION_PATCH 0
#define KFR_VERSION_LABEL ""
#define KFR_VERSION_STRING \
diff --git a/include/kfr/simd/impl/backend_generic.hpp b/include/kfr/simd/impl/backend_generic.hpp
@@ -49,10 +49,12 @@ namespace intrinsics
{
template <typename T, size_t N>
-using simd = typename simd_type<unwrap_bit<T>, N>::type;
+using simd = typename simd_type<unwrap_bit<T>, next_poweroftwo(static_cast<size_t>(N))>::type;
template <typename T, size_t N, typename U>
-union simd_small_array {
+struct simd_small_array
+{
+ static_assert(is_poweroftwo(N), "");
static_assert(sizeof(T) * N == sizeof(U), "");
U whole;
@@ -60,7 +62,11 @@ union simd_small_array {
constexpr static size_t size = N;
using packed_type = U;
+#ifdef _MSC_VER
KFR_INTRINSIC constexpr simd_small_array() CMT_NOEXCEPT = default;
+#else
+ KFR_INTRINSIC simd_small_array() CMT_NOEXCEPT {}
+#endif
KFR_INTRINSIC constexpr simd_small_array(U whole) CMT_NOEXCEPT : whole(whole) {}
@@ -73,6 +79,57 @@ union simd_small_array {
KFR_INTRINSIC static constexpr simd_small_array from(U whole) CMT_NOEXCEPT { return { whole }; }
};
+template <>
+struct simd_small_array<f32, 2, f64>
+{
+ f64 whole;
+
+ using value_type = f32;
+ constexpr static size_t size = 2;
+ using packed_type = f64;
+
+#ifdef _MSC_VER
+ KFR_INTRINSIC constexpr simd_small_array() CMT_NOEXCEPT = default;
+#else
+ KFR_INTRINSIC simd_small_array() CMT_NOEXCEPT {}
+#endif
+
+ KFR_INTRINSIC constexpr simd_small_array(f64 whole) CMT_NOEXCEPT : whole(whole) {}
+
+ KFR_INTRINSIC simd_small_array(f32 x, f32 y) CMT_NOEXCEPT
+ {
+#ifdef _MSC_VER
+#ifdef CMT_ARCH_SSE2
+ whole = _mm_cvtsd_f64(_mm_castps_pd(_mm_setr_ps(x, y, x, y)));
+#else
+ union {
+ struct
+ {
+ f32 x;
+ f32 y;
+ };
+ f64 r;
+ } u;
+ u.x = x;
+ u.y = y;
+ whole = u.r;
+#endif
+#else
+ union {
+ struct
+ {
+ f32 x;
+ f32 y;
+ };
+ f64 r;
+ } u{ { x, y } };
+ whole = u.r;
+#endif
+ }
+
+ KFR_INTRINSIC static constexpr simd_small_array from(f64 whole) CMT_NOEXCEPT { return { whole }; }
+};
+
template <typename T>
struct is_simd_small_array : cfalse_t
{
@@ -124,6 +181,10 @@ KFR_SIMD_SMALL_TYPE(i16, 4, u64)
KFR_SIMD_SMALL_TYPE(i32, 2, u64)
#ifdef CMT_ARCH_SSE
+#ifndef KFR_f32x2_array
+KFR_SIMD_SMALL_TYPE(f32, 2, f64)
+#endif
+
KFR_SIMD_TYPE(f32, 4, __m128)
KFR_SIMD_TYPE(f64, 2, __m128d)
#endif // CMT_ARCH_SSE
@@ -206,6 +267,9 @@ KFR_SIMD_TYPE(f64, 2, float64x2_t);
// specializations
+template <typename T, size_t N, size_t... indices, size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<T, Nout> universal_shuffle(simd_t<T, N>, const simd<T, N>& x, csizes_t<indices...>);
+
#ifdef KFR_NATIVE_INTRINSICS
#define KFR_GEN_ty(n, ty) ty(n)
@@ -219,8 +283,14 @@ KFR_SIMD_TYPE(f64, 2, float64x2_t);
}
#ifdef CMT_ARCH_SSE2
-inline __m128i KFR_mm_setr_epi64x(int64_t q0, int64_t q1) CMT_NOEXCEPT { return _mm_set_epi64x(q1, q0); }
-inline __m128i KFR_mm_setr_epi32(int32_t q0, int32_t q1, int32_t q2, int32_t q3) CMT_NOEXCEPT
+
+KFR_INTRINSIC double take_hi_sd(__m128d x) { return _mm_cvtsd_f64(_mm_unpackhi_pd(x, x)); }
+
+KFR_INTRINSIC __m128i KFR_mm_setr_epi64x(int64_t q0, int64_t q1) CMT_NOEXCEPT
+{
+ return _mm_set_epi64x(q1, q0);
+}
+KFR_INTRINSIC __m128i KFR_mm_setr_epi32(int32_t q0, int32_t q1, int32_t q2, int32_t q3) CMT_NOEXCEPT
{
return _mm_set_epi32(q3, q2, q1, q0);
}
@@ -263,6 +333,36 @@ KFR_INTRIN_BROADCAST(i32, 2, simd<i32, 2>(value, value))
KFR_INTRIN_BROADCAST(u32, 2, simd<u32, 2>(value, value))
KFR_INTRIN_BROADCAST(f32, 2, simd<f32, 2>{ value, value })
+template <size_t N, size_t... indices, size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<float, Nout> simd_shuffle(simd_t<float, N>, const simd<float, N>& x,
+ csizes_t<indices...> ind, overload_priority<2>) CMT_NOEXCEPT
+{
+ return universal_shuffle(simd_t<float, N>{}, x, ind);
+}
+
+template <size_t N, size_t... indices, size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<double, Nout> simd_shuffle(simd_t<double, N>, const simd<double, N>& x,
+ csizes_t<indices...> ind, overload_priority<2>) CMT_NOEXCEPT
+{
+ return universal_shuffle(simd_t<double, N>{}, x, ind);
+}
+
+template <size_t N, size_t... indices, size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<float, Nout> simd_shuffle(simd2_t<float, N, N>, const simd<float, N>& x,
+ const simd<float, N>& y, csizes_t<indices...> ind,
+ overload_priority<2>) CMT_NOEXCEPT
+{
+ return universal_shuffle(simd_t<float, 2 * N>{}, simd_from_halves(simd_t<float, 2 * N>{}, x, y), ind);
+}
+
+template <size_t N, size_t... indices, size_t Nout = sizeof...(indices)>
+KFR_INTRINSIC simd<double, Nout> simd_shuffle(simd2_t<double, N, N>, const simd<double, N>& x,
+ const simd<double, N>& y, csizes_t<indices...> ind,
+ overload_priority<2>) CMT_NOEXCEPT
+{
+ return universal_shuffle(simd_t<double, 2 * N>{}, simd_from_halves(simd_t<double, 2 * N>{}, x, y), ind);
+}
+
#define KFR_INTRIN_SHUFFLE_DUPHALVES(T, N, ...) \
KFR_INTRINSIC simd<T, N * 2> simd_shuffle(simd_t<T, N>, const simd<T, N>& x, \
decltype(csizeseq<N * 2> % csize<N>), overload_priority<9>) \
@@ -300,6 +400,16 @@ KFR_INTRIN_BROADCAST(f32, 2, simd<f32, 2>{ value, value })
return __VA_ARGS__; \
}
+KFR_INTRINSIC __m128 KFR_swap_ps(__m128 x) { return _mm_shuffle_ps(x, x, _MM_SHUFFLE(2, 3, 0, 1)); }
+
+#ifndef KFR_f32x2_array
+// KFR_INTRIN_SHUFFLE_CONCAT(f32, 2, _mm_castpd_ps(_mm_setr_pd(x.whole, y.whole)))
+KFR_INTRIN_SHUFFLE_SWAP(f32, 2, _mm_cvtsd_f64(_mm_castps_pd(KFR_swap_ps(_mm_castpd_ps(_mm_set_sd(x.whole))))))
+#else
+KFR_INTRIN_SHUFFLE_CONCAT(f32, 2, _mm_setr_ps(x.low, x.high, y.low, y.high))
+KFR_INTRIN_SHUFFLE_SWAP(f32, 2, simd<f32, 2>(x.high, x.low))
+#endif
+
#if defined CMT_COMPILER_MSVC && !defined CMT_COMPILER_CLANG && defined CMT_ARCH_X32
KFR_INTRINSIC __m128i _mm_cvtsi64_si128(int64_t u)
{
@@ -364,7 +474,11 @@ KFR_INTRIN_SHUFFLE_LINEAR(i64, 1, 2, _mm_cvtsi128_si64(x))
KFR_INTRIN_SHUFFLE_LINEAR(u64, 1, 2, _mm_cvtsi128_si64(x))
KFR_INTRIN_SHUFFLE_LINEAR(f32, 1, 4, _mm_cvtss_f32(x))
KFR_INTRIN_SHUFFLE_LINEAR(f32, 2, 4, bitcast_anything<simd<float, 2>>(_mm_cvtsd_f64(_mm_castps_pd(x))))
+#ifndef KFR_f32x2_array
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 2, _mm_castpd_ps(_mm_set_sd(x.whole)))
+#else
KFR_INTRIN_SHUFFLE_LINEAR(f32, 4, 2, _mm_unpacklo_ps(_mm_set_ss(x.low), _mm_set_ss(x.high)))
+#endif
KFR_INTRIN_SHUFFLE_LINEAR(f64, 1, 2, _mm_cvtsd_f64(x))
KFR_INTRIN_SHUFFLE_LINEAR(i8, 2, 16, simd<i8, 2>::from(u16(_mm_cvtsi128_si32(x))))
@@ -390,6 +504,13 @@ KFR_INTRIN_SHUFFLE_LINEAR_START(i16, 4, 8, 4, simd<i16, 4>::from(KFR_u64sse_INDE
KFR_INTRIN_SHUFFLE_LINEAR_START(u32, 2, 4, 2, simd<u32, 2>::from(KFR_u64sse_INDEX(x, 1)))
KFR_INTRIN_SHUFFLE_LINEAR_START(i32, 2, 4, 2, simd<i32, 2>::from(KFR_u64sse_INDEX(x, 1)))
+#ifndef KFR_f32x2_array
+KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 2, 4, 2, simd<f32, 2>::from(take_hi_sd(_mm_castps_pd(x))))
+#else
+KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 2, 4, 2,
+ simd_halves<f32, 2>{ KFR_f32sse_INDEX(x, 2), KFR_f32sse_INDEX(x, 3) })
+#endif
+
#define KFR_INTRIN_CONVERT(Tout, Tin, N, ...) \
KFR_INTRINSIC simd<Tout, N> simd_convert(simd_cvt_t<Tout, Tin, N>, const simd<Tin, N>& x) CMT_NOEXCEPT \
{ \
@@ -492,14 +613,14 @@ KFR_INTRIN_BITCAST(f64, i64, 4, _mm256_castsi256_pd(x))
KFR_INTRIN_BITCAST(i64, f64, 4, _mm256_castpd_si256(x))
#ifndef CMT_ARCH_AVX2
-KFR_INTRIN_SHUFFLE_DUPHALVES(i8, 16, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1) )
-KFR_INTRIN_SHUFFLE_DUPHALVES(u8, 16, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1) )
-KFR_INTRIN_SHUFFLE_DUPHALVES(i16, 8, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1) )
-KFR_INTRIN_SHUFFLE_DUPHALVES(u16, 8, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1) )
-KFR_INTRIN_SHUFFLE_DUPHALVES(i32, 4, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1) )
-KFR_INTRIN_SHUFFLE_DUPHALVES(u32, 4, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1) )
-KFR_INTRIN_SHUFFLE_DUPHALVES(i64, 2, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1) )
-KFR_INTRIN_SHUFFLE_DUPHALVES(u64, 2, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1) )
+KFR_INTRIN_SHUFFLE_DUPHALVES(i8, 16, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1))
+KFR_INTRIN_SHUFFLE_DUPHALVES(u8, 16, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1))
+KFR_INTRIN_SHUFFLE_DUPHALVES(i16, 8, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1))
+KFR_INTRIN_SHUFFLE_DUPHALVES(u16, 8, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1))
+KFR_INTRIN_SHUFFLE_DUPHALVES(i32, 4, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1))
+KFR_INTRIN_SHUFFLE_DUPHALVES(u32, 4, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1))
+KFR_INTRIN_SHUFFLE_DUPHALVES(i64, 2, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1))
+KFR_INTRIN_SHUFFLE_DUPHALVES(u64, 2, _mm256_insertf128_si256(_mm256_castsi128_si256(x), x, 1))
#endif
KFR_INTRINSIC __m256 KFR_mm256_setr_m128(__m128 x, __m128 y)
@@ -562,9 +683,15 @@ KFR_INTRIN_SHUFFLE_LINEAR(u16, 8, 16, _mm256_castsi256_si128(x))
KFR_INTRIN_SHUFFLE_LINEAR(u32, 4, 8, _mm256_castsi256_si128(x))
KFR_INTRIN_SHUFFLE_LINEAR(u64, 2, 4, _mm256_castsi256_si128(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 2, 8, _mm_cvtsd_f64(_mm_castps_pd(_mm256_castps256_ps128(x))))
+KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 2, 8, 2, take_hi_sd(_mm_castps_pd(_mm256_castps256_ps128(x))))
+KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 2, 8, 4, _mm_cvtsd_f64(_mm_castps_pd(_mm256_extractf128_ps(x, 1))))
+KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 2, 8, 6, take_hi_sd(_mm_castps_pd(_mm256_extractf128_ps(x, 1))))
+
// extend
KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 2, 4, _mm256_castps128_ps256(x))
KFR_INTRIN_SHUFFLE_LINEAR(f64, 2 * 2, 2, _mm256_castpd128_pd256(x))
+KFR_INTRIN_SHUFFLE_LINEAR(f32, 4 * 2, 2, _mm256_castps128_ps256(_mm_castpd_ps(_mm_set_sd(x.whole))))
// high
KFR_INTRIN_SHUFFLE_LINEAR_START(f32, 4, 8, 4, _mm256_extractf128_ps(x, 1))
@@ -1222,6 +1349,483 @@ KFR_INTRINSIC simd<T, N> simd_set_element(const simd<T, N>& value, size_t index,
arr.val[index] = x;
return from_simd_array(arr);
}
+
+#define SIMD_TYPE_INTRIN(T, N, TO_SCALAR, FROM_SCALAR, FROM_BROADCAST, FROM_ZERO) \
+ KFR_INTRINSIC T simd_to_scalar(simd_t<T, N>, const simd<T, N>& x) { return TO_SCALAR; } \
+ KFR_INTRINSIC simd<T, N> simd_from_scalar(simd_t<T, N>, T x) { return FROM_SCALAR; } \
+ KFR_INTRINSIC simd<T, N> simd_from_broadcast(simd_t<T, N>, T x) { return FROM_BROADCAST; } \
+ KFR_INTRINSIC simd<T, N> simd_from_zero(simd_t<T, N>) { return FROM_ZERO; }
+
+#define SIMD_TYPE_INTRIN_EX(T, N, TO_SCALAR, FROM_SCALAR, FROM_BROADCAST, FROM_ZERO, GET_LOW, GET_HIGH, \
+ FROM_HALVES) \
+ SIMD_TYPE_INTRIN(T, N, TO_SCALAR, FROM_SCALAR, FROM_BROADCAST, FROM_ZERO) \
+ KFR_INTRINSIC simd<T, N / 2> simd_get_low(simd_t<T, N>, const simd<T, N>& x) { return GET_LOW; } \
+ KFR_INTRINSIC simd<T, N / 2> simd_get_high(simd_t<T, N>, const simd<T, N>& x) { return GET_HIGH; } \
+ KFR_INTRINSIC simd<T, N> simd_from_halves(simd_t<T, N>, const simd<T, N / 2>& x, \
+ const simd<T, N / 2>& y) \
+ { \
+ return FROM_HALVES; \
+ }
+
+template <typename T, size_t Nout, size_t Nin>
+KFR_INTRINSIC simd<T, Nout> simd_from_partial(simd2_t<T, Nout, Nin>, const simd<T, Nin>& x)
+{
+#ifdef _MSC_VER
+ union {
+ simd<T, Nin> in;
+ simd<T, Nout> out;
+ } u;
+ u.in = x;
+ return u.out;
+#else
+ union {
+ simd<T, Nin> in;
+ simd<T, Nout> out;
+ } u{ x };
+ return u.out;
+#endif
+}
+template <typename T, size_t N>
+KFR_INTRINSIC simd<T, N / 2> simd_get_low(simd_t<T, N>, const simd<T, N>& x)
+{
+ return x.low;
+}
+template <typename T, size_t N>
+KFR_INTRINSIC simd<T, N / 2> simd_get_high(simd_t<T, N>, const simd<T, N>& x)
+{
+ return x.high;
+}
+template <typename T, size_t N>
+KFR_INTRINSIC simd<T, N> simd_from_halves(simd_t<T, N>, const simd<T, N / 2>& x, const simd<T, N / 2>& y)
+{
+ return { x, y };
+}
+
+KFR_INTRINSIC simd<float, 4> simd_from_halves(simd_t<float, 4>, const simd<float, 2>& x,
+ const simd<float, 2>& y)
+{
+ return _mm_castpd_ps(_mm_setr_pd(x.whole, y.whole));
+}
+
+KFR_INTRINSIC simd<double, 2> simd_from_halves(simd_t<double, 2>, const simd<double, 1>& x,
+ const simd<double, 1>& y)
+{
+ return _mm_setr_pd(x, y);
+}
+
+SIMD_TYPE_INTRIN(f32, 4, _mm_cvtss_f32(x), _mm_set_ss(x), _mm_set1_ps(x), _mm_setzero_ps())
+SIMD_TYPE_INTRIN(f64, 2, _mm_cvtsd_f64(x), _mm_set_sd(x), _mm_set1_pd(x), _mm_setzero_pd())
+
+#ifdef CMT_ARCH_AVX
+SIMD_TYPE_INTRIN_EX(f32, 8, _mm256_cvtss_f32(x), _mm256_castps128_ps256(_mm_set_ss(x)), _mm256_set1_ps(x),
+ _mm256_setzero_ps(), _mm256_castps256_ps128(x), _mm256_extractf128_ps(x, 1),
+ KFR_mm256_setr_m128(x, y))
+SIMD_TYPE_INTRIN_EX(f64, 4, _mm256_cvtsd_f64(x), _mm256_castpd128_pd256(_mm_set_sd(x)), _mm256_set1_pd(x),
+ _mm256_setzero_pd(), _mm256_castpd256_pd128(x), _mm256_extractf128_pd(x, 1),
+ KFR_mm256_setr_m128d(x, y))
+#endif
+
+#ifdef CMT_ARCH_AVX512
+SIMD_TYPE_INTRIN_EX(f32, 16, _mm512_cvtss_f32(x), _mm512_castps128_ps512(_mm_set_ss(x)), _mm512_set1_ps(x),
+ _mm512_setzero_ps(), _mm512_castps512_ps256(x), _mm512_extractf32x8_ps(x, 1),
+ KFR_mm512_setr_m256(x, y))
+SIMD_TYPE_INTRIN_EX(f64, 8, _mm512_cvtsd_f64(x), _mm512_castpd128_pd512(_mm_set_sd(x)), _mm512_set1_pd(x),
+ _mm512_setzero_pd(), _mm512_castpd512_pd256(x), _mm512_extractf64x4_pd(x, 1),
+ KFR_mm512_setr_m256d(x, y))
+#endif
+
+template <size_t bits, size_t...>
+struct shuffle_mask;
+
+template <size_t i0, size_t i1, size_t i2, size_t i3, size_t i4, size_t i5, size_t i6, size_t i7>
+struct shuffle_mask<8, i0, i1, i2, i3, i4, i5, i6, i7>
+{
+ constexpr static inline size_t Nmax = 1;
+ constexpr static inline size_t value = (const_min(i7, Nmax) << 7) | (const_min(i6, Nmax) << 6) |
+ (const_min(i5, Nmax) << 5) | (const_min(i4, Nmax) << 4) |
+ (const_min(i3, Nmax) << 3) | (const_min(i2, Nmax) << 2) |
+ (const_min(i1, Nmax) << 1) | const_min(i0, Nmax);
+};
+
+template <size_t i0, size_t i1, size_t i2, size_t i3>
+struct shuffle_mask<8, i0, i1, i2, i3>
+{
+ constexpr static inline size_t Nmax = 3;
+ constexpr static inline size_t value = (const_min(i3, Nmax) << 6) | (const_min(i2, Nmax) << 4) |
+ (const_min(i1, Nmax) << 2) | const_min(i0, Nmax);
+};
+
+template <size_t i0, size_t i1, size_t i2, size_t i3>
+struct shuffle_mask<4, i0, i1, i2, i3>
+{
+ constexpr static inline size_t Nmax = 1;
+ constexpr static inline size_t value = (const_min(i3, Nmax) << 3) | (const_min(i2, Nmax) << 2) |
+ (const_min(i1, Nmax) << 1) | const_min(i0, Nmax);
+};
+
+template <size_t i0, size_t i1>
+struct shuffle_mask<2, i0, i1>
+{
+ constexpr static inline size_t Nmax = 1;
+ constexpr static inline size_t value = (const_min(i1, Nmax) << 1) | const_min(i0, Nmax);
+};
+
+#ifdef CMT_ARCH_SSE2
+
+template <size_t I0, size_t I1, size_t I2, size_t I3>
+KFR_INTRINSIC simd<float, 4> simd_vec_shuffle(simd_t<float, 4>, const simd<float, 4>& x,
+ csizes_t<I0, I1, I2, I3>)
+{
+ // SSE -> SSE
+ return _mm_shuffle_ps(x, x, shuffle_mask<8, I0, I1, I2, I3>::value);
+}
+
+template <size_t I0, size_t I1>
+KFR_INTRINSIC simd<double, 2> simd_vec_shuffle(simd_t<double, 2>, const simd<double, 2>& x, csizes_t<I0, I1>)
+{
+ // SSE -> SSE
+ return _mm_shuffle_pd(x, x, shuffle_mask<2, I0, I1>::value);
+}
+#endif
+
+#ifdef CMT_ARCH_AVX512
+
+template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7, size_t I8,
+ size_t I9, size_t I10, size_t I11, size_t I12, size_t I13, size_t I14, size_t I15>
+KFR_INTRINSIC simd<float, 16> simd_vec_shuffle(
+ simd_t<float, 16>, const simd<float, 16>& x,
+ csizes_t<I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15>)
+{
+ // AVX512 -> AVX512
+ return _mm512_permutexvar_ps(
+ _mm512_setr_epi32(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15), x);
+}
+
+template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7>
+KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_t<double, 8>, const simd<double, 8>& x,
+ csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>)
+{
+ // AVX512 -> AVX512
+ return _mm512_permutexvar_pd(_mm512_setr_epi64(I0, I1, I2, I3, I4, I5, I6, I7), x);
+}
+
+template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7>
+KFR_INTRINSIC simd<float, 8> simd_vec_shuffle(simd_t<float, 16>, const simd<float, 16>& x,
+ csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>)
+{
+ // AVX512 -> AVX
+ return _mm512_castps512_ps256(_mm512_permutexvar_ps(
+ _mm512_setr_epi32(I0, I1, I2, I3, I4, I5, I6, I7, I0, I1, I2, I3, I4, I5, I6, I7), x));
+}
+
+template <size_t I0, size_t I1, size_t I2, size_t I3>
+KFR_INTRINSIC simd<float, 4> simd_vec_shuffle(simd_t<float, 16>, const simd<float, 16>& x,
+ csizes_t<I0, I1, I2, I3>)
+{
+ // AVX512 -> SSE
+ return _mm512_castps512_ps128(_mm512_permutexvar_ps(
+ _mm512_setr_epi32(I0, I1, I2, I3, I0, I1, I2, I3, I0, I1, I2, I3, I0, I1, I2, I3), x));
+}
+
+template <size_t I0, size_t I1, size_t I2, size_t I3>
+KFR_INTRINSIC simd<double, 4> simd_vec_shuffle(simd_t<double, 8>, const simd<double, 8>& x,
+ csizes_t<I0, I1, I2, I3>)
+{
+ // AVX512 -> AVX
+ return _mm512_castpd512_pd256(
+ _mm512_permutexvar_pd(_mm512_setr_epi64(I0, I1, I2, I3, I0, I1, I2, I3), x));
+}
+
+template <size_t I0, size_t I1>
+KFR_INTRINSIC simd<double, 2> simd_vec_shuffle(simd_t<double, 8>, const simd<double, 8>& x, csizes_t<I0, I1>)
+{
+ // AVX512 -> SSE
+ return _mm512_castpd512_pd128(
+ _mm512_permutexvar_pd(_mm512_setr_epi64(I0, I1, I0, I1, I0, I1, I0, I1), x));
+}
+
+template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7, size_t I8,
+ size_t I9, size_t I10, size_t I11, size_t I12, size_t I13, size_t I14, size_t I15>
+KFR_INTRINSIC simd<float, 16> simd_vec_shuffle(
+ simd_t<float, 8>, const simd<float, 8>& x,
+ csizes_t<I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15>)
+{
+ // AVX -> AVX512
+ return _mm512_permutexvar_ps(
+ _mm512_setr_epi32(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15),
+ _mm512_castps256_ps512(x));
+}
+
+template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7, size_t I8,
+ size_t I9, size_t I10, size_t I11, size_t I12, size_t I13, size_t I14, size_t I15>
+KFR_INTRINSIC simd<float, 16> simd_vec_shuffle(
+ simd_t<float, 4>, const simd<float, 4>& x,
+ csizes_t<I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15>)
+{
+ // SSE -> AVX512
+ return _mm512_permutexvar_ps(
+ _mm512_setr_epi32(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15),
+ _mm512_castps128_ps512(x));
+}
+
+template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7>
+KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_t<double, 4>, const simd<double, 4>& x,
+ csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>)
+{
+ // AVX -> AVX512
+ return _mm512_permutexvar_pd(_mm512_setr_epi64(I0, I1, I2, I3, I4, I5, I6, I7),
+ _mm512_castpd256_pd512(x));
+}
+
+template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7>
+KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_t<double, 2>, const simd<double, 2>& x,
+ csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>)
+{
+ // SSE -> AVX512
+ return _mm512_permutexvar_pd(_mm512_setr_epi64(I0, I1, I2, I3, I4, I5, I6, I7),
+ _mm512_castpd128_pd512(x));
+}
+
+#endif
+
+#ifdef CMT_ARCH_AVX
+
+template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7>
+KFR_INTRINSIC simd<float, 8> simd_vec_shuffle(simd_t<float, 8>, const simd<float, 8>& x,
+ csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>)
+{
+ // AVX -> AVX
+ if constexpr (cmaxof(csizes<I0, I1, I2, I3>) < 4 && csizes<I0, I1, I2, I3> == csizes<I4, I5, I6, I7>)
+ {
+ const simd<float, 4> tmp = universal_shuffle(simd_t<float, 4>{}, simd_get_low(simd_t<float, 8>{}, x),
+ csizes<I0, I1, I2, I3>);
+ return simd_from_halves(simd_t<float, 8>{}, tmp, tmp);
+ }
+ else if constexpr (cmaxof(csizes<I0, I1, I2, I3>) < 4 && cminof(csizes<I4, I5, I6, I7>) >= 4)
+ {
+ if constexpr (csizes<I0, I1, I2, I3, I4, I5, I6, I7> ==
+ csizes<I0, I1, I2, I3, I0 + 4, I1 + 4, I2 + 4, I3 + 4>)
+ {
+ return _mm256_shuffle_ps(x, x, shuffle_mask<8, I0, I1, I2, I3>::value);
+ }
+ else
+ {
+ return simd_from_halves(simd_t<float, 8>{},
+ universal_shuffle(simd_t<float, 4>{}, simd_get_low(simd_t<float, 8>{}, x),
+ shuffle_mask<8, I0, I1, I2, I3>::value),
+ universal_shuffle(simd_t<float, 4>{},
+ simd_get_high(simd_t<float, 8>{}, x),
+ shuffle_mask<8, I4, I5, I6, I7>::value));
+ }
+ }
+ else
+ {
+ const __m256 sw = _mm256_permute2f128_ps(x, x, 1); // swap lanes
+ const __m256 t1 = _mm256_permutevar_ps(
+ x, _mm256_setr_epi32(I0 % 4, I1 % 4, I2 % 4, I3 % 4, I4 % 4, I5 % 4, I6 % 4, I7 % 4));
+ const __m256 t2 = _mm256_permutevar_ps(
+ sw, _mm256_setr_epi32(I0 % 4, I1 % 4, I2 % 4, I3 % 4, I4 % 4, I5 % 4, I6 % 4, I7 % 4));
+ return _mm256_blend_ps(t1, t2,
+ shuffle_mask<8, I0 / 4, I1 / 4, I2 / 4, I3 / 4, 1 - I4 / 4, 1 - I5 / 4,
+ 1 - I6 / 4, 1 - I7 / 4>::value);
+ }
+}
+
+template <size_t I0, size_t I1, size_t I2, size_t I3>
+KFR_INTRINSIC simd<double, 4> simd_vec_shuffle(simd_t<double, 4>, const simd<double, 4>& x,
+ csizes_t<I0, I1, I2, I3>)
+{
+ // AVX -> AVX
+ if constexpr (cmaxof(csizes<I0, I1>) < 2 && csizes<I0, I1> == csizes<I2, I3>)
+ {
+ const simd<double, 2> tmp =
+ universal_shuffle(simd_t<double, 2>{}, simd_get_low(simd_t<double, 4>{}, x), csizes<I0, I1>);
+ return simd_from_halves(simd_t<double, 4>{}, tmp, tmp);
+ }
+ else if constexpr (cmaxof(csizes<I0, I1>) < 4 && cminof(csizes<I2, I3>) >= 4)
+ {
+ if constexpr (csizes<I0, I1, I2, I3> == csizes<I0, I1, I2 + 2, I3 + 2>)
+ {
+ return _mm256_shuffle_ps(x, x, shuffle_mask<2, I0, I1>::value);
+ }
+ else
+ {
+ return simd_from_halves(
+ simd_t<double, 4>{},
+ universal_shuffle(simd_t<double, 2>{}, simd_get_low(simd_t<double, 4>{}, x),
+ shuffle_mask<2, I0, I1>::value),
+ universal_shuffle(simd_t<double, 2>{}, simd_get_high(simd_t<double, 4>{}, x),
+ shuffle_mask<2, I2, I3>::value));
+ }
+ }
+ else
+ {
+ const __m256d sw = _mm256_permute2f128_pd(x, x, 1); // swap lanes
+ const __m256d t1 = _mm256_permutevar_pd(
+ x, _mm256_setr_epi64x((I0 % 2) << 1, (I1 % 2) << 1, (I2 % 2) << 1, (I3 % 2) << 1));
+ const __m256d t2 = _mm256_permutevar_pd(
+ sw, _mm256_setr_epi64x((I0 % 2) << 1, (I1 % 2) << 1, (I2 % 2) << 1, (I3 % 2) << 1));
+ return _mm256_blend_pd(t1, t2, shuffle_mask<4, I0 / 2, I1 / 2, 1 - I2 / 2, 1 - I3 / 2>::value);
+ }
+}
+
+template <size_t I0, size_t I1, size_t I2, size_t I3>
+KFR_INTRINSIC simd<float, 4> simd_vec_shuffle(simd_t<float, 8>, const simd<float, 8>& x,
+ csizes_t<I0, I1, I2, I3>)
+{
+ // AVX -> SSE
+ if constexpr (I0 % 4 == 0 && I1 % 4 == 1 && I2 % 4 == 2 && I3 % 4 == 3)
+ {
+ __m128 t1 = simd_get_low(simd_t<float, 8>{}, x);
+ __m128 t2 = simd_get_high(simd_t<float, 8>{}, x);
+ return _mm_blend_ps(t1, t2, shuffle_mask<4, I0 / 4, I1 / 4, I2 / 4, I3 / 4>::value);
+ }
+ else
+ {
+ __m128 t1 = simd_get_low(simd_t<float, 8>{}, x);
+ __m128 t2 = simd_get_high(simd_t<float, 8>{}, x);
+ t1 = _mm_permute_ps(t1, shuffle_mask<8, I0 % 4, I1 % 4, I2 % 4, I3 % 4>::value);
+ t2 = _mm_permute_ps(t2, shuffle_mask<8, I0 % 4, I1 % 4, I2 % 4, I3 % 4>::value);
+ return _mm_blend_ps(t1, t2, shuffle_mask<4, I0 / 4, I1 / 4, I2 / 4, I3 / 4>::value);
+ }
+}
+
+template <size_t I0, size_t I1>
+KFR_INTRINSIC simd<double, 2> simd_vec_shuffle(simd_t<double, 4>, const simd<double, 4>& x, csizes_t<I0, I1>)
+{
+ // AVX -> SSE
+ if constexpr (I0 % 2 == 0 && I1 % 2 == 1)
+ {
+ __m128d t1 = simd_get_low(simd_t<double, 4>{}, x);
+ __m128d t2 = simd_get_high(simd_t<double, 4>{}, x);
+ return _mm_blend_pd(t1, t2, shuffle_mask<2, I0 / 2, I1 / 2>::value);
+ }
+ else
+ {
+ __m128d t1 = simd_get_low(simd_t<double, 4>{}, x);
+ __m128d t2 = simd_get_high(simd_t<double, 4>{}, x);
+ t1 = _mm_permute_pd(t1, shuffle_mask<2, I0 % 2, I1 % 2>::value);
+ t2 = _mm_permute_pd(t2, shuffle_mask<2, I0 % 2, I1 % 2>::value);
+ return _mm_blend_pd(t1, t2, shuffle_mask<2, I0 / 2, I1 / 2>::value);
+ }
+}
+
+template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7>
+KFR_INTRINSIC simd<float, 8> simd_vec_shuffle(simd_t<float, 4>, const simd<float, 4>& x,
+ csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>)
+{
+ // SSE -> AVX
+ return KFR_mm256_setr_m128(_mm_shuffle_ps(x, x, shuffle_mask<8, I0, I1, I2, I3>::value),
+ _mm_shuffle_ps(x, x, shuffle_mask<8, I4, I5, I6, I7>::value));
+}
+
+template <size_t I0, size_t I1, size_t I2, size_t I3>
+KFR_INTRINSIC simd<double, 4> simd_vec_shuffle(simd_t<double, 2>, const simd<double, 2>& x,
+ csizes_t<I0, I1, I2, I3>)
+{
+ // SSE -> AVX
+ return KFR_mm256_setr_m128d(_mm_shuffle_pd(x, x, shuffle_mask<2, I0, I1>::value),
+ _mm_shuffle_pd(x, x, shuffle_mask<2, I2, I3>::value));
+}
+
+#endif
+
+template <typename T, size_t Nin, size_t... indices, size_t Nout>
+KFR_INTRINSIC simd<T, Nout> universal_shuffle(simd_t<T, Nin>, const simd<T, Nin>& x, csizes_t<indices...>)
+{
+ using Indices = csizes_t<indices...>;
+
+ constexpr bool floating = typeclass<T> == datatype::f;
+
+ constexpr size_t minwidth = minimum_vector_width<T>;
+ constexpr size_t maxwidth = vector_width<T>;
+ constexpr size_t minindex = cminof(Indices{});
+ constexpr size_t maxindex = cmaxof(csizes<(indices >= Nin ? 0 : indices)...>);
+
+ if constexpr (Nin == 1 && Nout == 1)
+ {
+ return x;
+ }
+ else if constexpr (next_poweroftwo(Nin) == next_poweroftwo(Nout) && Indices{} == csizeseq<Nout>)
+ {
+ return x;
+ }
+ else if constexpr (!is_poweroftwo(Nin) || !is_poweroftwo(Nout))
+ {
+ // Fix if not power of two
+ return universal_shuffle(
+ simd_t<T, next_poweroftwo(Nin)>{}, x,
+ cconcat(Indices{}, csizeseq<next_poweroftwo(Nout) - Nout, index_undefined, 0>));
+ }
+ else if constexpr (Nout < minwidth)
+ {
+ // Expand indices if less than vector
+ const simd<T, minwidth> tmp = universal_shuffle(
+ simd_t<T, Nin>{}, x, cconcat(Indices{}, csizeseq<minwidth - Nout, index_undefined, 0>));
+
+ if constexpr (Nout == 1)
+ {
+ return simd_to_scalar(simd_t<T, minwidth>{}, tmp);
+ }
+ else
+ {
+ union {
+ simd<T, minwidth> tmp;
+ simd<T, Nout> r;
+ } u{ tmp };
+ return u.r;
+ }
+ }
+ else if constexpr (Nout > maxwidth)
+ {
+ auto lowi = Indices{}[csizeseq<Nout / 2, 0>];
+ auto highi = Indices{}[csizeseq<Nout / 2, Nout / 2>];
+ if constexpr (lowi == highi)
+ {
+ auto tmp = universal_shuffle(simd_t<T, Nin>{}, x, lowi);
+ return { tmp, tmp };
+ }
+ else
+ {
+ return { universal_shuffle(simd_t<T, Nin>{}, x, lowi),
+ universal_shuffle(simd_t<T, Nin>{}, x, highi) };
+ }
+ }
+ else if constexpr (minindex >= Nin)
+ {
+ return simd_from_zero(simd_t<T, Nout>{});
+ }
+ else if constexpr (Nin == 1)
+ {
+ return simd_from_broadcast(simd_t<T, Nout>{}, x);
+ }
+ else if constexpr (Nin < minwidth)
+ {
+ return universal_shuffle(simd_t<T, minwidth>{}, simd_from_partial(simd2_t<T, minwidth, Nin>{}, x),
+ Indices{});
+ }
+ else if constexpr (Nin > Nout && maxindex < Nin / 2)
+ {
+ return universal_shuffle(simd_t<T, Nin / 2>{}, simd_get_low(simd_t<T, Nin>{}, x), Indices{});
+ }
+ else if constexpr (Nin > Nout && minindex >= Nin / 2)
+ {
+ return universal_shuffle(simd_t<T, Nin / 2>{}, simd_get_high(simd_t<T, Nin>{}, x),
+ csizes<(indices < Nin ? indices - csize<Nin / 2> : indices)...>);
+ }
+ else if constexpr (Nin >= minwidth && Nin <= maxwidth && Nout >= minwidth && Nout <= maxwidth)
+ {
+ return simd_vec_shuffle(simd_t<T, Nin>{}, x, Indices{});
+ }
+ else
+ {
+ not_optimized(CMT_FUNC_SIGNATURE);
+ const simd_array<T, Nin> xx = to_simd_array<T, Nin>(x);
+ constexpr static unsigned indices_array[] = { static_cast<unsigned>(indices)... };
+ return from_simd_array<T, Nout>(simd_shuffle_generic<T, Nout, Nin>(xx, indices_array));
+ }
+}
+
} // namespace intrinsics
} // namespace CMT_ARCH_NAME
} // namespace kfr
diff --git a/include/kfr/simd/impl/read_write.hpp b/include/kfr/simd/impl/read_write.hpp
@@ -147,8 +147,12 @@ KFR_INTRINSIC void write(cunaligned_t, T* ptr, const vec<T, 1>& x)
}
KFR_INTRINSIC void write(cunaligned_t, f32* ptr, const f32x2& x)
{
+#ifndef KFR_f32x2_array
+ *reinterpret_cast<f64*>(ptr) = x.v.whole;
+#else
ptr[0] = x.v.low;
ptr[1] = x.v.high;
+#endif
}
KFR_INTRINSIC void write(cunaligned_t, u8* ptr, const u8x2& x) { *reinterpret_cast<u16*>(ptr) = x.v.whole; }
diff --git a/include/kfr/simd/platform.hpp b/include/kfr/simd/platform.hpp
@@ -260,6 +260,11 @@ constexpr static size_t vector_width =
(const_max(size_t(1), typeclass<T> == datatype::f ? platform<>::native_float_vector_size / sizeof(T)
: platform<>::native_int_vector_size / sizeof(T)));
+template <typename T, cpu_t cpu>
+constexpr static size_t vector_width_for =
+ (const_max(size_t(1), typeclass<T> == datatype::f ? platform<cpu>::native_float_vector_size / sizeof(T)
+ : platform<cpu>::native_int_vector_size / sizeof(T)));
+
template <typename T>
constexpr static size_t minimum_vector_width =
(const_max(size_t(1), typeclass<T> == datatype::f ? platform<>::minimum_float_vector_size / sizeof(T)
diff --git a/include/kfr/simd/vec.hpp b/include/kfr/simd/vec.hpp
@@ -207,7 +207,7 @@ struct alignas(force_compiletime_size_t<
// from SIMD
KFR_MEM_INTRINSIC vec(const simd_type& simd) CMT_NOEXCEPT : v(simd) {}
// default
- KFR_MEM_INTRINSIC constexpr vec() CMT_NOEXCEPT = default;
+ KFR_MEM_INTRINSIC vec() CMT_NOEXCEPT {}
// copy
KFR_MEM_INTRINSIC constexpr vec(const vec& value) CMT_NOEXCEPT = default;
// move
@@ -345,6 +345,12 @@ struct alignas(force_compiletime_size_t<
intrinsics::simd_t<unwrap_bit<ST>, SN>{}, v, csizeseq<SW, SW * index>, overload_auto));
}
+ template <size_t index>
+ KFR_MEM_INTRINSIC constexpr value_type get() const CMT_NOEXCEPT
+ {
+ return this->get(csize_t<index>{});
+ }
+
template <int dummy = 0, KFR_ENABLE_IF(dummy == 0 && compound_type_traits<T>::is_scalar)>
KFR_MEM_INTRINSIC constexpr void set(size_t index, const value_type& s) CMT_NOEXCEPT
{
@@ -1318,3 +1324,19 @@ struct flt_type_impl<kfr::vec<T, N>>
CMT_PRAGMA_GNU(GCC diagnostic pop)
CMT_PRAGMA_MSVC(warning(pop))
+
+namespace std
+{
+
+template <typename T, size_t N>
+class tuple_size<kfr::vec<T, N>> : public integral_constant<size_t, N>
+{
+};
+
+template <size_t I, class T, size_t N>
+struct tuple_element<I, kfr::vec<T, N>>
+{
+ using type = T;
+};
+
+} // namespace std
+\ No newline at end of file
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -33,13 +33,12 @@ set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_SOURCE_DIR}/tests/cmake/")
if (ENABLE_ASMTEST)
add_executable(asm_test asm_test.cpp)
target_link_libraries(asm_test kfr)
- target_set_arch(asm_test PRIVATE avx)
+ target_set_arch(asm_test PRIVATE avx2)
target_compile_definitions(asm_test PRIVATE KFR_SHOW_NOT_OPTIMIZED)
target_compile_definitions(asm_test PRIVATE KFR_FUNCTION_IS_INTRINSIC)
if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
target_compile_options(asm_test PRIVATE -fno-stack-protector)
endif ()
- message("CMAKE_CXX_COMPILER_ID = ${CMAKE_CXX_COMPILER_ID}")
if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
target_compile_options(asm_test PRIVATE -GS-)
target_compile_options(asm_test PRIVATE -Gs16384)
diff --git a/tests/asm_test.cpp b/tests/asm_test.cpp
@@ -7,6 +7,7 @@
#define KFR_EXTENDED_TESTS
#include <kfr/base.hpp>
+#include <kfr/dft/impl/fft-impl.hpp>
#include <kfr/io.hpp>
#include <kfr/testo/console_colors.hpp>
@@ -119,6 +120,13 @@ using namespace kfr;
{ \
r = kfr::fn(x, y); \
}
+#define TEST_ASM_QUAD4(fn, ty, n) \
+ KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n * 4>& r, const vec<ty, n>& x, \
+ const vec<ty, n>& y, const vec<ty, n>& z, \
+ const vec<ty, n>& w) \
+ { \
+ r = kfr::fn(x, y, z, w); \
+ }
#define TEST_ASM_DOUBLE1(fn, ty, n) \
KFR_PUBLIC void asm__test__##fn##__##ty##__##n(vec<ty, n * 2>& r, const vec<ty, n>& x) { r = kfr::fn(x); }
@@ -167,6 +175,8 @@ using namespace kfr;
#define TEST_ASM_IF(fn, MACRO) TEST_ASM_I(fn, MACRO) TEST_ASM_F(fn, MACRO)
+#if 1
+
TEST_ASM_UIF(add, TEST_ASM_VTY2)
TEST_ASM_UIF(sub, TEST_ASM_VTY2)
@@ -207,6 +217,14 @@ TEST_ASM_UIF(high, TEST_ASM_HALF1)
TEST_ASM_UIF(concat, TEST_ASM_DOUBLE2)
+template <typename... Args>
+KFR_INTRINSIC decltype(auto) concat4(const Args&... args)
+{
+ return concat(args...);
+}
+
+TEST_ASM_UIF(concat4, TEST_ASM_QUAD4)
+
TEST_ASM_UIF(shl, TEST_ASM_SHIFT)
TEST_ASM_UIF(shr, TEST_ASM_SHIFT)
@@ -217,10 +235,6 @@ TEST_ASM_UIF(shr, TEST_ASM_SHIFT_SCALAR)
TEST_ASM_UIF(duphalfs, TEST_ASM_DOUBLE1)
-TEST_ASM_F(sin, TEST_ASM_VTY1_F)
-
-TEST_ASM_F(cos, TEST_ASM_VTY1_F)
-
TEST_ASM_UIF(sqr, TEST_ASM_VTY1)
TEST_ASM_UIF(make_vector, TEST_ASM_MAKE_VECTOR)
@@ -231,11 +245,199 @@ TEST_ASM_UIF(read, TEST_READ)
TEST_ASM_UIF(write, TEST_WRITE)
+#define TEST_FFT_SPEC(ty, size) \
+ static intrinsics::fft_specialization<ty, size> fft__##ty##__##size(static_cast<size_t>(1 << size)); \
+ KFR_PUBLIC void asm__test__fft__##ty##__##size(complex<ty>* out, const complex<ty>* in, u8* temp) \
+ { \
+ fft__##ty##__##size.do_execute<false>(out, in, temp); \
+ } \
+ KFR_PUBLIC void asm__test__ifft__##ty##__##size(complex<ty>* out, const complex<ty>* in, u8* temp) \
+ { \
+ fft__##ty##__##size.do_execute<true>(out, in, temp); \
+ }
+#define TEST_FFT_GEN(ty) \
+ static intrinsics::fft_stage_impl<ty, true, true> fft__##ty##__##size(static_cast<size_t>(65526)); \
+ KFR_PUBLIC void asm__test__fft__##ty##__gen(complex<ty>* out, const complex<ty>* in, u8* temp) \
+ { \
+ fft__##ty##__##size.do_execute<false>(out, in, temp); \
+ } \
+ KFR_PUBLIC void asm__test__ifft__##ty##__gen(complex<ty>* out, const complex<ty>* in, u8* temp) \
+ { \
+ fft__##ty##__##size.do_execute<true>(out, in, temp); \
+ }
+
+TEST_FFT_SPEC(f32, 1)
+TEST_FFT_SPEC(f32, 2)
+TEST_FFT_SPEC(f32, 3)
+TEST_FFT_SPEC(f32, 4)
+TEST_FFT_SPEC(f64, 1)
+TEST_FFT_SPEC(f64, 2)
+TEST_FFT_SPEC(f64, 3)
+TEST_FFT_SPEC(f64, 4)
+
+TEST_FFT_GEN(f32)
+TEST_FFT_GEN(f64)
+
+#endif
+
+TEST_ASM_F(sin, TEST_ASM_VTY1_F)
+
+TEST_ASM_F(cos, TEST_ASM_VTY1_F)
+
namespace kfr
{
+
#ifdef KFR_SHOW_NOT_OPTIMIZED
KFR_PUBLIC void not_optimized(const char* fn) CMT_NOEXCEPT { puts(fn); }
#endif
} // namespace kfr
+KFR_PUBLIC void test_shuffle_old1(f32x1& x, const f32x4& y)
+{
+ x.v = kfr::intrinsics::simd_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, csizes<2>, overload_auto);
+}
+
+KFR_PUBLIC void test_shuffle_old2(f32x4& x, const f32x4& y)
+{
+ x.v = kfr::intrinsics::simd_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, csizes<3, 2, 1, 0>,
+ overload_auto);
+}
+
+KFR_PUBLIC void test_shuffle_old3(f32x4& x, const f32x4& y)
+{
+ x.v = kfr::intrinsics::simd_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, csizes<0, 1, 2, 3>,
+ overload_auto);
+}
+
+KFR_PUBLIC void test_shuffle_old4(f32x2& x, const f32x4& y)
+{
+ x.v = kfr::intrinsics::simd_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, csizes<2, 3>, overload_auto);
+}
+
+KFR_PUBLIC void test_shuffle_old5(f32x8& x, const f32x4& y)
+{
+ x.v = kfr::intrinsics::simd_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v,
+ csizes<3, 2, 1, 0, 0, 1, 2, 3>, overload_auto);
+}
+
+KFR_PUBLIC void test_shuffle_old6(f32x8& x, const f32x4& y)
+{
+ x.v = kfr::intrinsics::simd_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v,
+ csizes<7, 6, 5, 4, 3, 2, 1, 0>, overload_auto);
+}
+
+KFR_PUBLIC void test_shuffle_old9(vec<f32, 3>& x, const vec<f32, 15>& y)
+{
+ x.v = kfr::intrinsics::simd_shuffle(kfr::intrinsics::simd_t<f32, 15>{}, y.v, csizes<3, 2, 1>,
+ overload_auto);
+}
+
+KFR_PUBLIC void test_shuffle_new1(f32x1& x, const f32x4& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, csizes<2>);
+}
+
+KFR_PUBLIC void test_shuffle_new2(f32x4& x, const f32x4& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, csizes<3, 2, 1, 0>);
+}
+
+KFR_PUBLIC void test_shuffle_new3(f32x4& x, const f32x4& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, csizes<0, 1, 2, 3>);
+}
+
+KFR_PUBLIC void test_shuffle_new4(f32x2& x, const f32x4& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v, csizes<2, 3>);
+}
+
+KFR_PUBLIC void test_shuffle_new5(f32x8& x, const f32x4& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v,
+ csizes<3, 2, 1, 0, 0, 1, 2, 3>);
+}
+
+KFR_PUBLIC void test_shuffle_new6(f32x8& x, const f32x4& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 4>{}, y.v,
+ csizes<7, 6, 5, 4, 3, 2, 1, 0>);
+}
+
+KFR_PUBLIC void test_shuffle_new7(f32x1& x, const f32x32& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 32>{}, (y + 1.f).v, csizes<19>);
+}
+
+KFR_PUBLIC void test_shuffle_new8(f32x8& x, const f32x8& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 8>{}, y.v,
+ csizes<3, 2, 1, 0, 3, 2, 1, 0>);
+}
+
+KFR_PUBLIC void test_shuffle_new9(vec<f32, 3>& x, const vec<f32, 15>& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 15>{}, y.v, csizes<3, 2, 1>);
+}
+
+KFR_PUBLIC void test_shuffle_new9a(vec<f32, 3>& x, const vec<f32, 15>& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 15>{}, y.v, csizes<5, 6, 7>);
+}
+
+KFR_PUBLIC void test_shuffle_new9b(vec<f32, 3>& x, const vec<f32, 15>& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 15>{}, y.v, csizes<11, 11, 11>);
+}
+
+KFR_PUBLIC void test_shuffle_new9c(vec<f32, 3>& x, const vec<f32, 15>& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 15>{}, y.v, csizes<3, 4, 5>);
+}
+
+KFR_PUBLIC void test_shuffle_new10(vec<f32, 15>& x)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 1>{}, 0.f,
+ csizes<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15>);
+}
+KFR_PUBLIC void test_shuffle_new11(vec<f32, 15>& x, float y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 1>{}, y,
+ csizes<0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0>);
+}
+KFR_PUBLIC void test_shuffle_new12(vec<f32, 32>& x, const vec<f32, 32>& y)
+{
+ x.v =
+ kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 32>{}, y.v, csizeseq<32> ^ csize<1>);
+}
+KFR_PUBLIC void test_shuffle_new13(vec<f32, 8>& x, const vec<f32, 8>& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 8>{}, y.v,
+ csizes<0, 2, 4, 6, 1, 3, 5, 7>);
+}
+KFR_PUBLIC void test_shuffle_new14(vec<f32, 8>& x, const vec<f32, 8>& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 8>{}, y.v,
+ csizes<0, 4, 1, 5, 2, 6, 3, 7>);
+}
+KFR_PUBLIC void test_shuffle_new15(vec<f32, 4>& x, const vec<f32, 8>& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 8>{}, y.v, csizes<0, 5, 2, 7>);
+}
+KFR_PUBLIC void test_shuffle_new16(vec<f32, 2>& x, const vec<f32, 2>& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 2>{}, y.v, csizes<1, 0>);
+}
+KFR_PUBLIC void test_shuffle_new17(vec<f32, 16>& x, const vec<f32, 16>& y)
+{
+ x.v = kfr::intrinsics::universal_shuffle(kfr::intrinsics::simd_t<f32, 16>{}, y.v,
+ csizes<0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15>);
+}
+
+KFR_PUBLIC float tuple_assign()
+{
+ auto [x, y, z, w] = f32x4(1.f, 2.f, 3.f, 4.f);
+ return x + y * y + z * z * z + w * w * w * w;
+}
+
int main() { println(library_version()); }