kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit bd8ccf749266b671ed17b63d9113abf3599b7cde
parent d56223d3102b96fca1cf99cefa0455dbc6f7317b
Author: [email protected] <[email protected]>
Date:   Mon, 18 Jul 2022 18:09:02 +0100

Fix for recent GCC

Diffstat:
Minclude/kfr/simd/impl/backend_generic.hpp | 134+++++++++++++++++++++++++++++++++++++++++++++++++++----------------------------
1 file changed, 86 insertions(+), 48 deletions(-)

diff --git a/include/kfr/simd/impl/backend_generic.hpp b/include/kfr/simd/impl/backend_generic.hpp @@ -137,7 +137,8 @@ struct simd_small_array<f32, 2, f64> #ifdef CMT_ARCH_SSE2 whole = _mm_cvtsd_f64(_mm_castps_pd(_mm_setr_ps(x, y, x, y))); #else - union { + union + { struct { f32 x; @@ -150,7 +151,8 @@ struct simd_small_array<f32, 2, f64> whole = u.r; #endif #else - union { + union + { struct { f32 x; @@ -650,7 +652,7 @@ KFR_INTRINSIC simd<float, 8> simd_shuffle(simd_t<float, 16>, const simd<float, 1 { const __m256 t1 = _mm256_permute2f128_ps(x.low, x.high, (0 << 0) | (2 << 4)); const __m256 t2 = _mm256_permute2f128_ps(x.low, x.high, (1 << 0) | (3 << 4)); - return _mm256_shuffle_ps(t1, t2, shuffle_mask<8, 2, 3, 2, 3>::value); + return _mm256_shuffle_ps(t1, t2, (shuffle_mask<8, 2, 3, 2, 3>::value)); } KFR_INTRINSIC simd<float, 8> simd_shuffle(simd_t<float, 16>, const simd<float, 16>& x, @@ -658,7 +660,7 @@ KFR_INTRINSIC simd<float, 8> simd_shuffle(simd_t<float, 16>, const simd<float, 1 { const __m256 t1 = _mm256_permute2f128_ps(x.low, x.high, (0 << 0) | (2 << 4)); const __m256 t2 = _mm256_permute2f128_ps(x.low, x.high, (1 << 0) | (3 << 4)); - return _mm256_shuffle_ps(t1, t2, shuffle_mask<8, 0, 1, 0, 1>::value); + return _mm256_shuffle_ps(t1, t2, (shuffle_mask<8, 0, 1, 0, 1>::value)); } #endif @@ -1282,9 +1284,9 @@ simd_array<T, Nout> simd_shuffle2_generic(const simd_array<T, N1>& x, const simd for (size_t i = 0; i < Nout; ++i) { const size_t index = indices[i]; - result.val[i] = index >= N1 + N2 - ? T() - : index >= N1 ? static_cast<T>(y.val[index - N1]) : static_cast<T>(x.val[index]); + result.val[i] = index >= N1 + N2 ? T() + : index >= N1 ? static_cast<T>(y.val[index - N1]) + : static_cast<T>(x.val[index]); } return result; } @@ -1318,8 +1320,8 @@ KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N, N>, const simd<T, N>& x, #else return from_simd_array<T, Nout>( { (indices >= N * 2 ? T() - : indices >= N ? static_cast<T>(to_simd_array<T, N>(y).val[indices - N]) - : static_cast<T>(to_simd_array<T, N>(x).val[indices]))... }); + : indices >= N ? static_cast<T>(to_simd_array<T, N>(y).val[indices - N]) + : static_cast<T>(to_simd_array<T, N>(x).val[indices]))... }); #endif } @@ -1339,8 +1341,8 @@ KFR_INTRINSIC simd<T, Nout> simd_shuffle(simd2_t<T, N1, N2>, const simd<T, N1>& return from_simd_array<T, Nout>( { (indices > N1 + N2 ? T() - : indices >= N1 ? static_cast<T>(to_simd_array<T, N2>(y).val[indices - N1]) - : static_cast<T>(to_simd_array<T, N1>(x).val[indices]))... }); + : indices >= N1 ? static_cast<T>(to_simd_array<T, N2>(y).val[indices - N1]) + : static_cast<T>(to_simd_array<T, N1>(x).val[indices]))... }); #endif } @@ -1451,14 +1453,16 @@ template <typename T, size_t Nout, size_t Nin> KFR_INTRINSIC simd<T, Nout> simd_from_partial(simd2_t<T, Nout, Nin>, const simd<T, Nin>& x) { #ifdef _MSC_VER - union { + union + { simd<T, Nin> in; simd<T, Nout> out; } u; u.in = x; return u.out; #else - union { + union + { simd<T, Nin> in; simd<T, Nout> out; } u{ x }; @@ -1521,19 +1525,23 @@ KFR_INTRINSIC simd<float, 4> simd_vec_shuffle(simd_t<float, 4>, const simd<float csizes_t<I0, I1, I2, I3>) { // SSE -> SSE - constexpr size_t mask = shuffle_mask<8, I0, I1, I2, I3>::value; - return _mm_shuffle_ps(x, x, mask); + return _mm_shuffle_ps(x, x, (shuffle_mask<8, I0, I1, I2, I3>::value)); } template <size_t I0, size_t I1> KFR_INTRINSIC simd<double, 2> simd_vec_shuffle(simd_t<double, 2>, const simd<double, 2>& x, csizes_t<I0, I1>) { // SSE -> SSE - constexpr size_t mask = shuffle_mask<2, I0, I1>::value; - return _mm_shuffle_pd(x, x, mask); + return _mm_shuffle_pd(x, x, (shuffle_mask<2, I0, I1>::value)); } #endif +template <uint8_t max> +KFR_INTRINSIC constexpr uint8_t vec_idx(size_t value) +{ + return value >= max ? 0 : value; +} + #ifdef CMT_ARCH_AVX512 template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7, size_t I8, @@ -1544,7 +1552,11 @@ KFR_INTRINSIC simd<float, 16> simd_vec_shuffle( { // AVX512 -> AVX512 return _mm512_permutexvar_ps( - _mm512_setr_epi32(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15), x); + _mm512_setr_epi32(vec_idx<16>(I0), vec_idx<16>(I1), vec_idx<16>(I2), vec_idx<16>(I3), vec_idx<16>(I4), + vec_idx<16>(I5), vec_idx<16>(I6), vec_idx<16>(I7), vec_idx<16>(I8), vec_idx<16>(I9), + vec_idx<16>(I10), vec_idx<16>(I11), vec_idx<16>(I12), vec_idx<16>(I13), + vec_idx<16>(I14), vec_idx<16>(I15)), + x); } template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> @@ -1552,7 +1564,10 @@ KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_t<double, 8>, const simd<dou csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>) { // AVX512 -> AVX512 - return _mm512_permutexvar_pd(_mm512_setr_epi64(I0, I1, I2, I3, I4, I5, I6, I7), x); + return _mm512_permutexvar_pd(_mm512_setr_epi64(vec_idx<8>(I0), vec_idx<8>(I1), vec_idx<8>(I2), + vec_idx<8>(I3), vec_idx<8>(I4), vec_idx<8>(I5), + vec_idx<8>(I6), vec_idx<8>(I7)), + x); } template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7> @@ -1561,7 +1576,11 @@ KFR_INTRINSIC simd<float, 8> simd_vec_shuffle(simd_t<float, 16>, const simd<floa { // AVX512 -> AVX return _mm512_castps512_ps256(_mm512_permutexvar_ps( - _mm512_setr_epi32(I0, I1, I2, I3, I4, I5, I6, I7, I0, I1, I2, I3, I4, I5, I6, I7), x)); + _mm512_setr_epi32(vec_idx<16>(I0), vec_idx<16>(I1), vec_idx<16>(I2), vec_idx<16>(I3), vec_idx<16>(I4), + vec_idx<16>(I5), vec_idx<16>(I6), vec_idx<16>(I7), vec_idx<16>(I0), vec_idx<16>(I1), + vec_idx<16>(I2), vec_idx<16>(I3), vec_idx<16>(I4), vec_idx<16>(I5), vec_idx<16>(I6), + vec_idx<16>(I7)), + x)); } template <size_t I0, size_t I1, size_t I2, size_t I3> @@ -1570,7 +1589,11 @@ KFR_INTRINSIC simd<float, 4> simd_vec_shuffle(simd_t<float, 16>, const simd<floa { // AVX512 -> SSE return _mm512_castps512_ps128(_mm512_permutexvar_ps( - _mm512_setr_epi32(I0, I1, I2, I3, I0, I1, I2, I3, I0, I1, I2, I3, I0, I1, I2, I3), x)); + _mm512_setr_epi32(vec_idx<16>(I0), vec_idx<16>(I1), vec_idx<16>(I2), vec_idx<16>(I3), vec_idx<16>(I0), + vec_idx<16>(I1), vec_idx<16>(I2), vec_idx<16>(I3), vec_idx<16>(I0), vec_idx<16>(I1), + vec_idx<16>(I2), vec_idx<16>(I3), vec_idx<16>(I0), vec_idx<16>(I1), vec_idx<16>(I2), + vec_idx<16>(I3)), + x)); } template <size_t I0, size_t I1, size_t I2, size_t I3> @@ -1578,16 +1601,20 @@ KFR_INTRINSIC simd<double, 4> simd_vec_shuffle(simd_t<double, 8>, const simd<dou csizes_t<I0, I1, I2, I3>) { // AVX512 -> AVX - return _mm512_castpd512_pd256( - _mm512_permutexvar_pd(_mm512_setr_epi64(I0, I1, I2, I3, I0, I1, I2, I3), x)); + return _mm512_castpd512_pd256(_mm512_permutexvar_pd( + _mm512_setr_epi64(vec_idx<8>(I0), vec_idx<8>(I1), vec_idx<8>(I2), vec_idx<8>(I3), vec_idx<8>(I0), + vec_idx<8>(I1), vec_idx<8>(I2), vec_idx<8>(I3)), + x)); } template <size_t I0, size_t I1> KFR_INTRINSIC simd<double, 2> simd_vec_shuffle(simd_t<double, 8>, const simd<double, 8>& x, csizes_t<I0, I1>) { // AVX512 -> SSE - return _mm512_castpd512_pd128( - _mm512_permutexvar_pd(_mm512_setr_epi64(I0, I1, I0, I1, I0, I1, I0, I1), x)); + return _mm512_castpd512_pd128(_mm512_permutexvar_pd( + _mm512_setr_epi64(vec_idx<8>(I0), vec_idx<8>(I1), vec_idx<8>(I0), vec_idx<8>(I1), vec_idx<8>(I0), + vec_idx<8>(I1), vec_idx<8>(I0), vec_idx<8>(I1)), + x)); } template <size_t I0, size_t I1, size_t I2, size_t I3, size_t I4, size_t I5, size_t I6, size_t I7, size_t I8, @@ -1598,7 +1625,10 @@ KFR_INTRINSIC simd<float, 16> simd_vec_shuffle( { // AVX -> AVX512 return _mm512_permutexvar_ps( - _mm512_setr_epi32(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15), + _mm512_setr_epi32(vec_idx<8>(I0), vec_idx<8>(I1), vec_idx<8>(I2), vec_idx<8>(I3), vec_idx<8>(I4), + vec_idx<8>(I5), vec_idx<8>(I6), vec_idx<8>(I7), vec_idx<8>(I8), vec_idx<8>(I9), + vec_idx<8>(I10), vec_idx<8>(I11), vec_idx<8>(I12), vec_idx<8>(I13), vec_idx<8>(I14), + vec_idx<8>(I15)), _mm512_castps256_ps512(x)); } @@ -1610,7 +1640,10 @@ KFR_INTRINSIC simd<float, 16> simd_vec_shuffle( { // SSE -> AVX512 return _mm512_permutexvar_ps( - _mm512_setr_epi32(I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15), + _mm512_setr_epi32(vec_idx<4>(I0), vec_idx<4>(I1), vec_idx<4>(I2), vec_idx<4>(I3), vec_idx<4>(I4), + vec_idx<4>(I5), vec_idx<4>(I6), vec_idx<4>(I7), vec_idx<4>(I8), vec_idx<4>(I9), + vec_idx<4>(I10), vec_idx<4>(I11), vec_idx<4>(I12), vec_idx<4>(I13), vec_idx<4>(I14), + vec_idx<4>(I15)), _mm512_castps128_ps512(x)); } @@ -1619,7 +1652,9 @@ KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_t<double, 4>, const simd<dou csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>) { // AVX -> AVX512 - return _mm512_permutexvar_pd(_mm512_setr_epi64(I0, I1, I2, I3, I4, I5, I6, I7), + return _mm512_permutexvar_pd(_mm512_setr_epi64(vec_idx<4>(I0), vec_idx<4>(I1), vec_idx<4>(I2), + vec_idx<4>(I3), vec_idx<4>(I4), vec_idx<4>(I5), + vec_idx<4>(I6), vec_idx<4>(I7)), _mm512_castpd256_pd512(x)); } @@ -1628,7 +1663,9 @@ KFR_INTRINSIC simd<double, 8> simd_vec_shuffle(simd_t<double, 2>, const simd<dou csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>) { // SSE -> AVX512 - return _mm512_permutexvar_pd(_mm512_setr_epi64(I0, I1, I2, I3, I4, I5, I6, I7), + return _mm512_permutexvar_pd(_mm512_setr_epi64(vec_idx<2>(I0), vec_idx<2>(I1), vec_idx<2>(I2), + vec_idx<2>(I3), vec_idx<2>(I4), vec_idx<2>(I5), + vec_idx<2>(I6), vec_idx<2>(I7)), _mm512_castpd128_pd512(x)); } @@ -1650,9 +1687,9 @@ KFR_INTRINSIC simd<float, 8> simd_vec_shuffle(simd_t<float, 8>, const simd<float else if constexpr (cmaxof(csizes<I0, I1, I2, I3>) < 4 && cminof(csizes<I4, I5, I6, I7>) >= 4) { if constexpr (csizes<I0, I1, I2, I3, I4, I5, I6, I7>.equal( - csizes<I0, I1, I2, I3, I0 + 4, I1 + 4, I2 + 4, I3 + 4>)) + csizes<I0, I1, I2, I3, I0 + 4, I1 + 4, I2 + 4, I3 + 4>)) { - return _mm256_shuffle_ps(x, x, shuffle_mask<8, I0, I1, I2, I3>::value); + return _mm256_shuffle_ps(x, x, (shuffle_mask<8, I0, I1, I2, I3>::value)); } else { @@ -1672,8 +1709,8 @@ KFR_INTRINSIC simd<float, 8> simd_vec_shuffle(simd_t<float, 8>, const simd<float const __m256 t2 = _mm256_permutevar_ps( sw, _mm256_setr_epi32(I0 % 4, I1 % 4, I2 % 4, I3 % 4, I4 % 4, I5 % 4, I6 % 4, I7 % 4)); return _mm256_blend_ps(t1, t2, - shuffle_mask<8, I0 / 4, I1 / 4, I2 / 4, I3 / 4, 1 - I4 / 4, 1 - I5 / 4, - 1 - I6 / 4, 1 - I7 / 4>::value); + (shuffle_mask<8, I0 / 4, I1 / 4, I2 / 4, I3 / 4, 1 - I4 / 4, 1 - I5 / 4, + 1 - I6 / 4, 1 - I7 / 4>::value)); } } @@ -1692,7 +1729,7 @@ KFR_INTRINSIC simd<double, 4> simd_vec_shuffle(simd_t<double, 4>, const simd<dou { if constexpr (csizes<I0, I1, I2, I3>.equal(csizes<I0, I1, I2 + 2, I3 + 2>)) { - return _mm256_shuffle_ps(x, x, shuffle_mask<2, I0, I1>::value); + return _mm256_shuffle_pd(x, x, (shuffle_mask<2, I0, I1>::value)); } else { @@ -1711,7 +1748,7 @@ KFR_INTRINSIC simd<double, 4> simd_vec_shuffle(simd_t<double, 4>, const simd<dou x, _mm256_setr_epi64x((I0 % 2) << 1, (I1 % 2) << 1, (I2 % 2) << 1, (I3 % 2) << 1)); const __m256d t2 = _mm256_permutevar_pd( sw, _mm256_setr_epi64x((I0 % 2) << 1, (I1 % 2) << 1, (I2 % 2) << 1, (I3 % 2) << 1)); - return _mm256_blend_pd(t1, t2, shuffle_mask<4, I0 / 2, I1 / 2, 1 - I2 / 2, 1 - I3 / 2>::value); + return _mm256_blend_pd(t1, t2, (shuffle_mask<4, I0 / 2, I1 / 2, 1 - I2 / 2, 1 - I3 / 2>::value)); } } @@ -1724,15 +1761,15 @@ KFR_INTRINSIC simd<float, 4> simd_vec_shuffle(simd_t<float, 8>, const simd<float { __m128 t1 = simd_get_low(simd_t<float, 8>{}, x); __m128 t2 = simd_get_high(simd_t<float, 8>{}, x); - return _mm_blend_ps(t1, t2, shuffle_mask<4, I0 / 4, I1 / 4, I2 / 4, I3 / 4>::value); + return _mm_blend_ps(t1, t2, (shuffle_mask<4, I0 / 4, I1 / 4, I2 / 4, I3 / 4>::value)); } else { __m128 t1 = simd_get_low(simd_t<float, 8>{}, x); __m128 t2 = simd_get_high(simd_t<float, 8>{}, x); - t1 = _mm_permute_ps(t1, shuffle_mask<8, I0 % 4, I1 % 4, I2 % 4, I3 % 4>::value); - t2 = _mm_permute_ps(t2, shuffle_mask<8, I0 % 4, I1 % 4, I2 % 4, I3 % 4>::value); - return _mm_blend_ps(t1, t2, shuffle_mask<4, I0 / 4, I1 / 4, I2 / 4, I3 / 4>::value); + t1 = _mm_permute_ps(t1, (shuffle_mask<8, I0 % 4, I1 % 4, I2 % 4, I3 % 4>::value)); + t2 = _mm_permute_ps(t2, (shuffle_mask<8, I0 % 4, I1 % 4, I2 % 4, I3 % 4>::value)); + return _mm_blend_ps(t1, t2, (shuffle_mask<4, I0 / 4, I1 / 4, I2 / 4, I3 / 4>::value)); } } @@ -1744,15 +1781,15 @@ KFR_INTRINSIC simd<double, 2> simd_vec_shuffle(simd_t<double, 4>, const simd<dou { __m128d t1 = simd_get_low(simd_t<double, 4>{}, x); __m128d t2 = simd_get_high(simd_t<double, 4>{}, x); - return _mm_blend_pd(t1, t2, shuffle_mask<2, I0 / 2, I1 / 2>::value); + return _mm_blend_pd(t1, t2, (shuffle_mask<2, I0 / 2, I1 / 2>::value)); } else { __m128d t1 = simd_get_low(simd_t<double, 4>{}, x); __m128d t2 = simd_get_high(simd_t<double, 4>{}, x); - t1 = _mm_permute_pd(t1, shuffle_mask<2, I0 % 2, I1 % 2>::value); - t2 = _mm_permute_pd(t2, shuffle_mask<2, I0 % 2, I1 % 2>::value); - return _mm_blend_pd(t1, t2, shuffle_mask<2, I0 / 2, I1 / 2>::value); + t1 = _mm_permute_pd(t1, (shuffle_mask<2, I0 % 2, I1 % 2>::value)); + t2 = _mm_permute_pd(t2, (shuffle_mask<2, I0 % 2, I1 % 2>::value)); + return _mm_blend_pd(t1, t2, (shuffle_mask<2, I0 / 2, I1 / 2>::value)); } } @@ -1761,8 +1798,8 @@ KFR_INTRINSIC simd<float, 8> simd_vec_shuffle(simd_t<float, 4>, const simd<float csizes_t<I0, I1, I2, I3, I4, I5, I6, I7>) { // SSE -> AVX - return KFR_mm256_setr_m128(_mm_shuffle_ps(x, x, shuffle_mask<8, I0, I1, I2, I3>::value), - _mm_shuffle_ps(x, x, shuffle_mask<8, I4, I5, I6, I7>::value)); + return KFR_mm256_setr_m128(_mm_shuffle_ps(x, x, (shuffle_mask<8, I0, I1, I2, I3>::value)), + _mm_shuffle_ps(x, x, (shuffle_mask<8, I4, I5, I6, I7>::value))); } template <size_t I0, size_t I1, size_t I2, size_t I3> @@ -1770,8 +1807,8 @@ KFR_INTRINSIC simd<double, 4> simd_vec_shuffle(simd_t<double, 2>, const simd<dou csizes_t<I0, I1, I2, I3>) { // SSE -> AVX - return KFR_mm256_setr_m128d(_mm_shuffle_pd(x, x, shuffle_mask<2, I0, I1>::value), - _mm_shuffle_pd(x, x, shuffle_mask<2, I2, I3>::value)); + return KFR_mm256_setr_m128d(_mm_shuffle_pd(x, x, (shuffle_mask<2, I0, I1>::value)), + _mm_shuffle_pd(x, x, (shuffle_mask<2, I2, I3>::value))); } #endif @@ -1815,7 +1852,8 @@ KFR_INTRINSIC simd<T, Nout> universal_shuffle(simd_t<T, Nin>, const simd<T, Nin> } else { - union { + union + { simd<T, minwidth> tmp; simd<T, Nout> r; } u{ tmp };