kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

ft.hpp (69618B)


      1 /** @addtogroup dft
      2  *  @{
      3  */
      4 /*
      5   Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com)
      6   This file is part of KFR
      7 
      8   KFR is free software: you can redistribute it and/or modify
      9   it under the terms of the GNU General Public License as published by
     10   the Free Software Foundation, either version 2 of the License, or
     11   (at your option) any later version.
     12 
     13   KFR is distributed in the hope that it will be useful,
     14   but WITHOUT ANY WARRANTY; without even the implied warranty of
     15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     16   GNU General Public License for more details.
     17 
     18   You should have received a copy of the GNU General Public License
     19   along with KFR.
     20 
     21   If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
     22   Buying a commercial license is mandatory as soon as you develop commercial activities without
     23   disclosing the source code of your own applications.
     24   See https://www.kfrlib.com for details.
     25  */
     26 #pragma once
     27 
     28 #include <kfr/base/univector.hpp>
     29 #include <kfr/math/sin_cos.hpp>
     30 #include <kfr/simd/complex.hpp>
     31 #include <kfr/simd/constants.hpp>
     32 #include <kfr/simd/digitreverse.hpp>
     33 #include <kfr/simd/read_write.hpp>
     34 #include <kfr/simd/vec.hpp>
     35 
     36 #include <kfr/base/memory.hpp>
     37 #include "data/sincos.hpp"
     38 
     39 CMT_PRAGMA_GNU(GCC diagnostic push)
     40 #if CMT_HAS_WARNING("-Wpass-failed")
     41 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpass-failed")
     42 #endif
     43 
     44 CMT_PRAGMA_MSVC(warning(push))
     45 CMT_PRAGMA_MSVC(warning(disable : 4127))
     46 
     47 namespace kfr
     48 {
     49 inline namespace CMT_ARCH_NAME
     50 {
     51 
     52 template <typename T, size_t N>
     53 using cvec = vec<T, N * 2>;
     54 
     55 namespace intrinsics
     56 {
     57 
     58 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
     59 KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, N>& y)
     60 {
     61     return subadd(x * dupeven(y), swap<2>(x) * dupodd(y));
     62 }
     63 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
     64 KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, 2>& y)
     65 {
     66     vec<T, N> yy = resize<N>(y);
     67     return cmul_impl(x, yy);
     68 }
     69 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
     70 KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, 2>& x, const vec<T, N>& y)
     71 {
     72     vec<T, N> xx = resize<N>(x);
     73     return cmul_impl(xx, y);
     74 }
     75 
     76 /// Complex Multiplication
     77 template <typename T, size_t N1, size_t N2>
     78 KFR_INTRINSIC vec<T, const_max(N1, N2)> cmul(const vec<T, N1>& x, const vec<T, N2>& y)
     79 {
     80     return intrinsics::cmul_impl(x, y);
     81 }
     82 
     83 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
     84 KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, N>& y)
     85 {
     86     return swap<2>(subadd(swap<2>(x) * dupeven(y), x * dupodd(y)));
     87 }
     88 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
     89 KFR_INTRINSIC vec<T, N> cmul_2conj(const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& tw)
     90 {
     91     return (in0 + in1) * dupeven(tw) + swap<2>(cnegimag(in0 - in1)) * dupodd(tw);
     92 }
     93 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)>
     94 KFR_INTRINSIC void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, const vec<T, 2>& in0, const vec<T, 2>& in1,
     95                               const vec<T, N>& tw)
     96 {
     97     const vec<T, N> twr   = dupeven(tw);
     98     const vec<T, N> twi   = dupodd(tw);
     99     const vec<T, 2> sum   = (in0 + in1);
    100     const vec<T, 2> dif   = swap<2>(negodd(in0 - in1));
    101     const vec<T, N> sumtw = resize<N>(sum) * twr;
    102     const vec<T, N> diftw = resize<N>(dif) * twi;
    103     out0 += sumtw + diftw;
    104     out1 += sumtw - diftw;
    105 }
    106 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
    107 KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, 2>& y)
    108 {
    109     vec<T, N> yy = resize<N>(y);
    110     return cmul_conj(x, yy);
    111 }
    112 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)>
    113 KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, 2>& x, const vec<T, N>& y)
    114 {
    115     vec<T, N> xx = resize<N>(x);
    116     return cmul_conj(xx, y);
    117 }
    118 
    119 template <size_t N, bool A = false, typename T>
    120 KFR_INTRINSIC cvec<T, N> cread(const complex<T>* src)
    121 {
    122     return cvec<T, N>(ptr_cast<T>(src), cbool_t<A>());
    123 }
    124 
    125 template <size_t N, bool A = false, typename T>
    126 KFR_INTRINSIC void cwrite(complex<T>* dest, const cvec<T, N>& value)
    127 {
    128     value.write(ptr_cast<T>(dest), cbool_t<A>());
    129 }
    130 
    131 template <size_t count, size_t N, bool A = false, typename T>
    132 KFR_INTRINSIC cvec<T, count * N> cread_group(const complex<T>* src, size_t stride)
    133 {
    134     return internal::read_group_impl<2, count, N, A>(ptr_cast<T>(src), stride, csizeseq_t<count>());
    135 }
    136 
    137 template <size_t count, size_t N, bool A = false, typename T>
    138 KFR_INTRINSIC void cwrite_group(complex<T>* dest, size_t stride, const cvec<T, count * N>& value)
    139 {
    140     return internal::write_group_impl<2, count, N, A>(ptr_cast<T>(dest), stride, value, csizeseq_t<count>());
    141 }
    142 
    143 template <size_t N, bool A = false, bool split = false, typename T>
    144 KFR_INTRINSIC cvec<T, N> cread_split(const complex<T>* src)
    145 {
    146     cvec<T, N> temp = cvec<T, N>(ptr_cast<T>(src), cbool_t<A>());
    147     if constexpr (split)
    148         temp = splitpairs(temp);
    149     return temp;
    150 }
    151 
    152 template <size_t N, bool A = false, bool split = false, typename T>
    153 KFR_INTRINSIC void cwrite_split(complex<T>* dest, const cvec<T, N>& value)
    154 {
    155     cvec<T, N> v = value;
    156     if constexpr (split)
    157         v = interleavehalves(v);
    158     v.write(ptr_cast<T>(dest), cbool_t<A>());
    159 }
    160 
    161 template <>
    162 inline cvec<f32, 8> cread_split<8, false, true, f32>(const complex<f32>* src)
    163 {
    164     const cvec<f32, 4> l = concat(cread<2>(src), cread<2>(src + 4));
    165     const cvec<f32, 4> h = concat(cread<2>(src + 2), cread<2>(src + 6));
    166 
    167     return concat(shuffle<0, 2, 8 + 0, 8 + 2>(l, h), shuffle<1, 3, 8 + 1, 8 + 3>(l, h));
    168 }
    169 template <>
    170 inline cvec<f32, 8> cread_split<8, true, true, f32>(const complex<f32>* src)
    171 {
    172     const cvec<f32, 4> l = concat(cread<2, true>(src), cread<2, true>(src + 4));
    173     const cvec<f32, 4> h = concat(cread<2, true>(src + 2), cread<2, true>(src + 6));
    174 
    175     return concat(shuffle<0, 2, 8 + 0, 8 + 2>(l, h), shuffle<1, 3, 8 + 1, 8 + 3>(l, h));
    176 }
    177 
    178 template <>
    179 inline cvec<f64, 4> cread_split<4, false, true, f64>(const complex<f64>* src)
    180 {
    181     const cvec<f64, 2> l = concat(cread<1>(src), cread<1>(src + 2));
    182     const cvec<f64, 2> h = concat(cread<1>(src + 1), cread<1>(src + 3));
    183 
    184     return concat(shuffle<0, 4, 2, 6>(l, h), shuffle<1, 5, 3, 7>(l, h));
    185 }
    186 
    187 template <>
    188 inline void cwrite_split<8, false, true, f32>(complex<f32>* dest, const cvec<f32, 8>& x)
    189 {
    190     const cvec<f32, 8> xx =
    191         concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x)));
    192 
    193     cvec<f32, 2> a, b, c, d;
    194     split<f32, 16>(xx, a, b, c, d);
    195     cwrite<2>(dest, a);
    196     cwrite<2>(dest + 4, b);
    197     cwrite<2>(dest + 2, c);
    198     cwrite<2>(dest + 6, d);
    199 }
    200 template <>
    201 inline void cwrite_split<8, true, true, f32>(complex<f32>* dest, const cvec<f32, 8>& x)
    202 {
    203     const cvec<f32, 8> xx =
    204         concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x)));
    205 
    206     cvec<f32, 2> a, b, c, d;
    207     split<f32, 16>(xx, a, b, c, d);
    208     cwrite<2, true>(dest + 0, a);
    209     cwrite<2, true>(dest + 4, b);
    210     cwrite<2, true>(dest + 2, c);
    211     cwrite<2, true>(dest + 6, d);
    212 }
    213 
    214 template <>
    215 inline void cwrite_split<4, false, true, f64>(complex<f64>* dest, const cvec<f64, 4>& x)
    216 {
    217     const cvec<f64, 4> xx =
    218         concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x)));
    219     cwrite<1>(dest, part<4, 0>(xx));
    220     cwrite<1>(dest + 2, part<4, 1>(xx));
    221     cwrite<1>(dest + 1, part<4, 2>(xx));
    222     cwrite<1>(dest + 3, part<4, 3>(xx));
    223 }
    224 template <>
    225 inline void cwrite_split<4, true, true, f64>(complex<f64>* dest, const cvec<f64, 4>& x)
    226 {
    227     const cvec<f64, 4> xx =
    228         concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x)));
    229     cwrite<1, true>(dest + 0, part<4, 0>(xx));
    230     cwrite<1, true>(dest + 2, part<4, 1>(xx));
    231     cwrite<1, true>(dest + 1, part<4, 2>(xx));
    232     cwrite<1, true>(dest + 3, part<4, 3>(xx));
    233 }
    234 
    235 template <size_t N, size_t stride, typename T, size_t... Indices>
    236 KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, csizes_t<Indices...>)
    237 {
    238     return concat(ref_cast<cvec<T, 1>>(base[Indices * stride])...);
    239 }
    240 
    241 template <size_t N, size_t stride, typename T>
    242 KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base)
    243 {
    244     if constexpr (stride == 1)
    245     {
    246         return ref_cast<cvec<T, N>>(*base);
    247     }
    248     else
    249     {
    250         return cgather_helper<N, stride, T>(base, csizeseq_t<N>());
    251     }
    252 }
    253 
    254 KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t size, size_t)
    255 {
    256     size_t temp = index;
    257     index += stride;
    258     if (index >= size)
    259         index -= size;
    260     return temp;
    261 }
    262 KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t)
    263 {
    264     size_t temp = index;
    265     index += stride;
    266     return temp;
    267 }
    268 
    269 template <size_t N, typename T, size_t... Indices>
    270 KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride,
    271                                         csizes_t<Indices...>)
    272 {
    273     return concat(ref_cast<cvec<T, 1>>(base[cgather_next(index, stride, Indices)])...);
    274 }
    275 
    276 template <size_t N, typename T>
    277 KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride)
    278 {
    279     return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>());
    280 }
    281 template <size_t N, typename T>
    282 KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t stride)
    283 {
    284     size_t index = 0;
    285     return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>());
    286 }
    287 
    288 template <size_t N, typename T, size_t... Indices>
    289 KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, size_t size,
    290                                         csizes_t<Indices...>)
    291 {
    292     return concat(ref_cast<cvec<T, 1>>(base[cgather_next(index, stride, size, Indices)])...);
    293 }
    294 
    295 template <size_t N, typename T>
    296 KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride, size_t size)
    297 {
    298     return cgather_helper<N, T>(base, index, stride, size, csizeseq_t<N>());
    299 }
    300 
    301 template <size_t N, size_t stride, typename T, size_t... Indices>
    302 KFR_INTRINSIC void cscatter_helper(complex<T>* base, const cvec<T, N>& value, csizes_t<Indices...>)
    303 {
    304     swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... };
    305 }
    306 
    307 template <size_t N, size_t stride, typename T>
    308 KFR_INTRINSIC void cscatter(complex<T>* base, const cvec<T, N>& value)
    309 {
    310     if constexpr (stride == 1)
    311     {
    312         cwrite<N>(base, value);
    313     }
    314     else
    315     {
    316         return cscatter_helper<N, stride, T>(base, value, csizeseq_t<N>());
    317     }
    318 }
    319 
    320 template <size_t N, typename T, size_t... Indices>
    321 KFR_INTRINSIC void cscatter_helper(complex<T>* base, size_t stride, const cvec<T, N>& value,
    322                                    csizes_t<Indices...>)
    323 {
    324     swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... };
    325 }
    326 
    327 template <size_t N, typename T>
    328 KFR_INTRINSIC void cscatter(complex<T>* base, size_t stride, const cvec<T, N>& value)
    329 {
    330     return cscatter_helper<N, T>(base, stride, value, csizeseq_t<N>());
    331 }
    332 
    333 template <size_t groupsize = 1, typename T, size_t N, typename IT>
    334 KFR_INTRINSIC vec<T, N * 2 * groupsize> cgather(const complex<T>* base, const vec<IT, N>& offset)
    335 {
    336     return internal::gather_helper<2 * groupsize>(ptr_cast<T>(base), offset, csizeseq_t<N>());
    337 }
    338 
    339 template <size_t groupsize = 1, typename T, size_t N, typename IT>
    340 KFR_INTRINSIC void cscatter(complex<T>* base, const vec<IT, N>& offset, vec<T, N * 2 * groupsize> value)
    341 {
    342     return internal::scatter_helper<2 * groupsize>(ptr_cast<T>(base), offset, value, csizeseq_t<N>());
    343 }
    344 
    345 template <typename T>
    346 KFR_INTRINSIC void transpose4x8(const cvec<T, 8>& z0, const cvec<T, 8>& z1, const cvec<T, 8>& z2,
    347                                 const cvec<T, 8>& z3, cvec<T, 4>& w0, cvec<T, 4>& w1, cvec<T, 4>& w2,
    348                                 cvec<T, 4>& w3, cvec<T, 4>& w4, cvec<T, 4>& w5, cvec<T, 4>& w6,
    349                                 cvec<T, 4>& w7)
    350 {
    351     cvec<T, 16> a = concat(low(z0), low(z1), low(z2), low(z3));
    352     cvec<T, 16> b = concat(high(z0), high(z1), high(z2), high(z3));
    353     a             = digitreverse4<2>(a);
    354     b             = digitreverse4<2>(b);
    355     w0            = part<4, 0>(a);
    356     w1            = part<4, 1>(a);
    357     w2            = part<4, 2>(a);
    358     w3            = part<4, 3>(a);
    359     w4            = part<4, 0>(b);
    360     w5            = part<4, 1>(b);
    361     w6            = part<4, 2>(b);
    362     w7            = part<4, 3>(b);
    363 }
    364 
    365 template <typename T>
    366 KFR_INTRINSIC void transpose4x8(const cvec<T, 4>& w0, const cvec<T, 4>& w1, const cvec<T, 4>& w2,
    367                                 const cvec<T, 4>& w3, const cvec<T, 4>& w4, const cvec<T, 4>& w5,
    368                                 const cvec<T, 4>& w6, const cvec<T, 4>& w7, cvec<T, 8>& z0, cvec<T, 8>& z1,
    369                                 cvec<T, 8>& z2, cvec<T, 8>& z3)
    370 {
    371     cvec<T, 16> a = concat(w0, w1, w2, w3);
    372     cvec<T, 16> b = concat(w4, w5, w6, w7);
    373     a             = digitreverse4<2>(a);
    374     b             = digitreverse4<2>(b);
    375     z0            = concat(part<4, 0>(a), part<4, 0>(b));
    376     z1            = concat(part<4, 1>(a), part<4, 1>(b));
    377     z2            = concat(part<4, 2>(a), part<4, 2>(b));
    378     z3            = concat(part<4, 3>(a), part<4, 3>(b));
    379 }
    380 
    381 template <typename T>
    382 KFR_INTRINSIC void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d)
    383 {
    384     cvec<T, 4> a0, a1, a2, a3;
    385     cvec<T, 4> b0, b1, b2, b3;
    386     cvec<T, 4> c0, c1, c2, c3;
    387     cvec<T, 4> d0, d1, d2, d3;
    388 
    389     split<T, 32>(a, a0, a1, a2, a3);
    390     split<T, 32>(b, b0, b1, b2, b3);
    391     split<T, 32>(c, c0, c1, c2, c3);
    392     split<T, 32>(d, d0, d1, d2, d3);
    393 
    394     a = concat(a0, b0, c0, d0);
    395     b = concat(a1, b1, c1, d1);
    396     c = concat(a2, b2, c2, d2);
    397     d = concat(a3, b3, c3, d3);
    398 }
    399 template <typename T>
    400 KFR_INTRINSIC void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d, cvec<T, 16>& aa,
    401                               cvec<T, 16>& bb, cvec<T, 16>& cc, cvec<T, 16>& dd)
    402 {
    403     cvec<T, 4> a0, a1, a2, a3;
    404     cvec<T, 4> b0, b1, b2, b3;
    405     cvec<T, 4> c0, c1, c2, c3;
    406     cvec<T, 4> d0, d1, d2, d3;
    407 
    408     split<T, 32>(a, a0, a1, a2, a3);
    409     split<T, 32>(b, b0, b1, b2, b3);
    410     split<T, 32>(c, c0, c1, c2, c3);
    411     split<T, 32>(d, d0, d1, d2, d3);
    412 
    413     aa = concat(a0, b0, c0, d0);
    414     bb = concat(a1, b1, c1, d1);
    415     cc = concat(a2, b2, c2, d2);
    416     dd = concat(a3, b3, c3, d3);
    417 }
    418 
    419 template <bool b, typename T>
    420 constexpr KFR_INTRINSIC T chsign(T x)
    421 {
    422     return b ? -x : x;
    423 }
    424 
    425 template <typename T, size_t N, size_t size, size_t start, size_t step, bool inverse = false,
    426           size_t... indices>
    427 constexpr KFR_INTRINSIC cvec<T, N> get_fixed_twiddle_helper(csizes_t<indices...>)
    428 {
    429     return make_vector((indices & 1 ? chsign<inverse>(-sin_using_table<T>(size, (indices / 2 * step + start)))
    430                                     : cos_using_table<T>(size, (indices / 2 * step + start)))...);
    431 }
    432 
    433 template <typename T, size_t width, size_t... indices>
    434 constexpr KFR_INTRINSIC cvec<T, width> get_fixed_twiddle_helper(csizes_t<indices...>, size_t size,
    435                                                                 size_t start, size_t step)
    436 {
    437     return make_vector((indices & 1 ? -sin_using_table<T>(size, indices / 2 * step + start)
    438                                     : cos_using_table<T>(size, indices / 2 * step + start))...);
    439 }
    440 
    441 template <typename T, size_t width, size_t size, size_t start, size_t step = 0, bool inverse = false>
    442 constexpr KFR_INTRINSIC cvec<T, width> fixed_twiddle()
    443 {
    444     return get_fixed_twiddle_helper<T, width, size, start, step, inverse>(csizeseq_t<width * 2>());
    445 }
    446 
    447 template <typename T, size_t width>
    448 constexpr KFR_INTRINSIC cvec<T, width> fixed_twiddle(size_t size, size_t start, size_t step = 0)
    449 {
    450     return get_fixed_twiddle_helper<T, width>(csizeseq_t<width * 2>(), start, step, size);
    451 }
    452 
    453 // template <typename T, size_t N, size_t size, size_t start, size_t step = 0, bool inverse = false>
    454 // constexpr cvec<T, N> fixed_twiddle = get_fixed_twiddle<T, N, size, start, step, inverse>();
    455 
    456 template <typename T, size_t N, bool inverse>
    457 constexpr static inline cvec<T, N> twiddleimagmask()
    458 {
    459     return inverse ? broadcast<N * 2, T>(-1, +1) : broadcast<N * 2, T>(+1, -1);
    460 }
    461 
    462 CMT_PRAGMA_GNU(GCC diagnostic push)
    463 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wconversion")
    464 
    465 CMT_PRAGMA_GNU(GCC diagnostic pop)
    466 
    467 template <typename T, size_t N>
    468 CMT_NOINLINE static vec<T, N> cossin_conj(const vec<T, N>& x)
    469 {
    470     return negodd(cossin(x));
    471 }
    472 
    473 template <size_t k, size_t size, bool inverse = false, typename T, size_t width,
    474           size_t kk = (inverse ? size - k : k) % size>
    475 KFR_INTRINSIC vec<T, width> cmul_by_twiddle(const vec<T, width>& x)
    476 {
    477     constexpr T isqrt2 = static_cast<T>(0.70710678118654752440084436210485);
    478     if constexpr (kk == 0)
    479     {
    480         return x;
    481     }
    482     else if constexpr (kk == size * 1 / 8)
    483     {
    484         return swap<2>(subadd(swap<2>(x), x)) * isqrt2;
    485     }
    486     else if constexpr (kk == size * 2 / 8)
    487     {
    488         return negodd(swap<2>(x));
    489     }
    490     else if constexpr (kk == size * 3 / 8)
    491     {
    492         return subadd(x, swap<2>(x)) * -isqrt2;
    493     }
    494     else if constexpr (kk == size * 4 / 8)
    495     {
    496         return -x;
    497     }
    498     else if constexpr (kk == size * 5 / 8)
    499     {
    500         return swap<2>(subadd(swap<2>(x), x)) * -isqrt2;
    501     }
    502     else if constexpr (kk == size * 6 / 8)
    503     {
    504         return swap<2>(negodd(x));
    505     }
    506     else if constexpr (kk == size * 7 / 8)
    507     {
    508         return subadd(x, swap<2>(x)) * isqrt2;
    509     }
    510     else
    511     {
    512         return cmul(x, resize<width>(fixed_twiddle<T, 1, size, kk>()));
    513     }
    514 }
    515 
    516 template <size_t N, typename T>
    517 KFR_INTRINSIC void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N>& w0, cvec<T, N>& w1)
    518 {
    519     const cvec<T, N> sum = a0 + a1;
    520     const cvec<T, N> dif = a0 - a1;
    521     w0                   = sum;
    522     w1                   = dif;
    523 }
    524 
    525 template <size_t N, typename T>
    526 KFR_INTRINSIC void butterfly2(cvec<T, N>& a0, cvec<T, N>& a1)
    527 {
    528     butterfly2<N>(a0, a1, a0, a1);
    529 }
    530 
    531 template <size_t N, bool inverse = false, typename T>
    532 KFR_INTRINSIC void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1,
    533                               const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1,
    534                               cvec<T, N>& w2, cvec<T, N>& w3)
    535 {
    536     cvec<T, N> sum02, sum13, diff02, diff13;
    537     cvec<T, N * 2> a01, a23, sum0213, diff0213;
    538 
    539     a01      = concat(a0, a1);
    540     a23      = concat(a2, a3);
    541     sum0213  = a01 + a23;
    542     diff0213 = a01 - a23;
    543 
    544     sum02  = low(sum0213);
    545     sum13  = high(sum0213);
    546     diff02 = low(diff0213);
    547     diff13 = high(diff0213);
    548     w0     = sum02 + sum13;
    549     w2     = sum02 - sum13;
    550     if constexpr (inverse)
    551     {
    552         diff13 = (diff13 ^ broadcast<N * 2, T>(T(), -T()));
    553         diff13 = swap<2>(diff13);
    554     }
    555     else
    556     {
    557         diff13 = swap<2>(diff13);
    558         diff13 = (diff13 ^ broadcast<N * 2, T>(T(), -T()));
    559     }
    560 
    561     w1 = diff02 + diff13;
    562     w3 = diff02 - diff13;
    563 }
    564 
    565 template <size_t N, bool inverse = false, typename T>
    566 KFR_INTRINSIC void butterfly4(ctrue_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1,
    567                               const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1,
    568                               cvec<T, N>& w2, cvec<T, N>& w3)
    569 {
    570     vec<T, N> re0, im0, re1, im1, re2, im2, re3, im3;
    571     vec<T, N> wre0, wim0, wre1, wim1, wre2, wim2, wre3, wim3;
    572 
    573     cvec<T, N> sum02, sum13, diff02, diff13;
    574     vec<T, N> sum02re, sum13re, diff02re, diff13re;
    575     vec<T, N> sum02im, sum13im, diff02im, diff13im;
    576 
    577     sum02 = a0 + a2;
    578     sum13 = a1 + a3;
    579 
    580     w0 = sum02 + sum13;
    581     w2 = sum02 - sum13;
    582 
    583     diff02 = a0 - a2;
    584     diff13 = a1 - a3;
    585     split(diff02, diff02re, diff02im);
    586     split(diff13, diff13re, diff13im);
    587 
    588     (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re);
    589     (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re);
    590 }
    591 
    592 template <size_t N, bool inverse = false, typename T>
    593 KFR_INTRINSIC void butterfly8(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
    594                               const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
    595                               const cvec<T, N>& a6, const cvec<T, N>& a7, cvec<T, N>& w0, cvec<T, N>& w1,
    596                               cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6,
    597                               cvec<T, N>& w7)
    598 {
    599     cvec<T, N> b0 = a0, b2 = a2, b4 = a4, b6 = a6;
    600     butterfly4<N, inverse>(cfalse, b0, b2, b4, b6, b0, b2, b4, b6);
    601     cvec<T, N> b1 = a1, b3 = a3, b5 = a5, b7 = a7;
    602     butterfly4<N, inverse>(cfalse, b1, b3, b5, b7, b1, b3, b5, b7);
    603     w0 = b0 + b1;
    604     w4 = b0 - b1;
    605 
    606     b3 = cmul_by_twiddle<1, 8, inverse>(b3);
    607     b5 = cmul_by_twiddle<2, 8, inverse>(b5);
    608     b7 = cmul_by_twiddle<3, 8, inverse>(b7);
    609 
    610     w1 = b2 + b3;
    611     w5 = b2 - b3;
    612     w2 = b4 + b5;
    613     w6 = b4 - b5;
    614     w3 = b6 + b7;
    615     w7 = b6 - b7;
    616 }
    617 
    618 template <size_t N, bool inverse = false, typename T>
    619 KFR_INTRINSIC void butterfly8(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
    620                               cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7)
    621 {
    622     butterfly8<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a0, a1, a2, a3, a4, a5, a6, a7);
    623 }
    624 
    625 template <bool inverse = false, typename T>
    626 KFR_INTRINSIC void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cvec<T, 2>& a67)
    627 {
    628     cvec<T, 2> b01 = a01, b23 = a23, b45 = a45, b67 = a67;
    629 
    630     butterfly4<2, inverse>(cfalse, b01, b23, b45, b67, b01, b23, b45, b67);
    631 
    632     cvec<T, 2> b02, b13, b46, b57;
    633 
    634     cvec<T, 8> b01234567 = concat(b01, b23, b45, b67);
    635     cvec<T, 8> b02461357 = concat(even<2>(b01234567), odd<2>(b01234567));
    636     split<T, 16>(b02461357, b02, b46, b13, b57);
    637 
    638     b13 = cmul(b13, fixed_twiddle<T, 2, 8, 0, 1, inverse>());
    639     b57 = cmul(b57, fixed_twiddle<T, 2, 8, 2, 1, inverse>());
    640     a01 = b02 + b13;
    641     a23 = b46 + b57;
    642     a45 = b02 - b13;
    643     a67 = b46 - b57;
    644 }
    645 
    646 template <bool inverse = false, typename T>
    647 KFR_INTRINSIC void butterfly8_packed(cvec<T, 8>& v8)
    648 {
    649     cvec<T, 2> w0, w1, w2, w3;
    650     split<T, 16>(v8, w0, w1, w2, w3);
    651     butterfly8<inverse>(w0, w1, w2, w3);
    652     v8 = concat(w0, w1, w2, w3);
    653 }
    654 
    655 template <bool inverse = false, typename T>
    656 KFR_INTRINSIC void butterfly32_packed(cvec<T, 32>& v32)
    657 {
    658     cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7;
    659     split(v32, w0, w1, w2, w3, w4, w5, w6, w7);
    660     butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7);
    661 
    662     w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>());
    663     w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>());
    664     w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>());
    665     w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>());
    666     w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>());
    667     w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>());
    668     w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>());
    669 
    670     cvec<T, 8> z0, z1, z2, z3;
    671     transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3);
    672 
    673     butterfly4<8, inverse>(cfalse, z0, z1, z2, z3, z0, z1, z2, z3);
    674     v32 = concat(z0, z1, z2, z3);
    675 }
    676 
    677 template <size_t N, bool inverse = false, typename T>
    678 KFR_INTRINSIC void butterfly4_packed(cvec<T, N * 4>& a0123)
    679 {
    680     cvec<T, N> a0;
    681     cvec<T, N> a1;
    682     cvec<T, N> a2;
    683     cvec<T, N> a3;
    684     split<T, N * 4 * 2>(a0123, a0, a1, a2, a3);
    685     butterfly4<N, inverse>(cfalse, a0, a1, a2, a3, a0, a1, a2, a3);
    686     a0123 = concat(a0, a1, a2, a3);
    687 }
    688 
    689 template <size_t N, typename T>
    690 KFR_INTRINSIC void butterfly2_packed(cvec<T, N * 2>& a01)
    691 {
    692     cvec<T, N> a0;
    693     cvec<T, N> a1;
    694     split(a01, a0, a1);
    695     butterfly2<N>(a0, a1);
    696     a01 = concat(a0, a1);
    697 }
    698 
    699 template <size_t N, bool inverse = false, bool split_format = false, typename T>
    700 KFR_INTRINSIC void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<T, N>& w1)
    701 {
    702     if constexpr (split_format)
    703     {
    704         vec<T, N> re1, im1, tw1re, tw1im;
    705         split<T, 2 * N>(a1, re1, im1);
    706         split<T, 2 * N>(tw1, tw1re, tw1im);
    707         vec<T, N> b1re = re1 * tw1re;
    708         vec<T, N> b1im = im1 * tw1re;
    709         if constexpr (inverse)
    710             w1 = concat(b1re + im1 * tw1im, b1im - re1 * tw1im);
    711         else
    712             w1 = concat(b1re - im1 * tw1im, b1im + re1 * tw1im);
    713     }
    714     else
    715     {
    716         const cvec<T, N> b1  = a1 * dupeven(tw1);
    717         const cvec<T, N> a1_ = swap<2>(a1);
    718 
    719         cvec<T, N> tw1_ = tw1;
    720         if constexpr (inverse)
    721             tw1_ = -(tw1_);
    722         w1 = subadd(b1, a1_ * dupodd(tw1_));
    723     }
    724 }
    725 
    726 template <size_t N, bool inverse = false, bool split_format = false, typename T>
    727 KFR_INTRINSIC void apply_twiddles4(const cvec<T, N>& a1, const cvec<T, N>& a2, const cvec<T, N>& a3,
    728                                    const cvec<T, N>& tw1, const cvec<T, N>& tw2, const cvec<T, N>& tw3,
    729                                    cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3)
    730 {
    731     apply_twiddle<N, inverse, split_format>(a1, tw1, w1);
    732     apply_twiddle<N, inverse, split_format>(a2, tw2, w2);
    733     apply_twiddle<N, inverse, split_format>(a3, tw3, w3);
    734 }
    735 
    736 template <size_t N, bool inverse = false, typename T>
    737 KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
    738                                    cvec<T, N>& __restrict a3, const cvec<T, N>& tw1, const cvec<T, N>& tw2,
    739                                    const cvec<T, N>& tw3)
    740 {
    741     apply_twiddles4<N, inverse>(a1, a2, a3, tw1, tw2, tw3, a1, a2, a3);
    742 }
    743 
    744 template <size_t N, bool inverse = false, typename T, typename = u8[N - 1]>
    745 KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
    746                                    cvec<T, N>& __restrict a3, const cvec<T, 1>& tw1, const cvec<T, 1>& tw2,
    747                                    const cvec<T, 1>& tw3)
    748 {
    749     apply_twiddles4<N, inverse>(a1, a2, a3, resize<N * 2>(tw1), resize<N * 2>(tw2), resize<N * 2>(tw3));
    750 }
    751 
    752 template <size_t N, bool inverse = false, typename T, typename = u8[N - 2]>
    753 KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2,
    754                                    cvec<T, N>& __restrict a3, cvec<T, N / 2> tw1, cvec<T, N / 2> tw2,
    755                                    cvec<T, N / 2> tw3)
    756 {
    757     apply_twiddles4<N, inverse>(a1, a2, a3, resize<N * 2>(tw1), resize<N * 2>(tw2), resize<N * 2>(tw3));
    758 }
    759 
    760 template <size_t N, bool inverse = false, typename T>
    761 KFR_INTRINSIC void apply_vertical_twiddles4(cvec<T, N * 4>& b, cvec<T, N * 4>& c, cvec<T, N * 4>& d)
    762 {
    763     cvec<T, 4> b0, b1, b2, b3;
    764     cvec<T, 4> c0, c1, c2, c3;
    765     cvec<T, 4> d0, d1, d2, d3;
    766 
    767     split(b, b0, b1, b2, b3);
    768     split(c, c0, c1, c2, c3);
    769     split(d, d0, d1, d2, d3);
    770 
    771     b1 = cmul_by_twiddle<4, 64, inverse>(b1);
    772     b2 = cmul_by_twiddle<8, 64, inverse>(b2);
    773     b3 = cmul_by_twiddle<12, 64, inverse>(b3);
    774 
    775     c1 = cmul_by_twiddle<8, 64, inverse>(c1);
    776     c2 = cmul_by_twiddle<16, 64, inverse>(c2);
    777     c3 = cmul_by_twiddle<24, 64, inverse>(c3);
    778 
    779     d1 = cmul_by_twiddle<12, 64, inverse>(d1);
    780     d2 = cmul_by_twiddle<24, 64, inverse>(d2);
    781     d3 = cmul_by_twiddle<36, 64, inverse>(d3);
    782 
    783     b = concat(b0, b1, b2, b3);
    784     c = concat(c0, c1, c2, c3);
    785     d = concat(d0, d1, d2, d3);
    786 }
    787 
    788 template <size_t n2, size_t nnstep, size_t N, bool inverse = false, typename T>
    789 KFR_INTRINSIC void apply_twiddles4(cvec<T, N * 4>& __restrict a0123)
    790 {
    791     cvec<T, N> a0;
    792     cvec<T, N> a1;
    793     cvec<T, N> a2;
    794     cvec<T, N> a3;
    795     split<T, 2 * N * 4>(a0123, a0, a1, a2, a3);
    796 
    797     cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1, inverse>(),
    798                tw2 = fixed_twiddle<T, N, 64, n2 * nnstep * 2, nnstep * 2, inverse>(),
    799                tw3 = fixed_twiddle<T, N, 64, n2 * nnstep * 3, nnstep * 3, inverse>();
    800 
    801     apply_twiddles4<N>(a1, a2, a3, tw1, tw2, tw3);
    802 
    803     a0123 = concat(a0, a1, a2, a3);
    804 }
    805 
    806 template <bool inverse, bool aligned, typename T>
    807 KFR_INTRINSIC void butterfly64_memory(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out,
    808                                       const complex<T>* in)
    809 {
    810     cvec<T, 16> w0, w1, w2, w3;
    811 
    812     w0 = cread_group<4, 4, aligned>(
    813         in, 16); // concat(cread<4>(in + 0), cread<4>(in + 16), cread<4>(in + 32), cread<4>(in + 48));
    814     butterfly4_packed<4, inverse>(w0);
    815     apply_twiddles4<0, 1, 4, inverse>(w0);
    816 
    817     w1 = cread_group<4, 4, aligned>(
    818         in + 4, 16); // concat(cread<4>(in + 4), cread<4>(in + 20), cread<4>(in + 36), cread<4>(in + 52));
    819     butterfly4_packed<4, inverse>(w1);
    820     apply_twiddles4<4, 1, 4, inverse>(w1);
    821 
    822     w2 = cread_group<4, 4, aligned>(
    823         in + 8, 16); // concat(cread<4>(in + 8), cread<4>(in + 24), cread<4>(in + 40), cread<4>(in + 56));
    824     butterfly4_packed<4, inverse>(w2);
    825     apply_twiddles4<8, 1, 4, inverse>(w2);
    826 
    827     w3 = cread_group<4, 4, aligned>(
    828         in + 12, 16); // concat(cread<4>(in + 12), cread<4>(in + 28), cread<4>(in + 44), cread<4>(in + 60));
    829     butterfly4_packed<4, inverse>(w3);
    830     apply_twiddles4<12, 1, 4, inverse>(w3);
    831 
    832     transpose4(w0, w1, w2, w3);
    833     // pass 2:
    834 
    835     butterfly4_packed<4, inverse>(w0);
    836     butterfly4_packed<4, inverse>(w1);
    837     butterfly4_packed<4, inverse>(w2);
    838     butterfly4_packed<4, inverse>(w3);
    839 
    840     transpose4(w0, w1, w2, w3);
    841 
    842     w0 = digitreverse4<2>(w0);
    843     w1 = digitreverse4<2>(w1);
    844     w2 = digitreverse4<2>(w2);
    845     w3 = digitreverse4<2>(w3);
    846 
    847     apply_vertical_twiddles4<4, inverse>(w1, w2, w3);
    848 
    849     // pass 3:
    850     butterfly4_packed<4, inverse>(w3);
    851     cwrite_group<4, 4, aligned>(out + 12, 16, w3); //        split(w3, out[3], out[7], out[11], out[15]);
    852 
    853     butterfly4_packed<4, inverse>(w2);
    854     cwrite_group<4, 4, aligned>(out + 8, 16, w2); //        split(w2, out[2], out[6], out[10], out[14]);
    855 
    856     butterfly4_packed<4, inverse>(w1);
    857     cwrite_group<4, 4, aligned>(out + 4, 16, w1); //     split(w1, out[1], out[5], out[9], out[13]);
    858 
    859     butterfly4_packed<4, inverse>(w0);
    860     cwrite_group<4, 4, aligned>(out, 16, w0); //     split(w0, out[0], out[4], out[8], out[12]);
    861 }
    862 
    863 template <bool inverse = false, typename T>
    864 KFR_INTRINSIC void butterfly16_packed(cvec<T, 16>& v16)
    865 {
    866     butterfly4_packed<4, inverse>(v16);
    867     apply_twiddles4<0, 4, 4, inverse>(v16);
    868     v16 = digitreverse4<2>(v16);
    869     butterfly4_packed<4, inverse>(v16);
    870 }
    871 
    872 template <size_t index, bool inverse = false, typename T>
    873 KFR_INTRINSIC void butterfly16_multi_natural(complex<T>* out, const complex<T>* in)
    874 {
    875     constexpr size_t N = 4;
    876 
    877     cvec<T, 4> a1  = cread<4>(in + index * 4 + 16 * 1);
    878     cvec<T, 4> a5  = cread<4>(in + index * 4 + 16 * 5);
    879     cvec<T, 4> a9  = cread<4>(in + index * 4 + 16 * 9);
    880     cvec<T, 4> a13 = cread<4>(in + index * 4 + 16 * 13);
    881     butterfly4<N, inverse>(cfalse, a1, a5, a9, a13, a1, a5, a9, a13);
    882     a5  = cmul_by_twiddle<1, 16, inverse>(a5);
    883     a9  = cmul_by_twiddle<2, 16, inverse>(a9);
    884     a13 = cmul_by_twiddle<3, 16, inverse>(a13);
    885 
    886     cvec<T, 4> a2  = cread<4>(in + index * 4 + 16 * 2);
    887     cvec<T, 4> a6  = cread<4>(in + index * 4 + 16 * 6);
    888     cvec<T, 4> a10 = cread<4>(in + index * 4 + 16 * 10);
    889     cvec<T, 4> a14 = cread<4>(in + index * 4 + 16 * 14);
    890     butterfly4<N, inverse>(cfalse, a2, a6, a10, a14, a2, a6, a10, a14);
    891     a6  = cmul_by_twiddle<2, 16, inverse>(a6);
    892     a10 = cmul_by_twiddle<4, 16, inverse>(a10);
    893     a14 = cmul_by_twiddle<6, 16, inverse>(a14);
    894 
    895     cvec<T, 4> a3  = cread<4>(in + index * 4 + 16 * 3);
    896     cvec<T, 4> a7  = cread<4>(in + index * 4 + 16 * 7);
    897     cvec<T, 4> a11 = cread<4>(in + index * 4 + 16 * 11);
    898     cvec<T, 4> a15 = cread<4>(in + index * 4 + 16 * 15);
    899     butterfly4<N, inverse>(cfalse, a3, a7, a11, a15, a3, a7, a11, a15);
    900     a7  = cmul_by_twiddle<3, 16, inverse>(a7);
    901     a11 = cmul_by_twiddle<6, 16, inverse>(a11);
    902     a15 = cmul_by_twiddle<9, 16, inverse>(a15);
    903 
    904     cvec<T, 4> a0  = cread<4>(in + index * 4 + 16 * 0);
    905     cvec<T, 4> a4  = cread<4>(in + index * 4 + 16 * 4);
    906     cvec<T, 4> a8  = cread<4>(in + index * 4 + 16 * 8);
    907     cvec<T, 4> a12 = cread<4>(in + index * 4 + 16 * 12);
    908     butterfly4<N, inverse>(cfalse, a0, a4, a8, a12, a0, a4, a8, a12);
    909     butterfly4<N, inverse>(cfalse, a0, a1, a2, a3, a0, a1, a2, a3);
    910     cwrite<4>(out + index * 4 + 16 * 0, a0);
    911     cwrite<4>(out + index * 4 + 16 * 4, a1);
    912     cwrite<4>(out + index * 4 + 16 * 8, a2);
    913     cwrite<4>(out + index * 4 + 16 * 12, a3);
    914     butterfly4<N, inverse>(cfalse, a4, a5, a6, a7, a4, a5, a6, a7);
    915     cwrite<4>(out + index * 4 + 16 * 1, a4);
    916     cwrite<4>(out + index * 4 + 16 * 5, a5);
    917     cwrite<4>(out + index * 4 + 16 * 9, a6);
    918     cwrite<4>(out + index * 4 + 16 * 13, a7);
    919     butterfly4<N, inverse>(cfalse, a8, a9, a10, a11, a8, a9, a10, a11);
    920     cwrite<4>(out + index * 4 + 16 * 2, a8);
    921     cwrite<4>(out + index * 4 + 16 * 6, a9);
    922     cwrite<4>(out + index * 4 + 16 * 10, a10);
    923     cwrite<4>(out + index * 4 + 16 * 14, a11);
    924     butterfly4<N, inverse>(cfalse, a12, a13, a14, a15, a12, a13, a14, a15);
    925     cwrite<4>(out + index * 4 + 16 * 3, a12);
    926     cwrite<4>(out + index * 4 + 16 * 7, a13);
    927     cwrite<4>(out + index * 4 + 16 * 11, a14);
    928     cwrite<4>(out + index * 4 + 16 * 15, a15);
    929 }
    930 
    931 template <size_t index, bool inverse = false, typename T>
    932 KFR_INTRINSIC void butterfly16_multi_flip(complex<T>* out, const complex<T>* in)
    933 {
    934     constexpr size_t N = 4;
    935 
    936     cvec<T, 4> a1  = cread<4>(in + index * 4 + 16 * 1);
    937     cvec<T, 4> a5  = cread<4>(in + index * 4 + 16 * 5);
    938     cvec<T, 4> a9  = cread<4>(in + index * 4 + 16 * 9);
    939     cvec<T, 4> a13 = cread<4>(in + index * 4 + 16 * 13);
    940     butterfly4<N, inverse>(cfalse, a1, a5, a9, a13, a1, a5, a9, a13);
    941     a5  = cmul_by_twiddle<1, 16, inverse>(a5);
    942     a9  = cmul_by_twiddle<2, 16, inverse>(a9);
    943     a13 = cmul_by_twiddle<3, 16, inverse>(a13);
    944 
    945     cvec<T, 4> a2  = cread<4>(in + index * 4 + 16 * 2);
    946     cvec<T, 4> a6  = cread<4>(in + index * 4 + 16 * 6);
    947     cvec<T, 4> a10 = cread<4>(in + index * 4 + 16 * 10);
    948     cvec<T, 4> a14 = cread<4>(in + index * 4 + 16 * 14);
    949     butterfly4<N, inverse>(cfalse, a2, a6, a10, a14, a2, a6, a10, a14);
    950     a6  = cmul_by_twiddle<2, 16, inverse>(a6);
    951     a10 = cmul_by_twiddle<4, 16, inverse>(a10);
    952     a14 = cmul_by_twiddle<6, 16, inverse>(a14);
    953 
    954     cvec<T, 4> a3  = cread<4>(in + index * 4 + 16 * 3);
    955     cvec<T, 4> a7  = cread<4>(in + index * 4 + 16 * 7);
    956     cvec<T, 4> a11 = cread<4>(in + index * 4 + 16 * 11);
    957     cvec<T, 4> a15 = cread<4>(in + index * 4 + 16 * 15);
    958     butterfly4<N, inverse>(cfalse, a3, a7, a11, a15, a3, a7, a11, a15);
    959     a7  = cmul_by_twiddle<3, 16, inverse>(a7);
    960     a11 = cmul_by_twiddle<6, 16, inverse>(a11);
    961     a15 = cmul_by_twiddle<9, 16, inverse>(a15);
    962 
    963     cvec<T, 16> w1 = concat(a1, a5, a9, a13);
    964     cvec<T, 16> w2 = concat(a2, a6, a10, a14);
    965     cvec<T, 16> w3 = concat(a3, a7, a11, a15);
    966 
    967     cvec<T, 4> a0  = cread<4>(in + index * 4 + 16 * 0);
    968     cvec<T, 4> a4  = cread<4>(in + index * 4 + 16 * 4);
    969     cvec<T, 4> a8  = cread<4>(in + index * 4 + 16 * 8);
    970     cvec<T, 4> a12 = cread<4>(in + index * 4 + 16 * 12);
    971     butterfly4<N, inverse>(cfalse, a0, a4, a8, a12, a0, a4, a8, a12);
    972     cvec<T, 16> w0 = concat(a0, a4, a8, a12);
    973 
    974     butterfly4<N * 4, inverse>(cfalse, w0, w1, w2, w3, w0, w1, w2, w3);
    975 
    976     w0 = digitreverse4<2>(w0);
    977     w1 = digitreverse4<2>(w1);
    978     w2 = digitreverse4<2>(w2);
    979     w3 = digitreverse4<2>(w3);
    980 
    981     transpose4(w0, w1, w2, w3);
    982     cwrite<16>(out + index * 64 + 16 * 0, cmul(w0, fixed_twiddle<T, 16, 256, 0, index * 4 + 0, inverse>()));
    983     cwrite<16>(out + index * 64 + 16 * 1, cmul(w1, fixed_twiddle<T, 16, 256, 0, index * 4 + 1, inverse>()));
    984     cwrite<16>(out + index * 64 + 16 * 2, cmul(w2, fixed_twiddle<T, 16, 256, 0, index * 4 + 2, inverse>()));
    985     cwrite<16>(out + index * 64 + 16 * 3, cmul(w3, fixed_twiddle<T, 16, 256, 0, index * 4 + 3, inverse>()));
    986 }
    987 
    988 template <size_t n2, size_t nnstep, size_t N, typename T>
    989 KFR_INTRINSIC void apply_twiddles2(cvec<T, N>& a1)
    990 {
    991     cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1>();
    992 
    993     a1 = cmul(a1, tw1);
    994 }
    995 
    996 template <typename T, size_t N, bool inverse>
    997 static constexpr KFR_INTRINSIC cvec<T, N> tw3r1()
    998 {
    999     return static_cast<T>(-0.5 - 1.0);
   1000 }
   1001 
   1002 template <typename T, size_t N, bool inverse>
   1003 static constexpr KFR_INTRINSIC cvec<T, N> tw3i1()
   1004 {
   1005     return static_cast<T>(0.86602540378443864676372317075) * twiddleimagmask<T, N, inverse>();
   1006 }
   1007 
   1008 template <size_t N, bool inverse = false, typename T>
   1009 KFR_INTRINSIC void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N>& w00,
   1010                               cvec<T, N>& w01, cvec<T, N>& w02)
   1011 {
   1012 
   1013     const cvec<T, N> sum1 = a01 + a02;
   1014     const cvec<T, N> dif1 = swap<2>(a01 - a02);
   1015     w00                   = a00 + sum1;
   1016 
   1017     const cvec<T, N> s1 = w00 + sum1 * tw3r1<T, N, inverse>();
   1018 
   1019     const cvec<T, N> d1 = dif1 * tw3i1<T, N, inverse>();
   1020 
   1021     w01 = s1 + d1;
   1022     w02 = s1 - d1;
   1023 }
   1024 
   1025 template <size_t N, bool inverse = false, typename T>
   1026 KFR_INTRINSIC void butterfly3(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2)
   1027 {
   1028     butterfly3<N, inverse>(a0, a1, a2, a0, a1, a2);
   1029 }
   1030 
   1031 template <size_t N, bool inverse = false, typename T>
   1032 KFR_INTRINSIC void butterfly6(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
   1033                               const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
   1034                               cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4,
   1035                               cvec<T, N>& w5)
   1036 {
   1037     cvec<T, N * 2> a03 = concat(a0, a3);
   1038     cvec<T, N * 2> a25 = concat(a2, a5);
   1039     cvec<T, N * 2> a41 = concat(a4, a1);
   1040     butterfly3<N * 2, inverse>(a03, a25, a41, a03, a25, a41);
   1041     cvec<T, N> t0, t1, t2, t3, t4, t5;
   1042     split(a03, t0, t1);
   1043     split(a25, t2, t3);
   1044     split(a41, t4, t5);
   1045     t3                 = -t3;
   1046     cvec<T, N * 2> a04 = concat(t0, t4);
   1047     cvec<T, N * 2> a15 = concat(t1, t5);
   1048     cvec<T, N * 2> w02, w35;
   1049     butterfly2<N * 2>(a04, a15, w02, w35);
   1050     split(w02, w0, w2);
   1051     split(w35, w3, w5);
   1052 
   1053     butterfly2<N>(t2, t3, w1, w4);
   1054 }
   1055 
   1056 template <size_t N, bool inverse = false, typename T>
   1057 KFR_INTRINSIC void butterfly6(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
   1058                               cvec<T, N>& a5)
   1059 {
   1060     butterfly6<N, inverse>(a0, a1, a2, a3, a4, a5, a0, a1, a2, a3, a4, a5);
   1061 }
   1062 
   1063 template <typename T, bool inverse = false>
   1064 static constexpr KFR_INTRINSIC cvec<T, 1> tw9_1()
   1065 {
   1066     return { T(0.76604444311897803520239265055541),
   1067              (inverse ? -1 : 1) * T(-0.64278760968653932632264340990727) };
   1068 }
   1069 template <typename T, bool inverse = false>
   1070 static constexpr KFR_INTRINSIC cvec<T, 1> tw9_2()
   1071 {
   1072     return { T(0.17364817766693034885171662676931),
   1073              (inverse ? -1 : 1) * T(-0.98480775301220805936674302458952) };
   1074 }
   1075 template <typename T, bool inverse = false>
   1076 static constexpr KFR_INTRINSIC cvec<T, 1> tw9_4()
   1077 {
   1078     return { T(-0.93969262078590838405410927732473),
   1079              (inverse ? -1 : 1) * T(-0.34202014332566873304409961468226) };
   1080 }
   1081 
   1082 template <size_t N, bool inverse = false, typename T>
   1083 KFR_INTRINSIC void butterfly9(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
   1084                               const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
   1085                               const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8,
   1086                               cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4,
   1087                               cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7, cvec<T, N>& w8)
   1088 {
   1089     cvec<T, N * 3> a012 = concat(a0, a1, a2);
   1090     cvec<T, N * 3> a345 = concat(a3, a4, a5);
   1091     cvec<T, N * 3> a678 = concat(a6, a7, a8);
   1092     butterfly3<N * 3, inverse>(a012, a345, a678, a012, a345, a678);
   1093     cvec<T, N> t0, t1, t2, t3, t4, t5, t6, t7, t8;
   1094     split(a012, t0, t1, t2);
   1095     split(a345, t3, t4, t5);
   1096     split(a678, t6, t7, t8);
   1097 
   1098     t4 = cmul(t4, tw9_1<T, inverse>());
   1099     t5 = cmul(t5, tw9_2<T, inverse>());
   1100     t7 = cmul(t7, tw9_2<T, inverse>());
   1101     t8 = cmul(t8, tw9_4<T, inverse>());
   1102 
   1103     cvec<T, N * 3> t036 = concat(t0, t3, t6);
   1104     cvec<T, N * 3> t147 = concat(t1, t4, t7);
   1105     cvec<T, N * 3> t258 = concat(t2, t5, t8);
   1106 
   1107     butterfly3<N * 3, inverse>(t036, t147, t258, t036, t147, t258);
   1108     split(t036, w0, w1, w2);
   1109     split(t147, w3, w4, w5);
   1110     split(t258, w6, w7, w8);
   1111 }
   1112 
   1113 template <size_t N, bool inverse = false, typename T>
   1114 KFR_INTRINSIC void butterfly9(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
   1115                               cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7, cvec<T, N>& a8)
   1116 {
   1117     butterfly9<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a8, a0, a1, a2, a3, a4, a5, a6, a7, a8);
   1118 }
   1119 
   1120 template <typename T, size_t N, bool inverse>
   1121 static constexpr KFR_INTRINSIC cvec<T, N> tw7r1()
   1122 {
   1123     return static_cast<T>(0.623489801858733530525004884 - 1.0);
   1124 }
   1125 
   1126 template <typename T, size_t N, bool inverse>
   1127 static constexpr KFR_INTRINSIC cvec<T, N> tw7i1()
   1128 {
   1129     return static_cast<T>(0.78183148246802980870844452667) * twiddleimagmask<T, N, inverse>();
   1130 }
   1131 
   1132 template <typename T, size_t N, bool inverse>
   1133 static constexpr KFR_INTRINSIC cvec<T, N> tw7r2()
   1134 {
   1135     return static_cast<T>(-0.2225209339563144042889025645 - 1.0);
   1136 }
   1137 
   1138 template <typename T, size_t N, bool inverse>
   1139 static constexpr KFR_INTRINSIC cvec<T, N> tw7i2()
   1140 {
   1141     return static_cast<T>(0.97492791218182360701813168299) * twiddleimagmask<T, N, inverse>();
   1142 }
   1143 
   1144 template <typename T, size_t N, bool inverse>
   1145 static constexpr KFR_INTRINSIC cvec<T, N> tw7r3()
   1146 {
   1147     return static_cast<T>(-0.90096886790241912623610231951 - 1.0);
   1148 }
   1149 
   1150 template <typename T, size_t N, bool inverse>
   1151 static constexpr KFR_INTRINSIC cvec<T, N> tw7i3()
   1152 {
   1153     return static_cast<T>(0.43388373911755812047576833285) * twiddleimagmask<T, N, inverse>();
   1154 }
   1155 
   1156 template <size_t N, bool inverse = false, typename T>
   1157 KFR_INTRINSIC void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04,
   1158                               cvec<T, N> a05, cvec<T, N> a06, cvec<T, N>& w00, cvec<T, N>& w01,
   1159                               cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05,
   1160                               cvec<T, N>& w06)
   1161 {
   1162     const cvec<T, N> sum1 = a01 + a06;
   1163     const cvec<T, N> dif1 = swap<2>(a01 - a06);
   1164     const cvec<T, N> sum2 = a02 + a05;
   1165     const cvec<T, N> dif2 = swap<2>(a02 - a05);
   1166     const cvec<T, N> sum3 = a03 + a04;
   1167     const cvec<T, N> dif3 = swap<2>(a03 - a04);
   1168     w00                   = a00 + sum1 + sum2 + sum3;
   1169 
   1170     const cvec<T, N> s1 =
   1171         w00 + sum1 * tw7r1<T, N, inverse>() + sum2 * tw7r2<T, N, inverse>() + sum3 * tw7r3<T, N, inverse>();
   1172     const cvec<T, N> s2 =
   1173         w00 + sum1 * tw7r2<T, N, inverse>() + sum2 * tw7r3<T, N, inverse>() + sum3 * tw7r1<T, N, inverse>();
   1174     const cvec<T, N> s3 =
   1175         w00 + sum1 * tw7r3<T, N, inverse>() + sum2 * tw7r1<T, N, inverse>() + sum3 * tw7r2<T, N, inverse>();
   1176 
   1177     const cvec<T, N> d1 =
   1178         dif1 * tw7i1<T, N, inverse>() + dif2 * tw7i2<T, N, inverse>() + dif3 * tw7i3<T, N, inverse>();
   1179     const cvec<T, N> d2 =
   1180         dif1 * tw7i2<T, N, inverse>() - dif2 * tw7i3<T, N, inverse>() - dif3 * tw7i1<T, N, inverse>();
   1181     const cvec<T, N> d3 =
   1182         dif1 * tw7i3<T, N, inverse>() - dif2 * tw7i1<T, N, inverse>() + dif3 * tw7i2<T, N, inverse>();
   1183 
   1184     w01 = s1 + d1;
   1185     w06 = s1 - d1;
   1186     w02 = s2 + d2;
   1187     w05 = s2 - d2;
   1188     w03 = s3 + d3;
   1189     w04 = s3 - d3;
   1190 }
   1191 
   1192 template <size_t N, bool inverse = false, typename T>
   1193 KFR_INTRINSIC void butterfly7(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4,
   1194                               cvec<T, N>& a5, cvec<T, N>& a6)
   1195 {
   1196     butterfly7<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a0, a1, a2, a3, a4, a5, a6);
   1197 }
   1198 
   1199 template <typename T, size_t N, bool inverse>
   1200 static const cvec<T, N> tw11r1 = static_cast<T>(0.84125353283118116886181164892 - 1.0);
   1201 
   1202 template <typename T, size_t N, bool inverse>
   1203 static const cvec<T, N> tw11i1 =
   1204     static_cast<T>(0.54064081745559758210763595432) * twiddleimagmask<T, N, inverse>();
   1205 
   1206 template <typename T, size_t N, bool inverse>
   1207 static const cvec<T, N> tw11r2 = static_cast<T>(0.41541501300188642552927414923 - 1.0);
   1208 
   1209 template <typename T, size_t N, bool inverse>
   1210 static const cvec<T, N> tw11i2 =
   1211     static_cast<T>(0.90963199535451837141171538308) * twiddleimagmask<T, N, inverse>();
   1212 
   1213 template <typename T, size_t N, bool inverse>
   1214 static const cvec<T, N> tw11r3 = static_cast<T>(-0.14231483827328514044379266862 - 1.0);
   1215 
   1216 template <typename T, size_t N, bool inverse>
   1217 static const cvec<T, N> tw11i3 =
   1218     static_cast<T>(0.98982144188093273237609203778) * twiddleimagmask<T, N, inverse>();
   1219 
   1220 template <typename T, size_t N, bool inverse>
   1221 static const cvec<T, N> tw11r4 = static_cast<T>(-0.65486073394528506405692507247 - 1.0);
   1222 
   1223 template <typename T, size_t N, bool inverse>
   1224 static const cvec<T, N> tw11i4 =
   1225     static_cast<T>(0.75574957435425828377403584397) * twiddleimagmask<T, N, inverse>();
   1226 
   1227 template <typename T, size_t N, bool inverse>
   1228 static const cvec<T, N> tw11r5 = static_cast<T>(-0.95949297361449738989036805707 - 1.0);
   1229 
   1230 template <typename T, size_t N, bool inverse>
   1231 static const cvec<T, N> tw11i5 =
   1232     static_cast<T>(0.28173255684142969771141791535) * twiddleimagmask<T, N, inverse>();
   1233 
   1234 template <size_t N, bool inverse = false, typename T>
   1235 KFR_INTRINSIC void butterfly11(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04,
   1236                                cvec<T, N> a05, cvec<T, N> a06, cvec<T, N> a07, cvec<T, N> a08, cvec<T, N> a09,
   1237                                cvec<T, N> a10, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02,
   1238                                cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06,
   1239                                cvec<T, N>& w07, cvec<T, N>& w08, cvec<T, N>& w09, cvec<T, N>& w10)
   1240 {
   1241     const cvec<T, N> sum1 = a01 + a10;
   1242     const cvec<T, N> dif1 = swap<2>(a01 - a10);
   1243     const cvec<T, N> sum2 = a02 + a09;
   1244     const cvec<T, N> dif2 = swap<2>(a02 - a09);
   1245     const cvec<T, N> sum3 = a03 + a08;
   1246     const cvec<T, N> dif3 = swap<2>(a03 - a08);
   1247     const cvec<T, N> sum4 = a04 + a07;
   1248     const cvec<T, N> dif4 = swap<2>(a04 - a07);
   1249     const cvec<T, N> sum5 = a05 + a06;
   1250     const cvec<T, N> dif5 = swap<2>(a05 - a06);
   1251     w00                   = a00 + sum1 + sum2 + sum3 + sum4 + sum5;
   1252 
   1253     const cvec<T, N> s1 = w00 + sum1 * tw11r1<T, N, inverse> + sum2 * tw11r2<T, N, inverse> +
   1254                           sum3 * tw11r3<T, N, inverse> + sum4 * tw11r4<T, N, inverse> +
   1255                           sum5 * tw11r5<T, N, inverse>;
   1256     const cvec<T, N> s2 = w00 + sum1 * tw11r2<T, N, inverse> + sum2 * tw11r3<T, N, inverse> +
   1257                           sum3 * tw11r4<T, N, inverse> + sum4 * tw11r5<T, N, inverse> +
   1258                           sum5 * tw11r1<T, N, inverse>;
   1259     const cvec<T, N> s3 = w00 + sum1 * tw11r3<T, N, inverse> + sum2 * tw11r4<T, N, inverse> +
   1260                           sum3 * tw11r5<T, N, inverse> + sum4 * tw11r1<T, N, inverse> +
   1261                           sum5 * tw11r2<T, N, inverse>;
   1262     const cvec<T, N> s4 = w00 + sum1 * tw11r4<T, N, inverse> + sum2 * tw11r5<T, N, inverse> +
   1263                           sum3 * tw11r1<T, N, inverse> + sum4 * tw11r2<T, N, inverse> +
   1264                           sum5 * tw11r3<T, N, inverse>;
   1265     const cvec<T, N> s5 = w00 + sum1 * tw11r5<T, N, inverse> + sum2 * tw11r1<T, N, inverse> +
   1266                           sum3 * tw11r2<T, N, inverse> + sum4 * tw11r3<T, N, inverse> +
   1267                           sum5 * tw11r4<T, N, inverse>;
   1268 
   1269     const cvec<T, N> d1 = dif1 * tw11i1<T, N, inverse> + dif2 * tw11i2<T, N, inverse> +
   1270                           dif3 * tw11i3<T, N, inverse> + dif4 * tw11i4<T, N, inverse> +
   1271                           dif5 * tw11i5<T, N, inverse>;
   1272     const cvec<T, N> d2 = dif1 * tw11i2<T, N, inverse> - dif2 * tw11i3<T, N, inverse> -
   1273                           dif3 * tw11i4<T, N, inverse> - dif4 * tw11i5<T, N, inverse> -
   1274                           dif5 * tw11i1<T, N, inverse>;
   1275     const cvec<T, N> d3 = dif1 * tw11i3<T, N, inverse> - dif2 * tw11i4<T, N, inverse> +
   1276                           dif3 * tw11i5<T, N, inverse> + dif4 * tw11i1<T, N, inverse> +
   1277                           dif5 * tw11i2<T, N, inverse>;
   1278     const cvec<T, N> d4 = dif1 * tw11i4<T, N, inverse> - dif2 * tw11i5<T, N, inverse> +
   1279                           dif3 * tw11i1<T, N, inverse> - dif4 * tw11i2<T, N, inverse> -
   1280                           dif5 * tw11i3<T, N, inverse>;
   1281     const cvec<T, N> d5 = dif1 * tw11i5<T, N, inverse> - dif2 * tw11i1<T, N, inverse> +
   1282                           dif3 * tw11i2<T, N, inverse> - dif4 * tw11i3<T, N, inverse> +
   1283                           dif5 * tw11i4<T, N, inverse>;
   1284 
   1285     w01 = s1 + d1;
   1286     w10 = s1 - d1;
   1287     w02 = s2 + d2;
   1288     w09 = s2 - d2;
   1289     w03 = s3 + d3;
   1290     w08 = s3 - d3;
   1291     w04 = s4 + d4;
   1292     w07 = s4 - d4;
   1293     w05 = s5 + d5;
   1294     w06 = s5 - d5;
   1295 }
   1296 
   1297 template <typename T, size_t N, bool inverse>
   1298 static constexpr KFR_INTRINSIC cvec<T, N> tw5r1()
   1299 {
   1300     return static_cast<T>(0.30901699437494742410229341718 - 1.0);
   1301 }
   1302 template <typename T, size_t N, bool inverse>
   1303 static constexpr KFR_INTRINSIC cvec<T, N> tw5i1()
   1304 {
   1305     return static_cast<T>(0.95105651629515357211643933338) * twiddleimagmask<T, N, inverse>();
   1306 }
   1307 template <typename T, size_t N, bool inverse>
   1308 static constexpr KFR_INTRINSIC cvec<T, N> tw5r2()
   1309 {
   1310     return static_cast<T>(-0.80901699437494742410229341718 - 1.0);
   1311 }
   1312 template <typename T, size_t N, bool inverse>
   1313 static constexpr KFR_INTRINSIC cvec<T, N> tw5i2()
   1314 {
   1315     return static_cast<T>(0.58778525229247312916870595464) * twiddleimagmask<T, N, inverse>();
   1316 }
   1317 
   1318 template <size_t N, bool inverse = false, typename T>
   1319 KFR_INTRINSIC void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02,
   1320                               const cvec<T, N>& a03, const cvec<T, N>& a04, cvec<T, N>& w00, cvec<T, N>& w01,
   1321                               cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04)
   1322 {
   1323     const cvec<T, N> sum1 = a01 + a04;
   1324     const cvec<T, N> dif1 = swap<2>(a01 - a04);
   1325     const cvec<T, N> sum2 = a02 + a03;
   1326     const cvec<T, N> dif2 = swap<2>(a02 - a03);
   1327     w00                   = a00 + sum1 + sum2;
   1328 
   1329     const cvec<T, N> s1 = w00 + sum1 * tw5r1<T, N, inverse>() + sum2 * tw5r2<T, N, inverse>();
   1330     const cvec<T, N> s2 = w00 + sum1 * tw5r2<T, N, inverse>() + sum2 * tw5r1<T, N, inverse>();
   1331 
   1332     const cvec<T, N> d1 = dif1 * tw5i1<T, N, inverse>() + dif2 * tw5i2<T, N, inverse>();
   1333     const cvec<T, N> d2 = dif1 * tw5i2<T, N, inverse>() - dif2 * tw5i1<T, N, inverse>();
   1334 
   1335     w01 = s1 + d1;
   1336     w04 = s1 - d1;
   1337     w02 = s2 + d2;
   1338     w03 = s2 - d2;
   1339 }
   1340 
   1341 template <size_t N, bool inverse = false, typename T>
   1342 KFR_INTRINSIC void butterfly10(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2,
   1343                                const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5,
   1344                                const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8,
   1345                                const cvec<T, N>& a9, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2,
   1346                                cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7,
   1347                                cvec<T, N>& w8, cvec<T, N>& w9)
   1348 {
   1349     cvec<T, N * 2> a05 = concat(a0, a5);
   1350     cvec<T, N * 2> a27 = concat(a2, a7);
   1351     cvec<T, N * 2> a49 = concat(a4, a9);
   1352     cvec<T, N * 2> a61 = concat(a6, a1);
   1353     cvec<T, N * 2> a83 = concat(a8, a3);
   1354     butterfly5<N * 2, inverse>(a05, a27, a49, a61, a83, a05, a27, a49, a61, a83);
   1355     cvec<T, N> t0, t1, t2, t3, t4, t5, t6, t7, t8, t9;
   1356     split(a05, t0, t1);
   1357     split(a27, t2, t3);
   1358     split(a49, t4, t5);
   1359     split(a61, t6, t7);
   1360     split(a83, t8, t9);
   1361     t5 = -t5;
   1362 
   1363     cvec<T, N * 2> t02, t13;
   1364     cvec<T, N * 2> w06, w51;
   1365     t02 = concat(t0, t2);
   1366     t13 = concat(t1, t3);
   1367     butterfly2<N * 2>(t02, t13, w06, w51);
   1368     split(w06, w0, w6);
   1369     split(w51, w5, w1);
   1370 
   1371     cvec<T, N * 2> t68, t79;
   1372     cvec<T, N * 2> w84, w39;
   1373     t68 = concat(t6, t8);
   1374     t79 = concat(t7, t9);
   1375     butterfly2<N * 2>(t68, t79, w84, w39);
   1376     split(w84, w8, w4);
   1377     split(w39, w3, w9);
   1378     butterfly2<N>(t4, t5, w7, w2);
   1379 }
   1380 
   1381 template <bool inverse, typename T, size_t N>
   1382 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, vec<T, N>& out0,
   1383                              vec<T, N>& out1)
   1384 {
   1385     butterfly2<N / 2>(in0, in1, out0, out1);
   1386 }
   1387 template <bool inverse, typename T, size_t N>
   1388 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
   1389                              const vec<T, N>& in2, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2)
   1390 {
   1391     butterfly3<N / 2, inverse>(in0, in1, in2, out0, out1, out2);
   1392 }
   1393 
   1394 template <bool inverse, typename T, size_t N>
   1395 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
   1396                              const vec<T, N>& in2, const vec<T, N>& in3, vec<T, N>& out0, vec<T, N>& out1,
   1397                              vec<T, N>& out2, vec<T, N>& out3)
   1398 {
   1399     butterfly4<N / 2, inverse>(cfalse, in0, in1, in2, in3, out0, out1, out2, out3);
   1400 }
   1401 template <bool inverse, typename T, size_t N>
   1402 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
   1403                              const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
   1404                              vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3,
   1405                              vec<T, N>& out4)
   1406 {
   1407     butterfly5<N / 2, inverse>(in0, in1, in2, in3, in4, out0, out1, out2, out3, out4);
   1408 }
   1409 template <bool inverse, typename T, size_t N>
   1410 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
   1411                              const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
   1412                              const vec<T, N>& in5, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
   1413                              vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5)
   1414 {
   1415     butterfly6<N / 2, inverse>(in0, in1, in2, in3, in4, in5, out0, out1, out2, out3, out4, out5);
   1416 }
   1417 template <bool inverse, typename T, size_t N>
   1418 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
   1419                              const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
   1420                              const vec<T, N>& in5, const vec<T, N>& in6, vec<T, N>& out0, vec<T, N>& out1,
   1421                              vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5,
   1422                              vec<T, N>& out6)
   1423 {
   1424     butterfly7<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, out0, out1, out2, out3, out4, out5, out6);
   1425 }
   1426 template <bool inverse, typename T, size_t N>
   1427 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
   1428                              const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
   1429                              const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7,
   1430                              vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3,
   1431                              vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7)
   1432 {
   1433     butterfly8<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4, out5,
   1434                                out6, out7);
   1435 }
   1436 template <bool inverse, typename T, size_t N>
   1437 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
   1438                              const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
   1439                              const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7,
   1440                              const vec<T, N>& in8, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2,
   1441                              vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6,
   1442                              vec<T, N>& out7, vec<T, N>& out8)
   1443 {
   1444     butterfly9<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, out0, out1, out2, out3, out4,
   1445                                out5, out6, out7, out8);
   1446 }
   1447 template <bool inverse, typename T, size_t N>
   1448 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
   1449                              const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
   1450                              const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7,
   1451                              const vec<T, N>& in8, const vec<T, N>& in9, vec<T, N>& out0, vec<T, N>& out1,
   1452                              vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5,
   1453                              vec<T, N>& out6, vec<T, N>& out7, vec<T, N>& out8, vec<T, N>& out9)
   1454 {
   1455     butterfly10<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, out0, out1, out2, out3,
   1456                                 out4, out5, out6, out7, out8, out9);
   1457 }
   1458 template <bool inverse, typename T, size_t N>
   1459 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1,
   1460                              const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4,
   1461                              const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7,
   1462                              const vec<T, N>& in8, const vec<T, N>& in9, const vec<T, N>& in10,
   1463                              vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3,
   1464                              vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7,
   1465                              vec<T, N>& out8, vec<T, N>& out9, vec<T, N>& out10)
   1466 {
   1467     butterfly11<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, out0, out1, out2,
   1468                                 out3, out4, out5, out6, out7, out8, out9, out10);
   1469 }
   1470 template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()>
   1471 KFR_INTRINSIC void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w)
   1472 {
   1473     vec<T, Nout> temp = read(cunaligned, csize<Nout>, ptr_cast<T>(ptr));
   1474     if constexpr (transposed)
   1475         temp = ctranspose<sizeof...(N)>(temp);
   1476     split(temp, w...);
   1477 }
   1478 
   1479 // Warning: Reads past the end. Use with care
   1480 KFR_INTRINSIC void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0,
   1481                                     cvec<f32, 4>& w1, cvec<f32, 4>& w2)
   1482 {
   1483     cvec<f32, 4> w3;
   1484     cvec<f32, 16> v16 = concat(cread<4>(ptr), cread<4>(ptr + 3), cread<4>(ptr + 6), cread<4>(ptr + 9));
   1485     v16               = digitreverse4<2>(v16);
   1486     split<f32, 32>(v16, w0, w1, w2, w3);
   1487 }
   1488 
   1489 KFR_INTRINSIC void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0,
   1490                                     cvec<f32, 4>& w1, cvec<f32, 4>& w2, cvec<f32, 4>& w3, cvec<f32, 4>& w4)
   1491 {
   1492     cvec<f32, 16> v16 = concat(cread<4>(ptr), cread<4>(ptr + 5), cread<4>(ptr + 10), cread<4>(ptr + 15));
   1493     v16               = digitreverse4<2>(v16);
   1494     split<f32, 32>(v16, w0, w1, w2, w3);
   1495     w4 = cgather<4, 5>(ptr + 4);
   1496 }
   1497 
   1498 template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()>
   1499 KFR_INTRINSIC void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N>... args)
   1500 {
   1501     auto temp = concat(args...);
   1502     if constexpr (transposed)
   1503         temp = ctransposeinverse<sizeof...(N)>(temp);
   1504     write(ptr_cast<T>(ptr), temp);
   1505 }
   1506 
   1507 template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2>
   1508 KFR_INTRINSIC vec<T, N> mul_tw(cbool_t<false>, const vec<T, N>& x, const complex<T>* twiddle)
   1509 {
   1510     return I == 0 ? x : cmul(x, cread<width>(twiddle + width * (I - 1)));
   1511 }
   1512 template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2>
   1513 KFR_INTRINSIC vec<T, N> mul_tw(cbool_t<true>, const vec<T, N>& x, const complex<T>* twiddle)
   1514 {
   1515     return I == 0 ? x : cmul_conj(x, cread<width>(twiddle + width * (I - 1)));
   1516 }
   1517 
   1518 // Non-final
   1519 template <typename T, size_t width, size_t radix, bool inverse, size_t... I>
   1520 KFR_INTRINSIC void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>,
   1521                                     cbool_t<inverse>, complex<T>* out, const complex<T>* in,
   1522                                     const complex<T>* tw, size_t stride)
   1523 {
   1524     carray<cvec<T, width>, radix> inout;
   1525 
   1526     swallow{ (inout.get(csize_t<I>()) = cread<width>(in + i + stride * I))... };
   1527 
   1528     butterfly(cbool_t<inverse>(), inout.template get<I>()..., inout.template get<I>()...);
   1529 
   1530     swallow{ (
   1531         cwrite<width>(out + i + stride * I,
   1532                       mul_tw<I, radix>(cbool_t<inverse>(), inout.template get<I>(), tw + i * (radix - 1))),
   1533         0)... };
   1534 }
   1535 
   1536 // Final
   1537 template <typename T, size_t width, size_t radix, bool inverse, size_t... I>
   1538 KFR_INTRINSIC void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>,
   1539                                     cbool_t<inverse>, complex<T>* out, const complex<T>* in, size_t stride)
   1540 {
   1541     carray<cvec<T, width>, radix> inout;
   1542 
   1543     //        swallow{ ( inout.get( csize<I> ) = infn( i, I, cvec<T, width>( ) ) )... };
   1544     cread_transposed(ctrue, in + i * radix, inout.template get<I>()...);
   1545 
   1546     butterfly(cbool_t<inverse>(), inout.template get<I>()..., inout.template get<I>()...);
   1547 
   1548     swallow{ (cwrite<width>(out + i + stride * I, inout.get(csize_t<I>())), 0)... };
   1549 }
   1550 
   1551 template <size_t width, size_t radix, typename... Args>
   1552 KFR_INTRINSIC void butterfly(size_t i, csize_t<width>, csize_t<radix>, Args&&... args)
   1553 {
   1554     butterfly_helper(csizeseq_t<radix>(), i, csize_t<width>(), csize_t<radix>(), std::forward<Args>(args)...);
   1555 }
   1556 
   1557 template <typename... Args>
   1558 KFR_INTRINSIC void butterfly_cycle(size_t&, size_t, csize_t<0>, Args&&...)
   1559 {
   1560 }
   1561 template <size_t width, typename... Args>
   1562 KFR_INTRINSIC void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&... args)
   1563 {
   1564     CMT_LOOP_NOUNROLL
   1565     for (; i < count / width * width; i += width)
   1566         butterfly(i, csize_t<width>(), std::forward<Args>(args)...);
   1567     butterfly_cycle(i, count, csize_t<width / 2>(), std::forward<Args>(args)...);
   1568 }
   1569 
   1570 template <size_t width, typename... Args>
   1571 KFR_INTRINSIC void butterflies(size_t count, csize_t<width>, Args&&... args)
   1572 {
   1573     CMT_ASSUME(count > 0);
   1574     size_t i = 0;
   1575     butterfly_cycle(i, count, csize_t<width>(), std::forward<Args>(args)...);
   1576 }
   1577 
   1578 template <typename T, bool inverse, typename Tradix, typename Tstride>
   1579 KFR_INTRINSIC void generic_butterfly_cycle(csize_t<0>, Tradix, cbool_t<inverse>, complex<T>*,
   1580                                            const complex<T>*, Tstride, size_t, size_t, const complex<T>*,
   1581                                            size_t)
   1582 {
   1583 }
   1584 
   1585 template <size_t width, bool inverse, typename T, typename Tradix, typename Thalfradix,
   1586           typename Thalfradixsqr, typename Tstride>
   1587 KFR_INTRINSIC void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t<inverse>, complex<T>* out,
   1588                                            const complex<T>* in, Tstride ostride, Thalfradix halfradix,
   1589                                            Thalfradixsqr halfradix_sqr, const complex<T>* twiddle, size_t i)
   1590 {
   1591     CMT_LOOP_NOUNROLL
   1592     for (; i < halfradix / width * width; i += width)
   1593     {
   1594         const cvec<T, 1> in0 = cread<1>(in);
   1595         cvec<T, width> sum0  = resize<2 * width>(in0);
   1596         cvec<T, width> sum1  = sum0;
   1597 
   1598         for (size_t j = 0; j < halfradix; j++)
   1599         {
   1600             const cvec<T, 1> ina = cread<1>(in + (1 + j));
   1601             const cvec<T, 1> inb = cread<1>(in + radix - (j + 1));
   1602             cvec<T, width> tw    = cread<width>(twiddle);
   1603             if constexpr (inverse)
   1604                 tw = negodd /*cconj*/ (tw);
   1605 
   1606             cmul_2conj(sum0, sum1, ina, inb, tw);
   1607             twiddle += halfradix;
   1608         }
   1609         twiddle = twiddle - halfradix_sqr + width;
   1610 
   1611         if (is_constant_val(ostride))
   1612         {
   1613             cwrite<width>(out + (1 + i), sum0);
   1614             cwrite<width>(out + (radix - (i + 1)) - (width - 1), reverse<2>(sum1));
   1615         }
   1616         else
   1617         {
   1618             cscatter<width>(out + (i + 1) * ostride, ostride, sum0);
   1619             cscatter<width>(out + (radix - (i + 1)) * ostride - (width - 1) * ostride, ostride,
   1620                             reverse<2>(sum1));
   1621         }
   1622     }
   1623     generic_butterfly_cycle(csize_t<width / 2>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix,
   1624                             halfradix_sqr, twiddle, i);
   1625 }
   1626 
   1627 template <typename T>
   1628 KFR_INTRINSIC vec<T, 2> hcadd(vec<T, 2> value)
   1629 {
   1630     return value;
   1631 }
   1632 template <typename T, size_t N, KFR_ENABLE_IF(N >= 4)>
   1633 KFR_INTRINSIC vec<T, 2> hcadd(vec<T, N> value)
   1634 {
   1635     return hcadd(low(value) + high(value));
   1636 }
   1637 
   1638 template <size_t width, typename T, bool inverse, typename Tstride = csize_t<1>>
   1639 KFR_INTRINSIC void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in,
   1640                                        const complex<T>* twiddle, Tstride ostride = Tstride{})
   1641 {
   1642     CMT_ASSUME(radix > 0);
   1643     {
   1644         cvec<T, width> sum = T();
   1645         size_t j           = 0;
   1646         CMT_LOOP_NOUNROLL
   1647         for (; j < radix / width * width; j += width)
   1648         {
   1649             sum += cread<width>(in + j);
   1650         }
   1651         cvec<T, 1> sums = T();
   1652         CMT_LOOP_NOUNROLL
   1653         for (; j < radix; j++)
   1654         {
   1655             sums += cread<1>(in + j);
   1656         }
   1657         cwrite<1>(out, hcadd(sum) + sums);
   1658     }
   1659     const auto halfradix = radix / 2;
   1660     CMT_ASSUME(halfradix > 0);
   1661     size_t i = 0;
   1662 
   1663     generic_butterfly_cycle(csize_t<width>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix,
   1664                             halfradix * halfradix, twiddle, i);
   1665 }
   1666 
   1667 template <size_t width, size_t radix, typename T, bool inverse, typename Tstride = csize_t<1>>
   1668 KFR_INTRINSIC void spec_generic_butterfly_w(csize_t<radix>, cbool_t<inverse>, complex<T>* out,
   1669                                             const complex<T>* in, const complex<T>* twiddle,
   1670                                             Tstride ostride = Tstride{})
   1671 {
   1672     {
   1673         cvec<T, width> sum = T();
   1674         size_t j           = 0;
   1675         CMT_LOOP_UNROLL
   1676         for (; j < radix / width * width; j += width)
   1677         {
   1678             sum += cread<width>(in + j);
   1679         }
   1680         cvec<T, 1> sums = T();
   1681         CMT_LOOP_UNROLL
   1682         for (; j < radix; j++)
   1683         {
   1684             sums += cread<1>(in + j);
   1685         }
   1686         cwrite<1>(out, hcadd(sum) + sums);
   1687     }
   1688     const size_t halfradix     = radix / 2;
   1689     const size_t halfradix_sqr = halfradix * halfradix;
   1690     CMT_ASSUME(halfradix > 0);
   1691     size_t i = 0;
   1692 
   1693     generic_butterfly_cycle(csize_t<width>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix,
   1694                             halfradix_sqr, twiddle, i);
   1695 }
   1696 
   1697 template <typename T, bool inverse, typename Tstride = csize_t<1>>
   1698 KFR_INTRINSIC void generic_butterfly(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in,
   1699                                      complex<T>*, const complex<T>* twiddle, Tstride ostride = {})
   1700 {
   1701     cswitch(
   1702         csizes_t<11, 13>(), radix,
   1703         [&](auto radix_) CMT_INLINE_LAMBDA
   1704         {
   1705             constexpr size_t width = vector_width<T>;
   1706             spec_generic_butterfly_w<width>(radix_, cbool_t<inverse>(), out, in, twiddle, ostride);
   1707         },
   1708         [&]() CMT_INLINE_LAMBDA
   1709         {
   1710             constexpr size_t width = vector_width<T>;
   1711             generic_butterfly_w<width>(radix, cbool_t<inverse>(), out, in, twiddle, ostride);
   1712         });
   1713 }
   1714 
   1715 template <typename T, size_t N>
   1716 constexpr cvec<T, N> cmask08 = broadcast<N * 2, T>(T(), -T());
   1717 
   1718 template <typename T, size_t N>
   1719 constexpr cvec<T, N> cmask0088 = broadcast<N * 4, T>(T(), T(), -T(), -T());
   1720 
   1721 template <bool A = false, typename T, size_t N>
   1722 KFR_INTRINSIC void cbitreverse_write(complex<T>* dest, const vec<T, N>& x)
   1723 {
   1724     cwrite<N / 2, A>(dest, bitreverse<2>(x));
   1725 }
   1726 
   1727 template <bool A = false, typename T, size_t N>
   1728 KFR_INTRINSIC void cdigitreverse4_write(complex<T>* dest, const vec<T, N>& x)
   1729 {
   1730     cwrite<N / 2, A>(dest, digitreverse4<2>(x));
   1731 }
   1732 
   1733 template <size_t N, bool A = false, typename T>
   1734 KFR_INTRINSIC cvec<T, N> cbitreverse_read(const complex<T>* src)
   1735 {
   1736     return bitreverse<2>(cread<N, A>(src));
   1737 }
   1738 
   1739 template <size_t N, bool A = false, typename T>
   1740 KFR_INTRINSIC cvec<T, N> cdigitreverse4_read(const complex<T>* src)
   1741 {
   1742     return digitreverse4<2>(cread<N, A>(src));
   1743 }
   1744 
   1745 #if 1
   1746 
   1747 template <>
   1748 KFR_INTRINSIC cvec<f64, 16> cdigitreverse4_read<16, false, f64>(const complex<f64>* src)
   1749 {
   1750     return concat(cread<1>(src + 0), cread<1>(src + 4), cread<1>(src + 8), cread<1>(src + 12),
   1751                   cread<1>(src + 1), cread<1>(src + 5), cread<1>(src + 9), cread<1>(src + 13),
   1752                   cread<1>(src + 2), cread<1>(src + 6), cread<1>(src + 10), cread<1>(src + 14),
   1753                   cread<1>(src + 3), cread<1>(src + 7), cread<1>(src + 11), cread<1>(src + 15));
   1754 }
   1755 template <>
   1756 KFR_INTRINSIC void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, const vec<f64, 32>& x)
   1757 {
   1758     cwrite<1>(dest, part<16, 0>(x));
   1759     cwrite<1>(dest + 4, part<16, 1>(x));
   1760     cwrite<1>(dest + 8, part<16, 2>(x));
   1761     cwrite<1>(dest + 12, part<16, 3>(x));
   1762 
   1763     cwrite<1>(dest + 1, part<16, 4>(x));
   1764     cwrite<1>(dest + 5, part<16, 5>(x));
   1765     cwrite<1>(dest + 9, part<16, 6>(x));
   1766     cwrite<1>(dest + 13, part<16, 7>(x));
   1767 
   1768     cwrite<1>(dest + 2, part<16, 8>(x));
   1769     cwrite<1>(dest + 6, part<16, 9>(x));
   1770     cwrite<1>(dest + 10, part<16, 10>(x));
   1771     cwrite<1>(dest + 14, part<16, 11>(x));
   1772 
   1773     cwrite<1>(dest + 3, part<16, 12>(x));
   1774     cwrite<1>(dest + 7, part<16, 13>(x));
   1775     cwrite<1>(dest + 11, part<16, 14>(x));
   1776     cwrite<1>(dest + 15, part<16, 15>(x));
   1777 }
   1778 #endif
   1779 } // namespace intrinsics
   1780 } // namespace CMT_ARCH_NAME
   1781 } // namespace kfr
   1782 
   1783 CMT_PRAGMA_MSVC(warning(pop))
   1784 
   1785 CMT_PRAGMA_GNU(GCC diagnostic pop)