kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

fft-impl.hpp (80406B)


      1 /** @addtogroup dft
      2  *  @{
      3  */
      4 /*
      5   Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com)
      6   This file is part of KFR
      7 
      8   KFR is free software: you can redistribute it and/or modify
      9   it under the terms of the GNU General Public License as published by
     10   the Free Software Foundation, either version 2 of the License, or
     11   (at your option) any later version.
     12 
     13   KFR is distributed in the hope that it will be useful,
     14   but WITHOUT ANY WARRANTY; without even the implied warranty of
     15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
     16   GNU General Public License for more details.
     17 
     18   You should have received a copy of the GNU General Public License
     19   along with KFR.
     20 
     21   If GPL is not suitable for your project, you must purchase a commercial license to use KFR.
     22   Buying a commercial license is mandatory as soon as you develop commercial activities without
     23   disclosing the source code of your own applications.
     24   See https://www.kfrlib.com for details.
     25  */
     26 #pragma once
     27 
     28 #include "dft-fft.hpp"
     29 
     30 CMT_PRAGMA_GNU(GCC diagnostic push)
     31 #if CMT_HAS_WARNING("-Wshadow")
     32 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow")
     33 #endif
     34 #if CMT_HAS_WARNING("-Wunused-lambda-capture")
     35 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunused-lambda-capture")
     36 #endif
     37 #if CMT_HAS_WARNING("-Wpass-failed")
     38 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpass-failed")
     39 #endif
     40 
     41 CMT_PRAGMA_MSVC(warning(push))
     42 CMT_PRAGMA_MSVC(warning(disable : 4100))
     43 
     44 namespace kfr
     45 {
     46 inline namespace CMT_ARCH_NAME
     47 {
     48 
     49 constexpr bool inline always_br2 = true;
     50 
     51 template <typename T>
     52 inline std::bitset<DFT_MAX_STAGES> fft_algorithm_selection;
     53 
     54 template <>
     55 inline std::bitset<DFT_MAX_STAGES> fft_algorithm_selection<float>{
     56 #ifdef CMT_ARCH_NEON
     57     0
     58 #else
     59     (1ull << 15) - 1
     60 #endif
     61 };
     62 
     63 template <>
     64 inline std::bitset<DFT_MAX_STAGES> fft_algorithm_selection<double>{ 0 };
     65 
     66 template <typename T>
     67 inline bool use_autosort(size_t log2n)
     68 {
     69     return fft_algorithm_selection<T>[log2n];
     70 }
     71 
     72 #ifndef CMT_ARCH_NEON
     73 #define KFR_AUTOSORT_FOR_2048
     74 #define KFR_AUTOSORT_FOR_128D
     75 #define KFR_AUTOSORT_FOR_256D
     76 #define KFR_AUTOSORT_FOR_512
     77 #define KFR_AUTOSORT_FOR_1024
     78 #endif
     79 
     80 #ifdef CMT_ARCH_AVX
     81 template <>
     82 KFR_INTRINSIC vec<float, 32> ctranspose<4, float, 32>(const vec<float, 32>& v16)
     83 {
     84     cvec<float, 4> r0, r1, r2, r3;
     85     split(v16, r0, r1, r2, r3);
     86     const __m256d t0 = _mm256_unpacklo_pd(_mm256_castps_pd(r0.v), _mm256_castps_pd(r1.v));
     87     const __m256d t1 = _mm256_unpacklo_pd(_mm256_castps_pd(r2.v), _mm256_castps_pd(r3.v));
     88     const __m256d t2 = _mm256_unpackhi_pd(_mm256_castps_pd(r0.v), _mm256_castps_pd(r1.v));
     89     const __m256d t3 = _mm256_unpackhi_pd(_mm256_castps_pd(r2.v), _mm256_castps_pd(r3.v));
     90     r0.v             = _mm256_castpd_ps(_mm256_permute2f128_pd(t0, t1, 0x20));
     91     r1.v             = _mm256_castpd_ps(_mm256_permute2f128_pd(t2, t3, 0x20));
     92     r2.v             = _mm256_castpd_ps(_mm256_permute2f128_pd(t0, t1, 0x31));
     93     r3.v             = _mm256_castpd_ps(_mm256_permute2f128_pd(t2, t3, 0x31));
     94     return concat(r0, r1, r2, r3);
     95 }
     96 #endif
     97 
     98 template <>
     99 KFR_INTRINSIC vec<float, 64> ctranspose<8, float, 64>(const vec<float, 64>& v32)
    100 {
    101     cvec<float, 4> a0, a1, a2, a3, a4, a5, a6, a7;
    102     split(v32, a0, a1, a2, a3, a4, a5, a6, a7);
    103     cvec<float, 16> even = concat(a0, a2, a4, a6);
    104     cvec<float, 16> odd  = concat(a1, a3, a5, a7);
    105     even                 = ctranspose<4>(even);
    106     odd                  = ctranspose<4>(odd);
    107     return concat(even, odd);
    108 }
    109 
    110 template <>
    111 KFR_INTRINSIC vec<float, 64> ctranspose<4, float, 64>(const vec<float, 64>& v32)
    112 {
    113     cvec<float, 16> lo, hi;
    114     split(v32, lo, hi);
    115     lo = ctranspose<4>(lo);
    116     hi = ctranspose<4>(hi);
    117     cvec<float, 4> a0, a1, a2, a3, a4, a5, a6, a7;
    118     split(lo, a0, a1, a2, a3);
    119     split(hi, a4, a5, a6, a7);
    120     return concat(a0, a4, a1, a5, a2, a6, a3, a7);
    121 }
    122 
    123 namespace intrinsics
    124 {
    125 
    126 template <size_t width, bool inverse, typename T>
    127 KFR_INTRINSIC cvec<T, width> radix4_apply_twiddle(csize_t<width>, cfalse_t /*split_format*/, cbool_t<inverse>,
    128                                                   const cvec<T, width>& w, const cvec<T, width>& tw)
    129 {
    130     cvec<T, width> ww  = w;
    131     cvec<T, width> tw_ = tw;
    132     cvec<T, width> b1  = ww * dupeven(tw_);
    133     ww                 = swap<2>(ww);
    134 
    135     if constexpr (inverse)
    136     {
    137         ww = addsub(b1, ww * dupodd(tw_));
    138     }
    139     else
    140     {
    141         ww = subadd(b1, ww * dupodd(tw_));
    142     }
    143     return ww;
    144 }
    145 
    146 template <size_t width, bool use_br2, bool inverse, bool aligned, typename T>
    147 KFR_INTRINSIC void radix4_body(size_t N, csize_t<width>, cfalse_t, cfalse_t, cfalse_t, cbool_t<use_br2>,
    148                                cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in,
    149                                const complex<T>* twiddle)
    150 {
    151     const size_t N4 = N / 4;
    152     cvec<T, width> w1, w2, w3;
    153 
    154     cvec<T, width> sum02, sum13, diff02, diff13;
    155 
    156     cvec<T, width> a0, a1, a2, a3;
    157     a0    = cread<width, aligned>(in + 0);
    158     a2    = cread<width, aligned>(in + N4 * 2);
    159     sum02 = a0 + a2;
    160 
    161     a1    = cread<width, aligned>(in + N4);
    162     a3    = cread<width, aligned>(in + N4 * 3);
    163     sum13 = a1 + a3;
    164 
    165     cwrite<width, aligned>(out, sum02 + sum13);
    166     w2 = sum02 - sum13;
    167     cwrite<width, aligned>(out + N4 * (use_br2 ? 1 : 2),
    168                            radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w2,
    169                                                 cread<width, true>(twiddle + width)));
    170     diff02 = a0 - a2;
    171     diff13 = a1 - a3;
    172     if constexpr (inverse)
    173     {
    174         diff13 = (diff13 ^ broadcast<width * 2, T>(T(), -T()));
    175         diff13 = swap<2>(diff13);
    176     }
    177     else
    178     {
    179         diff13 = swap<2>(diff13);
    180         diff13 = (diff13 ^ broadcast<width * 2, T>(T(), -T()));
    181     }
    182 
    183     w1 = diff02 + diff13;
    184 
    185     cwrite<width, aligned>(out + N4 * (use_br2 ? 2 : 1),
    186                            radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w1,
    187                                                 cread<width, true>(twiddle + 0)));
    188     w3 = diff02 - diff13;
    189     cwrite<width, aligned>(out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(),
    190                                                               w3, cread<width, true>(twiddle + width * 2)));
    191 }
    192 
    193 template <size_t width, bool inverse, typename T>
    194 KFR_INTRINSIC cvec<T, width> radix4_apply_twiddle(csize_t<width>, ctrue_t /*split_format*/, cbool_t<inverse>,
    195                                                   const cvec<T, width>& w, const cvec<T, width>& tw)
    196 {
    197     vec<T, width> re1, im1, twre, twim;
    198     split<T, 2 * width>(w, re1, im1);
    199     split<T, 2 * width>(tw, twre, twim);
    200 
    201     const vec<T, width> b1re = re1 * twre;
    202     const vec<T, width> b1im = im1 * twre;
    203     if constexpr (inverse)
    204         return concat(b1re + im1 * twim, b1im - re1 * twim);
    205     else
    206         return concat(b1re - im1 * twim, b1im + re1 * twim);
    207 }
    208 
    209 template <size_t width, bool splitout, bool splitin, bool use_br2, bool inverse, bool aligned, typename T>
    210 KFR_INTRINSIC void radix4_body(size_t N, csize_t<width>, ctrue_t, cbool_t<splitout>, cbool_t<splitin>,
    211                                cbool_t<use_br2>, cbool_t<inverse>, cbool_t<aligned>, complex<T>* out,
    212                                const complex<T>* in, const complex<T>* twiddle)
    213 {
    214     const size_t N4 = N / 4;
    215     cvec<T, width> w1, w2, w3;
    216     constexpr bool read_split  = !splitin;
    217     constexpr bool write_split = !splitout;
    218 
    219     vec<T, width> re0, im0, re1, im1, re2, im2, re3, im3;
    220 
    221     split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 0), re0, im0);
    222     split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 2), re2, im2);
    223     const vec<T, width> sum02re = re0 + re2;
    224     const vec<T, width> sum02im = im0 + im2;
    225 
    226     split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 1), re1, im1);
    227     split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 3), re3, im3);
    228     const vec<T, width> sum13re = re1 + re3;
    229     const vec<T, width> sum13im = im1 + im3;
    230 
    231     cwrite_split<width, aligned, write_split>(out, concat(sum02re + sum13re, sum02im + sum13im));
    232     w2 = concat(sum02re - sum13re, sum02im - sum13im);
    233     cwrite_split<width, aligned, write_split>(
    234         out + N4 * (use_br2 ? 1 : 2), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w2,
    235                                                            cread<width, true>(twiddle + width)));
    236 
    237     const vec<T, width> diff02re = re0 - re2;
    238     const vec<T, width> diff02im = im0 - im2;
    239     const vec<T, width> diff13re = re1 - re3;
    240     const vec<T, width> diff13im = im1 - im3;
    241 
    242     (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re);
    243     (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re);
    244 
    245     cwrite_split<width, aligned, write_split>(
    246         out + N4 * (use_br2 ? 2 : 1), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w1,
    247                                                            cread<width, true>(twiddle + 0)));
    248     cwrite_split<width, aligned, write_split>(
    249         out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w3,
    250                                            cread<width, true>(twiddle + width * 2)));
    251 }
    252 
    253 template <typename T>
    254 CMT_NOINLINE cvec<T, 1> calculate_twiddle(size_t n, size_t size)
    255 {
    256     if (n == 0)
    257     {
    258         return make_vector(static_cast<T>(1), static_cast<T>(0));
    259     }
    260     else if (n == size / 4)
    261     {
    262         return make_vector(static_cast<T>(0), static_cast<T>(-1));
    263     }
    264     else if (n == size / 2)
    265     {
    266         return make_vector(static_cast<T>(-1), static_cast<T>(0));
    267     }
    268     else if (n == size * 3 / 4)
    269     {
    270         return make_vector(static_cast<T>(0), static_cast<T>(1));
    271     }
    272     else
    273     {
    274         fbase kth  = c_pi<fbase, 2> * (n / static_cast<fbase>(size));
    275         fbase tcos = +kfr::cos(kth);
    276         fbase tsin = -kfr::sin(kth);
    277         return make_vector(static_cast<T>(tcos), static_cast<T>(tsin));
    278     }
    279 }
    280 
    281 template <typename T, size_t width>
    282 KFR_INTRINSIC void initialize_twiddles_impl(complex<T>*& twiddle, size_t nn, size_t nnstep, size_t size,
    283                                             bool split_format)
    284 {
    285     static_assert(width > 0, "width cannot be zero");
    286     vec<T, 2 * width> result = T();
    287     CMT_LOOP_UNROLL
    288     for (size_t i = 0; i < width; i++)
    289     {
    290         const cvec<T, 1> r = calculate_twiddle<T>(nn + nnstep * i, size);
    291         result[i * 2]      = r[0];
    292         result[i * 2 + 1]  = r[1];
    293     }
    294     if (split_format)
    295         ref_cast<cvec<T, width>>(twiddle[0]) = splitpairs(result);
    296     else
    297         ref_cast<cvec<T, width>>(twiddle[0]) = result;
    298     twiddle += width;
    299 }
    300 
    301 template <typename T, size_t width>
    302 CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, size_t size, bool split_format)
    303 {
    304     static_assert(width > 0, "width cannot be zero");
    305     const size_t count = stage_size / 4;
    306     size_t nnstep      = size / stage_size;
    307     DFT_ASSERT(width <= count);
    308     CMT_LOOP_NOUNROLL
    309     for (size_t n = 0; n < count; n += width)
    310     {
    311         initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 1, nnstep * 1, size, split_format);
    312         initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 2, nnstep * 2, size, split_format);
    313         initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 3, nnstep * 3, size, split_format);
    314     }
    315 }
    316 
    317 #ifdef KFR_NO_PREFETCH
    318 #define KFR_PREFETCH(addr)                                                                                   \
    319     do                                                                                                       \
    320     {                                                                                                        \
    321         (void)(addr);                                                                                        \
    322     } while (0)
    323 #else
    324 
    325 #if defined CMT_ARCH_SSE
    326 #ifdef CMT_COMPILER_GNU
    327 #define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr), 0, _MM_HINT_T0);
    328 #else
    329 #define KFR_PREFETCH(addr) _mm_prefetch(::kfr::ptr_cast<char>(addr), _MM_HINT_T0);
    330 #endif
    331 #else
    332 #define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr));
    333 #endif
    334 #endif
    335 
    336 template <size_t size = 1, typename T>
    337 KFR_INTRINSIC void prefetch_one(const complex<T>* in)
    338 {
    339     KFR_PREFETCH(in);
    340     if constexpr (sizeof(complex<T>) * size > 64)
    341         KFR_PREFETCH(in + 64);
    342     if constexpr (sizeof(complex<T>) * size > 128)
    343         KFR_PREFETCH(in + 128);
    344     if constexpr (sizeof(complex<T>) * size > 192)
    345         KFR_PREFETCH(in + 192);
    346 }
    347 
    348 template <size_t size = 1, typename T>
    349 KFR_INTRINSIC void prefetch_four(size_t stride, const complex<T>* in)
    350 {
    351     prefetch_one<size>(in);
    352     prefetch_one<size>(in + stride);
    353     prefetch_one<size>(in + stride * 2);
    354     prefetch_one<size>(in + stride * 3);
    355 }
    356 
    357 template <size_t size = 1, typename T>
    358 KFR_INTRINSIC void prefetch_eight(size_t stride, const complex<T>* in)
    359 {
    360     prefetch_one<size>(in);
    361     prefetch_one<size>(in + stride);
    362     prefetch_one<size>(in + stride * 2);
    363     prefetch_one<size>(in + stride * 3);
    364     prefetch_one<size>(in + stride * 4);
    365     prefetch_one<size>(in + stride * 5);
    366     prefetch_one<size>(in + stride * 6);
    367     prefetch_one<size>(in + stride * 7);
    368 }
    369 
    370 template <typename Ntype, size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2,
    371           bool inverse, bool aligned, typename T>
    372 KFR_INTRINSIC cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool_t<splitout>,
    373                                    cbool_t<splitin>, cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>,
    374                                    cbool_t<aligned>, complex<T>* out, const complex<T>* in,
    375                                    const complex<T>*& twiddle)
    376 {
    377     static_assert(width > 0, "width cannot be zero");
    378     constexpr static size_t prefetch_cycles = 8;
    379     const auto N4                           = N / csize_t<4>();
    380     const auto N43                          = N4 * csize_t<3>();
    381     CMT_ASSUME(blocks > 0);
    382     CMT_ASSUME(N > 0);
    383     CMT_ASSUME(N4 > 0);
    384     DFT_ASSERT(width <= N4);
    385     CMT_LOOP_NOUNROLL for (size_t b = 0; b < blocks; b++)
    386     {
    387         CMT_LOOP_NOUNROLL
    388         for (size_t n2 = 0; n2 < N4;)
    389         {
    390             if constexpr (prefetch)
    391                 prefetch_four<width>(N4, in + width * prefetch_cycles);
    392             radix4_body(N, csize_t<width>(), cbool_t<(splitout || splitin)>(), cbool_t<splitout>(),
    393                         cbool_t<splitin>(), cbool_t<use_br2>(), cbool_t<inverse>(), cbool_t<aligned>(), out,
    394                         in, twiddle + n2 * 3);
    395             in += width;
    396             out += width;
    397             n2 += width;
    398         }
    399         in += N43;
    400         out += N43;
    401     }
    402     twiddle += N43;
    403     return {};
    404 }
    405 
    406 template <size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2, bool inverse, typename T>
    407 KFR_INTRINSIC void radix2_autosort_pass(size_t N, size_t stride, csize_t<width>, cbool_t<splitout>,
    408                                         cbool_t<splitin>, cbool_t<use_br2>, cbool_t<prefetch>,
    409                                         cbool_t<inverse>, complex<T>* out, const complex<T>* in,
    410                                         const complex<T>*& twiddle)
    411 
    412 {
    413     for (size_t n = 0; n < stride; n++)
    414     {
    415         const cvec<T, 1> a = cread<1>(in + n);
    416         const cvec<T, 1> b = cread<1>(in + n + stride);
    417         cwrite<1>(out + n, a + b);
    418         cwrite<1>(out + n + stride, a - b);
    419     }
    420 }
    421 
    422 template <size_t N, bool inverse, bool split_format, typename T>
    423 KFR_INTRINSIC void radix4_butterfly(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3,
    424                                     cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3)
    425 {
    426     if constexpr (split_format)
    427     {
    428         const cvec<T, N> sum02  = a0 + a2;
    429         const cvec<T, N> diff02 = a0 - a2;
    430         const cvec<T, N> sum13  = a1 + a3;
    431         cvec<T, N> diff13       = a1 - a3;
    432         vec<T, N> diff13re, diff13im, diff02re, diff02im;
    433         split(diff02, diff02re, diff02im);
    434         split(diff13, diff13re, diff13im);
    435         w0 = sum02 + sum13;
    436         w2 = sum02 - sum13;
    437 
    438         (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re);
    439         (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re);
    440     }
    441     else
    442     {
    443         const cvec<T, N> sum02  = a0 + a2;
    444         const cvec<T, N> diff02 = a0 - a2;
    445         const cvec<T, N> sum13  = a1 + a3;
    446         cvec<T, N> diff13       = swap<2>(a1 - a3);
    447         if constexpr (inverse)
    448             diff13 = negodd(diff13);
    449         else
    450             diff13 = negeven(diff13);
    451         w0 = sum02 + sum13;
    452         w2 = sum02 - sum13;
    453         w1 = diff02 - diff13;
    454         w3 = diff02 + diff13;
    455     }
    456 }
    457 
    458 template <size_t N, bool inverse, bool split_format, typename T>
    459 KFR_INTRINSIC void radix4_butterfly(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3,
    460                                     cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3,
    461                                     cvec<T, N> tw1, cvec<T, N> tw2, cvec<T, N> tw3)
    462 {
    463     if constexpr (split_format)
    464     {
    465         const cvec<T, N> sum02  = a0 + a2;
    466         const cvec<T, N> diff02 = a0 - a2;
    467         const cvec<T, N> sum13  = a1 + a3;
    468         cvec<T, N> diff13       = a1 - a3;
    469         vec<T, N> diff13re, diff13im, diff02re, diff02im;
    470         split(diff02, diff02re, diff02im);
    471         split(diff13, diff13re, diff13im);
    472 
    473         w0                  = sum02 + sum13;
    474         w2                  = sum02 - sum13;
    475         (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re);
    476         (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re);
    477 
    478         w2 = radix4_apply_twiddle(csize<N>, ctrue, cbool<inverse>, w2, tw2);
    479         w1 = radix4_apply_twiddle(csize<N>, ctrue, cbool<inverse>, w1, tw1);
    480         w3 = radix4_apply_twiddle(csize<N>, ctrue, cbool<inverse>, w3, tw3);
    481     }
    482     else
    483     {
    484         const cvec<T, N> sum02  = a0 + a2;
    485         const cvec<T, N> diff02 = a0 - a2;
    486         const cvec<T, N> sum13  = a1 + a3;
    487         cvec<T, N> diff13       = swap<2>(a1 - a3);
    488         if constexpr (inverse)
    489             diff13 = negodd(diff13);
    490         else
    491             diff13 = negeven(diff13);
    492 
    493         w0 = sum02 + sum13;
    494         w2 = sum02 - sum13;
    495         w1 = diff02 - diff13;
    496         w3 = diff02 + diff13;
    497         w2 = radix4_apply_twiddle(csize<N>, cfalse, cbool<inverse>, w2, tw2);
    498         w1 = radix4_apply_twiddle(csize<N>, cfalse, cbool<inverse>, w1, tw1);
    499         w3 = radix4_apply_twiddle(csize<N>, cfalse, cbool<inverse>, w3, tw3);
    500     }
    501 }
    502 
    503 template <size_t N, bool split_format, typename T>
    504 KFR_INTRINSIC cvec<T, N> read_twiddle(const complex<T>* tw)
    505 {
    506     if constexpr (split_format)
    507     {
    508         return concat(repeat<N>(read(cunaligned, csize<1>, ptr_cast<T>(tw))),
    509                       repeat<N>(read(cunaligned, csize<1>, ptr_cast<T>(tw) + 1)));
    510     }
    511     else
    512     {
    513         return repeat<N>(cread<1>(tw));
    514     }
    515 }
    516 
    517 template <size_t width_, bool splitout, bool splitin, bool prefetch, bool inverse, typename T>
    518 KFR_INTRINSIC void radix4_autosort_pass_first(size_t N, csize_t<width_>, cbool_t<splitout>, cbool_t<splitin>,
    519                                               cbool_t<prefetch>, cbool_t<inverse>, complex<T>* out,
    520                                               const complex<T>* in, const complex<T>*& twiddle)
    521 {
    522     static_assert(width_ > 0, "width cannot be zero");
    523     const size_t N4        = N / 4;
    524     const size_t Nstride   = N4;
    525     constexpr size_t width = width_;
    526     constexpr bool split   = splitin || splitout;
    527     static_assert(!split);
    528     constexpr static size_t prefetch_cycles = 8;
    529 
    530     // CMT_LOOP_NOUNROLL
    531     for (size_t b = 0; b < N4; b += width)
    532     {
    533         if constexpr (prefetch)
    534             prefetch_four<width>(Nstride, in + prefetch_cycles * width);
    535         cvec<T, width> tw1 = cread<width>(twiddle);
    536         cvec<T, width> tw2 = cread<width>(twiddle + width);
    537         cvec<T, width> tw3 = cread<width>(twiddle + 2 * width);
    538 
    539         const cvec<T, width> a0 = cread<width>(in + 0 * Nstride);
    540         const cvec<T, width> a1 = cread<width>(in + 1 * Nstride);
    541         const cvec<T, width> a2 = cread<width>(in + 2 * Nstride);
    542         const cvec<T, width> a3 = cread<width>(in + 3 * Nstride);
    543         cvec<T, width> w0, w1, w2, w3;
    544         radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3, tw1, tw2, tw3);
    545         cvec<T, 4 * width> w0123 = concat(w0, w1, w2, w3);
    546         w0123                    = ctranspose<width>(w0123);
    547         cwrite<4 * width>(out, w0123);
    548         twiddle += 3 * width;
    549         in += width;
    550         out += 4 * width;
    551     }
    552 }
    553 
    554 template <size_t width, bool splitout, bool splitin, bool prefetch, bool inverse, typename T>
    555 KFR_INTRINSIC void radix4_autosort_pass_last(size_t stride, csize_t<width>, cbool_t<splitout>,
    556                                              cbool_t<splitin>, cbool_t<prefetch>, cbool_t<inverse>,
    557                                              complex<T>* out, const complex<T>* in,
    558                                              const complex<T>*& twiddle)
    559 {
    560     static_assert(width > 0, "width cannot be zero");
    561     constexpr static size_t prefetch_cycles = 8;
    562     constexpr bool split                    = splitin || splitout;
    563     constexpr bool read_split               = !splitin && split;
    564     constexpr bool write_split              = !splitout && split;
    565     static_assert(!splitout);
    566 
    567     CMT_PRAGMA_CLANG(clang loop unroll_count(4))
    568     for (size_t n = 0; n < stride; n += width)
    569     {
    570         if constexpr (prefetch)
    571             prefetch_four<width>(stride, in + prefetch_cycles * width);
    572         const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * stride);
    573         const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * stride);
    574         const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * stride);
    575         const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * stride);
    576         cvec<T, width> w0, w1, w2, w3;
    577         radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3);
    578         cwrite_split<width, false, write_split>(out + 0 * stride, w0);
    579         cwrite_split<width, false, write_split>(out + 1 * stride, w1);
    580         cwrite_split<width, false, write_split>(out + 2 * stride, w2);
    581         cwrite_split<width, false, write_split>(out + 3 * stride, w3);
    582         in += width;
    583         out += width;
    584     }
    585 }
    586 
    587 template <size_t width, bool splitout, bool splitin, bool prefetch, bool inverse, typename T>
    588 KFR_INTRINSIC void radix8_autosort_pass_last(size_t stride, csize_t<width>, cbool_t<splitout>,
    589                                              cbool_t<splitin>, cbool_t<prefetch>, cbool_t<inverse>,
    590                                              complex<T>* out, const complex<T>* in,
    591                                              const complex<T>*& twiddle)
    592 {
    593     static_assert(width > 0, "width cannot be zero");
    594     constexpr static size_t prefetch_cycles = 4;
    595     constexpr bool split                    = splitin || splitout;
    596     constexpr bool read_split               = !splitin && split;
    597     constexpr bool write_split              = !splitout && split;
    598     static_assert(!splitout);
    599 
    600     CMT_PRAGMA_CLANG(clang loop unroll_count(4))
    601     for (size_t n = 0; n < stride; n += width)
    602     {
    603         if constexpr (prefetch)
    604             prefetch_eight<width>(stride, in + prefetch_cycles * width);
    605         const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * stride);
    606         const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * stride);
    607         const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * stride);
    608         const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * stride);
    609         const cvec<T, width> a4 = cread_split<width, false, read_split>(in + 4 * stride);
    610         const cvec<T, width> a5 = cread_split<width, false, read_split>(in + 5 * stride);
    611         const cvec<T, width> a6 = cread_split<width, false, read_split>(in + 6 * stride);
    612         const cvec<T, width> a7 = cread_split<width, false, read_split>(in + 7 * stride);
    613         cvec<T, width> w0, w1, w2, w3, w4, w5, w6, w7;
    614         butterfly8<width, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, w0, w1, w2, w3, w4, w5, w6, w7);
    615         cwrite_split<width, false, write_split>(out + 0 * stride, w0);
    616         cwrite_split<width, false, write_split>(out + 1 * stride, w1);
    617         cwrite_split<width, false, write_split>(out + 2 * stride, w2);
    618         cwrite_split<width, false, write_split>(out + 3 * stride, w3);
    619         cwrite_split<width, false, write_split>(out + 4 * stride, w4);
    620         cwrite_split<width, false, write_split>(out + 5 * stride, w5);
    621         cwrite_split<width, false, write_split>(out + 6 * stride, w6);
    622         cwrite_split<width, false, write_split>(out + 7 * stride, w7);
    623         in += width;
    624         out += width;
    625     }
    626 }
    627 
    628 template <size_t width, bool splitout, bool splitin, bool prefetch, bool inverse, typename T>
    629 KFR_INTRINSIC void radix4_autosort_pass(size_t N, size_t stride, csize_t<width>, cbool_t<splitout>,
    630                                         cbool_t<splitin>, cbool_t<prefetch>, cbool_t<inverse>,
    631                                         complex<T>* out, const complex<T>* in, const complex<T>*& twiddle)
    632 {
    633     static_assert(width > 0, "width cannot be zero");
    634     constexpr static size_t prefetch_cycles = 8;
    635     const size_t N4                         = N / 4;
    636     const size_t Nstride                    = stride * N4;
    637     const size_t stride3                    = 3 * stride;
    638     constexpr bool split                    = splitin || splitout;
    639     constexpr bool read_split               = !splitin && split;
    640     constexpr bool write_split              = !splitout && split;
    641 
    642     {
    643         for (size_t n = 0; n < stride; n += width)
    644         {
    645             if constexpr (prefetch)
    646                 prefetch_four<width>(Nstride, in + prefetch_cycles * width);
    647             const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * Nstride);
    648             const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * Nstride);
    649             const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * Nstride);
    650             const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * Nstride);
    651             cvec<T, width> w0, w1, w2, w3;
    652             radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3);
    653             cwrite_split<width, false, write_split>(out + 0 * stride, w0);
    654             cwrite_split<width, false, write_split>(out + 1 * stride, w1);
    655             cwrite_split<width, false, write_split>(out + 2 * stride, w2);
    656             cwrite_split<width, false, write_split>(out + 3 * stride, w3);
    657             in += width;
    658             out += width;
    659         }
    660         twiddle += 3;
    661         out += stride3;
    662     }
    663 
    664     // CMT_LOOP_NOUNROLL
    665     for (size_t b = 1; b < N4; b++)
    666     {
    667         cvec<T, width> tw1 = read_twiddle<width, split>(twiddle);
    668         cvec<T, width> tw2 = read_twiddle<width, split>(twiddle + 1);
    669         cvec<T, width> tw3 = read_twiddle<width, split>(twiddle + 2);
    670         for (size_t n = 0; n < stride; n += width)
    671         {
    672             if constexpr (prefetch)
    673                 prefetch_four<width>(Nstride, in + prefetch_cycles * width);
    674             const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * Nstride);
    675             const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * Nstride);
    676             const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * Nstride);
    677             const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * Nstride);
    678             cvec<T, width> w0, w1, w2, w3;
    679             radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3, tw1, tw2, tw3);
    680             cwrite_split<width, false, write_split>(out + 0 * stride, w0);
    681             cwrite_split<width, false, write_split>(out + 1 * stride, w1);
    682             cwrite_split<width, false, write_split>(out + 2 * stride, w2);
    683             cwrite_split<width, false, write_split>(out + 3 * stride, w3);
    684             in += width;
    685             out += width;
    686         }
    687         twiddle += 3;
    688         out += stride3;
    689     }
    690 }
    691 
    692 template <typename T>
    693 static void initialize_twiddle_autosort(size_t N, size_t w, complex<T>*& twiddle)
    694 {
    695     for (size_t b = 0; b < N / 4; ++b)
    696     {
    697         cwrite<1>(twiddle + b / w * 3 * w + b % w + 0 * w, calculate_twiddle<T>(b, N));
    698         cwrite<1>(twiddle + b / w * 3 * w + b % w + 1 * w, calculate_twiddle<T>(2 * b, N));
    699         cwrite<1>(twiddle + b / w * 3 * w + b % w + 2 * w, calculate_twiddle<T>(3 * b, N));
    700     }
    701     twiddle += N / 4 * 3;
    702 }
    703 
    704 template <typename T>
    705 KFR_INTRINSIC void interleavehalves32(cvec<T, 32>& v0)
    706 {
    707     cvec<T, 8> t0, t1, t2, t3;
    708     split(v0, t0, t1, t2, t3);
    709     t0 = interleavehalves(t0);
    710     t1 = interleavehalves(t1);
    711     t2 = interleavehalves(t2);
    712     t3 = interleavehalves(t3);
    713     v0 = concat(t0, t1, t2, t3);
    714 }
    715 
    716 template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
    717 KFR_INTRINSIC ctrue_t radix4_pass(csize_t<8>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t,
    718                                   cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
    719                                   complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
    720 {
    721     CMT_ASSUME(blocks > 0);
    722     DFT_ASSERT(4 <= blocks);
    723     constexpr static size_t prefetch_cycles = 8;
    724     if constexpr (vector_capacity<T> >= 128)
    725     {
    726         CMT_PRAGMA_CLANG(clang loop unroll(disable))
    727         for (size_t b = 0; b < blocks; b += 4)
    728         {
    729             if constexpr (prefetch)
    730                 prefetch_one<32>(out + prefetch_cycles * 32);
    731 
    732             cvec<T, 32> v32 = cread<32, aligned>(out);
    733             cvec<T, 4> v0, v1, v2, v3, v4, v5, v6, v7;
    734             v32 = ctranspose<8>(v32);
    735             split(v32, v0, v1, v2, v3, v4, v5, v6, v7);
    736             butterfly8<4, inverse>(v0, v1, v2, v3, v4, v5, v6, v7);
    737             v32 = concat(v0, v4, v2, v6, v1, v5, v3, v7);
    738             v32 = ctranspose<4>(v32);
    739             cwrite<32, aligned>(out, v32);
    740 
    741             out += 32;
    742         }
    743     }
    744     else
    745     {
    746         CMT_PRAGMA_CLANG(clang loop unroll(disable))
    747         for (size_t b = 0; b < blocks; b += 2)
    748         {
    749             if constexpr (prefetch)
    750                 prefetch_one<16>(out + prefetch_cycles * 16);
    751 
    752             cvec<T, 16> v16 = cread<16, aligned>(out);
    753             cvec<T, 2> v0, v1, v2, v3, v4, v5, v6, v7;
    754             v16 = ctranspose<8>(v16);
    755             split(v16, v0, v1, v2, v3, v4, v5, v6, v7);
    756             butterfly8<2, inverse>(v0, v1, v2, v3, v4, v5, v6, v7);
    757             v16 = concat(v0, v4, v2, v6, v1, v5, v3, v7);
    758             v16 = ctranspose<2>(v16);
    759             cwrite<16, aligned>(out, v16);
    760 
    761             out += 16;
    762         }
    763     }
    764     return {};
    765 }
    766 
    767 template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
    768 KFR_INTRINSIC ctrue_t radix4_pass(csize_t<16>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t,
    769                                   cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
    770                                   complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
    771 {
    772     CMT_ASSUME(blocks > 0);
    773     constexpr static size_t prefetch_cycles = 4;
    774     DFT_ASSERT(4 <= blocks);
    775     if constexpr (vector_capacity<T> >= 128)
    776     {
    777         CMT_PRAGMA_CLANG(clang loop unroll(disable))
    778         for (size_t b = 0; b < blocks; b += 4)
    779         {
    780             if constexpr (prefetch)
    781                 prefetch_one<64>(out + prefetch_cycles * 64);
    782 
    783             cvec<T, 16> v0 = cread<16, aligned>(out);
    784             cvec<T, 16> v1 = cread<16, aligned>(out + 16);
    785             cvec<T, 16> v2 = cread<16, aligned>(out + 32);
    786             cvec<T, 16> v3 = cread<16, aligned>(out + 48);
    787             butterfly4_packed<4, inverse>(v0);
    788             butterfly4_packed<4, inverse>(v1);
    789             butterfly4_packed<4, inverse>(v2);
    790             butterfly4_packed<4, inverse>(v3);
    791             apply_twiddles4<0, 4, 4, inverse>(v0);
    792             apply_twiddles4<0, 4, 4, inverse>(v1);
    793             apply_twiddles4<0, 4, 4, inverse>(v2);
    794             apply_twiddles4<0, 4, 4, inverse>(v3);
    795             v0 = digitreverse4<2>(v0);
    796             v1 = digitreverse4<2>(v1);
    797             v2 = digitreverse4<2>(v2);
    798             v3 = digitreverse4<2>(v3);
    799             butterfly4_packed<4, inverse>(v0);
    800             butterfly4_packed<4, inverse>(v1);
    801             butterfly4_packed<4, inverse>(v2);
    802             butterfly4_packed<4, inverse>(v3);
    803 
    804             use_br2 ? cbitreverse_write(out, v0) : cdigitreverse4_write(out, v0);
    805             use_br2 ? cbitreverse_write(out + 16, v1) : cdigitreverse4_write(out + 16, v1);
    806             use_br2 ? cbitreverse_write(out + 32, v2) : cdigitreverse4_write(out + 32, v2);
    807             use_br2 ? cbitreverse_write(out + 48, v3) : cdigitreverse4_write(out + 48, v3);
    808             out += 64;
    809         }
    810     }
    811     else
    812     {
    813         CMT_PRAGMA_CLANG(clang loop unroll(disable))
    814         for (size_t b = 0; b < blocks; b += 2)
    815         {
    816             if constexpr (prefetch)
    817                 prefetch_one<32>(out + prefetch_cycles * 32);
    818 
    819             cvec<T, 16> vlo = cread<16, aligned>(out);
    820             cvec<T, 16> vhi = cread<16, aligned>(out + 16);
    821             butterfly4_packed<4, inverse>(vlo);
    822             butterfly4_packed<4, inverse>(vhi);
    823             apply_twiddles4<0, 4, 4, inverse>(vlo);
    824             apply_twiddles4<0, 4, 4, inverse>(vhi);
    825             vlo = digitreverse4<2>(vlo);
    826             vhi = digitreverse4<2>(vhi);
    827             butterfly4_packed<4, inverse>(vlo);
    828             butterfly4_packed<4, inverse>(vhi);
    829 
    830             use_br2 ? cbitreverse_write(out, vlo) : cdigitreverse4_write(out, vlo);
    831             use_br2 ? cbitreverse_write(out + 16, vhi) : cdigitreverse4_write(out + 16, vhi);
    832             out += 32;
    833         }
    834     }
    835     return {};
    836 }
    837 
    838 template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T>
    839 KFR_INTRINSIC ctrue_t radix4_pass(csize_t<4>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t,
    840                                   cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>,
    841                                   complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/)
    842 {
    843     constexpr static size_t prefetch_cycles = 8;
    844     CMT_ASSUME(blocks > 8);
    845     DFT_ASSERT(8 <= blocks);
    846     for (size_t b = 0; b < blocks; b += 4)
    847     {
    848         if constexpr (prefetch)
    849             prefetch_one<16>(out + prefetch_cycles * 16);
    850 
    851         cvec<T, 16> v16 = cdigitreverse4_read<16, aligned>(out);
    852         butterfly4_packed<4, inverse>(v16);
    853         if constexpr (use_br2)
    854             v16 = permutegroups<(8), 0, 2, 1, 3>(v16);
    855         cdigitreverse4_write<aligned>(out, v16);
    856 
    857         out += 4 * 4;
    858     }
    859     return {};
    860 }
    861 
    862 template <typename T>
    863 struct fft_config
    864 {
    865     constexpr static inline const bool recursion = true;
    866 #ifdef CMT_ARCH_NEON
    867     constexpr static inline const bool prefetch = false;
    868 #else
    869     constexpr static inline const bool prefetch = true;
    870 #endif
    871     constexpr static inline const size_t process_width =
    872         const_max(static_cast<size_t>(1), vector_capacity<T> / 16);
    873 };
    874 
    875 template <typename T, bool splitin, bool is_even>
    876 struct fft_stage_impl : dft_stage<T>
    877 {
    878     fft_stage_impl(size_t stage_size)
    879     {
    880         this->name       = dft_name(this);
    881         this->radix      = 4;
    882         this->stage_size = stage_size;
    883         this->repeats    = 4;
    884         this->recursion  = fft_config<T>::recursion;
    885         this->data_size =
    886             align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment);
    887     }
    888 
    889     constexpr static bool prefetch = fft_config<T>::prefetch;
    890     constexpr static bool aligned  = false;
    891     constexpr static size_t width  = fft_config<T>::process_width;
    892     constexpr static bool use_br2  = !is_even || always_br2;
    893 
    894     virtual void do_initialize(size_t size) override final
    895     {
    896         complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
    897         initialize_twiddles<T, width>(twiddle, this->stage_size, size, true);
    898     }
    899 
    900     DFT_STAGE_FN
    901     template <bool inverse>
    902     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
    903     {
    904         const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
    905         if constexpr (splitin)
    906             in = out;
    907         const size_t stg_size = this->stage_size;
    908         CMT_ASSUME(stg_size >= 2048);
    909         CMT_ASSUME(stg_size % 2048 == 0);
    910         radix4_pass(stg_size, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<use_br2>(),
    911                     cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
    912     }
    913 };
    914 
    915 template <typename T, bool splitin, size_t size>
    916 struct fft_final_stage_impl : dft_stage<T>
    917 {
    918     fft_final_stage_impl(size_t)
    919     {
    920         this->name       = dft_name(this);
    921         this->radix      = size;
    922         this->stage_size = size;
    923         this->out_offset = size;
    924         this->repeats    = 4;
    925         this->recursion  = fft_config<T>::recursion;
    926         this->data_size  = align_up(sizeof(complex<T>) * size * 3 / 2, platform<>::native_cache_alignment);
    927     }
    928 
    929     constexpr static size_t width  = fft_config<T>::process_width;
    930     constexpr static bool is_even  = cometa::is_even(ilog2(size));
    931     constexpr static bool use_br2  = !is_even || always_br2;
    932     constexpr static bool aligned  = false;
    933     constexpr static bool prefetch = fft_config<T>::prefetch && splitin;
    934 
    935     template <bool pass_splitin>
    936     KFR_MEM_INTRINSIC void init_twiddles(csize_t<8>, size_t, cbool_t<pass_splitin>, complex<T>*&)
    937     {
    938     }
    939     template <bool pass_splitin>
    940     KFR_MEM_INTRINSIC void init_twiddles(csize_t<4>, size_t, cbool_t<pass_splitin>, complex<T>*&)
    941     {
    942     }
    943 
    944     static constexpr bool get_pass_splitout(size_t N) { return N / 4 > 8 && N / 4 / 4 >= width; }
    945 
    946     template <size_t N, bool pass_splitin>
    947     KFR_MEM_INTRINSIC void init_twiddles(csize_t<N>, size_t total_size, cbool_t<pass_splitin>,
    948                                          complex<T>*& twiddle)
    949     {
    950         constexpr bool pass_splitout = get_pass_splitout(N);
    951         constexpr size_t pass_width  = const_min(width, N / 4);
    952         initialize_twiddles<T, pass_width>(twiddle, N, total_size, pass_splitout || pass_splitin);
    953         init_twiddles(csize<N / 4>, total_size, cbool<pass_splitout>, twiddle);
    954     }
    955 
    956     virtual void do_initialize(size_t total_size) override final
    957     {
    958         complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
    959         init_twiddles(csize<size>, total_size, cbool<splitin>, twiddle);
    960     }
    961 
    962     DFT_STAGE_FN_NONFINAL
    963     template <bool inverse>
    964     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
    965     {
    966         const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
    967         final_stage<inverse>(csize<size>, 1, cbool<splitin>, out, in, twiddle);
    968     }
    969 
    970     template <bool inverse, bool pass_splitin, typename U = T, KFR_ENABLE_IF(vector_capacity<U> >= 128)>
    971     KFR_MEM_INTRINSIC void final_stage(csize_t<16>, size_t invN, cbool_t<pass_splitin>, complex<T>* out,
    972                                        const complex<T>*, const complex<T>*& twiddle)
    973     {
    974         radix4_pass(csize_t<16>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
    975                     cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
    976     }
    977 
    978     template <bool inverse, bool pass_splitin>
    979     KFR_MEM_INTRINSIC void final_stage(csize_t<8>, size_t invN, cbool_t<pass_splitin>, complex<T>* out,
    980                                        const complex<T>*, const complex<T>*& twiddle)
    981     {
    982         radix4_pass(csize_t<8>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
    983                     cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
    984     }
    985 
    986     template <bool inverse, bool pass_splitin>
    987     KFR_MEM_INTRINSIC void final_stage(csize_t<4>, size_t invN, cbool_t<pass_splitin>, complex<T>* out,
    988                                        const complex<T>*, const complex<T>*& twiddle)
    989     {
    990         radix4_pass(csize_t<4>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
    991                     cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
    992     }
    993 
    994     template <bool inverse, size_t N, bool pass_splitin>
    995     KFR_MEM_INTRINSIC void final_stage(csize_t<N>, size_t invN, cbool_t<pass_splitin>, complex<T>* out,
    996                                        const complex<T>* in, const complex<T>*& twiddle)
    997     {
    998         static_assert(N > 8, "");
    999         constexpr bool pass_splitout = get_pass_splitout(N);
   1000         constexpr size_t pass_width  = const_min(width, N / 4);
   1001         static_assert(pass_width == width || !pass_splitin, "");
   1002         static_assert(pass_width <= N / 4, "");
   1003         radix4_pass(N, invN, csize_t<pass_width>(), cbool<pass_splitout>, cbool_t<pass_splitin>(),
   1004                     cbool_t<use_br2>(), cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in,
   1005                     twiddle);
   1006         final_stage<inverse>(csize<N / 4>, invN * 4, cbool<pass_splitout>, out, out, twiddle);
   1007     }
   1008 };
   1009 
   1010 template <typename T, bool is_even>
   1011 struct fft_reorder_stage_impl : dft_stage<T>
   1012 {
   1013     fft_reorder_stage_impl(size_t stage_size)
   1014     {
   1015         this->name       = dft_name(this);
   1016         this->stage_size = stage_size;
   1017         this->user       = ilog2(stage_size);
   1018         this->data_size  = 0;
   1019     }
   1020 
   1021     virtual void do_initialize(size_t) override final {}
   1022 
   1023     DFT_STAGE_FN
   1024     template <bool inverse>
   1025     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>*, u8*)
   1026     {
   1027         fft_reorder(out, this->user, cbool_t<(!is_even || always_br2)>());
   1028     }
   1029 };
   1030 
   1031 template <typename T, bool is_first, bool is_last, bool radix8>
   1032 struct fft_autosort_stage_impl : dft_stage<T>
   1033 {
   1034     fft_autosort_stage_impl(size_t stage_size, size_t stride)
   1035     {
   1036         this->name         = dft_name(this);
   1037         this->radix        = radix8 ? 8 : 4;
   1038         this->stage_size   = stage_size * stride * this->radix;
   1039         this->blocks       = stage_size;
   1040         this->recursion    = false;
   1041         this->can_inplace  = is_last;
   1042         this->need_reorder = false;
   1043         this->user         = stride;
   1044         if constexpr (!is_last)
   1045         {
   1046             this->data_size =
   1047                 align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment);
   1048         }
   1049     }
   1050 
   1051     constexpr static bool prefetch = fft_config<T>::prefetch;
   1052     constexpr static bool aligned  = false;
   1053 
   1054     constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
   1055 
   1056     void do_initialize(size_t total_size) final
   1057     {
   1058         if constexpr (!is_last)
   1059         {
   1060             complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
   1061             if constexpr (is_first)
   1062                 initialize_twiddle_autosort(this->blocks, width, twiddle);
   1063             else
   1064                 initialize_twiddle_autosort(this->blocks, 1, twiddle);
   1065         }
   1066     }
   1067 
   1068     DFT_STAGE_FN
   1069     template <bool inverse>
   1070     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
   1071     {
   1072         const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
   1073         const size_t stg_size     = this->blocks;
   1074         const size_t stride       = this->user;
   1075         if constexpr (is_first)
   1076         {
   1077             radix4_autosort_pass_first(stg_size, csize_t<width>(), cfalse, cfalse, cbool_t<prefetch>(),
   1078                                        cbool_t<inverse>(), out, in, twiddle);
   1079         }
   1080         else if constexpr (is_last)
   1081         {
   1082             if constexpr (radix8)
   1083                 radix8_autosort_pass_last(stride, csize_t<width / 2>(), cfalse, cfalse, cbool_t<prefetch>(),
   1084                                           cbool_t<inverse>(), out, in, twiddle);
   1085             else
   1086                 radix4_autosort_pass_last(stride, csize_t<width>(), cfalse, cfalse, cbool_t<prefetch>(),
   1087                                           cbool_t<inverse>(), out, in, twiddle);
   1088         }
   1089         else
   1090         {
   1091             if (stride == 4)
   1092                 radix4_autosort_pass(stg_size, stride, csize_t<4>(), cfalse, cfalse, cbool_t<prefetch>(),
   1093                                      cbool_t<inverse>(), out, in, twiddle);
   1094             else
   1095                 radix4_autosort_pass(stg_size, stride, csize_t<width>(), cfalse, cfalse, cbool_t<prefetch>(),
   1096                                      cbool_t<inverse>(), out, in, twiddle);
   1097         }
   1098     }
   1099 };
   1100 
   1101 template <typename T, size_t log2n>
   1102 struct fft_specialization;
   1103 
   1104 template <typename T>
   1105 struct fft_specialization<T, 0> : dft_stage<T>
   1106 {
   1107     fft_specialization(size_t)
   1108     {
   1109         this->stage_size = 1;
   1110         this->name       = dft_name(this);
   1111     }
   1112 
   1113     constexpr static bool aligned = false;
   1114     DFT_STAGE_FN
   1115 
   1116     template <bool inverse>
   1117     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
   1118     {
   1119         out[0] = in[0];
   1120     }
   1121 };
   1122 
   1123 template <typename T>
   1124 struct fft_specialization<T, 1> : dft_stage<T>
   1125 {
   1126     fft_specialization(size_t)
   1127     {
   1128         this->stage_size = 2;
   1129         this->name       = dft_name(this);
   1130     }
   1131 
   1132     constexpr static bool aligned = false;
   1133     DFT_STAGE_FN
   1134 
   1135     template <bool inverse>
   1136     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
   1137     {
   1138         cvec<T, 1> a0, a1;
   1139         split<T, 4>(cread<2, aligned>(in), a0, a1);
   1140         cwrite<2, aligned>(out, concat(a0 + a1, a0 - a1));
   1141     }
   1142 };
   1143 
   1144 template <typename T>
   1145 struct fft_specialization<T, 2> : dft_stage<T>
   1146 {
   1147     fft_specialization(size_t)
   1148     {
   1149         this->stage_size = 4;
   1150         this->name       = dft_name(this);
   1151     }
   1152 
   1153     constexpr static bool aligned = false;
   1154     DFT_STAGE_FN
   1155     template <bool inverse>
   1156     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
   1157     {
   1158         cvec<T, 1> a0, a1, a2, a3;
   1159         split<T, 8>(cread<4>(in), a0, a1, a2, a3);
   1160         butterfly(cbool_t<inverse>(), a0, a1, a2, a3, a0, a1, a2, a3);
   1161         cwrite<4>(out, concat(concat(a0, a1), concat(a2, a3)));
   1162     }
   1163 };
   1164 
   1165 template <typename T>
   1166 struct fft_specialization<T, 3> : dft_stage<T>
   1167 {
   1168     fft_specialization(size_t)
   1169     {
   1170         this->stage_size = 8;
   1171         this->name       = dft_name(this);
   1172     }
   1173 
   1174     constexpr static bool aligned = false;
   1175     DFT_STAGE_FN
   1176     template <bool inverse>
   1177     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
   1178     {
   1179         cvec<T, 8> v8 = cread<8, aligned>(in);
   1180         butterfly8_packed<inverse>(v8);
   1181         cwrite<8, aligned>(out, v8);
   1182     }
   1183 };
   1184 
   1185 template <typename T>
   1186 struct fft_specialization<T, 4> : dft_stage<T>
   1187 {
   1188     fft_specialization(size_t)
   1189     {
   1190         this->stage_size = 16;
   1191         this->name       = dft_name(this);
   1192     }
   1193 
   1194     constexpr static bool aligned = false;
   1195     DFT_STAGE_FN
   1196     template <bool inverse>
   1197     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
   1198     {
   1199         cvec<T, 16> v16 = cread<16, aligned>(in);
   1200         butterfly16_packed<inverse>(v16);
   1201         cwrite<16, aligned>(out, v16);
   1202     }
   1203 };
   1204 
   1205 template <typename T>
   1206 struct fft_specialization<T, 5> : dft_stage<T>
   1207 {
   1208     fft_specialization(size_t)
   1209     {
   1210         this->stage_size = 32;
   1211         this->name       = dft_name(this);
   1212     }
   1213 
   1214     constexpr static bool aligned = false;
   1215     DFT_STAGE_FN
   1216     template <bool inverse>
   1217     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
   1218     {
   1219         cvec<T, 32> v32 = cread<32, aligned>(in);
   1220         butterfly32_packed<inverse>(v32);
   1221         cwrite<32, aligned>(out, v32);
   1222     }
   1223 };
   1224 
   1225 #ifdef KFR_AUTOSORT_FOR_64
   1226 template <typename T>
   1227 struct fft_specialization<T, 6> : dft_stage<T>
   1228 {
   1229     fft_specialization(size_t stage_size)
   1230     {
   1231         this->stage_size = 64;
   1232         this->name       = dft_name(this);
   1233         this->temp_size  = 64 * sizeof(complex<T>);
   1234         this->data_size  = 64 * sizeof(complex<T>);
   1235     }
   1236 
   1237     constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
   1238 
   1239     void do_initialize(size_t) final
   1240     {
   1241         complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
   1242         initialize_twiddle_autosort(64, width, twiddle);
   1243         initialize_twiddle_autosort(16, 1, twiddle);
   1244     }
   1245 
   1246     DFT_STAGE_FN
   1247     template <bool inverse>
   1248     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
   1249     {
   1250         auto no              = cfalse;
   1251         const complex<T>* tw = ptr_cast<complex<T>>(this->data);
   1252         complex<T>* scratch  = ptr_cast<complex<T>>(temp);
   1253         radix4_autosort_pass_first(64, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
   1254         radix4_autosort_pass(16, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
   1255         radix4_autosort_pass_last(16, csize<width>, no, no, no, cbool<inverse>, out, out, tw);
   1256     }
   1257 };
   1258 #else
   1259 template <typename T>
   1260 struct fft_specialization<T, 6> : dft_stage<T>
   1261 {
   1262     fft_specialization(size_t)
   1263     {
   1264         this->stage_size = 64;
   1265         this->name       = dft_name(this);
   1266     }
   1267 
   1268     constexpr static bool aligned = false;
   1269     DFT_STAGE_FN
   1270     template <bool inverse>
   1271     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
   1272     {
   1273         butterfly64_memory(cbool_t<inverse>(), cbool_t<aligned>(), out, in);
   1274     }
   1275 };
   1276 #endif
   1277 
   1278 #ifdef KFR_AUTOSORT_FOR_128D
   1279 template <>
   1280 struct fft_specialization<double, 7> : dft_stage<double>
   1281 {
   1282     using T = double;
   1283     fft_specialization(size_t stage_size)
   1284     {
   1285         this->stage_size = 128;
   1286         this->name       = dft_name(this);
   1287         this->temp_size  = 128 * sizeof(complex<T>);
   1288         this->data_size  = 128 * sizeof(complex<T>);
   1289     }
   1290 
   1291     constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
   1292 
   1293     void do_initialize(size_t) final
   1294     {
   1295         complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
   1296         initialize_twiddle_autosort(128, width, twiddle);
   1297         initialize_twiddle_autosort(32, 1, twiddle);
   1298         initialize_twiddle_autosort(8, 1, twiddle);
   1299     }
   1300 
   1301     DFT_STAGE_FN
   1302     template <bool inverse>
   1303     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
   1304     {
   1305         auto no              = cfalse;
   1306         const complex<T>* tw = ptr_cast<complex<T>>(this->data);
   1307         complex<T>* scratch  = ptr_cast<complex<T>>(temp);
   1308         radix4_autosort_pass_first(128, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
   1309         radix4_autosort_pass(32, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
   1310         radix8_autosort_pass_last(16, csize<width>, no, no, no, cbool<inverse>, out, out, tw);
   1311     }
   1312 };
   1313 #else
   1314 template <>
   1315 struct fft_specialization<double, 7> : dft_stage<double>
   1316 {
   1317     using T = double;
   1318     fft_specialization(size_t)
   1319     {
   1320         this->name       = dft_name(this);
   1321         this->stage_size = 128;
   1322         this->data_size  = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment);
   1323     }
   1324 
   1325     constexpr static bool aligned        = false;
   1326     constexpr static size_t width        = const_min(fft_config<T>::process_width, size_t(8));
   1327     constexpr static bool use_br2        = true;
   1328     constexpr static bool prefetch       = false;
   1329     constexpr static size_t split_format = true;
   1330 
   1331     virtual void do_initialize(size_t total_size) override final
   1332     {
   1333         complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
   1334         initialize_twiddles<T, width>(twiddle, 128, total_size, split_format);
   1335         initialize_twiddles<T, width>(twiddle, 32, total_size, split_format);
   1336         initialize_twiddles<T, width>(twiddle, 8, total_size, split_format);
   1337     }
   1338 
   1339     DFT_STAGE_FN
   1340     template <bool inverse>
   1341     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
   1342     {
   1343         const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
   1344         radix4_pass(128, 1, csize_t<width>(), ctrue, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
   1345                     cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
   1346         radix4_pass(32, 4, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(),
   1347                     cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
   1348         radix4_pass(csize_t<8>(), 16, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(),
   1349                     cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
   1350         if (this->need_reorder)
   1351             fft_reorder(out, csize_t<7>());
   1352     }
   1353 };
   1354 #endif
   1355 
   1356 template <>
   1357 struct fft_specialization<float, 7> : dft_stage<float>
   1358 {
   1359     using T = float;
   1360     fft_specialization(size_t)
   1361     {
   1362         this->name       = dft_name(this);
   1363         this->stage_size = 128;
   1364         this->data_size  = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment);
   1365     }
   1366 
   1367     constexpr static bool aligned        = false;
   1368     constexpr static size_t width1       = fft_config<T>::process_width;
   1369     constexpr static size_t width2       = const_min(width1, size_t(8));
   1370     constexpr static bool use_br2        = true;
   1371     constexpr static bool prefetch       = false;
   1372     constexpr static size_t final_size   = 32;
   1373     constexpr static size_t split_format = false;
   1374 
   1375     virtual void do_initialize(size_t total_size) override final
   1376     {
   1377         complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
   1378         initialize_twiddles<T, width1>(twiddle, 128, total_size, split_format);
   1379         initialize_twiddles<T, width2>(twiddle, 32, total_size, split_format);
   1380     }
   1381 
   1382     DFT_STAGE_FN
   1383     template <bool inverse>
   1384     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
   1385     {
   1386         const complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
   1387         radix4_pass(128, 1, csize_t<width1>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
   1388                     cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle);
   1389         radix4_pass(32, 4, csize_t<width2>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(),
   1390                     cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
   1391         radix4_pass(csize_t<8>(), 16, csize_t<width2>(), cfalse, cfalse, cbool_t<use_br2>(),
   1392                     cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle);
   1393         if (this->need_reorder)
   1394             fft_reorder(out, csize_t<7>());
   1395     }
   1396 };
   1397 
   1398 template <>
   1399 struct fft_specialization<float, 8> : dft_stage<float>
   1400 {
   1401     fft_specialization(size_t)
   1402     {
   1403         this->stage_size = 256;
   1404         this->name       = dft_name(this);
   1405         this->temp_size  = sizeof(complex<float>) * 256;
   1406     }
   1407 
   1408     using T = float;
   1409     DFT_STAGE_FN
   1410     template <bool inverse>
   1411     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
   1412     {
   1413         complex<float>* scratch = ptr_cast<complex<float>>(temp);
   1414         if (out == in)
   1415         {
   1416             butterfly16_multi_flip<0, inverse>(scratch, out);
   1417             butterfly16_multi_flip<1, inverse>(scratch, out);
   1418             butterfly16_multi_flip<2, inverse>(scratch, out);
   1419             butterfly16_multi_flip<3, inverse>(scratch, out);
   1420 
   1421             butterfly16_multi_natural<0, inverse>(out, scratch);
   1422             butterfly16_multi_natural<1, inverse>(out, scratch);
   1423             butterfly16_multi_natural<2, inverse>(out, scratch);
   1424             butterfly16_multi_natural<3, inverse>(out, scratch);
   1425         }
   1426         else
   1427         {
   1428             butterfly16_multi_flip<0, inverse>(out, in);
   1429             butterfly16_multi_flip<1, inverse>(out, in);
   1430             butterfly16_multi_flip<2, inverse>(out, in);
   1431             butterfly16_multi_flip<3, inverse>(out, in);
   1432 
   1433             butterfly16_multi_natural<0, inverse>(out, out);
   1434             butterfly16_multi_natural<1, inverse>(out, out);
   1435             butterfly16_multi_natural<2, inverse>(out, out);
   1436             butterfly16_multi_natural<3, inverse>(out, out);
   1437         }
   1438     }
   1439 };
   1440 
   1441 #ifdef KFR_AUTOSORT_FOR_256D
   1442 
   1443 template <>
   1444 struct fft_specialization<double, 8> : dft_stage<double>
   1445 {
   1446     using T = double;
   1447     fft_specialization(size_t stage_size)
   1448     {
   1449         this->stage_size = 256;
   1450         this->name       = dft_name(this);
   1451         this->temp_size  = 256 * sizeof(complex<T>);
   1452         this->data_size  = 256 * sizeof(complex<T>);
   1453     }
   1454 
   1455     constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
   1456 
   1457     void do_initialize(size_t) final
   1458     {
   1459         complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
   1460         initialize_twiddle_autosort(256, width, twiddle);
   1461         initialize_twiddle_autosort(64, 1, twiddle);
   1462         initialize_twiddle_autosort(16, 1, twiddle);
   1463     }
   1464 
   1465     DFT_STAGE_FN
   1466     template <bool inverse>
   1467     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
   1468     {
   1469         auto no              = cfalse;
   1470         const complex<T>* tw = ptr_cast<complex<T>>(this->data);
   1471         complex<T>* scratch  = ptr_cast<complex<T>>(temp);
   1472         if (in != out)
   1473         {
   1474             radix4_autosort_pass_first(256, csize<width>, no, no, no, cbool<inverse>, out, in, tw);
   1475             radix4_autosort_pass(64, 4, csize<4>, no, no, no, cbool<inverse>, scratch, out, tw);
   1476             radix4_autosort_pass(16, 16, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw);
   1477             radix4_autosort_pass_last(64, csize<width>, no, no, no, cbool<inverse>, out, out, tw);
   1478         }
   1479         else
   1480         {
   1481             radix4_autosort_pass_first(256, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
   1482             radix4_autosort_pass(64, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
   1483             radix4_autosort_pass(16, 16, csize<width>, no, no, no, cbool<inverse>, scratch, out, tw);
   1484             radix4_autosort_pass_last(64, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw);
   1485         }
   1486     }
   1487 };
   1488 #else
   1489 template <>
   1490 struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256>
   1491 {
   1492     using T = double;
   1493     fft_specialization(size_t stage_size) : fft_final_stage_impl<double, false, 256>(stage_size)
   1494     {
   1495         this->stage_size = 256;
   1496         this->name       = dft_name(this);
   1497     }
   1498 
   1499     DFT_STAGE_FN
   1500     template <bool inverse>
   1501     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
   1502     {
   1503         fft_final_stage_impl<double, false, 256>::template do_execute<inverse>(out, in, nullptr);
   1504         if (this->need_reorder)
   1505             fft_reorder(out, csize_t<8>(), cbool<always_br2>);
   1506     }
   1507 };
   1508 #endif
   1509 
   1510 #ifdef KFR_AUTOSORT_FOR_512
   1511 
   1512 template <typename T>
   1513 struct fft_specialization<T, 9> : dft_stage<T>
   1514 {
   1515     fft_specialization(size_t stage_size)
   1516     {
   1517         this->stage_size = 512;
   1518         this->name       = dft_name(this);
   1519         this->temp_size  = 512 * sizeof(complex<T>);
   1520         this->data_size  = 512 * sizeof(complex<T>);
   1521     }
   1522 
   1523     constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
   1524 
   1525     void do_initialize(size_t) final
   1526     {
   1527         complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
   1528         initialize_twiddle_autosort(512, width, twiddle);
   1529         initialize_twiddle_autosort(128, 1, twiddle);
   1530         initialize_twiddle_autosort(32, 1, twiddle);
   1531         initialize_twiddle_autosort(8, 1, twiddle);
   1532     }
   1533 
   1534     DFT_STAGE_FN
   1535     template <bool inverse>
   1536     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
   1537     {
   1538         auto no              = cfalse;
   1539         const complex<T>* tw = ptr_cast<complex<T>>(this->data);
   1540         complex<T>* scratch  = ptr_cast<complex<T>>(temp);
   1541         radix4_autosort_pass_first(512, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
   1542         radix4_autosort_pass(128, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
   1543         radix4_autosort_pass(32, 16, csize<width>, no, no, no, cbool<inverse>, scratch, out, tw);
   1544         radix8_autosort_pass_last(64, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw);
   1545     }
   1546 };
   1547 #else
   1548 template <typename T>
   1549 struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512>
   1550 {
   1551     fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 512>(stage_size)
   1552     {
   1553         this->stage_size = 512;
   1554         this->name       = dft_name(this);
   1555     }
   1556 
   1557     DFT_STAGE_FN
   1558     template <bool inverse>
   1559     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
   1560     {
   1561         fft_final_stage_impl<T, false, 512>::template do_execute<inverse>(out, in, nullptr);
   1562         if (this->need_reorder)
   1563             fft_reorder(out, csize_t<9>());
   1564     }
   1565 };
   1566 #endif
   1567 
   1568 #ifdef KFR_AUTOSORT_FOR_1024
   1569 template <typename T>
   1570 struct fft_specialization<T, 10> : dft_stage<T>
   1571 {
   1572     fft_specialization(size_t stage_size)
   1573     {
   1574         this->stage_size = 1024;
   1575         this->name       = dft_name(this);
   1576         this->temp_size  = 1024 * sizeof(complex<T>);
   1577         this->data_size  = 1024 * sizeof(complex<T>);
   1578     }
   1579 
   1580     constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
   1581 
   1582     void do_initialize(size_t) final
   1583     {
   1584         complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
   1585         initialize_twiddle_autosort(1024, width, twiddle);
   1586         initialize_twiddle_autosort(256, 1, twiddle);
   1587         initialize_twiddle_autosort(64, 1, twiddle);
   1588         initialize_twiddle_autosort(16, 1, twiddle);
   1589     }
   1590 
   1591     DFT_STAGE_FN
   1592     template <bool inverse>
   1593     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
   1594     {
   1595         auto no              = cfalse;
   1596         auto split           = cfalse;
   1597         const complex<T>* tw = ptr_cast<complex<T>>(this->data);
   1598         complex<T>* scratch  = ptr_cast<complex<T>>(temp);
   1599         radix4_autosort_pass_first(1024, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
   1600         radix4_autosort_pass(256, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
   1601         radix4_autosort_pass(64, 16, csize<width>, split, no, no, cbool<inverse>, scratch, out, tw);
   1602         radix4_autosort_pass(16, 64, csize<width>, split, split, no, cbool<inverse>, out, scratch, tw);
   1603         radix4_autosort_pass_last(256, csize<width>, no, split, no, cbool<inverse>, out, out, tw);
   1604     }
   1605 };
   1606 #else
   1607 template <typename T>
   1608 struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024>
   1609 {
   1610     fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 1024>(stage_size)
   1611     {
   1612         this->name = dft_name(this);
   1613     }
   1614 
   1615     DFT_STAGE_FN
   1616     template <bool inverse>
   1617     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*)
   1618     {
   1619         fft_final_stage_impl<T, false, 1024>::template do_execute<inverse>(out, in, nullptr);
   1620         if (this->need_reorder)
   1621             fft_reorder(out, csize_t<10>{}, cbool_t<always_br2>{});
   1622     }
   1623 };
   1624 #endif
   1625 
   1626 #ifdef KFR_AUTOSORT_FOR_2048
   1627 template <typename T>
   1628 struct fft_specialization<T, 11> : dft_stage<T>
   1629 {
   1630     fft_specialization(size_t stage_size)
   1631     {
   1632         this->stage_size = 2048;
   1633         this->name       = dft_name(this);
   1634         this->temp_size  = 2048 * sizeof(complex<T>);
   1635         this->data_size  = 2048 * sizeof(complex<T>);
   1636     }
   1637 
   1638     constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width));
   1639 
   1640     void do_initialize(size_t) final
   1641     {
   1642         complex<T>* twiddle = ptr_cast<complex<T>>(this->data);
   1643         initialize_twiddle_autosort(2048, width, twiddle);
   1644         initialize_twiddle_autosort(512, 1, twiddle);
   1645         initialize_twiddle_autosort(128, 1, twiddle);
   1646         initialize_twiddle_autosort(32, 1, twiddle);
   1647         initialize_twiddle_autosort(8, 1, twiddle);
   1648     }
   1649 
   1650     DFT_STAGE_FN
   1651     template <bool inverse>
   1652     KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp)
   1653     {
   1654         auto no              = cfalse;
   1655         const complex<T>* tw = ptr_cast<complex<T>>(this->data);
   1656         complex<T>* scratch  = ptr_cast<complex<T>>(temp);
   1657         radix4_autosort_pass_first(2048, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw);
   1658         radix4_autosort_pass(512, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw);
   1659         radix4_autosort_pass(128, 16, csize<4>, no, no, no, cbool<inverse>, scratch, out, tw);
   1660         radix4_autosort_pass(32, 64, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw);
   1661         radix8_autosort_pass_last(256, csize<width>, no, no, no, cbool<inverse>, out, out, tw);
   1662     }
   1663 };
   1664 #endif
   1665 
   1666 template <bool is_even, bool first, typename T, bool autosort>
   1667 void make_fft_stages(dft_plan<T>* self, cbool_t<autosort>, size_t stage_size, cbool_t<is_even>,
   1668                      cbool_t<first>)
   1669 {
   1670     if constexpr (autosort)
   1671     {
   1672         if (stage_size >= 16)
   1673         {
   1674             add_stage<fft_autosort_stage_impl<T, first, false, false>>(self, stage_size,
   1675                                                                        self->size / stage_size);
   1676             make_fft_stages(self, ctrue, stage_size / 4, cbool_t<is_even>(), cfalse);
   1677         }
   1678         else
   1679         {
   1680             if (stage_size == 8)
   1681                 add_stage<fft_autosort_stage_impl<T, false, true, true>>(self, stage_size,
   1682                                                                          self->size / stage_size);
   1683             else
   1684                 add_stage<fft_autosort_stage_impl<T, false, true, false>>(self, stage_size,
   1685                                                                           self->size / stage_size);
   1686         }
   1687     }
   1688     else
   1689     {
   1690         constexpr size_t final_size = is_even ? 1024 : 512;
   1691 
   1692         if (stage_size >= 2048)
   1693         {
   1694             add_stage<fft_stage_impl<T, !first, is_even>>(self, stage_size);
   1695 
   1696             make_fft_stages(self, cfalse, stage_size / 4, cbool_t<is_even>(), cfalse);
   1697         }
   1698         else
   1699         {
   1700             add_stage<fft_final_stage_impl<T, !first, final_size>>(self, final_size);
   1701             add_stage<fft_reorder_stage_impl<T, is_even>>(self, self->size);
   1702         }
   1703     }
   1704 }
   1705 
   1706 } // namespace intrinsics
   1707 
   1708 template <bool is_even, typename T>
   1709 void make_fft(dft_plan<T>* self, size_t stage_size, cbool_t<is_even>)
   1710 {
   1711     if (use_autosort<T>(ilog2(stage_size)))
   1712     {
   1713         intrinsics::make_fft_stages(self, ctrue, stage_size, cbool<is_even>, ctrue);
   1714     }
   1715     else
   1716     {
   1717         intrinsics::make_fft_stages(self, cfalse, stage_size, cbool<is_even>, ctrue);
   1718     }
   1719 }
   1720 
   1721 template <typename T>
   1722 struct reverse_wrapper
   1723 {
   1724     T& iterable;
   1725 };
   1726 
   1727 template <typename T>
   1728 KFR_INTRINSIC auto begin(reverse_wrapper<T> w)
   1729 {
   1730     return std::rbegin(w.iterable);
   1731 }
   1732 
   1733 template <typename T>
   1734 KFR_INTRINSIC auto end(reverse_wrapper<T> w)
   1735 {
   1736     return std::rend(w.iterable);
   1737 }
   1738 
   1739 template <typename T>
   1740 KFR_INTRINSIC reverse_wrapper<T> reversed(T&& iterable)
   1741 {
   1742     return { iterable };
   1743 }
   1744 
   1745 template <typename T>
   1746 KFR_INTRINSIC void initialize_data_stage(dft_plan<T>* self, const dft_stage_ptr<T>& stage, size_t& offset)
   1747 {
   1748     stage->data = self->data.data() + offset;
   1749     stage->initialize(self->size);
   1750     offset += stage->data_size;
   1751 }
   1752 
   1753 template <typename T>
   1754 KFR_INTRINSIC size_t initialize_data(dft_plan<T>* self)
   1755 {
   1756     self->data    = autofree<u8>(self->data_size);
   1757     size_t offset = 0;
   1758     for (dft_stage_ptr<T>& stage : self->all_stages)
   1759     {
   1760         initialize_data_stage(self, stage, offset);
   1761     }
   1762     return offset;
   1763 }
   1764 
   1765 template <typename T>
   1766 KFR_INTRINSIC void initialize_order(dft_plan<T>* self)
   1767 {
   1768     self->calc_disposition();
   1769     typename dft_plan<T>::bitset ored = self->disposition_inplace[0] | self->disposition_inplace[1] |
   1770                                         self->disposition_outofplace[0] | self->disposition_outofplace[1];
   1771     if (ored.any()) // if scratch needed
   1772         self->temp_size +=
   1773             align_up(sizeof(complex<T>) * (self->size + 1), platform<>::native_cache_alignment);
   1774 }
   1775 
   1776 template <typename T>
   1777 KFR_INTRINSIC void init_fft(dft_plan<T>* self, size_t size, dft_order)
   1778 {
   1779     const size_t log2n = ilog2(size);
   1780     cswitch(
   1781         csizes_t<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10
   1782 #ifdef KFR_AUTOSORT_FOR_2048
   1783                  ,
   1784                  11
   1785 #endif
   1786                  >(),
   1787         log2n,
   1788         [&](auto log2n)
   1789         {
   1790             (void)log2n;
   1791             constexpr size_t log2nv = val_of(decltype(log2n)());
   1792             add_stage<intrinsics::fft_specialization<T, log2nv>>(self, size);
   1793         },
   1794         [&]()
   1795         { cswitch(cfalse_true, is_even(log2n), [&](auto is_even) { make_fft(self, size, is_even); }); });
   1796 }
   1797 
   1798 template <typename T>
   1799 KFR_INTRINSIC void generate_real_twiddles(dft_plan_real<T>* self, size_t size)
   1800 {
   1801     using namespace intrinsics;
   1802     constexpr size_t width = vector_width<T> * 2;
   1803     block_process(size / 4, csizes_t<width, 1>(),
   1804                   [=](size_t i, auto w)
   1805                   {
   1806                       constexpr size_t width = val_of(decltype(w)());
   1807                       cwrite<width>(self->rtwiddle.data() + i,
   1808                                     cossin(dup(-constants<T>::pi *
   1809                                                ((enumerate<T, width>() + i + size / 4) / (size / 2)))));
   1810                   });
   1811 }
   1812 
   1813 template <typename T>
   1814 #if (defined CMT_ARCH_X32 && defined CMT_ARCH_X86 && defined __clang__) &&                                   \
   1815     ((defined __APPLE__) || (__clang_major__ == 8))
   1816 // Fix for Clang 8.0 bug (x32 with FMA instructions)
   1817 // Xcode has different versions but x86 is very rare on macOS these days, 
   1818 // so disable inlining and FMA for x32 macOS and Clang 8.x
   1819 __attribute__((target("no-fma"), flatten, noinline))
   1820 #else
   1821 KFR_INTRINSIC
   1822 #endif
   1823 void
   1824 to_fmt(size_t real_size, const complex<T>* rtwiddle, complex<T>* out, const complex<T>* in,
   1825        dft_pack_format fmt)
   1826 {
   1827     using namespace intrinsics;
   1828     size_t csize = real_size / 2; // const size_t causes internal compiler error: in tsubst_copy in GCC 5.2
   1829 
   1830     constexpr size_t width = vector_width<T> * 2;
   1831     const cvec<T, 1> dc    = cread<1>(in);
   1832     cvec<T, 1> inmid       = cread<1>(in + csize / 2);
   1833     const size_t count     = (csize + 1) / 2;
   1834 
   1835     block_process(count - 1, csizes_t<width, 1>(),
   1836                   [&](size_t i, auto w)
   1837                   {
   1838                       i++;
   1839                       constexpr size_t width    = val_of(decltype(w)());
   1840                       constexpr size_t widthm1  = width - 1;
   1841                       const cvec<T, width> tw   = cread<width>(rtwiddle + i);
   1842                       const cvec<T, width> fpk  = cread<width>(in + i);
   1843                       const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1)));
   1844 
   1845                       const cvec<T, width> f1k = fpk + fpnk;
   1846                       const cvec<T, width> f2k = fpk - fpnk;
   1847                       const cvec<T, width> t   = cmul(f2k, tw);
   1848                       cwrite<width>(out + i, T(0.5) * (f1k + t));
   1849                       cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(T(0.5) * (f1k - t))));
   1850                   });
   1851 
   1852     if (is_even(csize))
   1853     {
   1854         cwrite<1>(out + csize / 2, negodd(inmid));
   1855     }
   1856     if (fmt == dft_pack_format::CCs)
   1857     {
   1858         cwrite<1>(out, pack(dc[0] + dc[1], 0));
   1859         cwrite<1>(out + csize, pack(dc[0] - dc[1], 0));
   1860     }
   1861     else
   1862     {
   1863         cwrite<1>(out, pack(dc[0] + dc[1], dc[0] - dc[1]));
   1864     }
   1865 }
   1866 
   1867 template <typename T>
   1868 #if (defined CMT_ARCH_X32 && defined CMT_ARCH_X86 && defined __clang__) &&                                   \
   1869     ((defined __APPLE__) || (__clang_major__ == 8))
   1870 // Fix for Clang 8.0 bug (x32 with FMA instructions)
   1871 // Xcode has different versions but x86 is very rare on macOS these days, 
   1872 // so disable inlining and FMA for x32 macOS and Clang 8.x
   1873 __attribute__((target("no-fma"), flatten, noinline))
   1874 #else
   1875 KFR_INTRINSIC
   1876 #endif
   1877 void from_fmt(size_t real_size, complex<T>* rtwiddle, complex<T>* out, const complex<T>* in,
   1878                             dft_pack_format fmt)
   1879 {
   1880     using namespace intrinsics;
   1881 
   1882     const size_t csize = real_size / 2;
   1883 
   1884     cvec<T, 1> dc;
   1885 
   1886     if (fmt == dft_pack_format::CCs)
   1887     {
   1888         dc = pack(in[0].real() + in[csize].real(), in[0].real() - in[csize].real());
   1889     }
   1890     else
   1891     {
   1892         dc = pack(in[0].real() + in[0].imag(), in[0].real() - in[0].imag());
   1893     }
   1894     cvec<T, 1> inmid = cread<1>(in + csize / 2);
   1895 
   1896     constexpr size_t width = vector_width<T> * 2;
   1897     const size_t count     = (csize + 1) / 2;
   1898 
   1899     block_process(count - 1, csizes_t<width, 1>(),
   1900                   [&](size_t i, auto w)
   1901                   {
   1902                       i++;
   1903                       constexpr size_t width    = val_of(decltype(w)());
   1904                       constexpr size_t widthm1  = width - 1;
   1905                       const cvec<T, width> tw   = cread<width>(rtwiddle + i);
   1906                       const cvec<T, width> fpk  = cread<width>(in + i);
   1907                       const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1)));
   1908 
   1909                       const cvec<T, width> f1k = fpk + fpnk;
   1910                       const cvec<T, width> f2k = fpk - fpnk;
   1911                       const cvec<T, width> t   = cmul_conj(f2k, tw);
   1912                       cwrite<width>(out + i, f1k + t);
   1913                       cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(f1k - t)));
   1914                   });
   1915     if (is_even(csize))
   1916     {
   1917         cwrite<1>(out + csize / 2, 2 * negodd(inmid));
   1918     }
   1919     cwrite<1>(out, dc);
   1920 }
   1921 
   1922 #ifndef KFR_DFT_NO_NPo2
   1923 template <typename T>
   1924 void init_dft(dft_plan<T>* self, size_t size, dft_order);
   1925 #endif
   1926 
   1927 template <typename T>
   1928 KFR_INTRINSIC void initialize_stages(dft_plan<T>* self)
   1929 {
   1930     if (is_poweroftwo(self->size))
   1931     {
   1932         init_fft(self, self->size, dft_order::normal);
   1933     }
   1934     else
   1935     {
   1936 #ifndef KFR_DFT_NO_NPo2
   1937         init_dft(self, self->size, dft_order::normal);
   1938 #else
   1939         KFR_REPORT_LOGIC_ERROR("Non-power of 2 FFT is disabled but ", self->size, " size is requested");
   1940 #endif
   1941     }
   1942 }
   1943 
   1944 namespace impl
   1945 {
   1946 template <typename T>
   1947 void dft_initialize(dft_plan<T>& plan)
   1948 {
   1949     if (plan.size == 0)
   1950         return;
   1951     initialize_stages(&plan);
   1952     initialize_data(&plan);
   1953     initialize_order(&plan);
   1954 }
   1955 
   1956 template <typename T>
   1957 const complex<T>* select_in(const dft_plan<T>& plan, typename dft_plan<T>::bitset disposition, size_t stage,
   1958                             const complex<T>* out, const complex<T>* in, const complex<T>* scratch)
   1959 {
   1960     return disposition.test(stage) ? scratch : stage == 0 ? in : out;
   1961 }
   1962 template <typename T>
   1963 complex<T>* select_out(const dft_plan<T>& plan, typename dft_plan<T>::bitset disposition, size_t stage,
   1964                        size_t total_stages, complex<T>* out, complex<T>* scratch)
   1965 {
   1966     return stage == total_stages - 1 ? out : disposition.test(stage + 1) ? scratch : out;
   1967 }
   1968 
   1969 template <typename T, bool inverse>
   1970 void dft_execute(const dft_plan<T>& plan, cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp)
   1971 {
   1972     if (temp == nullptr && plan.temp_size > 0)
   1973     {
   1974         return call_with_temp(plan.temp_size, std::bind(&impl::dft_execute<T, inverse>, std::cref(plan),
   1975                                                         cbool_t<inverse>{}, out, in, std::placeholders::_1));
   1976     }
   1977     auto&& stages = plan.stages[inverse];
   1978     if (stages.size() == 1 && (stages[0]->can_inplace || in != out))
   1979     {
   1980         return stages[0]->execute(cbool<inverse>, out, in, temp);
   1981     }
   1982     size_t stack[DFT_MAX_STAGES] = { 0 };
   1983 
   1984     typename dft_plan<T>::bitset disposition =
   1985         in == out ? plan.disposition_inplace[inverse] : plan.disposition_outofplace[inverse];
   1986 
   1987     complex<T>* scratch = ptr_cast<complex<T>>(
   1988         temp + plan.temp_size -
   1989         align_up(sizeof(complex<T>) * (plan.size + 1), platform<>::native_cache_alignment));
   1990 
   1991     bool in_scratch = disposition.test(0);
   1992     if (in_scratch)
   1993     {
   1994         stages[0]->copy_input(inverse, scratch, in, plan.size);
   1995     }
   1996 
   1997     const size_t count = stages.size();
   1998 
   1999     for (size_t depth = 0; depth < count;)
   2000     {
   2001         if (stages[depth]->recursion)
   2002         {
   2003             size_t offset   = 0;
   2004             size_t rdepth   = depth;
   2005             size_t maxdepth = depth;
   2006             do
   2007             {
   2008                 if (stack[rdepth] == stages[rdepth]->repeats)
   2009                 {
   2010                     stack[rdepth] = 0;
   2011                     rdepth--;
   2012                 }
   2013                 else
   2014                 {
   2015                     complex<T>* rout = select_out(plan, disposition, rdepth, stages.size(), out, scratch);
   2016                     const complex<T>* rin = select_in(plan, disposition, rdepth, out, in, scratch);
   2017                     stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp);
   2018                     offset += stages[rdepth]->out_offset;
   2019                     stack[rdepth]++;
   2020                     if (rdepth < count - 1 && stages[rdepth + 1]->recursion)
   2021                         rdepth++;
   2022                     else
   2023                         maxdepth = rdepth;
   2024                 }
   2025             } while (rdepth != depth);
   2026             depth = maxdepth + 1;
   2027         }
   2028         else
   2029         {
   2030             size_t offset = 0;
   2031             while (offset < plan.size)
   2032             {
   2033                 stages[depth]->execute(cbool<inverse>,
   2034                                        select_out(plan, disposition, depth, stages.size(), out, scratch) +
   2035                                            offset,
   2036                                        select_in(plan, disposition, depth, out, in, scratch) + offset, temp);
   2037                 offset += stages[depth]->stage_size;
   2038             }
   2039             depth++;
   2040         }
   2041     }
   2042 }
   2043 template <typename T>
   2044 void dft_initialize_transpose(internal_generic::fn_transpose<T>& transpose)
   2045 {
   2046     transpose = &kfr::CMT_ARCH_NAME::matrix_transpose;
   2047 }
   2048 } // namespace impl
   2049 
   2050 namespace intrinsics
   2051 {
   2052 
   2053 template <typename T>
   2054 struct dft_stage_real_repack : dft_stage<T>
   2055 {
   2056 public:
   2057     dft_stage_real_repack(size_t real_size, dft_pack_format fmt)
   2058     {
   2059         this->user         = static_cast<int>(fmt);
   2060         this->stage_size   = real_size;
   2061         this->can_inplace  = true;
   2062         this->name         = dft_name(this);
   2063         const size_t count = (real_size / 2 + 1) / 2;
   2064         this->data_size    = align_up(sizeof(complex<T>) * count, platform<>::native_cache_alignment);
   2065     }
   2066     void do_initialize(size_t) final
   2067     {
   2068         using namespace intrinsics;
   2069         constexpr size_t width = vector_width<T> * 2;
   2070         size_t real_size       = this->stage_size;
   2071         complex<T>* rtwiddle   = ptr_cast<complex<T>>(this->data);
   2072         const size_t count     = (real_size / 2 + 1) / 2;
   2073         block_process(count, csizes_t<width, 1>(),
   2074                       [=](size_t i, auto w)
   2075                       {
   2076                           constexpr size_t width = val_of(decltype(w)());
   2077                           cwrite<width>(
   2078                               rtwiddle + i,
   2079                               cossin(dup(-constants<T>::pi * ((enumerate<T, width>() + i + real_size / T(4)) /
   2080                                                               (real_size / 2)))));
   2081                       });
   2082     }
   2083     void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) final
   2084     {
   2085         to_fmt(this->stage_size, ptr_cast<complex<T>>(this->data), out, in,
   2086                static_cast<dft_pack_format>(this->user));
   2087     }
   2088     void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) final
   2089     {
   2090         from_fmt(this->stage_size, ptr_cast<complex<T>>(this->data), out, in,
   2091                  static_cast<dft_pack_format>(this->user));
   2092     }
   2093     void copy_input(bool invert, complex<T>* out, const complex<T>* in, size_t size) final
   2094     {
   2095         size_t extra = invert && static_cast<dft_pack_format>(this->user) == dft_pack_format::CCs ? 1 : 0;
   2096         builtin_memcpy(out, in, sizeof(complex<T>) * (size + extra));
   2097     }
   2098 };
   2099 } // namespace intrinsics
   2100 
   2101 namespace impl
   2102 {
   2103 template <typename T>
   2104 void dft_real_initialize(dft_plan_real<T>& plan)
   2105 {
   2106     if (plan.size == 0)
   2107         return;
   2108     initialize_stages(&plan);
   2109     add_stage<intrinsics::dft_stage_real_repack<T>, false>(&plan, plan.size, plan.fmt);
   2110     plan.stages[0].push_back(plan.all_stages.back().get());
   2111     plan.stages[1].insert(plan.stages[1].begin(), plan.all_stages.back().get());
   2112     initialize_data(&plan);
   2113     initialize_order(&plan);
   2114 }
   2115 } // namespace impl
   2116 
   2117 } // namespace CMT_ARCH_NAME
   2118 
   2119 } // namespace kfr
   2120 
   2121 CMT_PRAGMA_GNU(GCC diagnostic pop)
   2122 
   2123 CMT_PRAGMA_MSVC(warning(pop))