fft-impl.hpp (80406B)
1 /** @addtogroup dft 2 * @{ 3 */ 4 /* 5 Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) 6 This file is part of KFR 7 8 KFR is free software: you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation, either version 2 of the License, or 11 (at your option) any later version. 12 13 KFR is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with KFR. 20 21 If GPL is not suitable for your project, you must purchase a commercial license to use KFR. 22 Buying a commercial license is mandatory as soon as you develop commercial activities without 23 disclosing the source code of your own applications. 24 See https://www.kfrlib.com for details. 25 */ 26 #pragma once 27 28 #include "dft-fft.hpp" 29 30 CMT_PRAGMA_GNU(GCC diagnostic push) 31 #if CMT_HAS_WARNING("-Wshadow") 32 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") 33 #endif 34 #if CMT_HAS_WARNING("-Wunused-lambda-capture") 35 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunused-lambda-capture") 36 #endif 37 #if CMT_HAS_WARNING("-Wpass-failed") 38 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpass-failed") 39 #endif 40 41 CMT_PRAGMA_MSVC(warning(push)) 42 CMT_PRAGMA_MSVC(warning(disable : 4100)) 43 44 namespace kfr 45 { 46 inline namespace CMT_ARCH_NAME 47 { 48 49 constexpr bool inline always_br2 = true; 50 51 template <typename T> 52 inline std::bitset<DFT_MAX_STAGES> fft_algorithm_selection; 53 54 template <> 55 inline std::bitset<DFT_MAX_STAGES> fft_algorithm_selection<float>{ 56 #ifdef CMT_ARCH_NEON 57 0 58 #else 59 (1ull << 15) - 1 60 #endif 61 }; 62 63 template <> 64 inline std::bitset<DFT_MAX_STAGES> fft_algorithm_selection<double>{ 0 }; 65 66 template <typename T> 67 inline bool use_autosort(size_t log2n) 68 { 69 return fft_algorithm_selection<T>[log2n]; 70 } 71 72 #ifndef CMT_ARCH_NEON 73 #define KFR_AUTOSORT_FOR_2048 74 #define KFR_AUTOSORT_FOR_128D 75 #define KFR_AUTOSORT_FOR_256D 76 #define KFR_AUTOSORT_FOR_512 77 #define KFR_AUTOSORT_FOR_1024 78 #endif 79 80 #ifdef CMT_ARCH_AVX 81 template <> 82 KFR_INTRINSIC vec<float, 32> ctranspose<4, float, 32>(const vec<float, 32>& v16) 83 { 84 cvec<float, 4> r0, r1, r2, r3; 85 split(v16, r0, r1, r2, r3); 86 const __m256d t0 = _mm256_unpacklo_pd(_mm256_castps_pd(r0.v), _mm256_castps_pd(r1.v)); 87 const __m256d t1 = _mm256_unpacklo_pd(_mm256_castps_pd(r2.v), _mm256_castps_pd(r3.v)); 88 const __m256d t2 = _mm256_unpackhi_pd(_mm256_castps_pd(r0.v), _mm256_castps_pd(r1.v)); 89 const __m256d t3 = _mm256_unpackhi_pd(_mm256_castps_pd(r2.v), _mm256_castps_pd(r3.v)); 90 r0.v = _mm256_castpd_ps(_mm256_permute2f128_pd(t0, t1, 0x20)); 91 r1.v = _mm256_castpd_ps(_mm256_permute2f128_pd(t2, t3, 0x20)); 92 r2.v = _mm256_castpd_ps(_mm256_permute2f128_pd(t0, t1, 0x31)); 93 r3.v = _mm256_castpd_ps(_mm256_permute2f128_pd(t2, t3, 0x31)); 94 return concat(r0, r1, r2, r3); 95 } 96 #endif 97 98 template <> 99 KFR_INTRINSIC vec<float, 64> ctranspose<8, float, 64>(const vec<float, 64>& v32) 100 { 101 cvec<float, 4> a0, a1, a2, a3, a4, a5, a6, a7; 102 split(v32, a0, a1, a2, a3, a4, a5, a6, a7); 103 cvec<float, 16> even = concat(a0, a2, a4, a6); 104 cvec<float, 16> odd = concat(a1, a3, a5, a7); 105 even = ctranspose<4>(even); 106 odd = ctranspose<4>(odd); 107 return concat(even, odd); 108 } 109 110 template <> 111 KFR_INTRINSIC vec<float, 64> ctranspose<4, float, 64>(const vec<float, 64>& v32) 112 { 113 cvec<float, 16> lo, hi; 114 split(v32, lo, hi); 115 lo = ctranspose<4>(lo); 116 hi = ctranspose<4>(hi); 117 cvec<float, 4> a0, a1, a2, a3, a4, a5, a6, a7; 118 split(lo, a0, a1, a2, a3); 119 split(hi, a4, a5, a6, a7); 120 return concat(a0, a4, a1, a5, a2, a6, a3, a7); 121 } 122 123 namespace intrinsics 124 { 125 126 template <size_t width, bool inverse, typename T> 127 KFR_INTRINSIC cvec<T, width> radix4_apply_twiddle(csize_t<width>, cfalse_t /*split_format*/, cbool_t<inverse>, 128 const cvec<T, width>& w, const cvec<T, width>& tw) 129 { 130 cvec<T, width> ww = w; 131 cvec<T, width> tw_ = tw; 132 cvec<T, width> b1 = ww * dupeven(tw_); 133 ww = swap<2>(ww); 134 135 if constexpr (inverse) 136 { 137 ww = addsub(b1, ww * dupodd(tw_)); 138 } 139 else 140 { 141 ww = subadd(b1, ww * dupodd(tw_)); 142 } 143 return ww; 144 } 145 146 template <size_t width, bool use_br2, bool inverse, bool aligned, typename T> 147 KFR_INTRINSIC void radix4_body(size_t N, csize_t<width>, cfalse_t, cfalse_t, cfalse_t, cbool_t<use_br2>, 148 cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, const complex<T>* in, 149 const complex<T>* twiddle) 150 { 151 const size_t N4 = N / 4; 152 cvec<T, width> w1, w2, w3; 153 154 cvec<T, width> sum02, sum13, diff02, diff13; 155 156 cvec<T, width> a0, a1, a2, a3; 157 a0 = cread<width, aligned>(in + 0); 158 a2 = cread<width, aligned>(in + N4 * 2); 159 sum02 = a0 + a2; 160 161 a1 = cread<width, aligned>(in + N4); 162 a3 = cread<width, aligned>(in + N4 * 3); 163 sum13 = a1 + a3; 164 165 cwrite<width, aligned>(out, sum02 + sum13); 166 w2 = sum02 - sum13; 167 cwrite<width, aligned>(out + N4 * (use_br2 ? 1 : 2), 168 radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w2, 169 cread<width, true>(twiddle + width))); 170 diff02 = a0 - a2; 171 diff13 = a1 - a3; 172 if constexpr (inverse) 173 { 174 diff13 = (diff13 ^ broadcast<width * 2, T>(T(), -T())); 175 diff13 = swap<2>(diff13); 176 } 177 else 178 { 179 diff13 = swap<2>(diff13); 180 diff13 = (diff13 ^ broadcast<width * 2, T>(T(), -T())); 181 } 182 183 w1 = diff02 + diff13; 184 185 cwrite<width, aligned>(out + N4 * (use_br2 ? 2 : 1), 186 radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), w1, 187 cread<width, true>(twiddle + 0))); 188 w3 = diff02 - diff13; 189 cwrite<width, aligned>(out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), cfalse, cbool_t<inverse>(), 190 w3, cread<width, true>(twiddle + width * 2))); 191 } 192 193 template <size_t width, bool inverse, typename T> 194 KFR_INTRINSIC cvec<T, width> radix4_apply_twiddle(csize_t<width>, ctrue_t /*split_format*/, cbool_t<inverse>, 195 const cvec<T, width>& w, const cvec<T, width>& tw) 196 { 197 vec<T, width> re1, im1, twre, twim; 198 split<T, 2 * width>(w, re1, im1); 199 split<T, 2 * width>(tw, twre, twim); 200 201 const vec<T, width> b1re = re1 * twre; 202 const vec<T, width> b1im = im1 * twre; 203 if constexpr (inverse) 204 return concat(b1re + im1 * twim, b1im - re1 * twim); 205 else 206 return concat(b1re - im1 * twim, b1im + re1 * twim); 207 } 208 209 template <size_t width, bool splitout, bool splitin, bool use_br2, bool inverse, bool aligned, typename T> 210 KFR_INTRINSIC void radix4_body(size_t N, csize_t<width>, ctrue_t, cbool_t<splitout>, cbool_t<splitin>, 211 cbool_t<use_br2>, cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, 212 const complex<T>* in, const complex<T>* twiddle) 213 { 214 const size_t N4 = N / 4; 215 cvec<T, width> w1, w2, w3; 216 constexpr bool read_split = !splitin; 217 constexpr bool write_split = !splitout; 218 219 vec<T, width> re0, im0, re1, im1, re2, im2, re3, im3; 220 221 split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 0), re0, im0); 222 split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 2), re2, im2); 223 const vec<T, width> sum02re = re0 + re2; 224 const vec<T, width> sum02im = im0 + im2; 225 226 split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 1), re1, im1); 227 split<T, 2 * width>(cread_split<width, aligned, read_split>(in + N4 * 3), re3, im3); 228 const vec<T, width> sum13re = re1 + re3; 229 const vec<T, width> sum13im = im1 + im3; 230 231 cwrite_split<width, aligned, write_split>(out, concat(sum02re + sum13re, sum02im + sum13im)); 232 w2 = concat(sum02re - sum13re, sum02im - sum13im); 233 cwrite_split<width, aligned, write_split>( 234 out + N4 * (use_br2 ? 1 : 2), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w2, 235 cread<width, true>(twiddle + width))); 236 237 const vec<T, width> diff02re = re0 - re2; 238 const vec<T, width> diff02im = im0 - im2; 239 const vec<T, width> diff13re = re1 - re3; 240 const vec<T, width> diff13im = im1 - im3; 241 242 (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re); 243 (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re); 244 245 cwrite_split<width, aligned, write_split>( 246 out + N4 * (use_br2 ? 2 : 1), radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w1, 247 cread<width, true>(twiddle + 0))); 248 cwrite_split<width, aligned, write_split>( 249 out + N4 * 3, radix4_apply_twiddle(csize_t<width>(), ctrue, cbool_t<inverse>(), w3, 250 cread<width, true>(twiddle + width * 2))); 251 } 252 253 template <typename T> 254 CMT_NOINLINE cvec<T, 1> calculate_twiddle(size_t n, size_t size) 255 { 256 if (n == 0) 257 { 258 return make_vector(static_cast<T>(1), static_cast<T>(0)); 259 } 260 else if (n == size / 4) 261 { 262 return make_vector(static_cast<T>(0), static_cast<T>(-1)); 263 } 264 else if (n == size / 2) 265 { 266 return make_vector(static_cast<T>(-1), static_cast<T>(0)); 267 } 268 else if (n == size * 3 / 4) 269 { 270 return make_vector(static_cast<T>(0), static_cast<T>(1)); 271 } 272 else 273 { 274 fbase kth = c_pi<fbase, 2> * (n / static_cast<fbase>(size)); 275 fbase tcos = +kfr::cos(kth); 276 fbase tsin = -kfr::sin(kth); 277 return make_vector(static_cast<T>(tcos), static_cast<T>(tsin)); 278 } 279 } 280 281 template <typename T, size_t width> 282 KFR_INTRINSIC void initialize_twiddles_impl(complex<T>*& twiddle, size_t nn, size_t nnstep, size_t size, 283 bool split_format) 284 { 285 static_assert(width > 0, "width cannot be zero"); 286 vec<T, 2 * width> result = T(); 287 CMT_LOOP_UNROLL 288 for (size_t i = 0; i < width; i++) 289 { 290 const cvec<T, 1> r = calculate_twiddle<T>(nn + nnstep * i, size); 291 result[i * 2] = r[0]; 292 result[i * 2 + 1] = r[1]; 293 } 294 if (split_format) 295 ref_cast<cvec<T, width>>(twiddle[0]) = splitpairs(result); 296 else 297 ref_cast<cvec<T, width>>(twiddle[0]) = result; 298 twiddle += width; 299 } 300 301 template <typename T, size_t width> 302 CMT_NOINLINE void initialize_twiddles(complex<T>*& twiddle, size_t stage_size, size_t size, bool split_format) 303 { 304 static_assert(width > 0, "width cannot be zero"); 305 const size_t count = stage_size / 4; 306 size_t nnstep = size / stage_size; 307 DFT_ASSERT(width <= count); 308 CMT_LOOP_NOUNROLL 309 for (size_t n = 0; n < count; n += width) 310 { 311 initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 1, nnstep * 1, size, split_format); 312 initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 2, nnstep * 2, size, split_format); 313 initialize_twiddles_impl<T, width>(twiddle, n * nnstep * 3, nnstep * 3, size, split_format); 314 } 315 } 316 317 #ifdef KFR_NO_PREFETCH 318 #define KFR_PREFETCH(addr) \ 319 do \ 320 { \ 321 (void)(addr); \ 322 } while (0) 323 #else 324 325 #if defined CMT_ARCH_SSE 326 #ifdef CMT_COMPILER_GNU 327 #define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr), 0, _MM_HINT_T0); 328 #else 329 #define KFR_PREFETCH(addr) _mm_prefetch(::kfr::ptr_cast<char>(addr), _MM_HINT_T0); 330 #endif 331 #else 332 #define KFR_PREFETCH(addr) __builtin_prefetch(::kfr::ptr_cast<void>(addr)); 333 #endif 334 #endif 335 336 template <size_t size = 1, typename T> 337 KFR_INTRINSIC void prefetch_one(const complex<T>* in) 338 { 339 KFR_PREFETCH(in); 340 if constexpr (sizeof(complex<T>) * size > 64) 341 KFR_PREFETCH(in + 64); 342 if constexpr (sizeof(complex<T>) * size > 128) 343 KFR_PREFETCH(in + 128); 344 if constexpr (sizeof(complex<T>) * size > 192) 345 KFR_PREFETCH(in + 192); 346 } 347 348 template <size_t size = 1, typename T> 349 KFR_INTRINSIC void prefetch_four(size_t stride, const complex<T>* in) 350 { 351 prefetch_one<size>(in); 352 prefetch_one<size>(in + stride); 353 prefetch_one<size>(in + stride * 2); 354 prefetch_one<size>(in + stride * 3); 355 } 356 357 template <size_t size = 1, typename T> 358 KFR_INTRINSIC void prefetch_eight(size_t stride, const complex<T>* in) 359 { 360 prefetch_one<size>(in); 361 prefetch_one<size>(in + stride); 362 prefetch_one<size>(in + stride * 2); 363 prefetch_one<size>(in + stride * 3); 364 prefetch_one<size>(in + stride * 4); 365 prefetch_one<size>(in + stride * 5); 366 prefetch_one<size>(in + stride * 6); 367 prefetch_one<size>(in + stride * 7); 368 } 369 370 template <typename Ntype, size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2, 371 bool inverse, bool aligned, typename T> 372 KFR_INTRINSIC cfalse_t radix4_pass(Ntype N, size_t blocks, csize_t<width>, cbool_t<splitout>, 373 cbool_t<splitin>, cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, 374 cbool_t<aligned>, complex<T>* out, const complex<T>* in, 375 const complex<T>*& twiddle) 376 { 377 static_assert(width > 0, "width cannot be zero"); 378 constexpr static size_t prefetch_cycles = 8; 379 const auto N4 = N / csize_t<4>(); 380 const auto N43 = N4 * csize_t<3>(); 381 CMT_ASSUME(blocks > 0); 382 CMT_ASSUME(N > 0); 383 CMT_ASSUME(N4 > 0); 384 DFT_ASSERT(width <= N4); 385 CMT_LOOP_NOUNROLL for (size_t b = 0; b < blocks; b++) 386 { 387 CMT_LOOP_NOUNROLL 388 for (size_t n2 = 0; n2 < N4;) 389 { 390 if constexpr (prefetch) 391 prefetch_four<width>(N4, in + width * prefetch_cycles); 392 radix4_body(N, csize_t<width>(), cbool_t<(splitout || splitin)>(), cbool_t<splitout>(), 393 cbool_t<splitin>(), cbool_t<use_br2>(), cbool_t<inverse>(), cbool_t<aligned>(), out, 394 in, twiddle + n2 * 3); 395 in += width; 396 out += width; 397 n2 += width; 398 } 399 in += N43; 400 out += N43; 401 } 402 twiddle += N43; 403 return {}; 404 } 405 406 template <size_t width, bool splitout, bool splitin, bool prefetch, bool use_br2, bool inverse, typename T> 407 KFR_INTRINSIC void radix2_autosort_pass(size_t N, size_t stride, csize_t<width>, cbool_t<splitout>, 408 cbool_t<splitin>, cbool_t<use_br2>, cbool_t<prefetch>, 409 cbool_t<inverse>, complex<T>* out, const complex<T>* in, 410 const complex<T>*& twiddle) 411 412 { 413 for (size_t n = 0; n < stride; n++) 414 { 415 const cvec<T, 1> a = cread<1>(in + n); 416 const cvec<T, 1> b = cread<1>(in + n + stride); 417 cwrite<1>(out + n, a + b); 418 cwrite<1>(out + n + stride, a - b); 419 } 420 } 421 422 template <size_t N, bool inverse, bool split_format, typename T> 423 KFR_INTRINSIC void radix4_butterfly(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3, 424 cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3) 425 { 426 if constexpr (split_format) 427 { 428 const cvec<T, N> sum02 = a0 + a2; 429 const cvec<T, N> diff02 = a0 - a2; 430 const cvec<T, N> sum13 = a1 + a3; 431 cvec<T, N> diff13 = a1 - a3; 432 vec<T, N> diff13re, diff13im, diff02re, diff02im; 433 split(diff02, diff02re, diff02im); 434 split(diff13, diff13re, diff13im); 435 w0 = sum02 + sum13; 436 w2 = sum02 - sum13; 437 438 (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re); 439 (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re); 440 } 441 else 442 { 443 const cvec<T, N> sum02 = a0 + a2; 444 const cvec<T, N> diff02 = a0 - a2; 445 const cvec<T, N> sum13 = a1 + a3; 446 cvec<T, N> diff13 = swap<2>(a1 - a3); 447 if constexpr (inverse) 448 diff13 = negodd(diff13); 449 else 450 diff13 = negeven(diff13); 451 w0 = sum02 + sum13; 452 w2 = sum02 - sum13; 453 w1 = diff02 - diff13; 454 w3 = diff02 + diff13; 455 } 456 } 457 458 template <size_t N, bool inverse, bool split_format, typename T> 459 KFR_INTRINSIC void radix4_butterfly(cvec<T, N> a0, cvec<T, N> a1, cvec<T, N> a2, cvec<T, N> a3, 460 cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, 461 cvec<T, N> tw1, cvec<T, N> tw2, cvec<T, N> tw3) 462 { 463 if constexpr (split_format) 464 { 465 const cvec<T, N> sum02 = a0 + a2; 466 const cvec<T, N> diff02 = a0 - a2; 467 const cvec<T, N> sum13 = a1 + a3; 468 cvec<T, N> diff13 = a1 - a3; 469 vec<T, N> diff13re, diff13im, diff02re, diff02im; 470 split(diff02, diff02re, diff02im); 471 split(diff13, diff13re, diff13im); 472 473 w0 = sum02 + sum13; 474 w2 = sum02 - sum13; 475 (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re); 476 (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re); 477 478 w2 = radix4_apply_twiddle(csize<N>, ctrue, cbool<inverse>, w2, tw2); 479 w1 = radix4_apply_twiddle(csize<N>, ctrue, cbool<inverse>, w1, tw1); 480 w3 = radix4_apply_twiddle(csize<N>, ctrue, cbool<inverse>, w3, tw3); 481 } 482 else 483 { 484 const cvec<T, N> sum02 = a0 + a2; 485 const cvec<T, N> diff02 = a0 - a2; 486 const cvec<T, N> sum13 = a1 + a3; 487 cvec<T, N> diff13 = swap<2>(a1 - a3); 488 if constexpr (inverse) 489 diff13 = negodd(diff13); 490 else 491 diff13 = negeven(diff13); 492 493 w0 = sum02 + sum13; 494 w2 = sum02 - sum13; 495 w1 = diff02 - diff13; 496 w3 = diff02 + diff13; 497 w2 = radix4_apply_twiddle(csize<N>, cfalse, cbool<inverse>, w2, tw2); 498 w1 = radix4_apply_twiddle(csize<N>, cfalse, cbool<inverse>, w1, tw1); 499 w3 = radix4_apply_twiddle(csize<N>, cfalse, cbool<inverse>, w3, tw3); 500 } 501 } 502 503 template <size_t N, bool split_format, typename T> 504 KFR_INTRINSIC cvec<T, N> read_twiddle(const complex<T>* tw) 505 { 506 if constexpr (split_format) 507 { 508 return concat(repeat<N>(read(cunaligned, csize<1>, ptr_cast<T>(tw))), 509 repeat<N>(read(cunaligned, csize<1>, ptr_cast<T>(tw) + 1))); 510 } 511 else 512 { 513 return repeat<N>(cread<1>(tw)); 514 } 515 } 516 517 template <size_t width_, bool splitout, bool splitin, bool prefetch, bool inverse, typename T> 518 KFR_INTRINSIC void radix4_autosort_pass_first(size_t N, csize_t<width_>, cbool_t<splitout>, cbool_t<splitin>, 519 cbool_t<prefetch>, cbool_t<inverse>, complex<T>* out, 520 const complex<T>* in, const complex<T>*& twiddle) 521 { 522 static_assert(width_ > 0, "width cannot be zero"); 523 const size_t N4 = N / 4; 524 const size_t Nstride = N4; 525 constexpr size_t width = width_; 526 constexpr bool split = splitin || splitout; 527 static_assert(!split); 528 constexpr static size_t prefetch_cycles = 8; 529 530 // CMT_LOOP_NOUNROLL 531 for (size_t b = 0; b < N4; b += width) 532 { 533 if constexpr (prefetch) 534 prefetch_four<width>(Nstride, in + prefetch_cycles * width); 535 cvec<T, width> tw1 = cread<width>(twiddle); 536 cvec<T, width> tw2 = cread<width>(twiddle + width); 537 cvec<T, width> tw3 = cread<width>(twiddle + 2 * width); 538 539 const cvec<T, width> a0 = cread<width>(in + 0 * Nstride); 540 const cvec<T, width> a1 = cread<width>(in + 1 * Nstride); 541 const cvec<T, width> a2 = cread<width>(in + 2 * Nstride); 542 const cvec<T, width> a3 = cread<width>(in + 3 * Nstride); 543 cvec<T, width> w0, w1, w2, w3; 544 radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3, tw1, tw2, tw3); 545 cvec<T, 4 * width> w0123 = concat(w0, w1, w2, w3); 546 w0123 = ctranspose<width>(w0123); 547 cwrite<4 * width>(out, w0123); 548 twiddle += 3 * width; 549 in += width; 550 out += 4 * width; 551 } 552 } 553 554 template <size_t width, bool splitout, bool splitin, bool prefetch, bool inverse, typename T> 555 KFR_INTRINSIC void radix4_autosort_pass_last(size_t stride, csize_t<width>, cbool_t<splitout>, 556 cbool_t<splitin>, cbool_t<prefetch>, cbool_t<inverse>, 557 complex<T>* out, const complex<T>* in, 558 const complex<T>*& twiddle) 559 { 560 static_assert(width > 0, "width cannot be zero"); 561 constexpr static size_t prefetch_cycles = 8; 562 constexpr bool split = splitin || splitout; 563 constexpr bool read_split = !splitin && split; 564 constexpr bool write_split = !splitout && split; 565 static_assert(!splitout); 566 567 CMT_PRAGMA_CLANG(clang loop unroll_count(4)) 568 for (size_t n = 0; n < stride; n += width) 569 { 570 if constexpr (prefetch) 571 prefetch_four<width>(stride, in + prefetch_cycles * width); 572 const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * stride); 573 const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * stride); 574 const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * stride); 575 const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * stride); 576 cvec<T, width> w0, w1, w2, w3; 577 radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3); 578 cwrite_split<width, false, write_split>(out + 0 * stride, w0); 579 cwrite_split<width, false, write_split>(out + 1 * stride, w1); 580 cwrite_split<width, false, write_split>(out + 2 * stride, w2); 581 cwrite_split<width, false, write_split>(out + 3 * stride, w3); 582 in += width; 583 out += width; 584 } 585 } 586 587 template <size_t width, bool splitout, bool splitin, bool prefetch, bool inverse, typename T> 588 KFR_INTRINSIC void radix8_autosort_pass_last(size_t stride, csize_t<width>, cbool_t<splitout>, 589 cbool_t<splitin>, cbool_t<prefetch>, cbool_t<inverse>, 590 complex<T>* out, const complex<T>* in, 591 const complex<T>*& twiddle) 592 { 593 static_assert(width > 0, "width cannot be zero"); 594 constexpr static size_t prefetch_cycles = 4; 595 constexpr bool split = splitin || splitout; 596 constexpr bool read_split = !splitin && split; 597 constexpr bool write_split = !splitout && split; 598 static_assert(!splitout); 599 600 CMT_PRAGMA_CLANG(clang loop unroll_count(4)) 601 for (size_t n = 0; n < stride; n += width) 602 { 603 if constexpr (prefetch) 604 prefetch_eight<width>(stride, in + prefetch_cycles * width); 605 const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * stride); 606 const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * stride); 607 const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * stride); 608 const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * stride); 609 const cvec<T, width> a4 = cread_split<width, false, read_split>(in + 4 * stride); 610 const cvec<T, width> a5 = cread_split<width, false, read_split>(in + 5 * stride); 611 const cvec<T, width> a6 = cread_split<width, false, read_split>(in + 6 * stride); 612 const cvec<T, width> a7 = cread_split<width, false, read_split>(in + 7 * stride); 613 cvec<T, width> w0, w1, w2, w3, w4, w5, w6, w7; 614 butterfly8<width, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, w0, w1, w2, w3, w4, w5, w6, w7); 615 cwrite_split<width, false, write_split>(out + 0 * stride, w0); 616 cwrite_split<width, false, write_split>(out + 1 * stride, w1); 617 cwrite_split<width, false, write_split>(out + 2 * stride, w2); 618 cwrite_split<width, false, write_split>(out + 3 * stride, w3); 619 cwrite_split<width, false, write_split>(out + 4 * stride, w4); 620 cwrite_split<width, false, write_split>(out + 5 * stride, w5); 621 cwrite_split<width, false, write_split>(out + 6 * stride, w6); 622 cwrite_split<width, false, write_split>(out + 7 * stride, w7); 623 in += width; 624 out += width; 625 } 626 } 627 628 template <size_t width, bool splitout, bool splitin, bool prefetch, bool inverse, typename T> 629 KFR_INTRINSIC void radix4_autosort_pass(size_t N, size_t stride, csize_t<width>, cbool_t<splitout>, 630 cbool_t<splitin>, cbool_t<prefetch>, cbool_t<inverse>, 631 complex<T>* out, const complex<T>* in, const complex<T>*& twiddle) 632 { 633 static_assert(width > 0, "width cannot be zero"); 634 constexpr static size_t prefetch_cycles = 8; 635 const size_t N4 = N / 4; 636 const size_t Nstride = stride * N4; 637 const size_t stride3 = 3 * stride; 638 constexpr bool split = splitin || splitout; 639 constexpr bool read_split = !splitin && split; 640 constexpr bool write_split = !splitout && split; 641 642 { 643 for (size_t n = 0; n < stride; n += width) 644 { 645 if constexpr (prefetch) 646 prefetch_four<width>(Nstride, in + prefetch_cycles * width); 647 const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * Nstride); 648 const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * Nstride); 649 const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * Nstride); 650 const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * Nstride); 651 cvec<T, width> w0, w1, w2, w3; 652 radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3); 653 cwrite_split<width, false, write_split>(out + 0 * stride, w0); 654 cwrite_split<width, false, write_split>(out + 1 * stride, w1); 655 cwrite_split<width, false, write_split>(out + 2 * stride, w2); 656 cwrite_split<width, false, write_split>(out + 3 * stride, w3); 657 in += width; 658 out += width; 659 } 660 twiddle += 3; 661 out += stride3; 662 } 663 664 // CMT_LOOP_NOUNROLL 665 for (size_t b = 1; b < N4; b++) 666 { 667 cvec<T, width> tw1 = read_twiddle<width, split>(twiddle); 668 cvec<T, width> tw2 = read_twiddle<width, split>(twiddle + 1); 669 cvec<T, width> tw3 = read_twiddle<width, split>(twiddle + 2); 670 for (size_t n = 0; n < stride; n += width) 671 { 672 if constexpr (prefetch) 673 prefetch_four<width>(Nstride, in + prefetch_cycles * width); 674 const cvec<T, width> a0 = cread_split<width, false, read_split>(in + 0 * Nstride); 675 const cvec<T, width> a1 = cread_split<width, false, read_split>(in + 1 * Nstride); 676 const cvec<T, width> a2 = cread_split<width, false, read_split>(in + 2 * Nstride); 677 const cvec<T, width> a3 = cread_split<width, false, read_split>(in + 3 * Nstride); 678 cvec<T, width> w0, w1, w2, w3; 679 radix4_butterfly<width, inverse, split>(a0, a1, a2, a3, w0, w1, w2, w3, tw1, tw2, tw3); 680 cwrite_split<width, false, write_split>(out + 0 * stride, w0); 681 cwrite_split<width, false, write_split>(out + 1 * stride, w1); 682 cwrite_split<width, false, write_split>(out + 2 * stride, w2); 683 cwrite_split<width, false, write_split>(out + 3 * stride, w3); 684 in += width; 685 out += width; 686 } 687 twiddle += 3; 688 out += stride3; 689 } 690 } 691 692 template <typename T> 693 static void initialize_twiddle_autosort(size_t N, size_t w, complex<T>*& twiddle) 694 { 695 for (size_t b = 0; b < N / 4; ++b) 696 { 697 cwrite<1>(twiddle + b / w * 3 * w + b % w + 0 * w, calculate_twiddle<T>(b, N)); 698 cwrite<1>(twiddle + b / w * 3 * w + b % w + 1 * w, calculate_twiddle<T>(2 * b, N)); 699 cwrite<1>(twiddle + b / w * 3 * w + b % w + 2 * w, calculate_twiddle<T>(3 * b, N)); 700 } 701 twiddle += N / 4 * 3; 702 } 703 704 template <typename T> 705 KFR_INTRINSIC void interleavehalves32(cvec<T, 32>& v0) 706 { 707 cvec<T, 8> t0, t1, t2, t3; 708 split(v0, t0, t1, t2, t3); 709 t0 = interleavehalves(t0); 710 t1 = interleavehalves(t1); 711 t2 = interleavehalves(t2); 712 t3 = interleavehalves(t3); 713 v0 = concat(t0, t1, t2, t3); 714 } 715 716 template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T> 717 KFR_INTRINSIC ctrue_t radix4_pass(csize_t<8>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t, 718 cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>, 719 complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/) 720 { 721 CMT_ASSUME(blocks > 0); 722 DFT_ASSERT(4 <= blocks); 723 constexpr static size_t prefetch_cycles = 8; 724 if constexpr (vector_capacity<T> >= 128) 725 { 726 CMT_PRAGMA_CLANG(clang loop unroll(disable)) 727 for (size_t b = 0; b < blocks; b += 4) 728 { 729 if constexpr (prefetch) 730 prefetch_one<32>(out + prefetch_cycles * 32); 731 732 cvec<T, 32> v32 = cread<32, aligned>(out); 733 cvec<T, 4> v0, v1, v2, v3, v4, v5, v6, v7; 734 v32 = ctranspose<8>(v32); 735 split(v32, v0, v1, v2, v3, v4, v5, v6, v7); 736 butterfly8<4, inverse>(v0, v1, v2, v3, v4, v5, v6, v7); 737 v32 = concat(v0, v4, v2, v6, v1, v5, v3, v7); 738 v32 = ctranspose<4>(v32); 739 cwrite<32, aligned>(out, v32); 740 741 out += 32; 742 } 743 } 744 else 745 { 746 CMT_PRAGMA_CLANG(clang loop unroll(disable)) 747 for (size_t b = 0; b < blocks; b += 2) 748 { 749 if constexpr (prefetch) 750 prefetch_one<16>(out + prefetch_cycles * 16); 751 752 cvec<T, 16> v16 = cread<16, aligned>(out); 753 cvec<T, 2> v0, v1, v2, v3, v4, v5, v6, v7; 754 v16 = ctranspose<8>(v16); 755 split(v16, v0, v1, v2, v3, v4, v5, v6, v7); 756 butterfly8<2, inverse>(v0, v1, v2, v3, v4, v5, v6, v7); 757 v16 = concat(v0, v4, v2, v6, v1, v5, v3, v7); 758 v16 = ctranspose<2>(v16); 759 cwrite<16, aligned>(out, v16); 760 761 out += 16; 762 } 763 } 764 return {}; 765 } 766 767 template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T> 768 KFR_INTRINSIC ctrue_t radix4_pass(csize_t<16>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t, 769 cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>, 770 complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/) 771 { 772 CMT_ASSUME(blocks > 0); 773 constexpr static size_t prefetch_cycles = 4; 774 DFT_ASSERT(4 <= blocks); 775 if constexpr (vector_capacity<T> >= 128) 776 { 777 CMT_PRAGMA_CLANG(clang loop unroll(disable)) 778 for (size_t b = 0; b < blocks; b += 4) 779 { 780 if constexpr (prefetch) 781 prefetch_one<64>(out + prefetch_cycles * 64); 782 783 cvec<T, 16> v0 = cread<16, aligned>(out); 784 cvec<T, 16> v1 = cread<16, aligned>(out + 16); 785 cvec<T, 16> v2 = cread<16, aligned>(out + 32); 786 cvec<T, 16> v3 = cread<16, aligned>(out + 48); 787 butterfly4_packed<4, inverse>(v0); 788 butterfly4_packed<4, inverse>(v1); 789 butterfly4_packed<4, inverse>(v2); 790 butterfly4_packed<4, inverse>(v3); 791 apply_twiddles4<0, 4, 4, inverse>(v0); 792 apply_twiddles4<0, 4, 4, inverse>(v1); 793 apply_twiddles4<0, 4, 4, inverse>(v2); 794 apply_twiddles4<0, 4, 4, inverse>(v3); 795 v0 = digitreverse4<2>(v0); 796 v1 = digitreverse4<2>(v1); 797 v2 = digitreverse4<2>(v2); 798 v3 = digitreverse4<2>(v3); 799 butterfly4_packed<4, inverse>(v0); 800 butterfly4_packed<4, inverse>(v1); 801 butterfly4_packed<4, inverse>(v2); 802 butterfly4_packed<4, inverse>(v3); 803 804 use_br2 ? cbitreverse_write(out, v0) : cdigitreverse4_write(out, v0); 805 use_br2 ? cbitreverse_write(out + 16, v1) : cdigitreverse4_write(out + 16, v1); 806 use_br2 ? cbitreverse_write(out + 32, v2) : cdigitreverse4_write(out + 32, v2); 807 use_br2 ? cbitreverse_write(out + 48, v3) : cdigitreverse4_write(out + 48, v3); 808 out += 64; 809 } 810 } 811 else 812 { 813 CMT_PRAGMA_CLANG(clang loop unroll(disable)) 814 for (size_t b = 0; b < blocks; b += 2) 815 { 816 if constexpr (prefetch) 817 prefetch_one<32>(out + prefetch_cycles * 32); 818 819 cvec<T, 16> vlo = cread<16, aligned>(out); 820 cvec<T, 16> vhi = cread<16, aligned>(out + 16); 821 butterfly4_packed<4, inverse>(vlo); 822 butterfly4_packed<4, inverse>(vhi); 823 apply_twiddles4<0, 4, 4, inverse>(vlo); 824 apply_twiddles4<0, 4, 4, inverse>(vhi); 825 vlo = digitreverse4<2>(vlo); 826 vhi = digitreverse4<2>(vhi); 827 butterfly4_packed<4, inverse>(vlo); 828 butterfly4_packed<4, inverse>(vhi); 829 830 use_br2 ? cbitreverse_write(out, vlo) : cdigitreverse4_write(out, vlo); 831 use_br2 ? cbitreverse_write(out + 16, vhi) : cdigitreverse4_write(out + 16, vhi); 832 out += 32; 833 } 834 } 835 return {}; 836 } 837 838 template <size_t width, bool prefetch, bool use_br2, bool inverse, bool aligned, typename T> 839 KFR_INTRINSIC ctrue_t radix4_pass(csize_t<4>, size_t blocks, csize_t<width>, cfalse_t, cfalse_t, 840 cbool_t<use_br2>, cbool_t<prefetch>, cbool_t<inverse>, cbool_t<aligned>, 841 complex<T>* out, const complex<T>*, const complex<T>*& /*twiddle*/) 842 { 843 constexpr static size_t prefetch_cycles = 8; 844 CMT_ASSUME(blocks > 8); 845 DFT_ASSERT(8 <= blocks); 846 for (size_t b = 0; b < blocks; b += 4) 847 { 848 if constexpr (prefetch) 849 prefetch_one<16>(out + prefetch_cycles * 16); 850 851 cvec<T, 16> v16 = cdigitreverse4_read<16, aligned>(out); 852 butterfly4_packed<4, inverse>(v16); 853 if constexpr (use_br2) 854 v16 = permutegroups<(8), 0, 2, 1, 3>(v16); 855 cdigitreverse4_write<aligned>(out, v16); 856 857 out += 4 * 4; 858 } 859 return {}; 860 } 861 862 template <typename T> 863 struct fft_config 864 { 865 constexpr static inline const bool recursion = true; 866 #ifdef CMT_ARCH_NEON 867 constexpr static inline const bool prefetch = false; 868 #else 869 constexpr static inline const bool prefetch = true; 870 #endif 871 constexpr static inline const size_t process_width = 872 const_max(static_cast<size_t>(1), vector_capacity<T> / 16); 873 }; 874 875 template <typename T, bool splitin, bool is_even> 876 struct fft_stage_impl : dft_stage<T> 877 { 878 fft_stage_impl(size_t stage_size) 879 { 880 this->name = dft_name(this); 881 this->radix = 4; 882 this->stage_size = stage_size; 883 this->repeats = 4; 884 this->recursion = fft_config<T>::recursion; 885 this->data_size = 886 align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment); 887 } 888 889 constexpr static bool prefetch = fft_config<T>::prefetch; 890 constexpr static bool aligned = false; 891 constexpr static size_t width = fft_config<T>::process_width; 892 constexpr static bool use_br2 = !is_even || always_br2; 893 894 virtual void do_initialize(size_t size) override final 895 { 896 complex<T>* twiddle = ptr_cast<complex<T>>(this->data); 897 initialize_twiddles<T, width>(twiddle, this->stage_size, size, true); 898 } 899 900 DFT_STAGE_FN 901 template <bool inverse> 902 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) 903 { 904 const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); 905 if constexpr (splitin) 906 in = out; 907 const size_t stg_size = this->stage_size; 908 CMT_ASSUME(stg_size >= 2048); 909 CMT_ASSUME(stg_size % 2048 == 0); 910 radix4_pass(stg_size, 1, csize_t<width>(), ctrue, cbool_t<splitin>(), cbool_t<use_br2>(), 911 cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); 912 } 913 }; 914 915 template <typename T, bool splitin, size_t size> 916 struct fft_final_stage_impl : dft_stage<T> 917 { 918 fft_final_stage_impl(size_t) 919 { 920 this->name = dft_name(this); 921 this->radix = size; 922 this->stage_size = size; 923 this->out_offset = size; 924 this->repeats = 4; 925 this->recursion = fft_config<T>::recursion; 926 this->data_size = align_up(sizeof(complex<T>) * size * 3 / 2, platform<>::native_cache_alignment); 927 } 928 929 constexpr static size_t width = fft_config<T>::process_width; 930 constexpr static bool is_even = cometa::is_even(ilog2(size)); 931 constexpr static bool use_br2 = !is_even || always_br2; 932 constexpr static bool aligned = false; 933 constexpr static bool prefetch = fft_config<T>::prefetch && splitin; 934 935 template <bool pass_splitin> 936 KFR_MEM_INTRINSIC void init_twiddles(csize_t<8>, size_t, cbool_t<pass_splitin>, complex<T>*&) 937 { 938 } 939 template <bool pass_splitin> 940 KFR_MEM_INTRINSIC void init_twiddles(csize_t<4>, size_t, cbool_t<pass_splitin>, complex<T>*&) 941 { 942 } 943 944 static constexpr bool get_pass_splitout(size_t N) { return N / 4 > 8 && N / 4 / 4 >= width; } 945 946 template <size_t N, bool pass_splitin> 947 KFR_MEM_INTRINSIC void init_twiddles(csize_t<N>, size_t total_size, cbool_t<pass_splitin>, 948 complex<T>*& twiddle) 949 { 950 constexpr bool pass_splitout = get_pass_splitout(N); 951 constexpr size_t pass_width = const_min(width, N / 4); 952 initialize_twiddles<T, pass_width>(twiddle, N, total_size, pass_splitout || pass_splitin); 953 init_twiddles(csize<N / 4>, total_size, cbool<pass_splitout>, twiddle); 954 } 955 956 virtual void do_initialize(size_t total_size) override final 957 { 958 complex<T>* twiddle = ptr_cast<complex<T>>(this->data); 959 init_twiddles(csize<size>, total_size, cbool<splitin>, twiddle); 960 } 961 962 DFT_STAGE_FN_NONFINAL 963 template <bool inverse> 964 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) 965 { 966 const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); 967 final_stage<inverse>(csize<size>, 1, cbool<splitin>, out, in, twiddle); 968 } 969 970 template <bool inverse, bool pass_splitin, typename U = T, KFR_ENABLE_IF(vector_capacity<U> >= 128)> 971 KFR_MEM_INTRINSIC void final_stage(csize_t<16>, size_t invN, cbool_t<pass_splitin>, complex<T>* out, 972 const complex<T>*, const complex<T>*& twiddle) 973 { 974 radix4_pass(csize_t<16>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), 975 cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); 976 } 977 978 template <bool inverse, bool pass_splitin> 979 KFR_MEM_INTRINSIC void final_stage(csize_t<8>, size_t invN, cbool_t<pass_splitin>, complex<T>* out, 980 const complex<T>*, const complex<T>*& twiddle) 981 { 982 radix4_pass(csize_t<8>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), 983 cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); 984 } 985 986 template <bool inverse, bool pass_splitin> 987 KFR_MEM_INTRINSIC void final_stage(csize_t<4>, size_t invN, cbool_t<pass_splitin>, complex<T>* out, 988 const complex<T>*, const complex<T>*& twiddle) 989 { 990 radix4_pass(csize_t<4>(), invN, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), 991 cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); 992 } 993 994 template <bool inverse, size_t N, bool pass_splitin> 995 KFR_MEM_INTRINSIC void final_stage(csize_t<N>, size_t invN, cbool_t<pass_splitin>, complex<T>* out, 996 const complex<T>* in, const complex<T>*& twiddle) 997 { 998 static_assert(N > 8, ""); 999 constexpr bool pass_splitout = get_pass_splitout(N); 1000 constexpr size_t pass_width = const_min(width, N / 4); 1001 static_assert(pass_width == width || !pass_splitin, ""); 1002 static_assert(pass_width <= N / 4, ""); 1003 radix4_pass(N, invN, csize_t<pass_width>(), cbool<pass_splitout>, cbool_t<pass_splitin>(), 1004 cbool_t<use_br2>(), cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, in, 1005 twiddle); 1006 final_stage<inverse>(csize<N / 4>, invN * 4, cbool<pass_splitout>, out, out, twiddle); 1007 } 1008 }; 1009 1010 template <typename T, bool is_even> 1011 struct fft_reorder_stage_impl : dft_stage<T> 1012 { 1013 fft_reorder_stage_impl(size_t stage_size) 1014 { 1015 this->name = dft_name(this); 1016 this->stage_size = stage_size; 1017 this->user = ilog2(stage_size); 1018 this->data_size = 0; 1019 } 1020 1021 virtual void do_initialize(size_t) override final {} 1022 1023 DFT_STAGE_FN 1024 template <bool inverse> 1025 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>*, u8*) 1026 { 1027 fft_reorder(out, this->user, cbool_t<(!is_even || always_br2)>()); 1028 } 1029 }; 1030 1031 template <typename T, bool is_first, bool is_last, bool radix8> 1032 struct fft_autosort_stage_impl : dft_stage<T> 1033 { 1034 fft_autosort_stage_impl(size_t stage_size, size_t stride) 1035 { 1036 this->name = dft_name(this); 1037 this->radix = radix8 ? 8 : 4; 1038 this->stage_size = stage_size * stride * this->radix; 1039 this->blocks = stage_size; 1040 this->recursion = false; 1041 this->can_inplace = is_last; 1042 this->need_reorder = false; 1043 this->user = stride; 1044 if constexpr (!is_last) 1045 { 1046 this->data_size = 1047 align_up(sizeof(complex<T>) * stage_size / 4 * 3, platform<>::native_cache_alignment); 1048 } 1049 } 1050 1051 constexpr static bool prefetch = fft_config<T>::prefetch; 1052 constexpr static bool aligned = false; 1053 1054 constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width)); 1055 1056 void do_initialize(size_t total_size) final 1057 { 1058 if constexpr (!is_last) 1059 { 1060 complex<T>* twiddle = ptr_cast<complex<T>>(this->data); 1061 if constexpr (is_first) 1062 initialize_twiddle_autosort(this->blocks, width, twiddle); 1063 else 1064 initialize_twiddle_autosort(this->blocks, 1, twiddle); 1065 } 1066 } 1067 1068 DFT_STAGE_FN 1069 template <bool inverse> 1070 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) 1071 { 1072 const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); 1073 const size_t stg_size = this->blocks; 1074 const size_t stride = this->user; 1075 if constexpr (is_first) 1076 { 1077 radix4_autosort_pass_first(stg_size, csize_t<width>(), cfalse, cfalse, cbool_t<prefetch>(), 1078 cbool_t<inverse>(), out, in, twiddle); 1079 } 1080 else if constexpr (is_last) 1081 { 1082 if constexpr (radix8) 1083 radix8_autosort_pass_last(stride, csize_t<width / 2>(), cfalse, cfalse, cbool_t<prefetch>(), 1084 cbool_t<inverse>(), out, in, twiddle); 1085 else 1086 radix4_autosort_pass_last(stride, csize_t<width>(), cfalse, cfalse, cbool_t<prefetch>(), 1087 cbool_t<inverse>(), out, in, twiddle); 1088 } 1089 else 1090 { 1091 if (stride == 4) 1092 radix4_autosort_pass(stg_size, stride, csize_t<4>(), cfalse, cfalse, cbool_t<prefetch>(), 1093 cbool_t<inverse>(), out, in, twiddle); 1094 else 1095 radix4_autosort_pass(stg_size, stride, csize_t<width>(), cfalse, cfalse, cbool_t<prefetch>(), 1096 cbool_t<inverse>(), out, in, twiddle); 1097 } 1098 } 1099 }; 1100 1101 template <typename T, size_t log2n> 1102 struct fft_specialization; 1103 1104 template <typename T> 1105 struct fft_specialization<T, 0> : dft_stage<T> 1106 { 1107 fft_specialization(size_t) 1108 { 1109 this->stage_size = 1; 1110 this->name = dft_name(this); 1111 } 1112 1113 constexpr static bool aligned = false; 1114 DFT_STAGE_FN 1115 1116 template <bool inverse> 1117 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) 1118 { 1119 out[0] = in[0]; 1120 } 1121 }; 1122 1123 template <typename T> 1124 struct fft_specialization<T, 1> : dft_stage<T> 1125 { 1126 fft_specialization(size_t) 1127 { 1128 this->stage_size = 2; 1129 this->name = dft_name(this); 1130 } 1131 1132 constexpr static bool aligned = false; 1133 DFT_STAGE_FN 1134 1135 template <bool inverse> 1136 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) 1137 { 1138 cvec<T, 1> a0, a1; 1139 split<T, 4>(cread<2, aligned>(in), a0, a1); 1140 cwrite<2, aligned>(out, concat(a0 + a1, a0 - a1)); 1141 } 1142 }; 1143 1144 template <typename T> 1145 struct fft_specialization<T, 2> : dft_stage<T> 1146 { 1147 fft_specialization(size_t) 1148 { 1149 this->stage_size = 4; 1150 this->name = dft_name(this); 1151 } 1152 1153 constexpr static bool aligned = false; 1154 DFT_STAGE_FN 1155 template <bool inverse> 1156 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) 1157 { 1158 cvec<T, 1> a0, a1, a2, a3; 1159 split<T, 8>(cread<4>(in), a0, a1, a2, a3); 1160 butterfly(cbool_t<inverse>(), a0, a1, a2, a3, a0, a1, a2, a3); 1161 cwrite<4>(out, concat(concat(a0, a1), concat(a2, a3))); 1162 } 1163 }; 1164 1165 template <typename T> 1166 struct fft_specialization<T, 3> : dft_stage<T> 1167 { 1168 fft_specialization(size_t) 1169 { 1170 this->stage_size = 8; 1171 this->name = dft_name(this); 1172 } 1173 1174 constexpr static bool aligned = false; 1175 DFT_STAGE_FN 1176 template <bool inverse> 1177 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) 1178 { 1179 cvec<T, 8> v8 = cread<8, aligned>(in); 1180 butterfly8_packed<inverse>(v8); 1181 cwrite<8, aligned>(out, v8); 1182 } 1183 }; 1184 1185 template <typename T> 1186 struct fft_specialization<T, 4> : dft_stage<T> 1187 { 1188 fft_specialization(size_t) 1189 { 1190 this->stage_size = 16; 1191 this->name = dft_name(this); 1192 } 1193 1194 constexpr static bool aligned = false; 1195 DFT_STAGE_FN 1196 template <bool inverse> 1197 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) 1198 { 1199 cvec<T, 16> v16 = cread<16, aligned>(in); 1200 butterfly16_packed<inverse>(v16); 1201 cwrite<16, aligned>(out, v16); 1202 } 1203 }; 1204 1205 template <typename T> 1206 struct fft_specialization<T, 5> : dft_stage<T> 1207 { 1208 fft_specialization(size_t) 1209 { 1210 this->stage_size = 32; 1211 this->name = dft_name(this); 1212 } 1213 1214 constexpr static bool aligned = false; 1215 DFT_STAGE_FN 1216 template <bool inverse> 1217 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) 1218 { 1219 cvec<T, 32> v32 = cread<32, aligned>(in); 1220 butterfly32_packed<inverse>(v32); 1221 cwrite<32, aligned>(out, v32); 1222 } 1223 }; 1224 1225 #ifdef KFR_AUTOSORT_FOR_64 1226 template <typename T> 1227 struct fft_specialization<T, 6> : dft_stage<T> 1228 { 1229 fft_specialization(size_t stage_size) 1230 { 1231 this->stage_size = 64; 1232 this->name = dft_name(this); 1233 this->temp_size = 64 * sizeof(complex<T>); 1234 this->data_size = 64 * sizeof(complex<T>); 1235 } 1236 1237 constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width)); 1238 1239 void do_initialize(size_t) final 1240 { 1241 complex<T>* twiddle = ptr_cast<complex<T>>(this->data); 1242 initialize_twiddle_autosort(64, width, twiddle); 1243 initialize_twiddle_autosort(16, 1, twiddle); 1244 } 1245 1246 DFT_STAGE_FN 1247 template <bool inverse> 1248 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) 1249 { 1250 auto no = cfalse; 1251 const complex<T>* tw = ptr_cast<complex<T>>(this->data); 1252 complex<T>* scratch = ptr_cast<complex<T>>(temp); 1253 radix4_autosort_pass_first(64, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw); 1254 radix4_autosort_pass(16, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw); 1255 radix4_autosort_pass_last(16, csize<width>, no, no, no, cbool<inverse>, out, out, tw); 1256 } 1257 }; 1258 #else 1259 template <typename T> 1260 struct fft_specialization<T, 6> : dft_stage<T> 1261 { 1262 fft_specialization(size_t) 1263 { 1264 this->stage_size = 64; 1265 this->name = dft_name(this); 1266 } 1267 1268 constexpr static bool aligned = false; 1269 DFT_STAGE_FN 1270 template <bool inverse> 1271 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) 1272 { 1273 butterfly64_memory(cbool_t<inverse>(), cbool_t<aligned>(), out, in); 1274 } 1275 }; 1276 #endif 1277 1278 #ifdef KFR_AUTOSORT_FOR_128D 1279 template <> 1280 struct fft_specialization<double, 7> : dft_stage<double> 1281 { 1282 using T = double; 1283 fft_specialization(size_t stage_size) 1284 { 1285 this->stage_size = 128; 1286 this->name = dft_name(this); 1287 this->temp_size = 128 * sizeof(complex<T>); 1288 this->data_size = 128 * sizeof(complex<T>); 1289 } 1290 1291 constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width)); 1292 1293 void do_initialize(size_t) final 1294 { 1295 complex<T>* twiddle = ptr_cast<complex<T>>(this->data); 1296 initialize_twiddle_autosort(128, width, twiddle); 1297 initialize_twiddle_autosort(32, 1, twiddle); 1298 initialize_twiddle_autosort(8, 1, twiddle); 1299 } 1300 1301 DFT_STAGE_FN 1302 template <bool inverse> 1303 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) 1304 { 1305 auto no = cfalse; 1306 const complex<T>* tw = ptr_cast<complex<T>>(this->data); 1307 complex<T>* scratch = ptr_cast<complex<T>>(temp); 1308 radix4_autosort_pass_first(128, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw); 1309 radix4_autosort_pass(32, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw); 1310 radix8_autosort_pass_last(16, csize<width>, no, no, no, cbool<inverse>, out, out, tw); 1311 } 1312 }; 1313 #else 1314 template <> 1315 struct fft_specialization<double, 7> : dft_stage<double> 1316 { 1317 using T = double; 1318 fft_specialization(size_t) 1319 { 1320 this->name = dft_name(this); 1321 this->stage_size = 128; 1322 this->data_size = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment); 1323 } 1324 1325 constexpr static bool aligned = false; 1326 constexpr static size_t width = const_min(fft_config<T>::process_width, size_t(8)); 1327 constexpr static bool use_br2 = true; 1328 constexpr static bool prefetch = false; 1329 constexpr static size_t split_format = true; 1330 1331 virtual void do_initialize(size_t total_size) override final 1332 { 1333 complex<T>* twiddle = ptr_cast<complex<T>>(this->data); 1334 initialize_twiddles<T, width>(twiddle, 128, total_size, split_format); 1335 initialize_twiddles<T, width>(twiddle, 32, total_size, split_format); 1336 initialize_twiddles<T, width>(twiddle, 8, total_size, split_format); 1337 } 1338 1339 DFT_STAGE_FN 1340 template <bool inverse> 1341 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) 1342 { 1343 const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); 1344 radix4_pass(128, 1, csize_t<width>(), ctrue, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(), 1345 cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); 1346 radix4_pass(32, 4, csize_t<width>(), cfalse, ctrue, cbool_t<use_br2>(), cbool_t<prefetch>(), 1347 cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); 1348 radix4_pass(csize_t<8>(), 16, csize_t<width>(), cfalse, cfalse, cbool_t<use_br2>(), 1349 cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); 1350 if (this->need_reorder) 1351 fft_reorder(out, csize_t<7>()); 1352 } 1353 }; 1354 #endif 1355 1356 template <> 1357 struct fft_specialization<float, 7> : dft_stage<float> 1358 { 1359 using T = float; 1360 fft_specialization(size_t) 1361 { 1362 this->name = dft_name(this); 1363 this->stage_size = 128; 1364 this->data_size = align_up(sizeof(complex<T>) * 128 * 3 / 2, platform<>::native_cache_alignment); 1365 } 1366 1367 constexpr static bool aligned = false; 1368 constexpr static size_t width1 = fft_config<T>::process_width; 1369 constexpr static size_t width2 = const_min(width1, size_t(8)); 1370 constexpr static bool use_br2 = true; 1371 constexpr static bool prefetch = false; 1372 constexpr static size_t final_size = 32; 1373 constexpr static size_t split_format = false; 1374 1375 virtual void do_initialize(size_t total_size) override final 1376 { 1377 complex<T>* twiddle = ptr_cast<complex<T>>(this->data); 1378 initialize_twiddles<T, width1>(twiddle, 128, total_size, split_format); 1379 initialize_twiddles<T, width2>(twiddle, 32, total_size, split_format); 1380 } 1381 1382 DFT_STAGE_FN 1383 template <bool inverse> 1384 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) 1385 { 1386 const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); 1387 radix4_pass(128, 1, csize_t<width1>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(), 1388 cbool_t<inverse>(), cbool_t<aligned>(), out, in, twiddle); 1389 radix4_pass(32, 4, csize_t<width2>(), cfalse, cfalse, cbool_t<use_br2>(), cbool_t<prefetch>(), 1390 cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); 1391 radix4_pass(csize_t<8>(), 16, csize_t<width2>(), cfalse, cfalse, cbool_t<use_br2>(), 1392 cbool_t<prefetch>(), cbool_t<inverse>(), cbool_t<aligned>(), out, out, twiddle); 1393 if (this->need_reorder) 1394 fft_reorder(out, csize_t<7>()); 1395 } 1396 }; 1397 1398 template <> 1399 struct fft_specialization<float, 8> : dft_stage<float> 1400 { 1401 fft_specialization(size_t) 1402 { 1403 this->stage_size = 256; 1404 this->name = dft_name(this); 1405 this->temp_size = sizeof(complex<float>) * 256; 1406 } 1407 1408 using T = float; 1409 DFT_STAGE_FN 1410 template <bool inverse> 1411 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) 1412 { 1413 complex<float>* scratch = ptr_cast<complex<float>>(temp); 1414 if (out == in) 1415 { 1416 butterfly16_multi_flip<0, inverse>(scratch, out); 1417 butterfly16_multi_flip<1, inverse>(scratch, out); 1418 butterfly16_multi_flip<2, inverse>(scratch, out); 1419 butterfly16_multi_flip<3, inverse>(scratch, out); 1420 1421 butterfly16_multi_natural<0, inverse>(out, scratch); 1422 butterfly16_multi_natural<1, inverse>(out, scratch); 1423 butterfly16_multi_natural<2, inverse>(out, scratch); 1424 butterfly16_multi_natural<3, inverse>(out, scratch); 1425 } 1426 else 1427 { 1428 butterfly16_multi_flip<0, inverse>(out, in); 1429 butterfly16_multi_flip<1, inverse>(out, in); 1430 butterfly16_multi_flip<2, inverse>(out, in); 1431 butterfly16_multi_flip<3, inverse>(out, in); 1432 1433 butterfly16_multi_natural<0, inverse>(out, out); 1434 butterfly16_multi_natural<1, inverse>(out, out); 1435 butterfly16_multi_natural<2, inverse>(out, out); 1436 butterfly16_multi_natural<3, inverse>(out, out); 1437 } 1438 } 1439 }; 1440 1441 #ifdef KFR_AUTOSORT_FOR_256D 1442 1443 template <> 1444 struct fft_specialization<double, 8> : dft_stage<double> 1445 { 1446 using T = double; 1447 fft_specialization(size_t stage_size) 1448 { 1449 this->stage_size = 256; 1450 this->name = dft_name(this); 1451 this->temp_size = 256 * sizeof(complex<T>); 1452 this->data_size = 256 * sizeof(complex<T>); 1453 } 1454 1455 constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width)); 1456 1457 void do_initialize(size_t) final 1458 { 1459 complex<T>* twiddle = ptr_cast<complex<T>>(this->data); 1460 initialize_twiddle_autosort(256, width, twiddle); 1461 initialize_twiddle_autosort(64, 1, twiddle); 1462 initialize_twiddle_autosort(16, 1, twiddle); 1463 } 1464 1465 DFT_STAGE_FN 1466 template <bool inverse> 1467 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) 1468 { 1469 auto no = cfalse; 1470 const complex<T>* tw = ptr_cast<complex<T>>(this->data); 1471 complex<T>* scratch = ptr_cast<complex<T>>(temp); 1472 if (in != out) 1473 { 1474 radix4_autosort_pass_first(256, csize<width>, no, no, no, cbool<inverse>, out, in, tw); 1475 radix4_autosort_pass(64, 4, csize<4>, no, no, no, cbool<inverse>, scratch, out, tw); 1476 radix4_autosort_pass(16, 16, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw); 1477 radix4_autosort_pass_last(64, csize<width>, no, no, no, cbool<inverse>, out, out, tw); 1478 } 1479 else 1480 { 1481 radix4_autosort_pass_first(256, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw); 1482 radix4_autosort_pass(64, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw); 1483 radix4_autosort_pass(16, 16, csize<width>, no, no, no, cbool<inverse>, scratch, out, tw); 1484 radix4_autosort_pass_last(64, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw); 1485 } 1486 } 1487 }; 1488 #else 1489 template <> 1490 struct fft_specialization<double, 8> : fft_final_stage_impl<double, false, 256> 1491 { 1492 using T = double; 1493 fft_specialization(size_t stage_size) : fft_final_stage_impl<double, false, 256>(stage_size) 1494 { 1495 this->stage_size = 256; 1496 this->name = dft_name(this); 1497 } 1498 1499 DFT_STAGE_FN 1500 template <bool inverse> 1501 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) 1502 { 1503 fft_final_stage_impl<double, false, 256>::template do_execute<inverse>(out, in, nullptr); 1504 if (this->need_reorder) 1505 fft_reorder(out, csize_t<8>(), cbool<always_br2>); 1506 } 1507 }; 1508 #endif 1509 1510 #ifdef KFR_AUTOSORT_FOR_512 1511 1512 template <typename T> 1513 struct fft_specialization<T, 9> : dft_stage<T> 1514 { 1515 fft_specialization(size_t stage_size) 1516 { 1517 this->stage_size = 512; 1518 this->name = dft_name(this); 1519 this->temp_size = 512 * sizeof(complex<T>); 1520 this->data_size = 512 * sizeof(complex<T>); 1521 } 1522 1523 constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width)); 1524 1525 void do_initialize(size_t) final 1526 { 1527 complex<T>* twiddle = ptr_cast<complex<T>>(this->data); 1528 initialize_twiddle_autosort(512, width, twiddle); 1529 initialize_twiddle_autosort(128, 1, twiddle); 1530 initialize_twiddle_autosort(32, 1, twiddle); 1531 initialize_twiddle_autosort(8, 1, twiddle); 1532 } 1533 1534 DFT_STAGE_FN 1535 template <bool inverse> 1536 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) 1537 { 1538 auto no = cfalse; 1539 const complex<T>* tw = ptr_cast<complex<T>>(this->data); 1540 complex<T>* scratch = ptr_cast<complex<T>>(temp); 1541 radix4_autosort_pass_first(512, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw); 1542 radix4_autosort_pass(128, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw); 1543 radix4_autosort_pass(32, 16, csize<width>, no, no, no, cbool<inverse>, scratch, out, tw); 1544 radix8_autosort_pass_last(64, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw); 1545 } 1546 }; 1547 #else 1548 template <typename T> 1549 struct fft_specialization<T, 9> : fft_final_stage_impl<T, false, 512> 1550 { 1551 fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 512>(stage_size) 1552 { 1553 this->stage_size = 512; 1554 this->name = dft_name(this); 1555 } 1556 1557 DFT_STAGE_FN 1558 template <bool inverse> 1559 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) 1560 { 1561 fft_final_stage_impl<T, false, 512>::template do_execute<inverse>(out, in, nullptr); 1562 if (this->need_reorder) 1563 fft_reorder(out, csize_t<9>()); 1564 } 1565 }; 1566 #endif 1567 1568 #ifdef KFR_AUTOSORT_FOR_1024 1569 template <typename T> 1570 struct fft_specialization<T, 10> : dft_stage<T> 1571 { 1572 fft_specialization(size_t stage_size) 1573 { 1574 this->stage_size = 1024; 1575 this->name = dft_name(this); 1576 this->temp_size = 1024 * sizeof(complex<T>); 1577 this->data_size = 1024 * sizeof(complex<T>); 1578 } 1579 1580 constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width)); 1581 1582 void do_initialize(size_t) final 1583 { 1584 complex<T>* twiddle = ptr_cast<complex<T>>(this->data); 1585 initialize_twiddle_autosort(1024, width, twiddle); 1586 initialize_twiddle_autosort(256, 1, twiddle); 1587 initialize_twiddle_autosort(64, 1, twiddle); 1588 initialize_twiddle_autosort(16, 1, twiddle); 1589 } 1590 1591 DFT_STAGE_FN 1592 template <bool inverse> 1593 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) 1594 { 1595 auto no = cfalse; 1596 auto split = cfalse; 1597 const complex<T>* tw = ptr_cast<complex<T>>(this->data); 1598 complex<T>* scratch = ptr_cast<complex<T>>(temp); 1599 radix4_autosort_pass_first(1024, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw); 1600 radix4_autosort_pass(256, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw); 1601 radix4_autosort_pass(64, 16, csize<width>, split, no, no, cbool<inverse>, scratch, out, tw); 1602 radix4_autosort_pass(16, 64, csize<width>, split, split, no, cbool<inverse>, out, scratch, tw); 1603 radix4_autosort_pass_last(256, csize<width>, no, split, no, cbool<inverse>, out, out, tw); 1604 } 1605 }; 1606 #else 1607 template <typename T> 1608 struct fft_specialization<T, 10> : fft_final_stage_impl<T, false, 1024> 1609 { 1610 fft_specialization(size_t stage_size) : fft_final_stage_impl<T, false, 1024>(stage_size) 1611 { 1612 this->name = dft_name(this); 1613 } 1614 1615 DFT_STAGE_FN 1616 template <bool inverse> 1617 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) 1618 { 1619 fft_final_stage_impl<T, false, 1024>::template do_execute<inverse>(out, in, nullptr); 1620 if (this->need_reorder) 1621 fft_reorder(out, csize_t<10>{}, cbool_t<always_br2>{}); 1622 } 1623 }; 1624 #endif 1625 1626 #ifdef KFR_AUTOSORT_FOR_2048 1627 template <typename T> 1628 struct fft_specialization<T, 11> : dft_stage<T> 1629 { 1630 fft_specialization(size_t stage_size) 1631 { 1632 this->stage_size = 2048; 1633 this->name = dft_name(this); 1634 this->temp_size = 2048 * sizeof(complex<T>); 1635 this->data_size = 2048 * sizeof(complex<T>); 1636 } 1637 1638 constexpr static size_t width = const_min(16, const_max(4, fft_config<T>::process_width)); 1639 1640 void do_initialize(size_t) final 1641 { 1642 complex<T>* twiddle = ptr_cast<complex<T>>(this->data); 1643 initialize_twiddle_autosort(2048, width, twiddle); 1644 initialize_twiddle_autosort(512, 1, twiddle); 1645 initialize_twiddle_autosort(128, 1, twiddle); 1646 initialize_twiddle_autosort(32, 1, twiddle); 1647 initialize_twiddle_autosort(8, 1, twiddle); 1648 } 1649 1650 DFT_STAGE_FN 1651 template <bool inverse> 1652 KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) 1653 { 1654 auto no = cfalse; 1655 const complex<T>* tw = ptr_cast<complex<T>>(this->data); 1656 complex<T>* scratch = ptr_cast<complex<T>>(temp); 1657 radix4_autosort_pass_first(2048, csize<width>, no, no, no, cbool<inverse>, scratch, in, tw); 1658 radix4_autosort_pass(512, 4, csize<4>, no, no, no, cbool<inverse>, out, scratch, tw); 1659 radix4_autosort_pass(128, 16, csize<4>, no, no, no, cbool<inverse>, scratch, out, tw); 1660 radix4_autosort_pass(32, 64, csize<width>, no, no, no, cbool<inverse>, out, scratch, tw); 1661 radix8_autosort_pass_last(256, csize<width>, no, no, no, cbool<inverse>, out, out, tw); 1662 } 1663 }; 1664 #endif 1665 1666 template <bool is_even, bool first, typename T, bool autosort> 1667 void make_fft_stages(dft_plan<T>* self, cbool_t<autosort>, size_t stage_size, cbool_t<is_even>, 1668 cbool_t<first>) 1669 { 1670 if constexpr (autosort) 1671 { 1672 if (stage_size >= 16) 1673 { 1674 add_stage<fft_autosort_stage_impl<T, first, false, false>>(self, stage_size, 1675 self->size / stage_size); 1676 make_fft_stages(self, ctrue, stage_size / 4, cbool_t<is_even>(), cfalse); 1677 } 1678 else 1679 { 1680 if (stage_size == 8) 1681 add_stage<fft_autosort_stage_impl<T, false, true, true>>(self, stage_size, 1682 self->size / stage_size); 1683 else 1684 add_stage<fft_autosort_stage_impl<T, false, true, false>>(self, stage_size, 1685 self->size / stage_size); 1686 } 1687 } 1688 else 1689 { 1690 constexpr size_t final_size = is_even ? 1024 : 512; 1691 1692 if (stage_size >= 2048) 1693 { 1694 add_stage<fft_stage_impl<T, !first, is_even>>(self, stage_size); 1695 1696 make_fft_stages(self, cfalse, stage_size / 4, cbool_t<is_even>(), cfalse); 1697 } 1698 else 1699 { 1700 add_stage<fft_final_stage_impl<T, !first, final_size>>(self, final_size); 1701 add_stage<fft_reorder_stage_impl<T, is_even>>(self, self->size); 1702 } 1703 } 1704 } 1705 1706 } // namespace intrinsics 1707 1708 template <bool is_even, typename T> 1709 void make_fft(dft_plan<T>* self, size_t stage_size, cbool_t<is_even>) 1710 { 1711 if (use_autosort<T>(ilog2(stage_size))) 1712 { 1713 intrinsics::make_fft_stages(self, ctrue, stage_size, cbool<is_even>, ctrue); 1714 } 1715 else 1716 { 1717 intrinsics::make_fft_stages(self, cfalse, stage_size, cbool<is_even>, ctrue); 1718 } 1719 } 1720 1721 template <typename T> 1722 struct reverse_wrapper 1723 { 1724 T& iterable; 1725 }; 1726 1727 template <typename T> 1728 KFR_INTRINSIC auto begin(reverse_wrapper<T> w) 1729 { 1730 return std::rbegin(w.iterable); 1731 } 1732 1733 template <typename T> 1734 KFR_INTRINSIC auto end(reverse_wrapper<T> w) 1735 { 1736 return std::rend(w.iterable); 1737 } 1738 1739 template <typename T> 1740 KFR_INTRINSIC reverse_wrapper<T> reversed(T&& iterable) 1741 { 1742 return { iterable }; 1743 } 1744 1745 template <typename T> 1746 KFR_INTRINSIC void initialize_data_stage(dft_plan<T>* self, const dft_stage_ptr<T>& stage, size_t& offset) 1747 { 1748 stage->data = self->data.data() + offset; 1749 stage->initialize(self->size); 1750 offset += stage->data_size; 1751 } 1752 1753 template <typename T> 1754 KFR_INTRINSIC size_t initialize_data(dft_plan<T>* self) 1755 { 1756 self->data = autofree<u8>(self->data_size); 1757 size_t offset = 0; 1758 for (dft_stage_ptr<T>& stage : self->all_stages) 1759 { 1760 initialize_data_stage(self, stage, offset); 1761 } 1762 return offset; 1763 } 1764 1765 template <typename T> 1766 KFR_INTRINSIC void initialize_order(dft_plan<T>* self) 1767 { 1768 self->calc_disposition(); 1769 typename dft_plan<T>::bitset ored = self->disposition_inplace[0] | self->disposition_inplace[1] | 1770 self->disposition_outofplace[0] | self->disposition_outofplace[1]; 1771 if (ored.any()) // if scratch needed 1772 self->temp_size += 1773 align_up(sizeof(complex<T>) * (self->size + 1), platform<>::native_cache_alignment); 1774 } 1775 1776 template <typename T> 1777 KFR_INTRINSIC void init_fft(dft_plan<T>* self, size_t size, dft_order) 1778 { 1779 const size_t log2n = ilog2(size); 1780 cswitch( 1781 csizes_t<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10 1782 #ifdef KFR_AUTOSORT_FOR_2048 1783 , 1784 11 1785 #endif 1786 >(), 1787 log2n, 1788 [&](auto log2n) 1789 { 1790 (void)log2n; 1791 constexpr size_t log2nv = val_of(decltype(log2n)()); 1792 add_stage<intrinsics::fft_specialization<T, log2nv>>(self, size); 1793 }, 1794 [&]() 1795 { cswitch(cfalse_true, is_even(log2n), [&](auto is_even) { make_fft(self, size, is_even); }); }); 1796 } 1797 1798 template <typename T> 1799 KFR_INTRINSIC void generate_real_twiddles(dft_plan_real<T>* self, size_t size) 1800 { 1801 using namespace intrinsics; 1802 constexpr size_t width = vector_width<T> * 2; 1803 block_process(size / 4, csizes_t<width, 1>(), 1804 [=](size_t i, auto w) 1805 { 1806 constexpr size_t width = val_of(decltype(w)()); 1807 cwrite<width>(self->rtwiddle.data() + i, 1808 cossin(dup(-constants<T>::pi * 1809 ((enumerate<T, width>() + i + size / 4) / (size / 2))))); 1810 }); 1811 } 1812 1813 template <typename T> 1814 #if (defined CMT_ARCH_X32 && defined CMT_ARCH_X86 && defined __clang__) && \ 1815 ((defined __APPLE__) || (__clang_major__ == 8)) 1816 // Fix for Clang 8.0 bug (x32 with FMA instructions) 1817 // Xcode has different versions but x86 is very rare on macOS these days, 1818 // so disable inlining and FMA for x32 macOS and Clang 8.x 1819 __attribute__((target("no-fma"), flatten, noinline)) 1820 #else 1821 KFR_INTRINSIC 1822 #endif 1823 void 1824 to_fmt(size_t real_size, const complex<T>* rtwiddle, complex<T>* out, const complex<T>* in, 1825 dft_pack_format fmt) 1826 { 1827 using namespace intrinsics; 1828 size_t csize = real_size / 2; // const size_t causes internal compiler error: in tsubst_copy in GCC 5.2 1829 1830 constexpr size_t width = vector_width<T> * 2; 1831 const cvec<T, 1> dc = cread<1>(in); 1832 cvec<T, 1> inmid = cread<1>(in + csize / 2); 1833 const size_t count = (csize + 1) / 2; 1834 1835 block_process(count - 1, csizes_t<width, 1>(), 1836 [&](size_t i, auto w) 1837 { 1838 i++; 1839 constexpr size_t width = val_of(decltype(w)()); 1840 constexpr size_t widthm1 = width - 1; 1841 const cvec<T, width> tw = cread<width>(rtwiddle + i); 1842 const cvec<T, width> fpk = cread<width>(in + i); 1843 const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1))); 1844 1845 const cvec<T, width> f1k = fpk + fpnk; 1846 const cvec<T, width> f2k = fpk - fpnk; 1847 const cvec<T, width> t = cmul(f2k, tw); 1848 cwrite<width>(out + i, T(0.5) * (f1k + t)); 1849 cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(T(0.5) * (f1k - t)))); 1850 }); 1851 1852 if (is_even(csize)) 1853 { 1854 cwrite<1>(out + csize / 2, negodd(inmid)); 1855 } 1856 if (fmt == dft_pack_format::CCs) 1857 { 1858 cwrite<1>(out, pack(dc[0] + dc[1], 0)); 1859 cwrite<1>(out + csize, pack(dc[0] - dc[1], 0)); 1860 } 1861 else 1862 { 1863 cwrite<1>(out, pack(dc[0] + dc[1], dc[0] - dc[1])); 1864 } 1865 } 1866 1867 template <typename T> 1868 #if (defined CMT_ARCH_X32 && defined CMT_ARCH_X86 && defined __clang__) && \ 1869 ((defined __APPLE__) || (__clang_major__ == 8)) 1870 // Fix for Clang 8.0 bug (x32 with FMA instructions) 1871 // Xcode has different versions but x86 is very rare on macOS these days, 1872 // so disable inlining and FMA for x32 macOS and Clang 8.x 1873 __attribute__((target("no-fma"), flatten, noinline)) 1874 #else 1875 KFR_INTRINSIC 1876 #endif 1877 void from_fmt(size_t real_size, complex<T>* rtwiddle, complex<T>* out, const complex<T>* in, 1878 dft_pack_format fmt) 1879 { 1880 using namespace intrinsics; 1881 1882 const size_t csize = real_size / 2; 1883 1884 cvec<T, 1> dc; 1885 1886 if (fmt == dft_pack_format::CCs) 1887 { 1888 dc = pack(in[0].real() + in[csize].real(), in[0].real() - in[csize].real()); 1889 } 1890 else 1891 { 1892 dc = pack(in[0].real() + in[0].imag(), in[0].real() - in[0].imag()); 1893 } 1894 cvec<T, 1> inmid = cread<1>(in + csize / 2); 1895 1896 constexpr size_t width = vector_width<T> * 2; 1897 const size_t count = (csize + 1) / 2; 1898 1899 block_process(count - 1, csizes_t<width, 1>(), 1900 [&](size_t i, auto w) 1901 { 1902 i++; 1903 constexpr size_t width = val_of(decltype(w)()); 1904 constexpr size_t widthm1 = width - 1; 1905 const cvec<T, width> tw = cread<width>(rtwiddle + i); 1906 const cvec<T, width> fpk = cread<width>(in + i); 1907 const cvec<T, width> fpnk = reverse<2>(negodd(cread<width>(in + csize - i - widthm1))); 1908 1909 const cvec<T, width> f1k = fpk + fpnk; 1910 const cvec<T, width> f2k = fpk - fpnk; 1911 const cvec<T, width> t = cmul_conj(f2k, tw); 1912 cwrite<width>(out + i, f1k + t); 1913 cwrite<width>(out + csize - i - widthm1, reverse<2>(negodd(f1k - t))); 1914 }); 1915 if (is_even(csize)) 1916 { 1917 cwrite<1>(out + csize / 2, 2 * negodd(inmid)); 1918 } 1919 cwrite<1>(out, dc); 1920 } 1921 1922 #ifndef KFR_DFT_NO_NPo2 1923 template <typename T> 1924 void init_dft(dft_plan<T>* self, size_t size, dft_order); 1925 #endif 1926 1927 template <typename T> 1928 KFR_INTRINSIC void initialize_stages(dft_plan<T>* self) 1929 { 1930 if (is_poweroftwo(self->size)) 1931 { 1932 init_fft(self, self->size, dft_order::normal); 1933 } 1934 else 1935 { 1936 #ifndef KFR_DFT_NO_NPo2 1937 init_dft(self, self->size, dft_order::normal); 1938 #else 1939 KFR_REPORT_LOGIC_ERROR("Non-power of 2 FFT is disabled but ", self->size, " size is requested"); 1940 #endif 1941 } 1942 } 1943 1944 namespace impl 1945 { 1946 template <typename T> 1947 void dft_initialize(dft_plan<T>& plan) 1948 { 1949 if (plan.size == 0) 1950 return; 1951 initialize_stages(&plan); 1952 initialize_data(&plan); 1953 initialize_order(&plan); 1954 } 1955 1956 template <typename T> 1957 const complex<T>* select_in(const dft_plan<T>& plan, typename dft_plan<T>::bitset disposition, size_t stage, 1958 const complex<T>* out, const complex<T>* in, const complex<T>* scratch) 1959 { 1960 return disposition.test(stage) ? scratch : stage == 0 ? in : out; 1961 } 1962 template <typename T> 1963 complex<T>* select_out(const dft_plan<T>& plan, typename dft_plan<T>::bitset disposition, size_t stage, 1964 size_t total_stages, complex<T>* out, complex<T>* scratch) 1965 { 1966 return stage == total_stages - 1 ? out : disposition.test(stage + 1) ? scratch : out; 1967 } 1968 1969 template <typename T, bool inverse> 1970 void dft_execute(const dft_plan<T>& plan, cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) 1971 { 1972 if (temp == nullptr && plan.temp_size > 0) 1973 { 1974 return call_with_temp(plan.temp_size, std::bind(&impl::dft_execute<T, inverse>, std::cref(plan), 1975 cbool_t<inverse>{}, out, in, std::placeholders::_1)); 1976 } 1977 auto&& stages = plan.stages[inverse]; 1978 if (stages.size() == 1 && (stages[0]->can_inplace || in != out)) 1979 { 1980 return stages[0]->execute(cbool<inverse>, out, in, temp); 1981 } 1982 size_t stack[DFT_MAX_STAGES] = { 0 }; 1983 1984 typename dft_plan<T>::bitset disposition = 1985 in == out ? plan.disposition_inplace[inverse] : plan.disposition_outofplace[inverse]; 1986 1987 complex<T>* scratch = ptr_cast<complex<T>>( 1988 temp + plan.temp_size - 1989 align_up(sizeof(complex<T>) * (plan.size + 1), platform<>::native_cache_alignment)); 1990 1991 bool in_scratch = disposition.test(0); 1992 if (in_scratch) 1993 { 1994 stages[0]->copy_input(inverse, scratch, in, plan.size); 1995 } 1996 1997 const size_t count = stages.size(); 1998 1999 for (size_t depth = 0; depth < count;) 2000 { 2001 if (stages[depth]->recursion) 2002 { 2003 size_t offset = 0; 2004 size_t rdepth = depth; 2005 size_t maxdepth = depth; 2006 do 2007 { 2008 if (stack[rdepth] == stages[rdepth]->repeats) 2009 { 2010 stack[rdepth] = 0; 2011 rdepth--; 2012 } 2013 else 2014 { 2015 complex<T>* rout = select_out(plan, disposition, rdepth, stages.size(), out, scratch); 2016 const complex<T>* rin = select_in(plan, disposition, rdepth, out, in, scratch); 2017 stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp); 2018 offset += stages[rdepth]->out_offset; 2019 stack[rdepth]++; 2020 if (rdepth < count - 1 && stages[rdepth + 1]->recursion) 2021 rdepth++; 2022 else 2023 maxdepth = rdepth; 2024 } 2025 } while (rdepth != depth); 2026 depth = maxdepth + 1; 2027 } 2028 else 2029 { 2030 size_t offset = 0; 2031 while (offset < plan.size) 2032 { 2033 stages[depth]->execute(cbool<inverse>, 2034 select_out(plan, disposition, depth, stages.size(), out, scratch) + 2035 offset, 2036 select_in(plan, disposition, depth, out, in, scratch) + offset, temp); 2037 offset += stages[depth]->stage_size; 2038 } 2039 depth++; 2040 } 2041 } 2042 } 2043 template <typename T> 2044 void dft_initialize_transpose(internal_generic::fn_transpose<T>& transpose) 2045 { 2046 transpose = &kfr::CMT_ARCH_NAME::matrix_transpose; 2047 } 2048 } // namespace impl 2049 2050 namespace intrinsics 2051 { 2052 2053 template <typename T> 2054 struct dft_stage_real_repack : dft_stage<T> 2055 { 2056 public: 2057 dft_stage_real_repack(size_t real_size, dft_pack_format fmt) 2058 { 2059 this->user = static_cast<int>(fmt); 2060 this->stage_size = real_size; 2061 this->can_inplace = true; 2062 this->name = dft_name(this); 2063 const size_t count = (real_size / 2 + 1) / 2; 2064 this->data_size = align_up(sizeof(complex<T>) * count, platform<>::native_cache_alignment); 2065 } 2066 void do_initialize(size_t) final 2067 { 2068 using namespace intrinsics; 2069 constexpr size_t width = vector_width<T> * 2; 2070 size_t real_size = this->stage_size; 2071 complex<T>* rtwiddle = ptr_cast<complex<T>>(this->data); 2072 const size_t count = (real_size / 2 + 1) / 2; 2073 block_process(count, csizes_t<width, 1>(), 2074 [=](size_t i, auto w) 2075 { 2076 constexpr size_t width = val_of(decltype(w)()); 2077 cwrite<width>( 2078 rtwiddle + i, 2079 cossin(dup(-constants<T>::pi * ((enumerate<T, width>() + i + real_size / T(4)) / 2080 (real_size / 2))))); 2081 }); 2082 } 2083 void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) final 2084 { 2085 to_fmt(this->stage_size, ptr_cast<complex<T>>(this->data), out, in, 2086 static_cast<dft_pack_format>(this->user)); 2087 } 2088 void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) final 2089 { 2090 from_fmt(this->stage_size, ptr_cast<complex<T>>(this->data), out, in, 2091 static_cast<dft_pack_format>(this->user)); 2092 } 2093 void copy_input(bool invert, complex<T>* out, const complex<T>* in, size_t size) final 2094 { 2095 size_t extra = invert && static_cast<dft_pack_format>(this->user) == dft_pack_format::CCs ? 1 : 0; 2096 builtin_memcpy(out, in, sizeof(complex<T>) * (size + extra)); 2097 } 2098 }; 2099 } // namespace intrinsics 2100 2101 namespace impl 2102 { 2103 template <typename T> 2104 void dft_real_initialize(dft_plan_real<T>& plan) 2105 { 2106 if (plan.size == 0) 2107 return; 2108 initialize_stages(&plan); 2109 add_stage<intrinsics::dft_stage_real_repack<T>, false>(&plan, plan.size, plan.fmt); 2110 plan.stages[0].push_back(plan.all_stages.back().get()); 2111 plan.stages[1].insert(plan.stages[1].begin(), plan.all_stages.back().get()); 2112 initialize_data(&plan); 2113 initialize_order(&plan); 2114 } 2115 } // namespace impl 2116 2117 } // namespace CMT_ARCH_NAME 2118 2119 } // namespace kfr 2120 2121 CMT_PRAGMA_GNU(GCC diagnostic pop) 2122 2123 CMT_PRAGMA_MSVC(warning(pop))