ft.hpp (69618B)
1 /** @addtogroup dft 2 * @{ 3 */ 4 /* 5 Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) 6 This file is part of KFR 7 8 KFR is free software: you can redistribute it and/or modify 9 it under the terms of the GNU General Public License as published by 10 the Free Software Foundation, either version 2 of the License, or 11 (at your option) any later version. 12 13 KFR is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public License 19 along with KFR. 20 21 If GPL is not suitable for your project, you must purchase a commercial license to use KFR. 22 Buying a commercial license is mandatory as soon as you develop commercial activities without 23 disclosing the source code of your own applications. 24 See https://www.kfrlib.com for details. 25 */ 26 #pragma once 27 28 #include <kfr/base/univector.hpp> 29 #include <kfr/math/sin_cos.hpp> 30 #include <kfr/simd/complex.hpp> 31 #include <kfr/simd/constants.hpp> 32 #include <kfr/simd/digitreverse.hpp> 33 #include <kfr/simd/read_write.hpp> 34 #include <kfr/simd/vec.hpp> 35 36 #include <kfr/base/memory.hpp> 37 #include "data/sincos.hpp" 38 39 CMT_PRAGMA_GNU(GCC diagnostic push) 40 #if CMT_HAS_WARNING("-Wpass-failed") 41 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpass-failed") 42 #endif 43 44 CMT_PRAGMA_MSVC(warning(push)) 45 CMT_PRAGMA_MSVC(warning(disable : 4127)) 46 47 namespace kfr 48 { 49 inline namespace CMT_ARCH_NAME 50 { 51 52 template <typename T, size_t N> 53 using cvec = vec<T, N * 2>; 54 55 namespace intrinsics 56 { 57 58 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> 59 KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, N>& y) 60 { 61 return subadd(x * dupeven(y), swap<2>(x) * dupodd(y)); 62 } 63 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> 64 KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, 2>& y) 65 { 66 vec<T, N> yy = resize<N>(y); 67 return cmul_impl(x, yy); 68 } 69 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> 70 KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, 2>& x, const vec<T, N>& y) 71 { 72 vec<T, N> xx = resize<N>(x); 73 return cmul_impl(xx, y); 74 } 75 76 /// Complex Multiplication 77 template <typename T, size_t N1, size_t N2> 78 KFR_INTRINSIC vec<T, const_max(N1, N2)> cmul(const vec<T, N1>& x, const vec<T, N2>& y) 79 { 80 return intrinsics::cmul_impl(x, y); 81 } 82 83 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> 84 KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, N>& y) 85 { 86 return swap<2>(subadd(swap<2>(x) * dupeven(y), x * dupodd(y))); 87 } 88 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> 89 KFR_INTRINSIC vec<T, N> cmul_2conj(const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& tw) 90 { 91 return (in0 + in1) * dupeven(tw) + swap<2>(cnegimag(in0 - in1)) * dupodd(tw); 92 } 93 template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> 94 KFR_INTRINSIC void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, const vec<T, 2>& in0, const vec<T, 2>& in1, 95 const vec<T, N>& tw) 96 { 97 const vec<T, N> twr = dupeven(tw); 98 const vec<T, N> twi = dupodd(tw); 99 const vec<T, 2> sum = (in0 + in1); 100 const vec<T, 2> dif = swap<2>(negodd(in0 - in1)); 101 const vec<T, N> sumtw = resize<N>(sum) * twr; 102 const vec<T, N> diftw = resize<N>(dif) * twi; 103 out0 += sumtw + diftw; 104 out1 += sumtw - diftw; 105 } 106 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> 107 KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, 2>& y) 108 { 109 vec<T, N> yy = resize<N>(y); 110 return cmul_conj(x, yy); 111 } 112 template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> 113 KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, 2>& x, const vec<T, N>& y) 114 { 115 vec<T, N> xx = resize<N>(x); 116 return cmul_conj(xx, y); 117 } 118 119 template <size_t N, bool A = false, typename T> 120 KFR_INTRINSIC cvec<T, N> cread(const complex<T>* src) 121 { 122 return cvec<T, N>(ptr_cast<T>(src), cbool_t<A>()); 123 } 124 125 template <size_t N, bool A = false, typename T> 126 KFR_INTRINSIC void cwrite(complex<T>* dest, const cvec<T, N>& value) 127 { 128 value.write(ptr_cast<T>(dest), cbool_t<A>()); 129 } 130 131 template <size_t count, size_t N, bool A = false, typename T> 132 KFR_INTRINSIC cvec<T, count * N> cread_group(const complex<T>* src, size_t stride) 133 { 134 return internal::read_group_impl<2, count, N, A>(ptr_cast<T>(src), stride, csizeseq_t<count>()); 135 } 136 137 template <size_t count, size_t N, bool A = false, typename T> 138 KFR_INTRINSIC void cwrite_group(complex<T>* dest, size_t stride, const cvec<T, count * N>& value) 139 { 140 return internal::write_group_impl<2, count, N, A>(ptr_cast<T>(dest), stride, value, csizeseq_t<count>()); 141 } 142 143 template <size_t N, bool A = false, bool split = false, typename T> 144 KFR_INTRINSIC cvec<T, N> cread_split(const complex<T>* src) 145 { 146 cvec<T, N> temp = cvec<T, N>(ptr_cast<T>(src), cbool_t<A>()); 147 if constexpr (split) 148 temp = splitpairs(temp); 149 return temp; 150 } 151 152 template <size_t N, bool A = false, bool split = false, typename T> 153 KFR_INTRINSIC void cwrite_split(complex<T>* dest, const cvec<T, N>& value) 154 { 155 cvec<T, N> v = value; 156 if constexpr (split) 157 v = interleavehalves(v); 158 v.write(ptr_cast<T>(dest), cbool_t<A>()); 159 } 160 161 template <> 162 inline cvec<f32, 8> cread_split<8, false, true, f32>(const complex<f32>* src) 163 { 164 const cvec<f32, 4> l = concat(cread<2>(src), cread<2>(src + 4)); 165 const cvec<f32, 4> h = concat(cread<2>(src + 2), cread<2>(src + 6)); 166 167 return concat(shuffle<0, 2, 8 + 0, 8 + 2>(l, h), shuffle<1, 3, 8 + 1, 8 + 3>(l, h)); 168 } 169 template <> 170 inline cvec<f32, 8> cread_split<8, true, true, f32>(const complex<f32>* src) 171 { 172 const cvec<f32, 4> l = concat(cread<2, true>(src), cread<2, true>(src + 4)); 173 const cvec<f32, 4> h = concat(cread<2, true>(src + 2), cread<2, true>(src + 6)); 174 175 return concat(shuffle<0, 2, 8 + 0, 8 + 2>(l, h), shuffle<1, 3, 8 + 1, 8 + 3>(l, h)); 176 } 177 178 template <> 179 inline cvec<f64, 4> cread_split<4, false, true, f64>(const complex<f64>* src) 180 { 181 const cvec<f64, 2> l = concat(cread<1>(src), cread<1>(src + 2)); 182 const cvec<f64, 2> h = concat(cread<1>(src + 1), cread<1>(src + 3)); 183 184 return concat(shuffle<0, 4, 2, 6>(l, h), shuffle<1, 5, 3, 7>(l, h)); 185 } 186 187 template <> 188 inline void cwrite_split<8, false, true, f32>(complex<f32>* dest, const cvec<f32, 8>& x) 189 { 190 const cvec<f32, 8> xx = 191 concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x))); 192 193 cvec<f32, 2> a, b, c, d; 194 split<f32, 16>(xx, a, b, c, d); 195 cwrite<2>(dest, a); 196 cwrite<2>(dest + 4, b); 197 cwrite<2>(dest + 2, c); 198 cwrite<2>(dest + 6, d); 199 } 200 template <> 201 inline void cwrite_split<8, true, true, f32>(complex<f32>* dest, const cvec<f32, 8>& x) 202 { 203 const cvec<f32, 8> xx = 204 concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x))); 205 206 cvec<f32, 2> a, b, c, d; 207 split<f32, 16>(xx, a, b, c, d); 208 cwrite<2, true>(dest + 0, a); 209 cwrite<2, true>(dest + 4, b); 210 cwrite<2, true>(dest + 2, c); 211 cwrite<2, true>(dest + 6, d); 212 } 213 214 template <> 215 inline void cwrite_split<4, false, true, f64>(complex<f64>* dest, const cvec<f64, 4>& x) 216 { 217 const cvec<f64, 4> xx = 218 concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x))); 219 cwrite<1>(dest, part<4, 0>(xx)); 220 cwrite<1>(dest + 2, part<4, 1>(xx)); 221 cwrite<1>(dest + 1, part<4, 2>(xx)); 222 cwrite<1>(dest + 3, part<4, 3>(xx)); 223 } 224 template <> 225 inline void cwrite_split<4, true, true, f64>(complex<f64>* dest, const cvec<f64, 4>& x) 226 { 227 const cvec<f64, 4> xx = 228 concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x))); 229 cwrite<1, true>(dest + 0, part<4, 0>(xx)); 230 cwrite<1, true>(dest + 2, part<4, 1>(xx)); 231 cwrite<1, true>(dest + 1, part<4, 2>(xx)); 232 cwrite<1, true>(dest + 3, part<4, 3>(xx)); 233 } 234 235 template <size_t N, size_t stride, typename T, size_t... Indices> 236 KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, csizes_t<Indices...>) 237 { 238 return concat(ref_cast<cvec<T, 1>>(base[Indices * stride])...); 239 } 240 241 template <size_t N, size_t stride, typename T> 242 KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base) 243 { 244 if constexpr (stride == 1) 245 { 246 return ref_cast<cvec<T, N>>(*base); 247 } 248 else 249 { 250 return cgather_helper<N, stride, T>(base, csizeseq_t<N>()); 251 } 252 } 253 254 KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t size, size_t) 255 { 256 size_t temp = index; 257 index += stride; 258 if (index >= size) 259 index -= size; 260 return temp; 261 } 262 KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t) 263 { 264 size_t temp = index; 265 index += stride; 266 return temp; 267 } 268 269 template <size_t N, typename T, size_t... Indices> 270 KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, 271 csizes_t<Indices...>) 272 { 273 return concat(ref_cast<cvec<T, 1>>(base[cgather_next(index, stride, Indices)])...); 274 } 275 276 template <size_t N, typename T> 277 KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride) 278 { 279 return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>()); 280 } 281 template <size_t N, typename T> 282 KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t stride) 283 { 284 size_t index = 0; 285 return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>()); 286 } 287 288 template <size_t N, typename T, size_t... Indices> 289 KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, size_t size, 290 csizes_t<Indices...>) 291 { 292 return concat(ref_cast<cvec<T, 1>>(base[cgather_next(index, stride, size, Indices)])...); 293 } 294 295 template <size_t N, typename T> 296 KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride, size_t size) 297 { 298 return cgather_helper<N, T>(base, index, stride, size, csizeseq_t<N>()); 299 } 300 301 template <size_t N, size_t stride, typename T, size_t... Indices> 302 KFR_INTRINSIC void cscatter_helper(complex<T>* base, const cvec<T, N>& value, csizes_t<Indices...>) 303 { 304 swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... }; 305 } 306 307 template <size_t N, size_t stride, typename T> 308 KFR_INTRINSIC void cscatter(complex<T>* base, const cvec<T, N>& value) 309 { 310 if constexpr (stride == 1) 311 { 312 cwrite<N>(base, value); 313 } 314 else 315 { 316 return cscatter_helper<N, stride, T>(base, value, csizeseq_t<N>()); 317 } 318 } 319 320 template <size_t N, typename T, size_t... Indices> 321 KFR_INTRINSIC void cscatter_helper(complex<T>* base, size_t stride, const cvec<T, N>& value, 322 csizes_t<Indices...>) 323 { 324 swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... }; 325 } 326 327 template <size_t N, typename T> 328 KFR_INTRINSIC void cscatter(complex<T>* base, size_t stride, const cvec<T, N>& value) 329 { 330 return cscatter_helper<N, T>(base, stride, value, csizeseq_t<N>()); 331 } 332 333 template <size_t groupsize = 1, typename T, size_t N, typename IT> 334 KFR_INTRINSIC vec<T, N * 2 * groupsize> cgather(const complex<T>* base, const vec<IT, N>& offset) 335 { 336 return internal::gather_helper<2 * groupsize>(ptr_cast<T>(base), offset, csizeseq_t<N>()); 337 } 338 339 template <size_t groupsize = 1, typename T, size_t N, typename IT> 340 KFR_INTRINSIC void cscatter(complex<T>* base, const vec<IT, N>& offset, vec<T, N * 2 * groupsize> value) 341 { 342 return internal::scatter_helper<2 * groupsize>(ptr_cast<T>(base), offset, value, csizeseq_t<N>()); 343 } 344 345 template <typename T> 346 KFR_INTRINSIC void transpose4x8(const cvec<T, 8>& z0, const cvec<T, 8>& z1, const cvec<T, 8>& z2, 347 const cvec<T, 8>& z3, cvec<T, 4>& w0, cvec<T, 4>& w1, cvec<T, 4>& w2, 348 cvec<T, 4>& w3, cvec<T, 4>& w4, cvec<T, 4>& w5, cvec<T, 4>& w6, 349 cvec<T, 4>& w7) 350 { 351 cvec<T, 16> a = concat(low(z0), low(z1), low(z2), low(z3)); 352 cvec<T, 16> b = concat(high(z0), high(z1), high(z2), high(z3)); 353 a = digitreverse4<2>(a); 354 b = digitreverse4<2>(b); 355 w0 = part<4, 0>(a); 356 w1 = part<4, 1>(a); 357 w2 = part<4, 2>(a); 358 w3 = part<4, 3>(a); 359 w4 = part<4, 0>(b); 360 w5 = part<4, 1>(b); 361 w6 = part<4, 2>(b); 362 w7 = part<4, 3>(b); 363 } 364 365 template <typename T> 366 KFR_INTRINSIC void transpose4x8(const cvec<T, 4>& w0, const cvec<T, 4>& w1, const cvec<T, 4>& w2, 367 const cvec<T, 4>& w3, const cvec<T, 4>& w4, const cvec<T, 4>& w5, 368 const cvec<T, 4>& w6, const cvec<T, 4>& w7, cvec<T, 8>& z0, cvec<T, 8>& z1, 369 cvec<T, 8>& z2, cvec<T, 8>& z3) 370 { 371 cvec<T, 16> a = concat(w0, w1, w2, w3); 372 cvec<T, 16> b = concat(w4, w5, w6, w7); 373 a = digitreverse4<2>(a); 374 b = digitreverse4<2>(b); 375 z0 = concat(part<4, 0>(a), part<4, 0>(b)); 376 z1 = concat(part<4, 1>(a), part<4, 1>(b)); 377 z2 = concat(part<4, 2>(a), part<4, 2>(b)); 378 z3 = concat(part<4, 3>(a), part<4, 3>(b)); 379 } 380 381 template <typename T> 382 KFR_INTRINSIC void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d) 383 { 384 cvec<T, 4> a0, a1, a2, a3; 385 cvec<T, 4> b0, b1, b2, b3; 386 cvec<T, 4> c0, c1, c2, c3; 387 cvec<T, 4> d0, d1, d2, d3; 388 389 split<T, 32>(a, a0, a1, a2, a3); 390 split<T, 32>(b, b0, b1, b2, b3); 391 split<T, 32>(c, c0, c1, c2, c3); 392 split<T, 32>(d, d0, d1, d2, d3); 393 394 a = concat(a0, b0, c0, d0); 395 b = concat(a1, b1, c1, d1); 396 c = concat(a2, b2, c2, d2); 397 d = concat(a3, b3, c3, d3); 398 } 399 template <typename T> 400 KFR_INTRINSIC void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d, cvec<T, 16>& aa, 401 cvec<T, 16>& bb, cvec<T, 16>& cc, cvec<T, 16>& dd) 402 { 403 cvec<T, 4> a0, a1, a2, a3; 404 cvec<T, 4> b0, b1, b2, b3; 405 cvec<T, 4> c0, c1, c2, c3; 406 cvec<T, 4> d0, d1, d2, d3; 407 408 split<T, 32>(a, a0, a1, a2, a3); 409 split<T, 32>(b, b0, b1, b2, b3); 410 split<T, 32>(c, c0, c1, c2, c3); 411 split<T, 32>(d, d0, d1, d2, d3); 412 413 aa = concat(a0, b0, c0, d0); 414 bb = concat(a1, b1, c1, d1); 415 cc = concat(a2, b2, c2, d2); 416 dd = concat(a3, b3, c3, d3); 417 } 418 419 template <bool b, typename T> 420 constexpr KFR_INTRINSIC T chsign(T x) 421 { 422 return b ? -x : x; 423 } 424 425 template <typename T, size_t N, size_t size, size_t start, size_t step, bool inverse = false, 426 size_t... indices> 427 constexpr KFR_INTRINSIC cvec<T, N> get_fixed_twiddle_helper(csizes_t<indices...>) 428 { 429 return make_vector((indices & 1 ? chsign<inverse>(-sin_using_table<T>(size, (indices / 2 * step + start))) 430 : cos_using_table<T>(size, (indices / 2 * step + start)))...); 431 } 432 433 template <typename T, size_t width, size_t... indices> 434 constexpr KFR_INTRINSIC cvec<T, width> get_fixed_twiddle_helper(csizes_t<indices...>, size_t size, 435 size_t start, size_t step) 436 { 437 return make_vector((indices & 1 ? -sin_using_table<T>(size, indices / 2 * step + start) 438 : cos_using_table<T>(size, indices / 2 * step + start))...); 439 } 440 441 template <typename T, size_t width, size_t size, size_t start, size_t step = 0, bool inverse = false> 442 constexpr KFR_INTRINSIC cvec<T, width> fixed_twiddle() 443 { 444 return get_fixed_twiddle_helper<T, width, size, start, step, inverse>(csizeseq_t<width * 2>()); 445 } 446 447 template <typename T, size_t width> 448 constexpr KFR_INTRINSIC cvec<T, width> fixed_twiddle(size_t size, size_t start, size_t step = 0) 449 { 450 return get_fixed_twiddle_helper<T, width>(csizeseq_t<width * 2>(), start, step, size); 451 } 452 453 // template <typename T, size_t N, size_t size, size_t start, size_t step = 0, bool inverse = false> 454 // constexpr cvec<T, N> fixed_twiddle = get_fixed_twiddle<T, N, size, start, step, inverse>(); 455 456 template <typename T, size_t N, bool inverse> 457 constexpr static inline cvec<T, N> twiddleimagmask() 458 { 459 return inverse ? broadcast<N * 2, T>(-1, +1) : broadcast<N * 2, T>(+1, -1); 460 } 461 462 CMT_PRAGMA_GNU(GCC diagnostic push) 463 CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wconversion") 464 465 CMT_PRAGMA_GNU(GCC diagnostic pop) 466 467 template <typename T, size_t N> 468 CMT_NOINLINE static vec<T, N> cossin_conj(const vec<T, N>& x) 469 { 470 return negodd(cossin(x)); 471 } 472 473 template <size_t k, size_t size, bool inverse = false, typename T, size_t width, 474 size_t kk = (inverse ? size - k : k) % size> 475 KFR_INTRINSIC vec<T, width> cmul_by_twiddle(const vec<T, width>& x) 476 { 477 constexpr T isqrt2 = static_cast<T>(0.70710678118654752440084436210485); 478 if constexpr (kk == 0) 479 { 480 return x; 481 } 482 else if constexpr (kk == size * 1 / 8) 483 { 484 return swap<2>(subadd(swap<2>(x), x)) * isqrt2; 485 } 486 else if constexpr (kk == size * 2 / 8) 487 { 488 return negodd(swap<2>(x)); 489 } 490 else if constexpr (kk == size * 3 / 8) 491 { 492 return subadd(x, swap<2>(x)) * -isqrt2; 493 } 494 else if constexpr (kk == size * 4 / 8) 495 { 496 return -x; 497 } 498 else if constexpr (kk == size * 5 / 8) 499 { 500 return swap<2>(subadd(swap<2>(x), x)) * -isqrt2; 501 } 502 else if constexpr (kk == size * 6 / 8) 503 { 504 return swap<2>(negodd(x)); 505 } 506 else if constexpr (kk == size * 7 / 8) 507 { 508 return subadd(x, swap<2>(x)) * isqrt2; 509 } 510 else 511 { 512 return cmul(x, resize<width>(fixed_twiddle<T, 1, size, kk>())); 513 } 514 } 515 516 template <size_t N, typename T> 517 KFR_INTRINSIC void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N>& w0, cvec<T, N>& w1) 518 { 519 const cvec<T, N> sum = a0 + a1; 520 const cvec<T, N> dif = a0 - a1; 521 w0 = sum; 522 w1 = dif; 523 } 524 525 template <size_t N, typename T> 526 KFR_INTRINSIC void butterfly2(cvec<T, N>& a0, cvec<T, N>& a1) 527 { 528 butterfly2<N>(a0, a1, a0, a1); 529 } 530 531 template <size_t N, bool inverse = false, typename T> 532 KFR_INTRINSIC void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1, 533 const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1, 534 cvec<T, N>& w2, cvec<T, N>& w3) 535 { 536 cvec<T, N> sum02, sum13, diff02, diff13; 537 cvec<T, N * 2> a01, a23, sum0213, diff0213; 538 539 a01 = concat(a0, a1); 540 a23 = concat(a2, a3); 541 sum0213 = a01 + a23; 542 diff0213 = a01 - a23; 543 544 sum02 = low(sum0213); 545 sum13 = high(sum0213); 546 diff02 = low(diff0213); 547 diff13 = high(diff0213); 548 w0 = sum02 + sum13; 549 w2 = sum02 - sum13; 550 if constexpr (inverse) 551 { 552 diff13 = (diff13 ^ broadcast<N * 2, T>(T(), -T())); 553 diff13 = swap<2>(diff13); 554 } 555 else 556 { 557 diff13 = swap<2>(diff13); 558 diff13 = (diff13 ^ broadcast<N * 2, T>(T(), -T())); 559 } 560 561 w1 = diff02 + diff13; 562 w3 = diff02 - diff13; 563 } 564 565 template <size_t N, bool inverse = false, typename T> 566 KFR_INTRINSIC void butterfly4(ctrue_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1, 567 const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1, 568 cvec<T, N>& w2, cvec<T, N>& w3) 569 { 570 vec<T, N> re0, im0, re1, im1, re2, im2, re3, im3; 571 vec<T, N> wre0, wim0, wre1, wim1, wre2, wim2, wre3, wim3; 572 573 cvec<T, N> sum02, sum13, diff02, diff13; 574 vec<T, N> sum02re, sum13re, diff02re, diff13re; 575 vec<T, N> sum02im, sum13im, diff02im, diff13im; 576 577 sum02 = a0 + a2; 578 sum13 = a1 + a3; 579 580 w0 = sum02 + sum13; 581 w2 = sum02 - sum13; 582 583 diff02 = a0 - a2; 584 diff13 = a1 - a3; 585 split(diff02, diff02re, diff02im); 586 split(diff13, diff13re, diff13im); 587 588 (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re); 589 (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re); 590 } 591 592 template <size_t N, bool inverse = false, typename T> 593 KFR_INTRINSIC void butterfly8(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, 594 const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, 595 const cvec<T, N>& a6, const cvec<T, N>& a7, cvec<T, N>& w0, cvec<T, N>& w1, 596 cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, 597 cvec<T, N>& w7) 598 { 599 cvec<T, N> b0 = a0, b2 = a2, b4 = a4, b6 = a6; 600 butterfly4<N, inverse>(cfalse, b0, b2, b4, b6, b0, b2, b4, b6); 601 cvec<T, N> b1 = a1, b3 = a3, b5 = a5, b7 = a7; 602 butterfly4<N, inverse>(cfalse, b1, b3, b5, b7, b1, b3, b5, b7); 603 w0 = b0 + b1; 604 w4 = b0 - b1; 605 606 b3 = cmul_by_twiddle<1, 8, inverse>(b3); 607 b5 = cmul_by_twiddle<2, 8, inverse>(b5); 608 b7 = cmul_by_twiddle<3, 8, inverse>(b7); 609 610 w1 = b2 + b3; 611 w5 = b2 - b3; 612 w2 = b4 + b5; 613 w6 = b4 - b5; 614 w3 = b6 + b7; 615 w7 = b6 - b7; 616 } 617 618 template <size_t N, bool inverse = false, typename T> 619 KFR_INTRINSIC void butterfly8(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, 620 cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7) 621 { 622 butterfly8<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a0, a1, a2, a3, a4, a5, a6, a7); 623 } 624 625 template <bool inverse = false, typename T> 626 KFR_INTRINSIC void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cvec<T, 2>& a67) 627 { 628 cvec<T, 2> b01 = a01, b23 = a23, b45 = a45, b67 = a67; 629 630 butterfly4<2, inverse>(cfalse, b01, b23, b45, b67, b01, b23, b45, b67); 631 632 cvec<T, 2> b02, b13, b46, b57; 633 634 cvec<T, 8> b01234567 = concat(b01, b23, b45, b67); 635 cvec<T, 8> b02461357 = concat(even<2>(b01234567), odd<2>(b01234567)); 636 split<T, 16>(b02461357, b02, b46, b13, b57); 637 638 b13 = cmul(b13, fixed_twiddle<T, 2, 8, 0, 1, inverse>()); 639 b57 = cmul(b57, fixed_twiddle<T, 2, 8, 2, 1, inverse>()); 640 a01 = b02 + b13; 641 a23 = b46 + b57; 642 a45 = b02 - b13; 643 a67 = b46 - b57; 644 } 645 646 template <bool inverse = false, typename T> 647 KFR_INTRINSIC void butterfly8_packed(cvec<T, 8>& v8) 648 { 649 cvec<T, 2> w0, w1, w2, w3; 650 split<T, 16>(v8, w0, w1, w2, w3); 651 butterfly8<inverse>(w0, w1, w2, w3); 652 v8 = concat(w0, w1, w2, w3); 653 } 654 655 template <bool inverse = false, typename T> 656 KFR_INTRINSIC void butterfly32_packed(cvec<T, 32>& v32) 657 { 658 cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7; 659 split(v32, w0, w1, w2, w3, w4, w5, w6, w7); 660 butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7); 661 662 w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>()); 663 w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>()); 664 w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>()); 665 w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>()); 666 w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>()); 667 w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>()); 668 w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>()); 669 670 cvec<T, 8> z0, z1, z2, z3; 671 transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3); 672 673 butterfly4<8, inverse>(cfalse, z0, z1, z2, z3, z0, z1, z2, z3); 674 v32 = concat(z0, z1, z2, z3); 675 } 676 677 template <size_t N, bool inverse = false, typename T> 678 KFR_INTRINSIC void butterfly4_packed(cvec<T, N * 4>& a0123) 679 { 680 cvec<T, N> a0; 681 cvec<T, N> a1; 682 cvec<T, N> a2; 683 cvec<T, N> a3; 684 split<T, N * 4 * 2>(a0123, a0, a1, a2, a3); 685 butterfly4<N, inverse>(cfalse, a0, a1, a2, a3, a0, a1, a2, a3); 686 a0123 = concat(a0, a1, a2, a3); 687 } 688 689 template <size_t N, typename T> 690 KFR_INTRINSIC void butterfly2_packed(cvec<T, N * 2>& a01) 691 { 692 cvec<T, N> a0; 693 cvec<T, N> a1; 694 split(a01, a0, a1); 695 butterfly2<N>(a0, a1); 696 a01 = concat(a0, a1); 697 } 698 699 template <size_t N, bool inverse = false, bool split_format = false, typename T> 700 KFR_INTRINSIC void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<T, N>& w1) 701 { 702 if constexpr (split_format) 703 { 704 vec<T, N> re1, im1, tw1re, tw1im; 705 split<T, 2 * N>(a1, re1, im1); 706 split<T, 2 * N>(tw1, tw1re, tw1im); 707 vec<T, N> b1re = re1 * tw1re; 708 vec<T, N> b1im = im1 * tw1re; 709 if constexpr (inverse) 710 w1 = concat(b1re + im1 * tw1im, b1im - re1 * tw1im); 711 else 712 w1 = concat(b1re - im1 * tw1im, b1im + re1 * tw1im); 713 } 714 else 715 { 716 const cvec<T, N> b1 = a1 * dupeven(tw1); 717 const cvec<T, N> a1_ = swap<2>(a1); 718 719 cvec<T, N> tw1_ = tw1; 720 if constexpr (inverse) 721 tw1_ = -(tw1_); 722 w1 = subadd(b1, a1_ * dupodd(tw1_)); 723 } 724 } 725 726 template <size_t N, bool inverse = false, bool split_format = false, typename T> 727 KFR_INTRINSIC void apply_twiddles4(const cvec<T, N>& a1, const cvec<T, N>& a2, const cvec<T, N>& a3, 728 const cvec<T, N>& tw1, const cvec<T, N>& tw2, const cvec<T, N>& tw3, 729 cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3) 730 { 731 apply_twiddle<N, inverse, split_format>(a1, tw1, w1); 732 apply_twiddle<N, inverse, split_format>(a2, tw2, w2); 733 apply_twiddle<N, inverse, split_format>(a3, tw3, w3); 734 } 735 736 template <size_t N, bool inverse = false, typename T> 737 KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, 738 cvec<T, N>& __restrict a3, const cvec<T, N>& tw1, const cvec<T, N>& tw2, 739 const cvec<T, N>& tw3) 740 { 741 apply_twiddles4<N, inverse>(a1, a2, a3, tw1, tw2, tw3, a1, a2, a3); 742 } 743 744 template <size_t N, bool inverse = false, typename T, typename = u8[N - 1]> 745 KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, 746 cvec<T, N>& __restrict a3, const cvec<T, 1>& tw1, const cvec<T, 1>& tw2, 747 const cvec<T, 1>& tw3) 748 { 749 apply_twiddles4<N, inverse>(a1, a2, a3, resize<N * 2>(tw1), resize<N * 2>(tw2), resize<N * 2>(tw3)); 750 } 751 752 template <size_t N, bool inverse = false, typename T, typename = u8[N - 2]> 753 KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, 754 cvec<T, N>& __restrict a3, cvec<T, N / 2> tw1, cvec<T, N / 2> tw2, 755 cvec<T, N / 2> tw3) 756 { 757 apply_twiddles4<N, inverse>(a1, a2, a3, resize<N * 2>(tw1), resize<N * 2>(tw2), resize<N * 2>(tw3)); 758 } 759 760 template <size_t N, bool inverse = false, typename T> 761 KFR_INTRINSIC void apply_vertical_twiddles4(cvec<T, N * 4>& b, cvec<T, N * 4>& c, cvec<T, N * 4>& d) 762 { 763 cvec<T, 4> b0, b1, b2, b3; 764 cvec<T, 4> c0, c1, c2, c3; 765 cvec<T, 4> d0, d1, d2, d3; 766 767 split(b, b0, b1, b2, b3); 768 split(c, c0, c1, c2, c3); 769 split(d, d0, d1, d2, d3); 770 771 b1 = cmul_by_twiddle<4, 64, inverse>(b1); 772 b2 = cmul_by_twiddle<8, 64, inverse>(b2); 773 b3 = cmul_by_twiddle<12, 64, inverse>(b3); 774 775 c1 = cmul_by_twiddle<8, 64, inverse>(c1); 776 c2 = cmul_by_twiddle<16, 64, inverse>(c2); 777 c3 = cmul_by_twiddle<24, 64, inverse>(c3); 778 779 d1 = cmul_by_twiddle<12, 64, inverse>(d1); 780 d2 = cmul_by_twiddle<24, 64, inverse>(d2); 781 d3 = cmul_by_twiddle<36, 64, inverse>(d3); 782 783 b = concat(b0, b1, b2, b3); 784 c = concat(c0, c1, c2, c3); 785 d = concat(d0, d1, d2, d3); 786 } 787 788 template <size_t n2, size_t nnstep, size_t N, bool inverse = false, typename T> 789 KFR_INTRINSIC void apply_twiddles4(cvec<T, N * 4>& __restrict a0123) 790 { 791 cvec<T, N> a0; 792 cvec<T, N> a1; 793 cvec<T, N> a2; 794 cvec<T, N> a3; 795 split<T, 2 * N * 4>(a0123, a0, a1, a2, a3); 796 797 cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1, inverse>(), 798 tw2 = fixed_twiddle<T, N, 64, n2 * nnstep * 2, nnstep * 2, inverse>(), 799 tw3 = fixed_twiddle<T, N, 64, n2 * nnstep * 3, nnstep * 3, inverse>(); 800 801 apply_twiddles4<N>(a1, a2, a3, tw1, tw2, tw3); 802 803 a0123 = concat(a0, a1, a2, a3); 804 } 805 806 template <bool inverse, bool aligned, typename T> 807 KFR_INTRINSIC void butterfly64_memory(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, 808 const complex<T>* in) 809 { 810 cvec<T, 16> w0, w1, w2, w3; 811 812 w0 = cread_group<4, 4, aligned>( 813 in, 16); // concat(cread<4>(in + 0), cread<4>(in + 16), cread<4>(in + 32), cread<4>(in + 48)); 814 butterfly4_packed<4, inverse>(w0); 815 apply_twiddles4<0, 1, 4, inverse>(w0); 816 817 w1 = cread_group<4, 4, aligned>( 818 in + 4, 16); // concat(cread<4>(in + 4), cread<4>(in + 20), cread<4>(in + 36), cread<4>(in + 52)); 819 butterfly4_packed<4, inverse>(w1); 820 apply_twiddles4<4, 1, 4, inverse>(w1); 821 822 w2 = cread_group<4, 4, aligned>( 823 in + 8, 16); // concat(cread<4>(in + 8), cread<4>(in + 24), cread<4>(in + 40), cread<4>(in + 56)); 824 butterfly4_packed<4, inverse>(w2); 825 apply_twiddles4<8, 1, 4, inverse>(w2); 826 827 w3 = cread_group<4, 4, aligned>( 828 in + 12, 16); // concat(cread<4>(in + 12), cread<4>(in + 28), cread<4>(in + 44), cread<4>(in + 60)); 829 butterfly4_packed<4, inverse>(w3); 830 apply_twiddles4<12, 1, 4, inverse>(w3); 831 832 transpose4(w0, w1, w2, w3); 833 // pass 2: 834 835 butterfly4_packed<4, inverse>(w0); 836 butterfly4_packed<4, inverse>(w1); 837 butterfly4_packed<4, inverse>(w2); 838 butterfly4_packed<4, inverse>(w3); 839 840 transpose4(w0, w1, w2, w3); 841 842 w0 = digitreverse4<2>(w0); 843 w1 = digitreverse4<2>(w1); 844 w2 = digitreverse4<2>(w2); 845 w3 = digitreverse4<2>(w3); 846 847 apply_vertical_twiddles4<4, inverse>(w1, w2, w3); 848 849 // pass 3: 850 butterfly4_packed<4, inverse>(w3); 851 cwrite_group<4, 4, aligned>(out + 12, 16, w3); // split(w3, out[3], out[7], out[11], out[15]); 852 853 butterfly4_packed<4, inverse>(w2); 854 cwrite_group<4, 4, aligned>(out + 8, 16, w2); // split(w2, out[2], out[6], out[10], out[14]); 855 856 butterfly4_packed<4, inverse>(w1); 857 cwrite_group<4, 4, aligned>(out + 4, 16, w1); // split(w1, out[1], out[5], out[9], out[13]); 858 859 butterfly4_packed<4, inverse>(w0); 860 cwrite_group<4, 4, aligned>(out, 16, w0); // split(w0, out[0], out[4], out[8], out[12]); 861 } 862 863 template <bool inverse = false, typename T> 864 KFR_INTRINSIC void butterfly16_packed(cvec<T, 16>& v16) 865 { 866 butterfly4_packed<4, inverse>(v16); 867 apply_twiddles4<0, 4, 4, inverse>(v16); 868 v16 = digitreverse4<2>(v16); 869 butterfly4_packed<4, inverse>(v16); 870 } 871 872 template <size_t index, bool inverse = false, typename T> 873 KFR_INTRINSIC void butterfly16_multi_natural(complex<T>* out, const complex<T>* in) 874 { 875 constexpr size_t N = 4; 876 877 cvec<T, 4> a1 = cread<4>(in + index * 4 + 16 * 1); 878 cvec<T, 4> a5 = cread<4>(in + index * 4 + 16 * 5); 879 cvec<T, 4> a9 = cread<4>(in + index * 4 + 16 * 9); 880 cvec<T, 4> a13 = cread<4>(in + index * 4 + 16 * 13); 881 butterfly4<N, inverse>(cfalse, a1, a5, a9, a13, a1, a5, a9, a13); 882 a5 = cmul_by_twiddle<1, 16, inverse>(a5); 883 a9 = cmul_by_twiddle<2, 16, inverse>(a9); 884 a13 = cmul_by_twiddle<3, 16, inverse>(a13); 885 886 cvec<T, 4> a2 = cread<4>(in + index * 4 + 16 * 2); 887 cvec<T, 4> a6 = cread<4>(in + index * 4 + 16 * 6); 888 cvec<T, 4> a10 = cread<4>(in + index * 4 + 16 * 10); 889 cvec<T, 4> a14 = cread<4>(in + index * 4 + 16 * 14); 890 butterfly4<N, inverse>(cfalse, a2, a6, a10, a14, a2, a6, a10, a14); 891 a6 = cmul_by_twiddle<2, 16, inverse>(a6); 892 a10 = cmul_by_twiddle<4, 16, inverse>(a10); 893 a14 = cmul_by_twiddle<6, 16, inverse>(a14); 894 895 cvec<T, 4> a3 = cread<4>(in + index * 4 + 16 * 3); 896 cvec<T, 4> a7 = cread<4>(in + index * 4 + 16 * 7); 897 cvec<T, 4> a11 = cread<4>(in + index * 4 + 16 * 11); 898 cvec<T, 4> a15 = cread<4>(in + index * 4 + 16 * 15); 899 butterfly4<N, inverse>(cfalse, a3, a7, a11, a15, a3, a7, a11, a15); 900 a7 = cmul_by_twiddle<3, 16, inverse>(a7); 901 a11 = cmul_by_twiddle<6, 16, inverse>(a11); 902 a15 = cmul_by_twiddle<9, 16, inverse>(a15); 903 904 cvec<T, 4> a0 = cread<4>(in + index * 4 + 16 * 0); 905 cvec<T, 4> a4 = cread<4>(in + index * 4 + 16 * 4); 906 cvec<T, 4> a8 = cread<4>(in + index * 4 + 16 * 8); 907 cvec<T, 4> a12 = cread<4>(in + index * 4 + 16 * 12); 908 butterfly4<N, inverse>(cfalse, a0, a4, a8, a12, a0, a4, a8, a12); 909 butterfly4<N, inverse>(cfalse, a0, a1, a2, a3, a0, a1, a2, a3); 910 cwrite<4>(out + index * 4 + 16 * 0, a0); 911 cwrite<4>(out + index * 4 + 16 * 4, a1); 912 cwrite<4>(out + index * 4 + 16 * 8, a2); 913 cwrite<4>(out + index * 4 + 16 * 12, a3); 914 butterfly4<N, inverse>(cfalse, a4, a5, a6, a7, a4, a5, a6, a7); 915 cwrite<4>(out + index * 4 + 16 * 1, a4); 916 cwrite<4>(out + index * 4 + 16 * 5, a5); 917 cwrite<4>(out + index * 4 + 16 * 9, a6); 918 cwrite<4>(out + index * 4 + 16 * 13, a7); 919 butterfly4<N, inverse>(cfalse, a8, a9, a10, a11, a8, a9, a10, a11); 920 cwrite<4>(out + index * 4 + 16 * 2, a8); 921 cwrite<4>(out + index * 4 + 16 * 6, a9); 922 cwrite<4>(out + index * 4 + 16 * 10, a10); 923 cwrite<4>(out + index * 4 + 16 * 14, a11); 924 butterfly4<N, inverse>(cfalse, a12, a13, a14, a15, a12, a13, a14, a15); 925 cwrite<4>(out + index * 4 + 16 * 3, a12); 926 cwrite<4>(out + index * 4 + 16 * 7, a13); 927 cwrite<4>(out + index * 4 + 16 * 11, a14); 928 cwrite<4>(out + index * 4 + 16 * 15, a15); 929 } 930 931 template <size_t index, bool inverse = false, typename T> 932 KFR_INTRINSIC void butterfly16_multi_flip(complex<T>* out, const complex<T>* in) 933 { 934 constexpr size_t N = 4; 935 936 cvec<T, 4> a1 = cread<4>(in + index * 4 + 16 * 1); 937 cvec<T, 4> a5 = cread<4>(in + index * 4 + 16 * 5); 938 cvec<T, 4> a9 = cread<4>(in + index * 4 + 16 * 9); 939 cvec<T, 4> a13 = cread<4>(in + index * 4 + 16 * 13); 940 butterfly4<N, inverse>(cfalse, a1, a5, a9, a13, a1, a5, a9, a13); 941 a5 = cmul_by_twiddle<1, 16, inverse>(a5); 942 a9 = cmul_by_twiddle<2, 16, inverse>(a9); 943 a13 = cmul_by_twiddle<3, 16, inverse>(a13); 944 945 cvec<T, 4> a2 = cread<4>(in + index * 4 + 16 * 2); 946 cvec<T, 4> a6 = cread<4>(in + index * 4 + 16 * 6); 947 cvec<T, 4> a10 = cread<4>(in + index * 4 + 16 * 10); 948 cvec<T, 4> a14 = cread<4>(in + index * 4 + 16 * 14); 949 butterfly4<N, inverse>(cfalse, a2, a6, a10, a14, a2, a6, a10, a14); 950 a6 = cmul_by_twiddle<2, 16, inverse>(a6); 951 a10 = cmul_by_twiddle<4, 16, inverse>(a10); 952 a14 = cmul_by_twiddle<6, 16, inverse>(a14); 953 954 cvec<T, 4> a3 = cread<4>(in + index * 4 + 16 * 3); 955 cvec<T, 4> a7 = cread<4>(in + index * 4 + 16 * 7); 956 cvec<T, 4> a11 = cread<4>(in + index * 4 + 16 * 11); 957 cvec<T, 4> a15 = cread<4>(in + index * 4 + 16 * 15); 958 butterfly4<N, inverse>(cfalse, a3, a7, a11, a15, a3, a7, a11, a15); 959 a7 = cmul_by_twiddle<3, 16, inverse>(a7); 960 a11 = cmul_by_twiddle<6, 16, inverse>(a11); 961 a15 = cmul_by_twiddle<9, 16, inverse>(a15); 962 963 cvec<T, 16> w1 = concat(a1, a5, a9, a13); 964 cvec<T, 16> w2 = concat(a2, a6, a10, a14); 965 cvec<T, 16> w3 = concat(a3, a7, a11, a15); 966 967 cvec<T, 4> a0 = cread<4>(in + index * 4 + 16 * 0); 968 cvec<T, 4> a4 = cread<4>(in + index * 4 + 16 * 4); 969 cvec<T, 4> a8 = cread<4>(in + index * 4 + 16 * 8); 970 cvec<T, 4> a12 = cread<4>(in + index * 4 + 16 * 12); 971 butterfly4<N, inverse>(cfalse, a0, a4, a8, a12, a0, a4, a8, a12); 972 cvec<T, 16> w0 = concat(a0, a4, a8, a12); 973 974 butterfly4<N * 4, inverse>(cfalse, w0, w1, w2, w3, w0, w1, w2, w3); 975 976 w0 = digitreverse4<2>(w0); 977 w1 = digitreverse4<2>(w1); 978 w2 = digitreverse4<2>(w2); 979 w3 = digitreverse4<2>(w3); 980 981 transpose4(w0, w1, w2, w3); 982 cwrite<16>(out + index * 64 + 16 * 0, cmul(w0, fixed_twiddle<T, 16, 256, 0, index * 4 + 0, inverse>())); 983 cwrite<16>(out + index * 64 + 16 * 1, cmul(w1, fixed_twiddle<T, 16, 256, 0, index * 4 + 1, inverse>())); 984 cwrite<16>(out + index * 64 + 16 * 2, cmul(w2, fixed_twiddle<T, 16, 256, 0, index * 4 + 2, inverse>())); 985 cwrite<16>(out + index * 64 + 16 * 3, cmul(w3, fixed_twiddle<T, 16, 256, 0, index * 4 + 3, inverse>())); 986 } 987 988 template <size_t n2, size_t nnstep, size_t N, typename T> 989 KFR_INTRINSIC void apply_twiddles2(cvec<T, N>& a1) 990 { 991 cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1>(); 992 993 a1 = cmul(a1, tw1); 994 } 995 996 template <typename T, size_t N, bool inverse> 997 static constexpr KFR_INTRINSIC cvec<T, N> tw3r1() 998 { 999 return static_cast<T>(-0.5 - 1.0); 1000 } 1001 1002 template <typename T, size_t N, bool inverse> 1003 static constexpr KFR_INTRINSIC cvec<T, N> tw3i1() 1004 { 1005 return static_cast<T>(0.86602540378443864676372317075) * twiddleimagmask<T, N, inverse>(); 1006 } 1007 1008 template <size_t N, bool inverse = false, typename T> 1009 KFR_INTRINSIC void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N>& w00, 1010 cvec<T, N>& w01, cvec<T, N>& w02) 1011 { 1012 1013 const cvec<T, N> sum1 = a01 + a02; 1014 const cvec<T, N> dif1 = swap<2>(a01 - a02); 1015 w00 = a00 + sum1; 1016 1017 const cvec<T, N> s1 = w00 + sum1 * tw3r1<T, N, inverse>(); 1018 1019 const cvec<T, N> d1 = dif1 * tw3i1<T, N, inverse>(); 1020 1021 w01 = s1 + d1; 1022 w02 = s1 - d1; 1023 } 1024 1025 template <size_t N, bool inverse = false, typename T> 1026 KFR_INTRINSIC void butterfly3(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2) 1027 { 1028 butterfly3<N, inverse>(a0, a1, a2, a0, a1, a2); 1029 } 1030 1031 template <size_t N, bool inverse = false, typename T> 1032 KFR_INTRINSIC void butterfly6(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, 1033 const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, 1034 cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, 1035 cvec<T, N>& w5) 1036 { 1037 cvec<T, N * 2> a03 = concat(a0, a3); 1038 cvec<T, N * 2> a25 = concat(a2, a5); 1039 cvec<T, N * 2> a41 = concat(a4, a1); 1040 butterfly3<N * 2, inverse>(a03, a25, a41, a03, a25, a41); 1041 cvec<T, N> t0, t1, t2, t3, t4, t5; 1042 split(a03, t0, t1); 1043 split(a25, t2, t3); 1044 split(a41, t4, t5); 1045 t3 = -t3; 1046 cvec<T, N * 2> a04 = concat(t0, t4); 1047 cvec<T, N * 2> a15 = concat(t1, t5); 1048 cvec<T, N * 2> w02, w35; 1049 butterfly2<N * 2>(a04, a15, w02, w35); 1050 split(w02, w0, w2); 1051 split(w35, w3, w5); 1052 1053 butterfly2<N>(t2, t3, w1, w4); 1054 } 1055 1056 template <size_t N, bool inverse = false, typename T> 1057 KFR_INTRINSIC void butterfly6(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, 1058 cvec<T, N>& a5) 1059 { 1060 butterfly6<N, inverse>(a0, a1, a2, a3, a4, a5, a0, a1, a2, a3, a4, a5); 1061 } 1062 1063 template <typename T, bool inverse = false> 1064 static constexpr KFR_INTRINSIC cvec<T, 1> tw9_1() 1065 { 1066 return { T(0.76604444311897803520239265055541), 1067 (inverse ? -1 : 1) * T(-0.64278760968653932632264340990727) }; 1068 } 1069 template <typename T, bool inverse = false> 1070 static constexpr KFR_INTRINSIC cvec<T, 1> tw9_2() 1071 { 1072 return { T(0.17364817766693034885171662676931), 1073 (inverse ? -1 : 1) * T(-0.98480775301220805936674302458952) }; 1074 } 1075 template <typename T, bool inverse = false> 1076 static constexpr KFR_INTRINSIC cvec<T, 1> tw9_4() 1077 { 1078 return { T(-0.93969262078590838405410927732473), 1079 (inverse ? -1 : 1) * T(-0.34202014332566873304409961468226) }; 1080 } 1081 1082 template <size_t N, bool inverse = false, typename T> 1083 KFR_INTRINSIC void butterfly9(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, 1084 const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, 1085 const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8, 1086 cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, 1087 cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7, cvec<T, N>& w8) 1088 { 1089 cvec<T, N * 3> a012 = concat(a0, a1, a2); 1090 cvec<T, N * 3> a345 = concat(a3, a4, a5); 1091 cvec<T, N * 3> a678 = concat(a6, a7, a8); 1092 butterfly3<N * 3, inverse>(a012, a345, a678, a012, a345, a678); 1093 cvec<T, N> t0, t1, t2, t3, t4, t5, t6, t7, t8; 1094 split(a012, t0, t1, t2); 1095 split(a345, t3, t4, t5); 1096 split(a678, t6, t7, t8); 1097 1098 t4 = cmul(t4, tw9_1<T, inverse>()); 1099 t5 = cmul(t5, tw9_2<T, inverse>()); 1100 t7 = cmul(t7, tw9_2<T, inverse>()); 1101 t8 = cmul(t8, tw9_4<T, inverse>()); 1102 1103 cvec<T, N * 3> t036 = concat(t0, t3, t6); 1104 cvec<T, N * 3> t147 = concat(t1, t4, t7); 1105 cvec<T, N * 3> t258 = concat(t2, t5, t8); 1106 1107 butterfly3<N * 3, inverse>(t036, t147, t258, t036, t147, t258); 1108 split(t036, w0, w1, w2); 1109 split(t147, w3, w4, w5); 1110 split(t258, w6, w7, w8); 1111 } 1112 1113 template <size_t N, bool inverse = false, typename T> 1114 KFR_INTRINSIC void butterfly9(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, 1115 cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7, cvec<T, N>& a8) 1116 { 1117 butterfly9<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a8, a0, a1, a2, a3, a4, a5, a6, a7, a8); 1118 } 1119 1120 template <typename T, size_t N, bool inverse> 1121 static constexpr KFR_INTRINSIC cvec<T, N> tw7r1() 1122 { 1123 return static_cast<T>(0.623489801858733530525004884 - 1.0); 1124 } 1125 1126 template <typename T, size_t N, bool inverse> 1127 static constexpr KFR_INTRINSIC cvec<T, N> tw7i1() 1128 { 1129 return static_cast<T>(0.78183148246802980870844452667) * twiddleimagmask<T, N, inverse>(); 1130 } 1131 1132 template <typename T, size_t N, bool inverse> 1133 static constexpr KFR_INTRINSIC cvec<T, N> tw7r2() 1134 { 1135 return static_cast<T>(-0.2225209339563144042889025645 - 1.0); 1136 } 1137 1138 template <typename T, size_t N, bool inverse> 1139 static constexpr KFR_INTRINSIC cvec<T, N> tw7i2() 1140 { 1141 return static_cast<T>(0.97492791218182360701813168299) * twiddleimagmask<T, N, inverse>(); 1142 } 1143 1144 template <typename T, size_t N, bool inverse> 1145 static constexpr KFR_INTRINSIC cvec<T, N> tw7r3() 1146 { 1147 return static_cast<T>(-0.90096886790241912623610231951 - 1.0); 1148 } 1149 1150 template <typename T, size_t N, bool inverse> 1151 static constexpr KFR_INTRINSIC cvec<T, N> tw7i3() 1152 { 1153 return static_cast<T>(0.43388373911755812047576833285) * twiddleimagmask<T, N, inverse>(); 1154 } 1155 1156 template <size_t N, bool inverse = false, typename T> 1157 KFR_INTRINSIC void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04, 1158 cvec<T, N> a05, cvec<T, N> a06, cvec<T, N>& w00, cvec<T, N>& w01, 1159 cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, 1160 cvec<T, N>& w06) 1161 { 1162 const cvec<T, N> sum1 = a01 + a06; 1163 const cvec<T, N> dif1 = swap<2>(a01 - a06); 1164 const cvec<T, N> sum2 = a02 + a05; 1165 const cvec<T, N> dif2 = swap<2>(a02 - a05); 1166 const cvec<T, N> sum3 = a03 + a04; 1167 const cvec<T, N> dif3 = swap<2>(a03 - a04); 1168 w00 = a00 + sum1 + sum2 + sum3; 1169 1170 const cvec<T, N> s1 = 1171 w00 + sum1 * tw7r1<T, N, inverse>() + sum2 * tw7r2<T, N, inverse>() + sum3 * tw7r3<T, N, inverse>(); 1172 const cvec<T, N> s2 = 1173 w00 + sum1 * tw7r2<T, N, inverse>() + sum2 * tw7r3<T, N, inverse>() + sum3 * tw7r1<T, N, inverse>(); 1174 const cvec<T, N> s3 = 1175 w00 + sum1 * tw7r3<T, N, inverse>() + sum2 * tw7r1<T, N, inverse>() + sum3 * tw7r2<T, N, inverse>(); 1176 1177 const cvec<T, N> d1 = 1178 dif1 * tw7i1<T, N, inverse>() + dif2 * tw7i2<T, N, inverse>() + dif3 * tw7i3<T, N, inverse>(); 1179 const cvec<T, N> d2 = 1180 dif1 * tw7i2<T, N, inverse>() - dif2 * tw7i3<T, N, inverse>() - dif3 * tw7i1<T, N, inverse>(); 1181 const cvec<T, N> d3 = 1182 dif1 * tw7i3<T, N, inverse>() - dif2 * tw7i1<T, N, inverse>() + dif3 * tw7i2<T, N, inverse>(); 1183 1184 w01 = s1 + d1; 1185 w06 = s1 - d1; 1186 w02 = s2 + d2; 1187 w05 = s2 - d2; 1188 w03 = s3 + d3; 1189 w04 = s3 - d3; 1190 } 1191 1192 template <size_t N, bool inverse = false, typename T> 1193 KFR_INTRINSIC void butterfly7(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, 1194 cvec<T, N>& a5, cvec<T, N>& a6) 1195 { 1196 butterfly7<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a0, a1, a2, a3, a4, a5, a6); 1197 } 1198 1199 template <typename T, size_t N, bool inverse> 1200 static const cvec<T, N> tw11r1 = static_cast<T>(0.84125353283118116886181164892 - 1.0); 1201 1202 template <typename T, size_t N, bool inverse> 1203 static const cvec<T, N> tw11i1 = 1204 static_cast<T>(0.54064081745559758210763595432) * twiddleimagmask<T, N, inverse>(); 1205 1206 template <typename T, size_t N, bool inverse> 1207 static const cvec<T, N> tw11r2 = static_cast<T>(0.41541501300188642552927414923 - 1.0); 1208 1209 template <typename T, size_t N, bool inverse> 1210 static const cvec<T, N> tw11i2 = 1211 static_cast<T>(0.90963199535451837141171538308) * twiddleimagmask<T, N, inverse>(); 1212 1213 template <typename T, size_t N, bool inverse> 1214 static const cvec<T, N> tw11r3 = static_cast<T>(-0.14231483827328514044379266862 - 1.0); 1215 1216 template <typename T, size_t N, bool inverse> 1217 static const cvec<T, N> tw11i3 = 1218 static_cast<T>(0.98982144188093273237609203778) * twiddleimagmask<T, N, inverse>(); 1219 1220 template <typename T, size_t N, bool inverse> 1221 static const cvec<T, N> tw11r4 = static_cast<T>(-0.65486073394528506405692507247 - 1.0); 1222 1223 template <typename T, size_t N, bool inverse> 1224 static const cvec<T, N> tw11i4 = 1225 static_cast<T>(0.75574957435425828377403584397) * twiddleimagmask<T, N, inverse>(); 1226 1227 template <typename T, size_t N, bool inverse> 1228 static const cvec<T, N> tw11r5 = static_cast<T>(-0.95949297361449738989036805707 - 1.0); 1229 1230 template <typename T, size_t N, bool inverse> 1231 static const cvec<T, N> tw11i5 = 1232 static_cast<T>(0.28173255684142969771141791535) * twiddleimagmask<T, N, inverse>(); 1233 1234 template <size_t N, bool inverse = false, typename T> 1235 KFR_INTRINSIC void butterfly11(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04, 1236 cvec<T, N> a05, cvec<T, N> a06, cvec<T, N> a07, cvec<T, N> a08, cvec<T, N> a09, 1237 cvec<T, N> a10, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02, 1238 cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06, 1239 cvec<T, N>& w07, cvec<T, N>& w08, cvec<T, N>& w09, cvec<T, N>& w10) 1240 { 1241 const cvec<T, N> sum1 = a01 + a10; 1242 const cvec<T, N> dif1 = swap<2>(a01 - a10); 1243 const cvec<T, N> sum2 = a02 + a09; 1244 const cvec<T, N> dif2 = swap<2>(a02 - a09); 1245 const cvec<T, N> sum3 = a03 + a08; 1246 const cvec<T, N> dif3 = swap<2>(a03 - a08); 1247 const cvec<T, N> sum4 = a04 + a07; 1248 const cvec<T, N> dif4 = swap<2>(a04 - a07); 1249 const cvec<T, N> sum5 = a05 + a06; 1250 const cvec<T, N> dif5 = swap<2>(a05 - a06); 1251 w00 = a00 + sum1 + sum2 + sum3 + sum4 + sum5; 1252 1253 const cvec<T, N> s1 = w00 + sum1 * tw11r1<T, N, inverse> + sum2 * tw11r2<T, N, inverse> + 1254 sum3 * tw11r3<T, N, inverse> + sum4 * tw11r4<T, N, inverse> + 1255 sum5 * tw11r5<T, N, inverse>; 1256 const cvec<T, N> s2 = w00 + sum1 * tw11r2<T, N, inverse> + sum2 * tw11r3<T, N, inverse> + 1257 sum3 * tw11r4<T, N, inverse> + sum4 * tw11r5<T, N, inverse> + 1258 sum5 * tw11r1<T, N, inverse>; 1259 const cvec<T, N> s3 = w00 + sum1 * tw11r3<T, N, inverse> + sum2 * tw11r4<T, N, inverse> + 1260 sum3 * tw11r5<T, N, inverse> + sum4 * tw11r1<T, N, inverse> + 1261 sum5 * tw11r2<T, N, inverse>; 1262 const cvec<T, N> s4 = w00 + sum1 * tw11r4<T, N, inverse> + sum2 * tw11r5<T, N, inverse> + 1263 sum3 * tw11r1<T, N, inverse> + sum4 * tw11r2<T, N, inverse> + 1264 sum5 * tw11r3<T, N, inverse>; 1265 const cvec<T, N> s5 = w00 + sum1 * tw11r5<T, N, inverse> + sum2 * tw11r1<T, N, inverse> + 1266 sum3 * tw11r2<T, N, inverse> + sum4 * tw11r3<T, N, inverse> + 1267 sum5 * tw11r4<T, N, inverse>; 1268 1269 const cvec<T, N> d1 = dif1 * tw11i1<T, N, inverse> + dif2 * tw11i2<T, N, inverse> + 1270 dif3 * tw11i3<T, N, inverse> + dif4 * tw11i4<T, N, inverse> + 1271 dif5 * tw11i5<T, N, inverse>; 1272 const cvec<T, N> d2 = dif1 * tw11i2<T, N, inverse> - dif2 * tw11i3<T, N, inverse> - 1273 dif3 * tw11i4<T, N, inverse> - dif4 * tw11i5<T, N, inverse> - 1274 dif5 * tw11i1<T, N, inverse>; 1275 const cvec<T, N> d3 = dif1 * tw11i3<T, N, inverse> - dif2 * tw11i4<T, N, inverse> + 1276 dif3 * tw11i5<T, N, inverse> + dif4 * tw11i1<T, N, inverse> + 1277 dif5 * tw11i2<T, N, inverse>; 1278 const cvec<T, N> d4 = dif1 * tw11i4<T, N, inverse> - dif2 * tw11i5<T, N, inverse> + 1279 dif3 * tw11i1<T, N, inverse> - dif4 * tw11i2<T, N, inverse> - 1280 dif5 * tw11i3<T, N, inverse>; 1281 const cvec<T, N> d5 = dif1 * tw11i5<T, N, inverse> - dif2 * tw11i1<T, N, inverse> + 1282 dif3 * tw11i2<T, N, inverse> - dif4 * tw11i3<T, N, inverse> + 1283 dif5 * tw11i4<T, N, inverse>; 1284 1285 w01 = s1 + d1; 1286 w10 = s1 - d1; 1287 w02 = s2 + d2; 1288 w09 = s2 - d2; 1289 w03 = s3 + d3; 1290 w08 = s3 - d3; 1291 w04 = s4 + d4; 1292 w07 = s4 - d4; 1293 w05 = s5 + d5; 1294 w06 = s5 - d5; 1295 } 1296 1297 template <typename T, size_t N, bool inverse> 1298 static constexpr KFR_INTRINSIC cvec<T, N> tw5r1() 1299 { 1300 return static_cast<T>(0.30901699437494742410229341718 - 1.0); 1301 } 1302 template <typename T, size_t N, bool inverse> 1303 static constexpr KFR_INTRINSIC cvec<T, N> tw5i1() 1304 { 1305 return static_cast<T>(0.95105651629515357211643933338) * twiddleimagmask<T, N, inverse>(); 1306 } 1307 template <typename T, size_t N, bool inverse> 1308 static constexpr KFR_INTRINSIC cvec<T, N> tw5r2() 1309 { 1310 return static_cast<T>(-0.80901699437494742410229341718 - 1.0); 1311 } 1312 template <typename T, size_t N, bool inverse> 1313 static constexpr KFR_INTRINSIC cvec<T, N> tw5i2() 1314 { 1315 return static_cast<T>(0.58778525229247312916870595464) * twiddleimagmask<T, N, inverse>(); 1316 } 1317 1318 template <size_t N, bool inverse = false, typename T> 1319 KFR_INTRINSIC void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02, 1320 const cvec<T, N>& a03, const cvec<T, N>& a04, cvec<T, N>& w00, cvec<T, N>& w01, 1321 cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04) 1322 { 1323 const cvec<T, N> sum1 = a01 + a04; 1324 const cvec<T, N> dif1 = swap<2>(a01 - a04); 1325 const cvec<T, N> sum2 = a02 + a03; 1326 const cvec<T, N> dif2 = swap<2>(a02 - a03); 1327 w00 = a00 + sum1 + sum2; 1328 1329 const cvec<T, N> s1 = w00 + sum1 * tw5r1<T, N, inverse>() + sum2 * tw5r2<T, N, inverse>(); 1330 const cvec<T, N> s2 = w00 + sum1 * tw5r2<T, N, inverse>() + sum2 * tw5r1<T, N, inverse>(); 1331 1332 const cvec<T, N> d1 = dif1 * tw5i1<T, N, inverse>() + dif2 * tw5i2<T, N, inverse>(); 1333 const cvec<T, N> d2 = dif1 * tw5i2<T, N, inverse>() - dif2 * tw5i1<T, N, inverse>(); 1334 1335 w01 = s1 + d1; 1336 w04 = s1 - d1; 1337 w02 = s2 + d2; 1338 w03 = s2 - d2; 1339 } 1340 1341 template <size_t N, bool inverse = false, typename T> 1342 KFR_INTRINSIC void butterfly10(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, 1343 const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, 1344 const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8, 1345 const cvec<T, N>& a9, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, 1346 cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7, 1347 cvec<T, N>& w8, cvec<T, N>& w9) 1348 { 1349 cvec<T, N * 2> a05 = concat(a0, a5); 1350 cvec<T, N * 2> a27 = concat(a2, a7); 1351 cvec<T, N * 2> a49 = concat(a4, a9); 1352 cvec<T, N * 2> a61 = concat(a6, a1); 1353 cvec<T, N * 2> a83 = concat(a8, a3); 1354 butterfly5<N * 2, inverse>(a05, a27, a49, a61, a83, a05, a27, a49, a61, a83); 1355 cvec<T, N> t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; 1356 split(a05, t0, t1); 1357 split(a27, t2, t3); 1358 split(a49, t4, t5); 1359 split(a61, t6, t7); 1360 split(a83, t8, t9); 1361 t5 = -t5; 1362 1363 cvec<T, N * 2> t02, t13; 1364 cvec<T, N * 2> w06, w51; 1365 t02 = concat(t0, t2); 1366 t13 = concat(t1, t3); 1367 butterfly2<N * 2>(t02, t13, w06, w51); 1368 split(w06, w0, w6); 1369 split(w51, w5, w1); 1370 1371 cvec<T, N * 2> t68, t79; 1372 cvec<T, N * 2> w84, w39; 1373 t68 = concat(t6, t8); 1374 t79 = concat(t7, t9); 1375 butterfly2<N * 2>(t68, t79, w84, w39); 1376 split(w84, w8, w4); 1377 split(w39, w3, w9); 1378 butterfly2<N>(t4, t5, w7, w2); 1379 } 1380 1381 template <bool inverse, typename T, size_t N> 1382 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, vec<T, N>& out0, 1383 vec<T, N>& out1) 1384 { 1385 butterfly2<N / 2>(in0, in1, out0, out1); 1386 } 1387 template <bool inverse, typename T, size_t N> 1388 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, 1389 const vec<T, N>& in2, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2) 1390 { 1391 butterfly3<N / 2, inverse>(in0, in1, in2, out0, out1, out2); 1392 } 1393 1394 template <bool inverse, typename T, size_t N> 1395 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, 1396 const vec<T, N>& in2, const vec<T, N>& in3, vec<T, N>& out0, vec<T, N>& out1, 1397 vec<T, N>& out2, vec<T, N>& out3) 1398 { 1399 butterfly4<N / 2, inverse>(cfalse, in0, in1, in2, in3, out0, out1, out2, out3); 1400 } 1401 template <bool inverse, typename T, size_t N> 1402 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, 1403 const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, 1404 vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, 1405 vec<T, N>& out4) 1406 { 1407 butterfly5<N / 2, inverse>(in0, in1, in2, in3, in4, out0, out1, out2, out3, out4); 1408 } 1409 template <bool inverse, typename T, size_t N> 1410 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, 1411 const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, 1412 const vec<T, N>& in5, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, 1413 vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5) 1414 { 1415 butterfly6<N / 2, inverse>(in0, in1, in2, in3, in4, in5, out0, out1, out2, out3, out4, out5); 1416 } 1417 template <bool inverse, typename T, size_t N> 1418 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, 1419 const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, 1420 const vec<T, N>& in5, const vec<T, N>& in6, vec<T, N>& out0, vec<T, N>& out1, 1421 vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, 1422 vec<T, N>& out6) 1423 { 1424 butterfly7<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, out0, out1, out2, out3, out4, out5, out6); 1425 } 1426 template <bool inverse, typename T, size_t N> 1427 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, 1428 const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, 1429 const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, 1430 vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, 1431 vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7) 1432 { 1433 butterfly8<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4, out5, 1434 out6, out7); 1435 } 1436 template <bool inverse, typename T, size_t N> 1437 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, 1438 const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, 1439 const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, 1440 const vec<T, N>& in8, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, 1441 vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, 1442 vec<T, N>& out7, vec<T, N>& out8) 1443 { 1444 butterfly9<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, out0, out1, out2, out3, out4, 1445 out5, out6, out7, out8); 1446 } 1447 template <bool inverse, typename T, size_t N> 1448 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, 1449 const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, 1450 const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, 1451 const vec<T, N>& in8, const vec<T, N>& in9, vec<T, N>& out0, vec<T, N>& out1, 1452 vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, 1453 vec<T, N>& out6, vec<T, N>& out7, vec<T, N>& out8, vec<T, N>& out9) 1454 { 1455 butterfly10<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, out0, out1, out2, out3, 1456 out4, out5, out6, out7, out8, out9); 1457 } 1458 template <bool inverse, typename T, size_t N> 1459 KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, 1460 const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, 1461 const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, 1462 const vec<T, N>& in8, const vec<T, N>& in9, const vec<T, N>& in10, 1463 vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, 1464 vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7, 1465 vec<T, N>& out8, vec<T, N>& out9, vec<T, N>& out10) 1466 { 1467 butterfly11<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, out0, out1, out2, 1468 out3, out4, out5, out6, out7, out8, out9, out10); 1469 } 1470 template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()> 1471 KFR_INTRINSIC void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w) 1472 { 1473 vec<T, Nout> temp = read(cunaligned, csize<Nout>, ptr_cast<T>(ptr)); 1474 if constexpr (transposed) 1475 temp = ctranspose<sizeof...(N)>(temp); 1476 split(temp, w...); 1477 } 1478 1479 // Warning: Reads past the end. Use with care 1480 KFR_INTRINSIC void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0, 1481 cvec<f32, 4>& w1, cvec<f32, 4>& w2) 1482 { 1483 cvec<f32, 4> w3; 1484 cvec<f32, 16> v16 = concat(cread<4>(ptr), cread<4>(ptr + 3), cread<4>(ptr + 6), cread<4>(ptr + 9)); 1485 v16 = digitreverse4<2>(v16); 1486 split<f32, 32>(v16, w0, w1, w2, w3); 1487 } 1488 1489 KFR_INTRINSIC void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0, 1490 cvec<f32, 4>& w1, cvec<f32, 4>& w2, cvec<f32, 4>& w3, cvec<f32, 4>& w4) 1491 { 1492 cvec<f32, 16> v16 = concat(cread<4>(ptr), cread<4>(ptr + 5), cread<4>(ptr + 10), cread<4>(ptr + 15)); 1493 v16 = digitreverse4<2>(v16); 1494 split<f32, 32>(v16, w0, w1, w2, w3); 1495 w4 = cgather<4, 5>(ptr + 4); 1496 } 1497 1498 template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()> 1499 KFR_INTRINSIC void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N>... args) 1500 { 1501 auto temp = concat(args...); 1502 if constexpr (transposed) 1503 temp = ctransposeinverse<sizeof...(N)>(temp); 1504 write(ptr_cast<T>(ptr), temp); 1505 } 1506 1507 template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2> 1508 KFR_INTRINSIC vec<T, N> mul_tw(cbool_t<false>, const vec<T, N>& x, const complex<T>* twiddle) 1509 { 1510 return I == 0 ? x : cmul(x, cread<width>(twiddle + width * (I - 1))); 1511 } 1512 template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2> 1513 KFR_INTRINSIC vec<T, N> mul_tw(cbool_t<true>, const vec<T, N>& x, const complex<T>* twiddle) 1514 { 1515 return I == 0 ? x : cmul_conj(x, cread<width>(twiddle + width * (I - 1))); 1516 } 1517 1518 // Non-final 1519 template <typename T, size_t width, size_t radix, bool inverse, size_t... I> 1520 KFR_INTRINSIC void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>, 1521 cbool_t<inverse>, complex<T>* out, const complex<T>* in, 1522 const complex<T>* tw, size_t stride) 1523 { 1524 carray<cvec<T, width>, radix> inout; 1525 1526 swallow{ (inout.get(csize_t<I>()) = cread<width>(in + i + stride * I))... }; 1527 1528 butterfly(cbool_t<inverse>(), inout.template get<I>()..., inout.template get<I>()...); 1529 1530 swallow{ ( 1531 cwrite<width>(out + i + stride * I, 1532 mul_tw<I, radix>(cbool_t<inverse>(), inout.template get<I>(), tw + i * (radix - 1))), 1533 0)... }; 1534 } 1535 1536 // Final 1537 template <typename T, size_t width, size_t radix, bool inverse, size_t... I> 1538 KFR_INTRINSIC void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>, 1539 cbool_t<inverse>, complex<T>* out, const complex<T>* in, size_t stride) 1540 { 1541 carray<cvec<T, width>, radix> inout; 1542 1543 // swallow{ ( inout.get( csize<I> ) = infn( i, I, cvec<T, width>( ) ) )... }; 1544 cread_transposed(ctrue, in + i * radix, inout.template get<I>()...); 1545 1546 butterfly(cbool_t<inverse>(), inout.template get<I>()..., inout.template get<I>()...); 1547 1548 swallow{ (cwrite<width>(out + i + stride * I, inout.get(csize_t<I>())), 0)... }; 1549 } 1550 1551 template <size_t width, size_t radix, typename... Args> 1552 KFR_INTRINSIC void butterfly(size_t i, csize_t<width>, csize_t<radix>, Args&&... args) 1553 { 1554 butterfly_helper(csizeseq_t<radix>(), i, csize_t<width>(), csize_t<radix>(), std::forward<Args>(args)...); 1555 } 1556 1557 template <typename... Args> 1558 KFR_INTRINSIC void butterfly_cycle(size_t&, size_t, csize_t<0>, Args&&...) 1559 { 1560 } 1561 template <size_t width, typename... Args> 1562 KFR_INTRINSIC void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&... args) 1563 { 1564 CMT_LOOP_NOUNROLL 1565 for (; i < count / width * width; i += width) 1566 butterfly(i, csize_t<width>(), std::forward<Args>(args)...); 1567 butterfly_cycle(i, count, csize_t<width / 2>(), std::forward<Args>(args)...); 1568 } 1569 1570 template <size_t width, typename... Args> 1571 KFR_INTRINSIC void butterflies(size_t count, csize_t<width>, Args&&... args) 1572 { 1573 CMT_ASSUME(count > 0); 1574 size_t i = 0; 1575 butterfly_cycle(i, count, csize_t<width>(), std::forward<Args>(args)...); 1576 } 1577 1578 template <typename T, bool inverse, typename Tradix, typename Tstride> 1579 KFR_INTRINSIC void generic_butterfly_cycle(csize_t<0>, Tradix, cbool_t<inverse>, complex<T>*, 1580 const complex<T>*, Tstride, size_t, size_t, const complex<T>*, 1581 size_t) 1582 { 1583 } 1584 1585 template <size_t width, bool inverse, typename T, typename Tradix, typename Thalfradix, 1586 typename Thalfradixsqr, typename Tstride> 1587 KFR_INTRINSIC void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t<inverse>, complex<T>* out, 1588 const complex<T>* in, Tstride ostride, Thalfradix halfradix, 1589 Thalfradixsqr halfradix_sqr, const complex<T>* twiddle, size_t i) 1590 { 1591 CMT_LOOP_NOUNROLL 1592 for (; i < halfradix / width * width; i += width) 1593 { 1594 const cvec<T, 1> in0 = cread<1>(in); 1595 cvec<T, width> sum0 = resize<2 * width>(in0); 1596 cvec<T, width> sum1 = sum0; 1597 1598 for (size_t j = 0; j < halfradix; j++) 1599 { 1600 const cvec<T, 1> ina = cread<1>(in + (1 + j)); 1601 const cvec<T, 1> inb = cread<1>(in + radix - (j + 1)); 1602 cvec<T, width> tw = cread<width>(twiddle); 1603 if constexpr (inverse) 1604 tw = negodd /*cconj*/ (tw); 1605 1606 cmul_2conj(sum0, sum1, ina, inb, tw); 1607 twiddle += halfradix; 1608 } 1609 twiddle = twiddle - halfradix_sqr + width; 1610 1611 if (is_constant_val(ostride)) 1612 { 1613 cwrite<width>(out + (1 + i), sum0); 1614 cwrite<width>(out + (radix - (i + 1)) - (width - 1), reverse<2>(sum1)); 1615 } 1616 else 1617 { 1618 cscatter<width>(out + (i + 1) * ostride, ostride, sum0); 1619 cscatter<width>(out + (radix - (i + 1)) * ostride - (width - 1) * ostride, ostride, 1620 reverse<2>(sum1)); 1621 } 1622 } 1623 generic_butterfly_cycle(csize_t<width / 2>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix, 1624 halfradix_sqr, twiddle, i); 1625 } 1626 1627 template <typename T> 1628 KFR_INTRINSIC vec<T, 2> hcadd(vec<T, 2> value) 1629 { 1630 return value; 1631 } 1632 template <typename T, size_t N, KFR_ENABLE_IF(N >= 4)> 1633 KFR_INTRINSIC vec<T, 2> hcadd(vec<T, N> value) 1634 { 1635 return hcadd(low(value) + high(value)); 1636 } 1637 1638 template <size_t width, typename T, bool inverse, typename Tstride = csize_t<1>> 1639 KFR_INTRINSIC void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in, 1640 const complex<T>* twiddle, Tstride ostride = Tstride{}) 1641 { 1642 CMT_ASSUME(radix > 0); 1643 { 1644 cvec<T, width> sum = T(); 1645 size_t j = 0; 1646 CMT_LOOP_NOUNROLL 1647 for (; j < radix / width * width; j += width) 1648 { 1649 sum += cread<width>(in + j); 1650 } 1651 cvec<T, 1> sums = T(); 1652 CMT_LOOP_NOUNROLL 1653 for (; j < radix; j++) 1654 { 1655 sums += cread<1>(in + j); 1656 } 1657 cwrite<1>(out, hcadd(sum) + sums); 1658 } 1659 const auto halfradix = radix / 2; 1660 CMT_ASSUME(halfradix > 0); 1661 size_t i = 0; 1662 1663 generic_butterfly_cycle(csize_t<width>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix, 1664 halfradix * halfradix, twiddle, i); 1665 } 1666 1667 template <size_t width, size_t radix, typename T, bool inverse, typename Tstride = csize_t<1>> 1668 KFR_INTRINSIC void spec_generic_butterfly_w(csize_t<radix>, cbool_t<inverse>, complex<T>* out, 1669 const complex<T>* in, const complex<T>* twiddle, 1670 Tstride ostride = Tstride{}) 1671 { 1672 { 1673 cvec<T, width> sum = T(); 1674 size_t j = 0; 1675 CMT_LOOP_UNROLL 1676 for (; j < radix / width * width; j += width) 1677 { 1678 sum += cread<width>(in + j); 1679 } 1680 cvec<T, 1> sums = T(); 1681 CMT_LOOP_UNROLL 1682 for (; j < radix; j++) 1683 { 1684 sums += cread<1>(in + j); 1685 } 1686 cwrite<1>(out, hcadd(sum) + sums); 1687 } 1688 const size_t halfradix = radix / 2; 1689 const size_t halfradix_sqr = halfradix * halfradix; 1690 CMT_ASSUME(halfradix > 0); 1691 size_t i = 0; 1692 1693 generic_butterfly_cycle(csize_t<width>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix, 1694 halfradix_sqr, twiddle, i); 1695 } 1696 1697 template <typename T, bool inverse, typename Tstride = csize_t<1>> 1698 KFR_INTRINSIC void generic_butterfly(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in, 1699 complex<T>*, const complex<T>* twiddle, Tstride ostride = {}) 1700 { 1701 cswitch( 1702 csizes_t<11, 13>(), radix, 1703 [&](auto radix_) CMT_INLINE_LAMBDA 1704 { 1705 constexpr size_t width = vector_width<T>; 1706 spec_generic_butterfly_w<width>(radix_, cbool_t<inverse>(), out, in, twiddle, ostride); 1707 }, 1708 [&]() CMT_INLINE_LAMBDA 1709 { 1710 constexpr size_t width = vector_width<T>; 1711 generic_butterfly_w<width>(radix, cbool_t<inverse>(), out, in, twiddle, ostride); 1712 }); 1713 } 1714 1715 template <typename T, size_t N> 1716 constexpr cvec<T, N> cmask08 = broadcast<N * 2, T>(T(), -T()); 1717 1718 template <typename T, size_t N> 1719 constexpr cvec<T, N> cmask0088 = broadcast<N * 4, T>(T(), T(), -T(), -T()); 1720 1721 template <bool A = false, typename T, size_t N> 1722 KFR_INTRINSIC void cbitreverse_write(complex<T>* dest, const vec<T, N>& x) 1723 { 1724 cwrite<N / 2, A>(dest, bitreverse<2>(x)); 1725 } 1726 1727 template <bool A = false, typename T, size_t N> 1728 KFR_INTRINSIC void cdigitreverse4_write(complex<T>* dest, const vec<T, N>& x) 1729 { 1730 cwrite<N / 2, A>(dest, digitreverse4<2>(x)); 1731 } 1732 1733 template <size_t N, bool A = false, typename T> 1734 KFR_INTRINSIC cvec<T, N> cbitreverse_read(const complex<T>* src) 1735 { 1736 return bitreverse<2>(cread<N, A>(src)); 1737 } 1738 1739 template <size_t N, bool A = false, typename T> 1740 KFR_INTRINSIC cvec<T, N> cdigitreverse4_read(const complex<T>* src) 1741 { 1742 return digitreverse4<2>(cread<N, A>(src)); 1743 } 1744 1745 #if 1 1746 1747 template <> 1748 KFR_INTRINSIC cvec<f64, 16> cdigitreverse4_read<16, false, f64>(const complex<f64>* src) 1749 { 1750 return concat(cread<1>(src + 0), cread<1>(src + 4), cread<1>(src + 8), cread<1>(src + 12), 1751 cread<1>(src + 1), cread<1>(src + 5), cread<1>(src + 9), cread<1>(src + 13), 1752 cread<1>(src + 2), cread<1>(src + 6), cread<1>(src + 10), cread<1>(src + 14), 1753 cread<1>(src + 3), cread<1>(src + 7), cread<1>(src + 11), cread<1>(src + 15)); 1754 } 1755 template <> 1756 KFR_INTRINSIC void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, const vec<f64, 32>& x) 1757 { 1758 cwrite<1>(dest, part<16, 0>(x)); 1759 cwrite<1>(dest + 4, part<16, 1>(x)); 1760 cwrite<1>(dest + 8, part<16, 2>(x)); 1761 cwrite<1>(dest + 12, part<16, 3>(x)); 1762 1763 cwrite<1>(dest + 1, part<16, 4>(x)); 1764 cwrite<1>(dest + 5, part<16, 5>(x)); 1765 cwrite<1>(dest + 9, part<16, 6>(x)); 1766 cwrite<1>(dest + 13, part<16, 7>(x)); 1767 1768 cwrite<1>(dest + 2, part<16, 8>(x)); 1769 cwrite<1>(dest + 6, part<16, 9>(x)); 1770 cwrite<1>(dest + 10, part<16, 10>(x)); 1771 cwrite<1>(dest + 14, part<16, 11>(x)); 1772 1773 cwrite<1>(dest + 3, part<16, 12>(x)); 1774 cwrite<1>(dest + 7, part<16, 13>(x)); 1775 cwrite<1>(dest + 11, part<16, 14>(x)); 1776 cwrite<1>(dest + 15, part<16, 15>(x)); 1777 } 1778 #endif 1779 } // namespace intrinsics 1780 } // namespace CMT_ARCH_NAME 1781 } // namespace kfr 1782 1783 CMT_PRAGMA_MSVC(warning(pop)) 1784 1785 CMT_PRAGMA_GNU(GCC diagnostic pop)