kfr

Fast, modern C++ DSP framework, FFT, Sample Rate Conversion, FIR/IIR/Biquad Filters (SSE, AVX, AVX-512, ARM NEON)
Log | Files | Refs | README

commit 31b1063a50c152548a7dce7965f5c5345aa93e21
parent 0128b80180376f6d44cefc4d6669672f0e3d9fca
Author: [email protected] <[email protected]>
Date:   Sat,  2 Dec 2023 01:00:33 +0000

Move dft and io to src/

Diffstat:
MCMakeLists.txt | 115++++++++++++++++++++++---------------------------------------------------------
Dcapi/CMakeLists.txt | 131-------------------------------------------------------------------------------
Mexamples/CMakeLists.txt | 2+-
Mformat-all.py | 2+-
Dinclude/kfr/dft/data/sincos.hpp | 192-------------------------------------------------------------------------------
Dinclude/kfr/dft/impl/bitrev.hpp | 480-------------------------------------------------------------------------------
Dinclude/kfr/dft/impl/convolution-impl.cpp | 307-------------------------------------------------------------------------------
Dinclude/kfr/dft/impl/dft-fft.hpp | 114-------------------------------------------------------------------------------
Dinclude/kfr/dft/impl/dft-impl.hpp | 568-------------------------------------------------------------------------------
Dinclude/kfr/dft/impl/dft-templates.hpp | 41-----------------------------------------
Dinclude/kfr/dft/impl/fft-templates.hpp | 39---------------------------------------
Dinclude/kfr/dft/impl/ft.hpp | 1785-------------------------------------------------------------------------------
Dinclude/kfr/dsp/impl/dsp-impl.cpp | 29-----------------------------
Minclude/kfr/io/audiofile.hpp | 396++++++++++++++++++-------------------------------------------------------------
Dinclude/kfr/io/impl/audiofile-impl.cpp | 49-------------------------------------------------
Msources.cmake | 45++++++++++++---------------------------------
Asrc/capi/CMakeLists.txt | 127+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rcapi/capi.cpp -> src/capi/capi.cpp | 0
Asrc/capi/dsp.cpp | 28++++++++++++++++++++++++++++
Asrc/dft/CMakeLists.txt | 74++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/dft/bitrev.hpp | 480+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/dft/convolution-impl.cpp | 307+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rinclude/kfr/dft/data/bitrev.hpp -> src/dft/data/bitrev.hpp | 0
Asrc/dft/data/sincos.hpp | 192+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/dft/dft-fft.hpp | 114+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rinclude/kfr/dft/impl/dft-impl-f32.cpp -> src/dft/dft-impl-f32.cpp | 0
Rinclude/kfr/dft/impl/dft-impl-f64.cpp -> src/dft/dft-impl-f64.cpp | 0
Asrc/dft/dft-impl.hpp | 568+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/dft/dft-templates.hpp | 41+++++++++++++++++++++++++++++++++++++++++
Rinclude/kfr/dft/impl/fft-impl-f32.cpp -> src/dft/fft-impl-f32.cpp | 0
Rinclude/kfr/dft/impl/fft-impl-f64.cpp -> src/dft/fft-impl-f64.cpp | 0
Rinclude/kfr/dft/impl/fft-impl.hpp -> src/dft/fft-impl.hpp | 0
Asrc/dft/fft-templates.hpp | 39+++++++++++++++++++++++++++++++++++++++
Asrc/dft/ft.hpp | 1785+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Asrc/io/CMakeLists.txt | 0
Asrc/io/audiofile-impl.cpp | 407+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Rinclude/kfr/io/dr/README.txt -> src/io/dr/README.txt | 0
Rinclude/kfr/io/dr/dr_flac.h -> src/io/dr/dr_flac.h | 0
Rinclude/kfr/io/dr/dr_mp3.h -> src/io/dr/dr_mp3.h | 0
Rinclude/kfr/io/dr/dr_wav.h -> src/io/dr/dr_wav.h | 0
Mtests/CMakeLists.txt | 2+-
Mtests/dft_test.cpp | 2+-
Mtools/CMakeLists.txt | 2+-
Mupdate-sources.py | 4++--
44 files changed, 4300 insertions(+), 4167 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt @@ -14,7 +14,7 @@ # You should have received a copy of the GNU General Public License # along with KFR. -cmake_minimum_required(VERSION 3.10) +cmake_minimum_required(VERSION 3.12) project(kfr CXX) @@ -61,6 +61,8 @@ endif () include(sources.cmake) include(CMakeDependentOption) +add_subdirectory(src/io) + option(ENABLE_TESTS "Enable KFR tests" OFF) cmake_dependent_option(ENABLE_EXAMPLES "Enable KFR examples" ON "ENABLE_TESTS" OFF) if (CLANG) @@ -89,6 +91,35 @@ mark_as_advanced(KFR_ENABLE_ASMTEST) mark_as_advanced(KFR_REGENERATE_TESTS) mark_as_advanced(KFR_DISABLE_CLANG_EXTENSIONS) +if (KFR_ENABLE_CAPI_BUILD AND NOT KFR_ENABLE_DFT) + message(FATAL_ERROR "KFR_ENABLE_CAPI_BUILD requires KFR_ENABLE_DFT to be enabled") +endif () +if (KFR_ENABLE_CAPI_BUILD AND NOT KFR_ENABLE_DFT_NP) + message(FATAL_ERROR "KFR_ENABLE_CAPI_BUILD requires KFR_ENABLE_DFT_NP to be enabled") +endif () +if (KFR_ENABLE_CAPI_BUILD AND KFR_ENABLE_DFT_MULTIARCH) + message(FATAL_ERROR "KFR_ENABLE_CAPI_BUILD requires KFR_ENABLE_DFT_MULTIARCH to be disabled") +endif () + +include(cmake/target_set_arch.cmake) + +function (link_as_whole TARGET TYPE LIBRARY) + if (APPLE) + target_link_options(${TARGET} ${TYPE} -Wl,-force_load $<TARGET_FILE:${LIBRARY}>) + elseif (WIN32) + target_link_options(${TARGET} ${TYPE} /WHOLEARCHIVE:$<TARGET_FILE:${LIBRARY}>) + else () + target_link_options(${TARGET} ${TYPE} -Wl,--push-state,--whole-archive $<TARGET_FILE:${LIBRARY}> -Wl,--pop-state) + endif () +endfunction() + +if (KFR_ENABLE_DFT) + add_subdirectory(src/dft) +endif () +if (KFR_ENABLE_CAPI_BUILD) + add_subdirectory(src/capi) +endif () + if (NOT KFR_ARCH) set(KFR_ARCH detect) endif () @@ -124,8 +155,6 @@ else () ) endif () -include(cmake/target_set_arch.cmake) - add_library(use_arch INTERFACE) target_set_arch(use_arch INTERFACE ${KFR_ARCH}) @@ -147,12 +176,6 @@ else () set(PTHREAD_LIB pthread) endif () -if (MSVC AND CLANG) - set(CLANG_ARG_PREFIX "SHELL:-Xclang ") -else () - set(CLANG_ARG_PREFIX "") -endif () - # KFR library add_library(kfr INTERFACE) target_sources(kfr INTERFACE ${KFR_SRC}) @@ -201,16 +224,6 @@ if (X86) target_set_arch(detect_cpu PRIVATE generic) endif () -function (link_as_whole TARGET TYPE LIBRARY) - if (APPLE) - target_link_options(${TARGET} ${TYPE} -Wl,-force_load $<TARGET_FILE:${LIBRARY}>) - elseif (WIN32) - target_link_options(${TARGET} ${TYPE} /WHOLEARCHIVE:$<TARGET_FILE:${LIBRARY}>) - else () - target_link_options(${TARGET} ${TYPE} -Wl,--push-state,--whole-archive $<TARGET_FILE:${LIBRARY}> -Wl,--pop-state) - endif () -endfunction() - function (add_arch_library NAME ARCH SRCS DEFS) add_library(${NAME}_${ARCH} ${SRCS}) target_link_libraries(${NAME}_${ARCH} kfr) @@ -219,53 +232,6 @@ function (add_arch_library NAME ARCH SRCS DEFS) target_link_libraries(${NAME}_all INTERFACE ${NAME}_${ARCH}) endfunction () -if (KFR_ENABLE_DFT) - - if (X86) - set(KFR_DFT_DEFS ${CLANG_ARG_PREFIX}-ffp-contract=fast -Xclang -O3 -mllvm -x86-use-vzeroupper=0) - else() - set(KFR_DFT_DEFS ${CLANG_ARG_PREFIX}-ffp-contract=fast -Xclang -O3) - endif () - - if (KFR_ENABLE_DFT_MULTIARCH) - add_library(kfr_dft INTERFACE) - add_library(kfr_dft_all INTERFACE) - target_link_libraries(kfr_dft INTERFACE kfr kfr_dft_all) - target_compile_definitions( - kfr_dft - INTERFACE -DKFR_DFT_MULTI=1 - -DCMT_MULTI=1 - -DCMT_MULTI_ENABLED_SSE2=1 - -DCMT_MULTI_ENABLED_SSE41=1 - -DCMT_MULTI_ENABLED_AVX=1 - -DCMT_MULTI_ENABLED_AVX2=1 - -DCMT_MULTI_ENABLED_AVX512=1) - - add_arch_library(kfr_dft sse2 "${KFR_DFT_SRC}" "${KFR_DFT_DEFS}") - add_arch_library(kfr_dft sse41 "${KFR_DFT_SRC}" "${KFR_DFT_DEFS}") - add_arch_library(kfr_dft avx "${KFR_DFT_SRC}" "${KFR_DFT_DEFS}") - add_arch_library(kfr_dft avx2 "${KFR_DFT_SRC}" "${KFR_DFT_DEFS}") - add_arch_library(kfr_dft avx512 "${KFR_DFT_SRC}" "${KFR_DFT_DEFS}") - - link_as_whole(kfr_dft_all INTERFACE kfr_dft_sse2) - - else () - add_library(kfr_dft ${KFR_DFT_SRC}) - target_link_libraries(kfr_dft kfr use_arch) - target_compile_options(kfr_dft PRIVATE ${KFR_DFT_DEFS}) - if (KFR_ENABLE_DFT_NP) - target_compile_definitions(kfr_dft PUBLIC -DKFR_DFT_NPo2) - else () - target_compile_definitions(kfr_dft PUBLIC -DKFR_DFT_NO_NPo2) - endif () - - endif () - - if (KFR_ENABLE_CAPI_BUILD) - add_subdirectory(capi) - endif () -endif () - if (ENABLE_EXAMPLES) add_subdirectory(examples) add_subdirectory(tools) @@ -318,23 +284,6 @@ set(kfr_defines "#define ${kfr_defines}\n") file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/kfr_config.h "${kfr_defines}") -if (KFR_ENABLE_DFT AND KFR_INSTALL_LIBRARIES) - if (KFR_ENABLE_DFT_MULTIARCH) - install( - TARGETS kfr_dft_sse2 kfr_dft_sse41 kfr_dft_avx kfr_dft_avx2 - kfr_dft_avx512 - ARCHIVE DESTINATION lib - LIBRARY DESTINATION lib - RUNTIME DESTINATION bin) - else () - install( - TARGETS kfr_dft - ARCHIVE DESTINATION lib - LIBRARY DESTINATION lib - RUNTIME DESTINATION bin) - endif () -endif () - if (KFR_INSTALL_HEADERS) install(DIRECTORY include/kfr DESTINATION include) diff --git a/capi/CMakeLists.txt b/capi/CMakeLists.txt @@ -1,131 +0,0 @@ -# Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) -# This file is part of KFR -# -# KFR is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 2 of the License, or -# (at your option) any later version. -# -# KFR is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with KFR. - -cmake_minimum_required(VERSION 3.10) - -if (WIN32) - set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) -endif () - -set(CMAKE_CXX_VISIBILITY_PRESET "default") -set(CMAKE_C_VISIBILITY_PRESET "default") - -if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") - add_compile_options(-fdiagnostics-absolute-paths) -endif () - -if (MSVC) - set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT") - set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd") -endif () - -if (APPLE) - add_compile_options(-mmacosx-version-min=10.9) -endif () - -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin) -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO ${PROJECT_BINARY_DIR}/bin) -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin) -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib) -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELWITHDEBINFO ${PROJECT_BINARY_DIR}/lib) -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib) -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib) -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELWITHDEBINFO ${PROJECT_BINARY_DIR}/lib) -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib) - -add_library(kfr_capi_all INTERFACE) -target_link_libraries(kfr_capi_all INTERFACE kfr) -if (NOT WIN32) - add_library(kfr_capi_all_pic INTERFACE) - target_link_libraries(kfr_capi_all_pic INTERFACE kfr) -endif () - -function (add_c_library ARCH) - add_library( - kfr_capi_${ARCH} STATIC - ${KFR_DFT_SRC} - ${CMAKE_CURRENT_SOURCE_DIR}/../include/kfr/dsp/impl/dsp-impl.cpp) - target_link_libraries(kfr_capi_${ARCH} kfr) - target_set_arch(kfr_capi_${ARCH} PRIVATE ${ARCH}) - target_compile_options(kfr_capi_${ARCH} PRIVATE "${CLANG_ARG_PREFIX}-ffp-contract=fast") - target_link_libraries(kfr_capi_all INTERFACE kfr_capi_${ARCH}) - - if (NOT WIN32) - add_library( - kfr_capi_${ARCH}_pic STATIC - ${KFR_DFT_SRC} - ${CMAKE_CURRENT_SOURCE_DIR}/../include/kfr/dsp/impl/dsp-impl.cpp) - set_property(TARGET kfr_capi_${ARCH}_pic - PROPERTY POSITION_INDEPENDENT_CODE 1) - target_link_libraries(kfr_capi_${ARCH}_pic kfr) - target_set_arch(kfr_capi_${ARCH}_pic PRIVATE ${ARCH}) - target_compile_options(kfr_capi_${ARCH}_pic PRIVATE "${CLANG_ARG_PREFIX}-ffp-contract=fast") - - target_link_libraries(kfr_capi_all_pic INTERFACE kfr_capi_${ARCH}_pic) - endif () -endfunction () - -add_library(kfr_capi SHARED ${PROJECT_SOURCE_DIR}/capi/capi.cpp) - -add_c_library(sse2) -add_c_library(sse41) -add_c_library(avx) -add_c_library(avx2) -add_c_library(avx512) - -link_as_whole(kfr_capi_all INTERFACE kfr_capi_sse2) -if (NOT WIN32) - link_as_whole(kfr_capi_all_pic INTERFACE kfr_capi_sse2_pic) -endif() - -target_compile_definitions( - kfr_capi - PRIVATE -DKFR_DFT_MULTI=1 - -DCMT_MULTI=1 - -DCMT_MULTI_ENABLED_SSE2=1 - -DCMT_MULTI_ENABLED_SSE41=1 - -DCMT_MULTI_ENABLED_AVX=1 - -DCMT_MULTI_ENABLED_AVX2=1 - -DCMT_MULTI_ENABLED_AVX512=1 - -DKFR_BUILDING_DLL=1) - -target_set_arch(kfr_capi PRIVATE sse2) - -if (WIN32) - target_link_libraries(kfr_capi PRIVATE kfr kfr_capi_all) -else () - target_link_libraries(kfr_capi PRIVATE kfr kfr_capi_all_pic) - - if (APPLE) - message( - STATUS - "Minimum macOS version is set to ${CMAKE_OSX_DEPLOYMENT_TARGET}" - ) - message(STATUS "Set CMAKE_OSX_DEPLOYMENT_TARGET variable to change") - else () - set_property( - TARGET kfr_capi APPEND - PROPERTY LINK_LIBRARIES - -nodefaultlibs - -Wl,-Bdynamic - -lm - -lc - -Wl,-Bstatic - -lstdc++ - -lgcc - -s) - endif () -endif () diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt @@ -14,7 +14,7 @@ # You should have received a copy of the GNU General Public License # along with KFR. -cmake_minimum_required(VERSION 3.10) +cmake_minimum_required(VERSION 3.12) # Binary output directories set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin) diff --git a/format-all.py b/format-all.py @@ -10,7 +10,7 @@ import glob path = os.path.dirname(os.path.realpath(__file__)) masks = ['*.hpp', '*.h', '*.cpp', '*.c', '*.cxx'] -ignore = ['build/*', 'build-*', 'cmake-*', '.*', 'include/kfr/io/dr'] +ignore = ['build/*', 'build-*', 'cmake-*', '.*', 'src/io/dr'] filenames = [] for root, dirnames, files in os.walk(path, path): diff --git a/include/kfr/dft/data/sincos.hpp b/include/kfr/dft/data/sincos.hpp @@ -1,192 +0,0 @@ -/* - Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../../kfr.h" -#include "../../simd/types.hpp" -#include <cstdint> - -namespace kfr -{ - -namespace data -{ - -template <typename T> -constexpr inline T c_sin_table[65] = { - /* sin(2*pi* 0/ 256) */ f32(0.0), - /* sin(2*pi* 1/ 256) */ f32(0.02454122852291228803173452945928292506547), - /* sin(2*pi* 2/ 256) */ f32(0.04906767432741801425495497694268265831475), - /* sin(2*pi* 3/ 256) */ f32(0.0735645635996674235294656215752343218133), - /* sin(2*pi* 4/ 256) */ f32(0.09801714032956060199419556388864184586114), - /* sin(2*pi* 5/ 256) */ f32(0.1224106751992161984987044741509457875752), - /* sin(2*pi* 6/ 256) */ f32(0.1467304744553617516588501296467178197062), - /* sin(2*pi* 7/ 256) */ f32(0.1709618887603012263636423572082635319663), - /* sin(2*pi* 8/ 256) */ f32(0.1950903220161282678482848684770222409277), - /* sin(2*pi* 9/ 256) */ f32(0.2191012401568697972277375474973577988484), - /* sin(2*pi* 10/ 256) */ f32(0.242980179903263889948274162077471118321), - /* sin(2*pi* 11/ 256) */ f32(0.2667127574748983863252865151164363940421), - /* sin(2*pi* 12/ 256) */ f32(0.2902846772544623676361923758173952746915), - /* sin(2*pi* 13/ 256) */ f32(0.3136817403988914766564788459941003099934), - /* sin(2*pi* 14/ 256) */ f32(0.3368898533922200506892532126191475704778), - /* sin(2*pi* 15/ 256) */ f32(0.3598950365349881487751045723267564202023), - /* sin(2*pi* 16/ 256) */ f32(0.3826834323650897717284599840303988667613), - /* sin(2*pi* 17/ 256) */ f32(0.4052413140049898709084813055050524665119), - /* sin(2*pi* 18/ 256) */ f32(0.4275550934302820943209668568887985343046), - /* sin(2*pi* 19/ 256) */ f32(0.4496113296546066000462945794242270758832), - /* sin(2*pi* 20/ 256) */ f32(0.4713967368259976485563876259052543776575), - /* sin(2*pi* 21/ 256) */ f32(0.4928981922297840368730266887588092682397), - /* sin(2*pi* 22/ 256) */ f32(0.514102744193221726593693838968815772608), - /* sin(2*pi* 23/ 256) */ f32(0.5349976198870972106630769046370179155603), - /* sin(2*pi* 24/ 256) */ f32(0.5555702330196022247428308139485328743749), - /* sin(2*pi* 25/ 256) */ f32(0.575808191417845300745972453815730841776), - /* sin(2*pi* 26/ 256) */ f32(0.5956993044924333434670365288299698895119), - /* sin(2*pi* 27/ 256) */ f32(0.6152315905806268454849135634139842776594), - /* sin(2*pi* 28/ 256) */ f32(0.6343932841636454982151716132254933706757), - /* sin(2*pi* 29/ 256) */ f32(0.6531728429537767640842030136563054150769), - /* sin(2*pi* 30/ 256) */ f32(0.6715589548470184006253768504274218032288), - /* sin(2*pi* 31/ 256) */ f32(0.6895405447370669246167306299574847028455), - /* sin(2*pi* 32/ 256) */ f32(0.7071067811865475244008443621048490392848), - /* sin(2*pi* 33/ 256) */ f32(0.7242470829514669209410692432905531674831), - /* sin(2*pi* 34/ 256) */ f32(0.740951125354959091175616897495162729729), - /* sin(2*pi* 35/ 256) */ f32(0.7572088465064845475754640536057844730404), - /* sin(2*pi* 36/ 256) */ f32(0.773010453362736960810906609758469800971), - /* sin(2*pi* 37/ 256) */ f32(0.7883464276266062620091647053596892826565), - /* sin(2*pi* 38/ 256) */ f32(0.8032075314806449098066765129631419238796), - /* sin(2*pi* 39/ 256) */ f32(0.817584813151583696504920884130633809471), - /* sin(2*pi* 40/ 256) */ f32(0.8314696123025452370787883776179057567386), - /* sin(2*pi* 41/ 256) */ f32(0.8448535652497070732595712051049570977198), - /* sin(2*pi* 42/ 256) */ f32(0.8577286100002720699022699842847701370425), - /* sin(2*pi* 43/ 256) */ f32(0.8700869911087114186522924044838488439108), - /* sin(2*pi* 44/ 256) */ f32(0.8819212643483550297127568636603883495084), - /* sin(2*pi* 45/ 256) */ f32(0.8932243011955153203424164474933979780006), - /* sin(2*pi* 46/ 256) */ f32(0.9039892931234433315862002972305370487101), - /* sin(2*pi* 47/ 256) */ f32(0.9142097557035306546350148293935774010447), - /* sin(2*pi* 48/ 256) */ f32(0.9238795325112867561281831893967882868224), - /* sin(2*pi* 49/ 256) */ f32(0.932992798834738887711660255543302498295), - /* sin(2*pi* 50/ 256) */ f32(0.9415440651830207784125094025995023571856), - /* sin(2*pi* 51/ 256) */ f32(0.9495281805930366671959360741893450282522), - /* sin(2*pi* 52/ 256) */ f32(0.9569403357322088649357978869802699694828), - /* sin(2*pi* 53/ 256) */ f32(0.9637760657954398666864643555078351536631), - /* sin(2*pi* 54/ 256) */ f32(0.9700312531945439926039842072861002514569), - /* sin(2*pi* 55/ 256) */ f32(0.975702130038528544460395766419527971644), - /* sin(2*pi* 56/ 256) */ f32(0.9807852804032304491261822361342390369739), - /* sin(2*pi* 57/ 256) */ f32(0.9852776423889412447740184331785477871601), - /* sin(2*pi* 58/ 256) */ f32(0.9891765099647809734516737380162430639837), - /* sin(2*pi* 59/ 256) */ f32(0.9924795345987099981567672516611178200108), - /* sin(2*pi* 60/ 256) */ f32(0.9951847266721968862448369531094799215755), - /* sin(2*pi* 61/ 256) */ f32(0.9972904566786902161355971401825678211717), - /* sin(2*pi* 62/ 256) */ f32(0.9987954562051723927147716047591006944432), - /* sin(2*pi* 63/ 256) */ f32(0.9996988186962042201157656496661721968501), - /* sin(2*pi* 64/ 256) */ f32(1.0000000000000000000000000000000000000000) -}; - -// data generated by mpfr -template <> -constexpr inline f64 c_sin_table<f64>[65] = { - /* sin(2*pi* 0/ 256) */ f64(0.0), - /* sin(2*pi* 1/ 256) */ f64(0.02454122852291228803173452945928292506547), - /* sin(2*pi* 2/ 256) */ f64(0.04906767432741801425495497694268265831475), - /* sin(2*pi* 3/ 256) */ f64(0.0735645635996674235294656215752343218133), - /* sin(2*pi* 4/ 256) */ f64(0.09801714032956060199419556388864184586114), - /* sin(2*pi* 5/ 256) */ f64(0.1224106751992161984987044741509457875752), - /* sin(2*pi* 6/ 256) */ f64(0.1467304744553617516588501296467178197062), - /* sin(2*pi* 7/ 256) */ f64(0.1709618887603012263636423572082635319663), - /* sin(2*pi* 8/ 256) */ f64(0.1950903220161282678482848684770222409277), - /* sin(2*pi* 9/ 256) */ f64(0.2191012401568697972277375474973577988484), - /* sin(2*pi* 10/ 256) */ f64(0.242980179903263889948274162077471118321), - /* sin(2*pi* 11/ 256) */ f64(0.2667127574748983863252865151164363940421), - /* sin(2*pi* 12/ 256) */ f64(0.2902846772544623676361923758173952746915), - /* sin(2*pi* 13/ 256) */ f64(0.3136817403988914766564788459941003099934), - /* sin(2*pi* 14/ 256) */ f64(0.3368898533922200506892532126191475704778), - /* sin(2*pi* 15/ 256) */ f64(0.3598950365349881487751045723267564202023), - /* sin(2*pi* 16/ 256) */ f64(0.3826834323650897717284599840303988667613), - /* sin(2*pi* 17/ 256) */ f64(0.4052413140049898709084813055050524665119), - /* sin(2*pi* 18/ 256) */ f64(0.4275550934302820943209668568887985343046), - /* sin(2*pi* 19/ 256) */ f64(0.4496113296546066000462945794242270758832), - /* sin(2*pi* 20/ 256) */ f64(0.4713967368259976485563876259052543776575), - /* sin(2*pi* 21/ 256) */ f64(0.4928981922297840368730266887588092682397), - /* sin(2*pi* 22/ 256) */ f64(0.514102744193221726593693838968815772608), - /* sin(2*pi* 23/ 256) */ f64(0.5349976198870972106630769046370179155603), - /* sin(2*pi* 24/ 256) */ f64(0.5555702330196022247428308139485328743749), - /* sin(2*pi* 25/ 256) */ f64(0.575808191417845300745972453815730841776), - /* sin(2*pi* 26/ 256) */ f64(0.5956993044924333434670365288299698895119), - /* sin(2*pi* 27/ 256) */ f64(0.6152315905806268454849135634139842776594), - /* sin(2*pi* 28/ 256) */ f64(0.6343932841636454982151716132254933706757), - /* sin(2*pi* 29/ 256) */ f64(0.6531728429537767640842030136563054150769), - /* sin(2*pi* 30/ 256) */ f64(0.6715589548470184006253768504274218032288), - /* sin(2*pi* 31/ 256) */ f64(0.6895405447370669246167306299574847028455), - /* sin(2*pi* 32/ 256) */ f64(0.7071067811865475244008443621048490392848), - /* sin(2*pi* 33/ 256) */ f64(0.7242470829514669209410692432905531674831), - /* sin(2*pi* 34/ 256) */ f64(0.740951125354959091175616897495162729729), - /* sin(2*pi* 35/ 256) */ f64(0.7572088465064845475754640536057844730404), - /* sin(2*pi* 36/ 256) */ f64(0.773010453362736960810906609758469800971), - /* sin(2*pi* 37/ 256) */ f64(0.7883464276266062620091647053596892826565), - /* sin(2*pi* 38/ 256) */ f64(0.8032075314806449098066765129631419238796), - /* sin(2*pi* 39/ 256) */ f64(0.817584813151583696504920884130633809471), - /* sin(2*pi* 40/ 256) */ f64(0.8314696123025452370787883776179057567386), - /* sin(2*pi* 41/ 256) */ f64(0.8448535652497070732595712051049570977198), - /* sin(2*pi* 42/ 256) */ f64(0.8577286100002720699022699842847701370425), - /* sin(2*pi* 43/ 256) */ f64(0.8700869911087114186522924044838488439108), - /* sin(2*pi* 44/ 256) */ f64(0.8819212643483550297127568636603883495084), - /* sin(2*pi* 45/ 256) */ f64(0.8932243011955153203424164474933979780006), - /* sin(2*pi* 46/ 256) */ f64(0.9039892931234433315862002972305370487101), - /* sin(2*pi* 47/ 256) */ f64(0.9142097557035306546350148293935774010447), - /* sin(2*pi* 48/ 256) */ f64(0.9238795325112867561281831893967882868224), - /* sin(2*pi* 49/ 256) */ f64(0.932992798834738887711660255543302498295), - /* sin(2*pi* 50/ 256) */ f64(0.9415440651830207784125094025995023571856), - /* sin(2*pi* 51/ 256) */ f64(0.9495281805930366671959360741893450282522), - /* sin(2*pi* 52/ 256) */ f64(0.9569403357322088649357978869802699694828), - /* sin(2*pi* 53/ 256) */ f64(0.9637760657954398666864643555078351536631), - /* sin(2*pi* 54/ 256) */ f64(0.9700312531945439926039842072861002514569), - /* sin(2*pi* 55/ 256) */ f64(0.975702130038528544460395766419527971644), - /* sin(2*pi* 56/ 256) */ f64(0.9807852804032304491261822361342390369739), - /* sin(2*pi* 57/ 256) */ f64(0.9852776423889412447740184331785477871601), - /* sin(2*pi* 58/ 256) */ f64(0.9891765099647809734516737380162430639837), - /* sin(2*pi* 59/ 256) */ f64(0.9924795345987099981567672516611178200108), - /* sin(2*pi* 60/ 256) */ f64(0.9951847266721968862448369531094799215755), - /* sin(2*pi* 61/ 256) */ f64(0.9972904566786902161355971401825678211717), - /* sin(2*pi* 62/ 256) */ f64(0.9987954562051723927147716047591006944432), - /* sin(2*pi* 63/ 256) */ f64(0.9996988186962042201157656496661721968501), - /* sin(2*pi* 64/ 256) */ f64(1.0000000000000000000000000000000000000000) -}; - -} // namespace data - -template <typename T> -constexpr inline T sin_using_table_256(size_t k) -{ - return (k > 128 ? -1 : +1) * data::c_sin_table<T>[k % 128 >= 64 ? 128 - k % 128 : k % 128]; -} - -template <typename T> -constexpr inline T sin_using_table(size_t size, size_t k) -{ - return sin_using_table_256<T>((k * 256 / size) % 256); -} -template <typename T> -constexpr inline T cos_using_table(size_t size, size_t k) -{ - return sin_using_table<T>(size, k + size / 4); -} -} // namespace kfr diff --git a/include/kfr/dft/impl/bitrev.hpp b/include/kfr/dft/impl/bitrev.hpp @@ -1,480 +0,0 @@ -/** @addtogroup dft - * @{ - */ -/* - Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../../simd/complex.hpp" -#include "../../simd/constants.hpp" -#include "../../simd/digitreverse.hpp" -#include "../../simd/vec.hpp" - -#include "../data/bitrev.hpp" - -#include "ft.hpp" - -namespace kfr -{ -inline namespace CMT_ARCH_NAME -{ - -namespace intrinsics -{ - -constexpr inline static bool fft_reorder_aligned = false; - -constexpr inline static size_t bitrev_table_log2N = ilog2(arraysize(data::bitrev_table)); - -template <size_t Bits> -CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x) -{ - if constexpr (Bits > bitrev_table_log2N) - return bitreverse<Bits>(x); - - return data::bitrev_table[x] >> (bitrev_table_log2N - Bits); -} - -template <bool use_table> -CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x, size_t bits, cbool_t<use_table>) -{ - if constexpr (use_table) - { - return data::bitrev_table[x] >> (bitrev_table_log2N - bits); - } - else - { - return bitreverse<32>(x) >> (32 - bits); - } -} - -CMT_GNU_CONSTEXPR inline u32 dig4rev_using_table(u32 x, size_t bits) -{ - if (bits > bitrev_table_log2N) - { - if (bits <= 16) - return digitreverse4<16>(x) >> (16 - bits); - else - return digitreverse4<32>(x) >> (32 - bits); - } - - x = data::bitrev_table[x]; - x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1)); - x = x >> (bitrev_table_log2N - bits); - return x; -} - -template <size_t log2n, size_t bitrev, typename T> -KFR_INTRINSIC void fft_reorder_swap(T* inout, size_t i) -{ - using cxx = cvec<T, 16>; - constexpr size_t N = 1 << log2n; - constexpr size_t N4 = 2 * N / 4; - - cxx vi = cread_group<4, 4, N4 / 2, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i)); - vi = digitreverse<bitrev, 2>(vi); - cwrite_group<4, 4, N4 / 2, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), vi); -} - -template <size_t log2n, size_t bitrev, typename T> -KFR_INTRINSIC void fft_reorder_swap_two(T* inout, size_t i, size_t j) -{ - CMT_ASSUME(i != j); - using cxx = cvec<T, 16>; - constexpr size_t N = 1 << log2n; - constexpr size_t N4 = 2 * N / 4; - - cxx vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4 / 2); - cxx vj = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4 / 2); - - vi = digitreverse<bitrev, 2>(vi); - cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4 / 2, vi); - vj = digitreverse<bitrev, 2>(vj); - cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4 / 2, vj); -} - -template <size_t log2n, size_t bitrev, typename T> -KFR_INTRINSIC void fft_reorder_swap(T* inout, size_t i, size_t j) -{ - CMT_ASSUME(i != j); - using cxx = cvec<T, 16>; - constexpr size_t N = 1 << log2n; - constexpr size_t N4 = 2 * N / 4; - - cxx vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4 / 2); - cxx vj = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4 / 2); - - vi = digitreverse<bitrev, 2>(vi); - cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4 / 2, vi); - vj = digitreverse<bitrev, 2>(vj); - cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4 / 2, vj); -} - -template <size_t log2n, size_t bitrev, typename T> -KFR_INTRINSIC void fft_reorder_swap(complex<T>* inout, size_t i) -{ - fft_reorder_swap<log2n, bitrev>(ptr_cast<T>(inout), i * 2); -} - -template <size_t log2n, size_t bitrev, typename T> -KFR_INTRINSIC void fft_reorder_swap_two(complex<T>* inout, size_t i0, size_t i1) -{ - fft_reorder_swap_two<log2n, bitrev>(ptr_cast<T>(inout), i0 * 2, i1 * 2); -} - -template <size_t log2n, size_t bitrev, typename T> -KFR_INTRINSIC void fft_reorder_swap(complex<T>* inout, size_t i, size_t j) -{ - fft_reorder_swap<log2n, bitrev>(ptr_cast<T>(inout), i * 2, j * 2); -} - -template <typename T> -KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<11>) -{ - fft_reorder_swap_two<11>(inout, 0 * 4, 8 * 4); - fft_reorder_swap<11>(inout, 1 * 4, 64 * 4); - fft_reorder_swap<11>(inout, 2 * 4, 32 * 4); - fft_reorder_swap<11>(inout, 3 * 4, 96 * 4); - fft_reorder_swap<11>(inout, 4 * 4, 16 * 4); - fft_reorder_swap<11>(inout, 5 * 4, 80 * 4); - fft_reorder_swap<11>(inout, 6 * 4, 48 * 4); - fft_reorder_swap<11>(inout, 7 * 4, 112 * 4); - fft_reorder_swap<11>(inout, 9 * 4, 72 * 4); - fft_reorder_swap<11>(inout, 10 * 4, 40 * 4); - fft_reorder_swap<11>(inout, 11 * 4, 104 * 4); - fft_reorder_swap<11>(inout, 12 * 4, 24 * 4); - fft_reorder_swap<11>(inout, 13 * 4, 88 * 4); - fft_reorder_swap<11>(inout, 14 * 4, 56 * 4); - fft_reorder_swap<11>(inout, 15 * 4, 120 * 4); - fft_reorder_swap<11>(inout, 17 * 4, 68 * 4); - fft_reorder_swap<11>(inout, 18 * 4, 36 * 4); - fft_reorder_swap<11>(inout, 19 * 4, 100 * 4); - fft_reorder_swap_two<11>(inout, 20 * 4, 28 * 4); - fft_reorder_swap<11>(inout, 21 * 4, 84 * 4); - fft_reorder_swap<11>(inout, 22 * 4, 52 * 4); - fft_reorder_swap<11>(inout, 23 * 4, 116 * 4); - fft_reorder_swap<11>(inout, 25 * 4, 76 * 4); - fft_reorder_swap<11>(inout, 26 * 4, 44 * 4); - fft_reorder_swap<11>(inout, 27 * 4, 108 * 4); - fft_reorder_swap<11>(inout, 29 * 4, 92 * 4); - fft_reorder_swap<11>(inout, 30 * 4, 60 * 4); - fft_reorder_swap<11>(inout, 31 * 4, 124 * 4); - fft_reorder_swap<11>(inout, 33 * 4, 66 * 4); - fft_reorder_swap_two<11>(inout, 34 * 4, 42 * 4); - fft_reorder_swap<11>(inout, 35 * 4, 98 * 4); - fft_reorder_swap<11>(inout, 37 * 4, 82 * 4); - fft_reorder_swap<11>(inout, 38 * 4, 50 * 4); - fft_reorder_swap<11>(inout, 39 * 4, 114 * 4); - fft_reorder_swap<11>(inout, 41 * 4, 74 * 4); - fft_reorder_swap<11>(inout, 43 * 4, 106 * 4); - fft_reorder_swap<11>(inout, 45 * 4, 90 * 4); - fft_reorder_swap<11>(inout, 46 * 4, 58 * 4); - fft_reorder_swap<11>(inout, 47 * 4, 122 * 4); - fft_reorder_swap<11>(inout, 49 * 4, 70 * 4); - fft_reorder_swap<11>(inout, 51 * 4, 102 * 4); - fft_reorder_swap<11>(inout, 53 * 4, 86 * 4); - fft_reorder_swap_two<11>(inout, 54 * 4, 62 * 4); - fft_reorder_swap<11>(inout, 55 * 4, 118 * 4); - fft_reorder_swap<11>(inout, 57 * 4, 78 * 4); - fft_reorder_swap<11>(inout, 59 * 4, 110 * 4); - fft_reorder_swap<11>(inout, 61 * 4, 94 * 4); - fft_reorder_swap<11>(inout, 63 * 4, 126 * 4); - fft_reorder_swap_two<11>(inout, 65 * 4, 73 * 4); - fft_reorder_swap<11>(inout, 67 * 4, 97 * 4); - fft_reorder_swap<11>(inout, 69 * 4, 81 * 4); - fft_reorder_swap<11>(inout, 71 * 4, 113 * 4); - fft_reorder_swap<11>(inout, 75 * 4, 105 * 4); - fft_reorder_swap<11>(inout, 77 * 4, 89 * 4); - fft_reorder_swap<11>(inout, 79 * 4, 121 * 4); - fft_reorder_swap<11>(inout, 83 * 4, 101 * 4); - fft_reorder_swap_two<11>(inout, 85 * 4, 93 * 4); - fft_reorder_swap<11>(inout, 87 * 4, 117 * 4); - fft_reorder_swap<11>(inout, 91 * 4, 109 * 4); - fft_reorder_swap<11>(inout, 95 * 4, 125 * 4); - fft_reorder_swap_two<11>(inout, 99 * 4, 107 * 4); - fft_reorder_swap<11>(inout, 103 * 4, 115 * 4); - fft_reorder_swap<11>(inout, 111 * 4, 123 * 4); - fft_reorder_swap_two<11>(inout, 119 * 4, 127 * 4); -} - -template <typename T> -KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<7>) -{ - constexpr size_t bitrev = 2; - fft_reorder_swap_two<7, bitrev>(inout, 0 * 4, 2 * 4); - fft_reorder_swap<7, bitrev>(inout, 1 * 4, 4 * 4); - fft_reorder_swap<7, bitrev>(inout, 3 * 4, 6 * 4); - fft_reorder_swap_two<7, bitrev>(inout, 5 * 4, 7 * 4); -} - -template <typename T> -KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>, cfalse_t /* use_br2 */) -{ - constexpr size_t bitrev = 4; - fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 5 * 4); - fft_reorder_swap<8, bitrev>(inout, 1 * 4, 4 * 4); - fft_reorder_swap<8, bitrev>(inout, 2 * 4, 8 * 4); - fft_reorder_swap<8, bitrev>(inout, 3 * 4, 12 * 4); - fft_reorder_swap<8, bitrev>(inout, 6 * 4, 9 * 4); - fft_reorder_swap<8, bitrev>(inout, 7 * 4, 13 * 4); - fft_reorder_swap_two<8, bitrev>(inout, 10 * 4, 15 * 4); - fft_reorder_swap<8, bitrev>(inout, 11 * 4, 14 * 4); -} - -template <typename T> -KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>, ctrue_t /* use_br2 */) -{ - constexpr size_t bitrev = 2; - fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 6 * 4); - fft_reorder_swap<8, bitrev>(inout, 1 * 4, 8 * 4); - fft_reorder_swap<8, bitrev>(inout, 2 * 4, 4 * 4); - fft_reorder_swap<8, bitrev>(inout, 3 * 4, 12 * 4); - fft_reorder_swap<8, bitrev>(inout, 5 * 4, 10 * 4); - fft_reorder_swap<8, bitrev>(inout, 7 * 4, 14 * 4); - fft_reorder_swap_two<8, bitrev>(inout, 9 * 4, 15 * 4); - fft_reorder_swap<8, bitrev>(inout, 11 * 4, 13 * 4); -} - -template <typename T> -KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<9>) -{ - constexpr size_t bitrev = 2; - fft_reorder_swap_two<9, bitrev>(inout, 0 * 4, 4 * 4); - fft_reorder_swap<9, bitrev>(inout, 1 * 4, 16 * 4); - fft_reorder_swap<9, bitrev>(inout, 2 * 4, 8 * 4); - fft_reorder_swap<9, bitrev>(inout, 3 * 4, 24 * 4); - fft_reorder_swap<9, bitrev>(inout, 5 * 4, 20 * 4); - fft_reorder_swap<9, bitrev>(inout, 6 * 4, 12 * 4); - fft_reorder_swap<9, bitrev>(inout, 7 * 4, 28 * 4); - fft_reorder_swap<9, bitrev>(inout, 9 * 4, 18 * 4); - fft_reorder_swap_two<9, bitrev>(inout, 10 * 4, 14 * 4); - fft_reorder_swap<9, bitrev>(inout, 11 * 4, 26 * 4); - fft_reorder_swap<9, bitrev>(inout, 13 * 4, 22 * 4); - fft_reorder_swap<9, bitrev>(inout, 15 * 4, 30 * 4); - fft_reorder_swap_two<9, bitrev>(inout, 17 * 4, 21 * 4); - fft_reorder_swap<9, bitrev>(inout, 19 * 4, 25 * 4); - fft_reorder_swap<9, bitrev>(inout, 23 * 4, 29 * 4); - fft_reorder_swap_two<9, bitrev>(inout, 27 * 4, 31 * 4); -} - -template <typename T> -KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<10>, ctrue_t /* use_br2 */) -{ - constexpr size_t bitrev = 2; - fft_reorder_swap_two<10, bitrev>(inout, 0 * 4, 12 * 4); - fft_reorder_swap<10, bitrev>(inout, 1 * 4, 32 * 4); - fft_reorder_swap<10, bitrev>(inout, 2 * 4, 16 * 4); - fft_reorder_swap<10, bitrev>(inout, 3 * 4, 48 * 4); - fft_reorder_swap<10, bitrev>(inout, 4 * 4, 8 * 4); - fft_reorder_swap<10, bitrev>(inout, 5 * 4, 40 * 4); - fft_reorder_swap<10, bitrev>(inout, 6 * 4, 24 * 4); - fft_reorder_swap<10, bitrev>(inout, 7 * 4, 56 * 4); - fft_reorder_swap<10, bitrev>(inout, 9 * 4, 36 * 4); - fft_reorder_swap<10, bitrev>(inout, 10 * 4, 20 * 4); - fft_reorder_swap<10, bitrev>(inout, 11 * 4, 52 * 4); - fft_reorder_swap<10, bitrev>(inout, 13 * 4, 44 * 4); - fft_reorder_swap<10, bitrev>(inout, 14 * 4, 28 * 4); - fft_reorder_swap<10, bitrev>(inout, 15 * 4, 60 * 4); - fft_reorder_swap<10, bitrev>(inout, 17 * 4, 34 * 4); - fft_reorder_swap_two<10, bitrev>(inout, 18 * 4, 30 * 4); - fft_reorder_swap<10, bitrev>(inout, 19 * 4, 50 * 4); - fft_reorder_swap<10, bitrev>(inout, 21 * 4, 42 * 4); - fft_reorder_swap<10, bitrev>(inout, 22 * 4, 26 * 4); - fft_reorder_swap<10, bitrev>(inout, 23 * 4, 58 * 4); - fft_reorder_swap<10, bitrev>(inout, 25 * 4, 38 * 4); - fft_reorder_swap<10, bitrev>(inout, 27 * 4, 54 * 4); - fft_reorder_swap<10, bitrev>(inout, 29 * 4, 46 * 4); - fft_reorder_swap<10, bitrev>(inout, 31 * 4, 62 * 4); - fft_reorder_swap_two<10, bitrev>(inout, 33 * 4, 45 * 4); - fft_reorder_swap<10, bitrev>(inout, 35 * 4, 49 * 4); - fft_reorder_swap<10, bitrev>(inout, 37 * 4, 41 * 4); - fft_reorder_swap<10, bitrev>(inout, 39 * 4, 57 * 4); - fft_reorder_swap<10, bitrev>(inout, 43 * 4, 53 * 4); - fft_reorder_swap<10, bitrev>(inout, 47 * 4, 61 * 4); - fft_reorder_swap_two<10, bitrev>(inout, 51 * 4, 63 * 4); - fft_reorder_swap<10, bitrev>(inout, 55 * 4, 59 * 4); -} - -template <typename T, bool use_br2> -KFR_INTRINSIC void cwrite_reordered(T* out, const cvec<T, 16>& value, size_t N4, cbool_t<use_br2>) -{ - cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(out), N4, - digitreverse<(use_br2 ? 2 : 4), 2>(value)); -} - -template <typename T, bool use_br2> -KFR_INTRINSIC void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, cbool_t<use_br2>) -{ - CMT_ASSUME(i != j); - const cvec<T, 16> vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4); - const cvec<T, 16> vj = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4); - cwrite_reordered(inout + j, vi, N4, cbool_t<use_br2>()); - cwrite_reordered(inout + i, vj, N4, cbool_t<use_br2>()); -} - -template <typename T, bool use_table> -KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2, cbool_t<use_table>) -{ - const size_t N = size_t(1) << log2n; - const size_t N4 = N / 4; - const size_t iend = N / 16 * 4 * 2; - constexpr size_t istep = 2 * 4; - const size_t jstep1 = (1 << (log2n - 5)) * 4 * 2; - const size_t jstep2 = size_t(size_t(1) << (log2n - 5)) * 4 * 2 - size_t(size_t(1) << (log2n - 6)) * 4 * 2; - T* io = ptr_cast<T>(inout); - - for (size_t i = 0; i < iend;) - { - size_t j = bitrev_using_table(static_cast<u32>(i >> 3), log2n - 4, cbool<use_table>) << 3; - if (i >= j) - { - fft_reorder_swap_n4(io, i, j, N4, use_br2); - } - else - { - i += 4 * istep; - continue; - } - i += istep; - j = j + jstep1; - - if (i >= j) - { - fft_reorder_swap_n4(io, i, j, N4, use_br2); - } - i += istep; - j = j - jstep2; - - if (i >= j) - { - fft_reorder_swap_n4(io, i, j, N4, use_br2); - } - i += istep; - j = j + jstep1; - - if (i >= j) - { - fft_reorder_swap_n4(io, i, j, N4, use_br2); - } - i += istep; - } -} - -template <typename T> -KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2) -{ - if (log2n - 4 > bitrev_table_log2N) - { - fft_reorder(inout, log2n, ctrue, cfalse); - } - else - { - fft_reorder(inout, log2n, ctrue, ctrue); - } -} - -template <typename T> -KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2) -{ - const size_t N = size_t(1) << log2n; - const size_t N4 = N / 4; - const size_t N16 = N * 2 / 16; - size_t iend = N16; - constexpr size_t istep = 2 * 4; - const size_t jstep = N / 64 * 4 * 2; - T* io = ptr_cast<T>(inout); - - size_t i = 0; - CMT_PRAGMA_CLANG(clang loop unroll_count(2)) - for (; i < iend;) - { - size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; - - if (i >= j) - fft_reorder_swap_n4(io, i, j, N4, use_br2); - i += istep * 4; - } - iend += N16; - CMT_PRAGMA_CLANG(clang loop unroll_count(2)) - for (; i < iend;) - { - size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; - - fft_reorder_swap_n4(io, i, j, N4, use_br2); - - i += istep; - j = j + jstep; - - if (i >= j) - fft_reorder_swap_n4(io, i, j, N4, use_br2); - i += istep * 3; - } - iend += N16; - CMT_PRAGMA_CLANG(clang loop unroll_count(2)) - for (; i < iend;) - { - size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; - - fft_reorder_swap_n4(io, i, j, N4, use_br2); - - i += istep; - j = j + jstep; - - fft_reorder_swap_n4(io, i, j, N4, use_br2); - - i += istep; - j = j + jstep; - - if (i >= j) - fft_reorder_swap_n4(io, i, j, N4, use_br2); - i += istep * 2; - } - iend += N16; - CMT_PRAGMA_CLANG(clang loop unroll_count(2)) - for (; i < iend;) - { - size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; - - fft_reorder_swap_n4(io, i, j, N4, use_br2); - - i += istep; - j = j + jstep; - - fft_reorder_swap_n4(io, i, j, N4, use_br2); - - i += istep; - j = j + jstep; - - fft_reorder_swap_n4(io, i, j, N4, use_br2); - - i += istep; - j = j + jstep; - - if (i >= j) - fft_reorder_swap_n4(io, i, j, N4, use_br2); - i += istep; - } -} -} // namespace intrinsics -} // namespace CMT_ARCH_NAME -} // namespace kfr diff --git a/include/kfr/dft/impl/convolution-impl.cpp b/include/kfr/dft/impl/convolution-impl.cpp @@ -1,307 +0,0 @@ -/** @addtogroup dft - * @{ - */ -/* - Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#include "../../base/simd_expressions.hpp" -#include "../../simd/complex.hpp" -#include "../convolution.hpp" - -namespace kfr -{ -inline namespace CMT_ARCH_NAME -{ - -namespace intrinsics -{ - -template <typename T> -univector<T> convolve(const univector_ref<const T>& src1, const univector_ref<const T>& src2) -{ - using ST = subtype<T>; - const size_t size = next_poweroftwo(src1.size() + src2.size() - 1); - univector<complex<ST>> src1padded = src1; - univector<complex<ST>> src2padded = src2; - src1padded.resize(size); - src2padded.resize(size); - - dft_plan_ptr<ST> dft = dft_cache::instance().get(ctype_t<ST>(), size); - univector<u8> temp(dft->temp_size); - dft->execute(src1padded, src1padded, temp); - dft->execute(src2padded, src2padded, temp); - src1padded = src1padded * src2padded; - dft->execute(src1padded, src1padded, temp, true); - const ST invsize = reciprocal<ST>(static_cast<ST>(size)); - return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize; -} - -template <typename T> -univector<T> correlate(const univector_ref<const T>& src1, const univector_ref<const T>& src2) -{ - using ST = subtype<T>; - const size_t size = next_poweroftwo(src1.size() + src2.size() - 1); - univector<complex<ST>> src1padded = src1; - univector<complex<ST>> src2padded = reverse(src2); - src1padded.resize(size); - src2padded.resize(size); - dft_plan_ptr<ST> dft = dft_cache::instance().get(ctype_t<ST>(), size); - univector<u8> temp(dft->temp_size); - dft->execute(src1padded, src1padded, temp); - dft->execute(src2padded, src2padded, temp); - src1padded = src1padded * src2padded; - dft->execute(src1padded, src1padded, temp, true); - const ST invsize = reciprocal<ST>(static_cast<ST>(size)); - return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize; -} - -template <typename T> -univector<T> autocorrelate(const univector_ref<const T>& src1) -{ - univector<T> result = correlate(src1, src1); - result = result.slice(result.size() / 2); - return result; -} - -} // namespace intrinsics - -template <typename T> -convolve_filter<T>::convolve_filter(size_t size_, size_t block_size_) - : data_size(size_), block_size(next_poweroftwo(block_size_)), fft(2 * block_size), temp(fft.temp_size), - segments((data_size + block_size - 1) / block_size), position(0), ir_segments(segments.size()), - saved_input(block_size), input_position(0), premul(fft.csize()), cscratch(fft.csize()), - scratch1(fft.size), scratch2(fft.size), overlap(block_size) -{ -} - -template <typename T> -convolve_filter<T>::convolve_filter(const univector_ref<const T>& data, size_t block_size_) - : convolve_filter(data.size(), block_size_) -{ - set_data(data); -} - -template <typename T> -void convolve_filter<T>::set_data(const univector_ref<const T>& data) -{ - data_size = data.size(); - segments.resize((data_size + block_size - 1) / block_size); - ir_segments.resize(segments.size()); - univector<T> input(fft.size); - const ST ifftsize = reciprocal(static_cast<ST>(fft.size)); - for (size_t i = 0; i < ir_segments.size(); i++) - { - segments[i].resize(fft.csize()); - ir_segments[i].resize(fft.csize()); - input = padded(data.slice(i * block_size, block_size)); - - fft.execute(ir_segments[i], input, temp); - process(ir_segments[i], ir_segments[i] * ifftsize); - } - reset(); -} - -template <typename T> -void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size) -{ - // Note that the conditionals in the following algorithm are meant to - // reduce complexity in the common cases of either processing complete - // blocks (processing == block_size) or only one segment. - - // For complex filtering, use CCs pack format to omit special processing in fft_multiply[_accumulate]. - const dft_pack_format fft_multiply_pack = this->real_fft ? dft_pack_format::Perm : dft_pack_format::CCs; - - size_t processed = 0; - while (processed < size) - { - // Calculate how many samples to process this iteration. - auto const processing = std::min(size - processed, block_size - input_position); - - // Prepare input to forward FFT: - if (processing == block_size) - { - // No need to work with saved_input. - builtin_memcpy(scratch1.data(), input + processed, processing * sizeof(T)); - } - else - { - // Append this iteration's input to the saved_input current block. - builtin_memcpy(saved_input.data() + input_position, input + processed, processing * sizeof(T)); - builtin_memcpy(scratch1.data(), saved_input.data(), block_size * sizeof(T)); - } - - // Forward FFT saved_input block. - fft.execute(segments[position], scratch1, temp); - - if (segments.size() == 1) - { - // Just one segment/block of history. - // Y_k = H * X_k - fft_multiply(cscratch, ir_segments[0], segments[0], fft_multiply_pack); - } - else - { - // More than one segment/block of history so this is more involved. - if (input_position == 0) - { - // At the start of an input block, we premultiply the history from - // previous input blocks with the extended filter blocks. - - // Y_(k-i,i) = H_i * X_(k-i) - // premul += Y_(k-i,i) for i=1,...,N - - fft_multiply(premul, ir_segments[1], segments[(position + 1) % segments.size()], - fft_multiply_pack); - for (size_t i = 2; i < segments.size(); i++) - { - const size_t n = (position + i) % segments.size(); - fft_multiply_accumulate(premul, ir_segments[i], segments[n], fft_multiply_pack); - } - } - // Y_(k,0) = H_0 * X_k - // Y_k = premul + Y_(k,0) - fft_multiply_accumulate(cscratch, premul, ir_segments[0], segments[position], fft_multiply_pack); - } - // y_k = IFFT( Y_k ) - fft.execute(scratch2, cscratch, temp, cinvert_t{}); - - // z_k = y_k + overlap - process(make_univector(output + processed, processing), - scratch2.slice(input_position, processing) + overlap.slice(input_position, processing)); - - input_position += processing; - processed += processing; - - // If a whole block was processed, prepare for next block. - if (input_position == block_size) - { - // Input block k is complete. Move to (k+1)-th input block. - input_position = 0; - - // Zero out the saved_input if it will be used in the next iteration. - auto const remaining = size - processed; - if (remaining < block_size && remaining > 0) - { - process(saved_input, zeros()); - } - - builtin_memcpy(overlap.data(), scratch2.data() + block_size, block_size * sizeof(T)); - - position = position > 0 ? position - 1 : segments.size() - 1; - } - } -} - -template <typename T> -void convolve_filter<T>::reset() -{ - for (auto& segment : segments) - { - process(segment, zeros()); - } - position = 0; - process(saved_input, zeros()); - input_position = 0; - process(overlap, zeros()); -} - -namespace intrinsics -{ - -template univector<float> convolve<float>(const univector_ref<const float>&, - const univector_ref<const float>&); -template univector<complex<float>> convolve<complex<float>>(const univector_ref<const complex<float>>&, - const univector_ref<const complex<float>>&); -template univector<float> correlate<float>(const univector_ref<const float>&, - const univector_ref<const float>&); -template univector<complex<float>> correlate<complex<float>>(const univector_ref<const complex<float>>&, - const univector_ref<const complex<float>>&); - -template univector<float> autocorrelate<float>(const univector_ref<const float>&); -template univector<complex<float>> autocorrelate<complex<float>>(const univector_ref<const complex<float>>&); - -} // namespace intrinsics - -template convolve_filter<float>::convolve_filter(size_t, size_t); -template convolve_filter<complex<float>>::convolve_filter(size_t, size_t); - -template convolve_filter<float>::convolve_filter(const univector_ref<const float>&, size_t); -template convolve_filter<complex<float>>::convolve_filter(const univector_ref<const complex<float>>&, size_t); - -template void convolve_filter<float>::set_data(const univector_ref<const float>&); -template void convolve_filter<complex<float>>::set_data(const univector_ref<const complex<float>>&); - -template void convolve_filter<float>::process_buffer(float* output, const float* input, size_t size); -template void convolve_filter<complex<float>>::process_buffer(complex<float>* output, - const complex<float>* input, size_t size); - -template void convolve_filter<float>::reset(); -template void convolve_filter<complex<float>>::reset(); - -namespace intrinsics -{ - -template univector<double> convolve<double>(const univector_ref<const double>&, - const univector_ref<const double>&); -template univector<complex<double>> convolve<complex<double>>(const univector_ref<const complex<double>>&, - const univector_ref<const complex<double>>&); -template univector<double> correlate<double>(const univector_ref<const double>&, - const univector_ref<const double>&); -template univector<complex<double>> correlate<complex<double>>(const univector_ref<const complex<double>>&, - const univector_ref<const complex<double>>&); - -template univector<double> autocorrelate<double>(const univector_ref<const double>&); -template univector<complex<double>> autocorrelate<complex<double>>( - const univector_ref<const complex<double>>&); - -} // namespace intrinsics - -template convolve_filter<double>::convolve_filter(size_t, size_t); -template convolve_filter<complex<double>>::convolve_filter(size_t, size_t); - -template convolve_filter<double>::convolve_filter(const univector_ref<const double>&, size_t); -template convolve_filter<complex<double>>::convolve_filter(const univector_ref<const complex<double>>&, - size_t); - -template void convolve_filter<double>::set_data(const univector_ref<const double>&); -template void convolve_filter<complex<double>>::set_data(const univector_ref<const complex<double>>&); - -template void convolve_filter<double>::process_buffer(double* output, const double* input, size_t size); -template void convolve_filter<complex<double>>::process_buffer(complex<double>* output, - const complex<double>* input, size_t size); - -template void convolve_filter<double>::reset(); -template void convolve_filter<complex<double>>::reset(); - -template <typename T> -filter<T>* make_convolve_filter(const univector_ref<const T>& taps, size_t block_size) -{ - return new convolve_filter<T>(taps, block_size); -} - -template filter<float>* make_convolve_filter(const univector_ref<const float>&, size_t); -template filter<complex<float>>* make_convolve_filter(const univector_ref<const complex<float>>&, size_t); -template filter<double>* make_convolve_filter(const univector_ref<const double>&, size_t); -template filter<complex<double>>* make_convolve_filter(const univector_ref<const complex<double>>&, size_t); - -} // namespace CMT_ARCH_NAME -} // namespace kfr diff --git a/include/kfr/dft/impl/dft-fft.hpp b/include/kfr/dft/impl/dft-fft.hpp @@ -1,114 +0,0 @@ -/** @addtogroup dft - * @{ - */ -/* - Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../../base/basic_expressions.hpp" -#include "../../math/complex_math.hpp" -#include "../../testo/assert.hpp" -#include "../cache.hpp" -#include "../fft.hpp" -#include "bitrev.hpp" -#include "ft.hpp" - -namespace kfr -{ - -inline namespace CMT_ARCH_NAME -{ -namespace intrinsics -{ -struct name_test_impl -{ -}; -} // namespace intrinsics -} // namespace CMT_ARCH_NAME - -template <typename T, cpu_t cpu> -struct dft_name_impl -{ -}; - -template <typename Class> -inline const char* dft_name(Class*) -{ - constexpr static size_t prefix_len = ctype_name<intrinsics::name_test_impl>().length() - 14; - static constexpr cstring full_name = ctype_name<std::decay_t<Class>>(); - static constexpr cstring name_arch = - concat_cstring(full_name.slice(csize<prefix_len>), make_cstring("("), - make_cstring(CMT_STRINGIFY(CMT_ARCH_NAME)), make_cstring(")")); - return name_arch.c_str(); -} - -#define DFT_STAGE_FN \ - KFR_MEM_INTRINSIC void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) final \ - { \ - return do_execute<false>(out, in, temp); \ - } \ - KFR_MEM_INTRINSIC void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) final \ - { \ - return do_execute<true>(out, in, temp); \ - } -#define DFT_STAGE_FN_NONFINAL \ - void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override \ - { \ - return do_execute<false>(out, in, temp); \ - } \ - void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) override \ - { \ - return do_execute<true>(out, in, temp); \ - } - -inline namespace CMT_ARCH_NAME -{ - -#define DFT_ASSERT TESTO_ASSERT_INACTIVE - -template <typename T> -constexpr size_t fft_vector_width = vector_width<T>; - -CMT_PRAGMA_GNU(GCC diagnostic push) -#if CMT_HAS_WARNING("-Wassume") -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wassume") -#endif - -template <typename Stage, bool add_stages = true, typename T, typename... Args> -void add_stage(dft_plan<T>* plan, Args... args) -{ - dft_stage<T>* stage = new Stage(args...); - stage->need_reorder = true; - plan->data_size += stage->data_size; - plan->temp_size += stage->temp_size; - plan->all_stages.push_back(dft_stage_ptr<T>(stage)); - if constexpr (add_stages) - { - plan->stages[0].push_back(stage); - plan->stages[1].push_back(stage); - } -} - -} // namespace CMT_ARCH_NAME - -} // namespace kfr diff --git a/include/kfr/dft/impl/dft-impl.hpp b/include/kfr/dft/impl/dft-impl.hpp @@ -1,568 +0,0 @@ -/** @addtogroup dft - * @{ - */ -/* - Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../../base/math_expressions.hpp" -#include "../../base/simd_expressions.hpp" -#include "dft-fft.hpp" - -CMT_PRAGMA_GNU(GCC diagnostic push) -#if CMT_HAS_WARNING("-Wshadow") -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") -#endif -#if CMT_HAS_WARNING("-Wunused-lambda-capture") -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunused-lambda-capture") -#endif -#if CMT_HAS_WARNING("-Wpass-failed") -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpass-failed") -#endif - -CMT_PRAGMA_MSVC(warning(push)) -CMT_PRAGMA_MSVC(warning(disable : 4100)) - -namespace kfr -{ - -inline namespace CMT_ARCH_NAME -{ -constexpr csizes_t<2, 3, 4, 5, 6, 7, 8, 9, 10> dft_radices{}; - -namespace intrinsics -{ - -template <typename T> -void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width) -{ - complex<T>* twiddle = ptr_cast<complex<T>>(stage->data); - const size_t N = stage->repeats * stage->radix; - const size_t Nord = stage->repeats; - size_t i = 0; - - while (width > 0) - { - CMT_LOOP_NOUNROLL - for (; i < Nord / width * width; i += width) - { - CMT_LOOP_NOUNROLL - for (size_t j = 1; j < stage->radix; j++) - { - CMT_LOOP_NOUNROLL - for (size_t k = 0; k < width; k++) - { - cvec<T, 1> xx = cossin_conj(broadcast<2, T>(c_pi<T, 2> * (i + k) * j / N)); - ref_cast<cvec<T, 1>>(twiddle[k]) = xx; - } - twiddle += width; - } - } - width = width / 2; - } -} - -template <typename T, size_t fixed_radix> -struct dft_stage_fixed_impl : dft_stage<T> -{ - dft_stage_fixed_impl(size_t, size_t iterations, size_t blocks) - { - this->name = dft_name(this); - this->radix = fixed_radix; - this->blocks = blocks; - this->repeats = iterations; - this->recursion = false; // true; - this->stage_size = fixed_radix * iterations * blocks; - this->data_size = align_up((this->repeats * (fixed_radix - 1)) * sizeof(complex<T>), - platform<>::native_cache_alignment); - } - - constexpr static size_t rradix = fixed_radix; - - constexpr static size_t width = fixed_radix >= 7 ? fft_vector_width<T> / 2 - : fixed_radix >= 4 ? fft_vector_width<T> - : fft_vector_width<T> * 2; - virtual void do_initialize(size_t) override final { dft_stage_fixed_initialize(this, width); } - - DFT_STAGE_FN - template <bool inverse> - KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) - { - const size_t Nord = this->repeats; - const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); - - const size_t N = Nord * fixed_radix; - CMT_LOOP_NOUNROLL - for (size_t b = 0; b < this->blocks; b++) - { - butterflies(Nord, csize<width>, csize<fixed_radix>, cbool<inverse>, out, in, twiddle, Nord); - in += N; - out += N; - } - } -}; - -template <typename T, size_t fixed_radix> -struct dft_stage_fixed_final_impl : dft_stage<T> -{ - dft_stage_fixed_final_impl(size_t, size_t iterations, size_t blocks) - { - this->name = dft_name(this); - this->radix = fixed_radix; - this->blocks = blocks; - this->repeats = iterations; - this->stage_size = fixed_radix * iterations * blocks; - this->recursion = false; - this->can_inplace = false; - } - constexpr static size_t width = fixed_radix >= 7 ? fft_vector_width<T> / 2 - : fixed_radix >= 4 ? fft_vector_width<T> - : fft_vector_width<T> * 2; - - DFT_STAGE_FN - template <bool inverse> - KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) - { - const size_t b = this->blocks; - - butterflies(b, csize<width>, csize<fixed_radix>, cbool<inverse>, out, in, b); - } -}; - -template <typename E> -inline E& apply_conj(E& e, cfalse_t) -{ - return e; -} - -template <typename E> -inline auto apply_conj(E& e, ctrue_t) -{ - return cconj(e); -} - -/// [0, N - 1, N - 2, N - 3, ..., 3, 2, 1] -template <typename E> -struct fft_inverse : expression_with_traits<E> -{ - using value_type = typename expression_with_traits<E>::value_type; - - KFR_MEM_INTRINSIC fft_inverse(E&& expr) CMT_NOEXCEPT : expression_with_traits<E>(std::forward<E>(expr)) {} - - friend KFR_INTRINSIC vec<value_type, 1> get_elements(const fft_inverse& self, shape<1> index, - axis_params<0, 1>) - { - const size_t size = get_shape(self).front(); - return get_elements(self.first(), index.front() == 0 ? 0 : size - index, axis_params<0, 1>()); - } - - template <size_t N> - friend KFR_MEM_INTRINSIC vec<value_type, N> get_elements(const fft_inverse& self, shape<1> index, - axis_params<0, N>) - { - const size_t size = get_shape(self).front(); - if (index.front() == 0) - { - return concat(get_elements(self.first(), index, axis_params<0, 1>()), - reverse(get_elements(self.first(), size - (N - 1), axis_params<0, N - 1>()))); - } - return reverse(get_elements(self.first(), size - index - (N - 1), axis_params<0, N>())); - } -}; - -template <typename E> -inline auto apply_fft_inverse(E&& e) -{ - return fft_inverse<E>(std::forward<E>(e)); -} - -template <typename T> -struct dft_arblen_stage_impl : dft_stage<T> -{ - dft_arblen_stage_impl(size_t size) - : size(size), fftsize(next_poweroftwo(size) * 2), plan(fftsize, dft_order::internal) - { - this->name = dft_name(this); - this->radix = size; - this->blocks = 1; - this->repeats = 1; - this->recursion = false; - this->can_inplace = false; - this->temp_size = plan.temp_size; - this->stage_size = size; - - chirp_ = render(cexp(sqr(linspace(T(1) - size, size - T(1), size * 2 - 1, true, ctrue)) * - complex<T>(0, -1) * c_pi<T> / size)); - - ichirpp_ = render(truncate(padded(1 / slice(chirp_, 0, 2 * size - 1)), fftsize)); - - univector<u8> temp(plan.temp_size); - plan.execute(ichirpp_, ichirpp_, temp); - xp.resize(fftsize, 0); - xp_fft.resize(fftsize); - invN2 = T(1) / fftsize; - } - - DFT_STAGE_FN - template <bool inverse> - KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) - { - const size_t n = this->size; - - auto&& chirp = apply_conj(chirp_, cbool<inverse>); - - xp.slice(0, n) = make_univector(in, n) * slice(chirp, n - 1); - - plan.execute(xp_fft.data(), xp.data(), temp); - - if (inverse) - xp_fft = xp_fft * cconj(apply_fft_inverse(ichirpp_)); - else - xp_fft = xp_fft * ichirpp_; - plan.execute(xp_fft.data(), xp_fft.data(), temp, ctrue); - - make_univector(out, n) = xp_fft.slice(n - 1, n) * slice(chirp, n - 1, n) * invN2; - } - - const size_t size; - const size_t fftsize; - T invN2; - dft_plan<T> plan; - univector<complex<T>> chirp_; - univector<complex<T>> ichirpp_; - univector<complex<T>> xp; - univector<complex<T>> xp_fft; -}; - -template <typename T, size_t radix1, size_t radix2, size_t size = radix1 * radix2> -struct dft_special_stage_impl : dft_stage<T> -{ - dft_special_stage_impl() : stage1(radix1, size / radix1, 1), stage2(radix2, 1, size / radix2) - { - this->name = dft_name(this); - this->radix = size; - this->blocks = 1; - this->repeats = 1; - this->recursion = false; - this->can_inplace = false; - this->stage_size = size; - this->temp_size = stage1.temp_size + stage2.temp_size + sizeof(complex<T>) * size; - this->data_size = stage1.data_size + stage2.data_size; - } - void dump() const override - { - dft_stage<T>::dump(); - printf(" "); - stage1.dump(); - printf(" "); - stage2.dump(); - } - void do_initialize(size_t stage_size) override - { - stage1.data = this->data; - stage2.data = this->data + stage1.data_size; - stage1.initialize(stage_size); - stage2.initialize(stage_size); - } - DFT_STAGE_FN - template <bool inverse> - KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) - { - complex<T>* scratch = ptr_cast<complex<T>>(temp + stage1.temp_size + stage2.temp_size); - stage1.do_execute(cbool<inverse>, scratch, in, temp); - stage2.do_execute(cbool<inverse>, out, scratch, temp + stage1.temp_size); - } - dft_stage_fixed_impl<T, radix1> stage1; - dft_stage_fixed_final_impl<T, radix2> stage2; -}; - -template <typename T, bool final> -struct dft_stage_generic_impl : dft_stage<T> -{ - dft_stage_generic_impl(size_t radix, size_t iterations, size_t blocks) - { - this->name = dft_name(this); - this->radix = radix; - this->blocks = blocks; - this->repeats = iterations; - this->recursion = false; // true; - this->can_inplace = false; - this->stage_size = radix * iterations * blocks; - this->temp_size = align_up(sizeof(complex<T>) * radix, platform<>::native_cache_alignment); - this->data_size = - align_up(sizeof(complex<T>) * sqr(this->radix / 2), platform<>::native_cache_alignment); - } - -protected: - virtual void do_initialize(size_t) override final - { - complex<T>* twiddle = ptr_cast<complex<T>>(this->data); - CMT_LOOP_NOUNROLL - for (size_t i = 0; i < this->radix / 2; i++) - { - CMT_LOOP_NOUNROLL - for (size_t j = 0; j < this->radix / 2; j++) - { - cwrite<1>(twiddle++, cossin_conj(broadcast<2>((i + 1) * (j + 1) * c_pi<T, 2> / this->radix))); - } - } - } - - DFT_STAGE_FN - template <bool inverse> - KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) - { - const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); - const size_t bl = this->blocks; - - CMT_LOOP_NOUNROLL - for (size_t b = 0; b < bl; b++) - generic_butterfly(this->radix, cbool<inverse>, out + b, in + b * this->radix, - ptr_cast<complex<T>>(temp), twiddle, bl); - } -}; - -template <typename T, typename Tr2> -inline void dft_permute(complex<T>* out, const complex<T>* in, size_t r0, size_t r1, Tr2 first_radix) -{ - CMT_ASSUME(r0 > 1); - CMT_ASSUME(r1 > 1); - - CMT_LOOP_NOUNROLL - for (size_t p = 0; p < r0; p++) - { - const complex<T>* in1 = in; - CMT_LOOP_NOUNROLL - for (size_t i = 0; i < r1; i++) - { - const complex<T>* in2 = in1; - CMT_LOOP_UNROLL - for (size_t j = 0; j < first_radix; j++) - { - *out++ = *in2; - in2 += r1; - } - in1++; - in += first_radix; - } - } -} - -template <typename T, typename Tr2> -inline void dft_permute_deep(complex<T>*& out, const complex<T>* in, const size_t* radices, size_t count, - size_t index, size_t inscale, size_t inner_size, Tr2 first_radix) -{ - const bool b = index == 1; - const size_t radix = radices[index]; - if (b) - { - CMT_LOOP_NOUNROLL - for (size_t i = 0; i < radix; i++) - { - const complex<T>* in1 = in; - CMT_LOOP_UNROLL - for (size_t j = 0; j < first_radix; j++) - { - *out++ = *in1; - in1 += inner_size; - } - in += inscale; - } - } - else - { - const size_t steps = radix; - const size_t inscale_next = inscale * radix; - CMT_LOOP_NOUNROLL - for (size_t i = 0; i < steps; i++) - { - dft_permute_deep(out, in, radices, count, index - 1, inscale_next, inner_size, first_radix); - in += inscale; - } - } -} - -template <typename T> -struct dft_reorder_stage_impl : dft_stage<T> -{ - dft_reorder_stage_impl(const int* radices, size_t count) : count(count) - { - this->name = dft_name(this); - this->can_inplace = false; - this->data_size = 0; - std::copy(radices, radices + count, this->radices); - this->inner_size = 1; - this->size = 1; - for (size_t r = 0; r < count; r++) - { - if (r != 0 && r != count - 1) - this->inner_size *= radices[r]; - this->size *= radices[r]; - } - this->stage_size = this->size; - } - -protected: - size_t radices[32]; - size_t count = 0; - size_t size = 0; - size_t inner_size = 0; - virtual void do_initialize(size_t) override final {} - - DFT_STAGE_FN - template <bool inverse> - KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) - { - cswitch( - dft_radices, radices[0], - [&](auto first_radix) - { - if (count == 3) - { - dft_permute(out, in, radices[2], radices[1], first_radix); - } - else - { - const size_t rlast = radices[count - 1]; - for (size_t p = 0; p < rlast; p++) - { - dft_permute_deep(out, in, radices, count, count - 2, 1, inner_size, first_radix); - in += size / rlast; - } - } - }, - [&]() - { - if (count == 3) - { - dft_permute(out, in, radices[2], radices[1], radices[0]); - } - else - { - const size_t rlast = radices[count - 1]; - for (size_t p = 0; p < rlast; p++) - { - dft_permute_deep(out, in, radices, count, count - 2, 1, inner_size, radices[0]); - in += size / rlast; - } - } - }); - } -}; -} // namespace intrinsics - -template <bool is_final, typename T> -void prepare_dft_stage(dft_plan<T>* self, size_t radix, size_t iterations, size_t blocks, cbool_t<is_final>) -{ - return cswitch( - dft_radices, radix, - [self, iterations, blocks](auto radix) CMT_INLINE_LAMBDA - { - add_stage<std::conditional_t<is_final, intrinsics::dft_stage_fixed_final_impl<T, val_of(radix)>, - intrinsics::dft_stage_fixed_impl<T, val_of(radix)>>>( - self, radix, iterations, blocks); - }, - [self, radix, iterations, blocks]() - { add_stage<intrinsics::dft_stage_generic_impl<T, is_final>>(self, radix, iterations, blocks); }); -} - -template <typename T> -void init_dft(dft_plan<T>* self, size_t size, dft_order) -{ - if (size == 60) - { - add_stage<intrinsics::dft_special_stage_impl<T, 6, 10>>(self); - } - else if (size == 48) - { - add_stage<intrinsics::dft_special_stage_impl<T, 6, 8>>(self); - } - else - { - size_t cur_size = size; - constexpr size_t radices_count = dft_radices.back() + 1; - u8 count[radices_count] = { 0 }; - int radices[32] = { 0 }; - size_t radices_size = 0; - - cforeach(dft_radices[csizeseq<dft_radices.size(), dft_radices.size() - 1, -1>], - [&](auto radix) - { - while (cur_size && cur_size % val_of(radix) == 0) - { - count[val_of(radix)]++; - cur_size /= val_of(radix); - } - }); - - int num_stages = 0; - if (cur_size >= 101) - { - add_stage<intrinsics::dft_arblen_stage_impl<T>>(self, size); - ++num_stages; - self->arblen = true; - } - else - { - size_t blocks = 1; - size_t iterations = size; - - for (size_t r = dft_radices.front(); r <= dft_radices.back(); r++) - { - for (size_t i = 0; i < count[r]; i++) - { - iterations /= r; - radices[radices_size++] = static_cast<int>(r); - if (iterations == 1) - prepare_dft_stage(self, r, iterations, blocks, ctrue); - else - prepare_dft_stage(self, r, iterations, blocks, cfalse); - ++num_stages; - blocks *= r; - } - } - - if (cur_size > 1) - { - iterations /= cur_size; - radices[radices_size++] = static_cast<int>(cur_size); - if (iterations == 1) - prepare_dft_stage(self, cur_size, iterations, blocks, ctrue); - else - prepare_dft_stage(self, cur_size, iterations, blocks, cfalse); - ++num_stages; - } - - if (num_stages > 2) - add_stage<intrinsics::dft_reorder_stage_impl<T>>(self, radices, radices_size); - } - } -} - -} // namespace CMT_ARCH_NAME - -} // namespace kfr - -CMT_PRAGMA_GNU(GCC diagnostic pop) - -CMT_PRAGMA_MSVC(warning(pop)) diff --git a/include/kfr/dft/impl/dft-templates.hpp b/include/kfr/dft/impl/dft-templates.hpp @@ -1,41 +0,0 @@ -/** @addtogroup dft - * @{ - */ -/* - Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ - -#ifdef FLOAT -#include "../fft.hpp" - -namespace kfr -{ -inline namespace CMT_ARCH_NAME -{ - -#ifndef KFR_DFT_NO_NPo2 -template void init_dft(dft_plan<FLOAT>*, size_t, dft_order); -#endif -} // namespace CMT_ARCH_NAME -} // namespace kfr - -#endif diff --git a/include/kfr/dft/impl/fft-templates.hpp b/include/kfr/dft/impl/fft-templates.hpp @@ -1,39 +0,0 @@ -/** @addtogroup dft - * @{ - */ -/* - Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ - -#ifdef FLOAT -#include "../fft.hpp" - -namespace kfr -{ -inline namespace CMT_ARCH_NAME -{ -template void dft_initialize<FLOAT>(dft_plan<FLOAT>& plan); -template void dft_real_initialize<FLOAT>(dft_plan_real<FLOAT>& plan); -} // namespace CMT_ARCH_NAME -} // namespace kfr - -#endif diff --git a/include/kfr/dft/impl/ft.hpp b/include/kfr/dft/impl/ft.hpp @@ -1,1785 +0,0 @@ -/** @addtogroup dft - * @{ - */ -/* - Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ -#pragma once - -#include "../../base/univector.hpp" -#include "../../math/sin_cos.hpp" -#include "../../simd/complex.hpp" -#include "../../simd/constants.hpp" -#include "../../simd/digitreverse.hpp" -#include "../../simd/read_write.hpp" -#include "../../simd/vec.hpp" - -#include "../../base/memory.hpp" -#include "../data/sincos.hpp" - -CMT_PRAGMA_GNU(GCC diagnostic push) -#if CMT_HAS_WARNING("-Wpass-failed") -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpass-failed") -#endif - -CMT_PRAGMA_MSVC(warning(push)) -CMT_PRAGMA_MSVC(warning(disable : 4127)) - -namespace kfr -{ -inline namespace CMT_ARCH_NAME -{ - -template <typename T, size_t N> -using cvec = vec<T, N * 2>; - -namespace intrinsics -{ - -template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> -KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, N>& y) -{ - return subadd(x * dupeven(y), swap<2>(x) * dupodd(y)); -} -template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> -KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, 2>& y) -{ - vec<T, N> yy = resize<N>(y); - return cmul_impl(x, yy); -} -template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> -KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, 2>& x, const vec<T, N>& y) -{ - vec<T, N> xx = resize<N>(x); - return cmul_impl(xx, y); -} - -/// Complex Multiplication -template <typename T, size_t N1, size_t N2> -KFR_INTRINSIC vec<T, const_max(N1, N2)> cmul(const vec<T, N1>& x, const vec<T, N2>& y) -{ - return intrinsics::cmul_impl(x, y); -} - -template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> -KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, N>& y) -{ - return swap<2>(subadd(swap<2>(x) * dupeven(y), x * dupodd(y))); -} -template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> -KFR_INTRINSIC vec<T, N> cmul_2conj(const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& tw) -{ - return (in0 + in1) * dupeven(tw) + swap<2>(cnegimag(in0 - in1)) * dupodd(tw); -} -template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> -KFR_INTRINSIC void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, const vec<T, 2>& in0, const vec<T, 2>& in1, - const vec<T, N>& tw) -{ - const vec<T, N> twr = dupeven(tw); - const vec<T, N> twi = dupodd(tw); - const vec<T, 2> sum = (in0 + in1); - const vec<T, 2> dif = swap<2>(negodd(in0 - in1)); - const vec<T, N> sumtw = resize<N>(sum) * twr; - const vec<T, N> diftw = resize<N>(dif) * twi; - out0 += sumtw + diftw; - out1 += sumtw - diftw; -} -template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> -KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, 2>& y) -{ - vec<T, N> yy = resize<N>(y); - return cmul_conj(x, yy); -} -template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> -KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, 2>& x, const vec<T, N>& y) -{ - vec<T, N> xx = resize<N>(x); - return cmul_conj(xx, y); -} - -template <size_t N, bool A = false, typename T> -KFR_INTRINSIC cvec<T, N> cread(const complex<T>* src) -{ - return cvec<T, N>(ptr_cast<T>(src), cbool_t<A>()); -} - -template <size_t N, bool A = false, typename T> -KFR_INTRINSIC void cwrite(complex<T>* dest, const cvec<T, N>& value) -{ - value.write(ptr_cast<T>(dest), cbool_t<A>()); -} - -template <size_t count, size_t N, bool A = false, typename T> -KFR_INTRINSIC cvec<T, count * N> cread_group(const complex<T>* src, size_t stride) -{ - return internal::read_group_impl<2, count, N, A>(ptr_cast<T>(src), stride, csizeseq_t<count>()); -} - -template <size_t count, size_t N, bool A = false, typename T> -KFR_INTRINSIC void cwrite_group(complex<T>* dest, size_t stride, const cvec<T, count * N>& value) -{ - return internal::write_group_impl<2, count, N, A>(ptr_cast<T>(dest), stride, value, csizeseq_t<count>()); -} - -template <size_t N, bool A = false, bool split = false, typename T> -KFR_INTRINSIC cvec<T, N> cread_split(const complex<T>* src) -{ - cvec<T, N> temp = cvec<T, N>(ptr_cast<T>(src), cbool_t<A>()); - if constexpr (split) - temp = splitpairs(temp); - return temp; -} - -template <size_t N, bool A = false, bool split = false, typename T> -KFR_INTRINSIC void cwrite_split(complex<T>* dest, const cvec<T, N>& value) -{ - cvec<T, N> v = value; - if constexpr (split) - v = interleavehalves(v); - v.write(ptr_cast<T>(dest), cbool_t<A>()); -} - -template <> -inline cvec<f32, 8> cread_split<8, false, true, f32>(const complex<f32>* src) -{ - const cvec<f32, 4> l = concat(cread<2>(src), cread<2>(src + 4)); - const cvec<f32, 4> h = concat(cread<2>(src + 2), cread<2>(src + 6)); - - return concat(shuffle<0, 2, 8 + 0, 8 + 2>(l, h), shuffle<1, 3, 8 + 1, 8 + 3>(l, h)); -} -template <> -inline cvec<f32, 8> cread_split<8, true, true, f32>(const complex<f32>* src) -{ - const cvec<f32, 4> l = concat(cread<2, true>(src), cread<2, true>(src + 4)); - const cvec<f32, 4> h = concat(cread<2, true>(src + 2), cread<2, true>(src + 6)); - - return concat(shuffle<0, 2, 8 + 0, 8 + 2>(l, h), shuffle<1, 3, 8 + 1, 8 + 3>(l, h)); -} - -template <> -inline cvec<f64, 4> cread_split<4, false, true, f64>(const complex<f64>* src) -{ - const cvec<f64, 2> l = concat(cread<1>(src), cread<1>(src + 2)); - const cvec<f64, 2> h = concat(cread<1>(src + 1), cread<1>(src + 3)); - - return concat(shuffle<0, 4, 2, 6>(l, h), shuffle<1, 5, 3, 7>(l, h)); -} - -template <> -inline void cwrite_split<8, false, true, f32>(complex<f32>* dest, const cvec<f32, 8>& x) -{ - const cvec<f32, 8> xx = - concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x))); - - cvec<f32, 2> a, b, c, d; - split<f32, 16>(xx, a, b, c, d); - cwrite<2>(dest, a); - cwrite<2>(dest + 4, b); - cwrite<2>(dest + 2, c); - cwrite<2>(dest + 6, d); -} -template <> -inline void cwrite_split<8, true, true, f32>(complex<f32>* dest, const cvec<f32, 8>& x) -{ - const cvec<f32, 8> xx = - concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x))); - - cvec<f32, 2> a, b, c, d; - split<f32, 16>(xx, a, b, c, d); - cwrite<2, true>(dest + 0, a); - cwrite<2, true>(dest + 4, b); - cwrite<2, true>(dest + 2, c); - cwrite<2, true>(dest + 6, d); -} - -template <> -inline void cwrite_split<4, false, true, f64>(complex<f64>* dest, const cvec<f64, 4>& x) -{ - const cvec<f64, 4> xx = - concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x))); - cwrite<1>(dest, part<4, 0>(xx)); - cwrite<1>(dest + 2, part<4, 1>(xx)); - cwrite<1>(dest + 1, part<4, 2>(xx)); - cwrite<1>(dest + 3, part<4, 3>(xx)); -} -template <> -inline void cwrite_split<4, true, true, f64>(complex<f64>* dest, const cvec<f64, 4>& x) -{ - const cvec<f64, 4> xx = - concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x))); - cwrite<1, true>(dest + 0, part<4, 0>(xx)); - cwrite<1, true>(dest + 2, part<4, 1>(xx)); - cwrite<1, true>(dest + 1, part<4, 2>(xx)); - cwrite<1, true>(dest + 3, part<4, 3>(xx)); -} - -template <size_t N, size_t stride, typename T, size_t... Indices> -KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, csizes_t<Indices...>) -{ - return concat(ref_cast<cvec<T, 1>>(base[Indices * stride])...); -} - -template <size_t N, size_t stride, typename T> -KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base) -{ - if constexpr (stride == 1) - { - return ref_cast<cvec<T, N>>(*base); - } - else - { - return cgather_helper<N, stride, T>(base, csizeseq_t<N>()); - } -} - -KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t size, size_t) -{ - size_t temp = index; - index += stride; - if (index >= size) - index -= size; - return temp; -} -KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t) -{ - size_t temp = index; - index += stride; - return temp; -} - -template <size_t N, typename T, size_t... Indices> -KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, - csizes_t<Indices...>) -{ - return concat(ref_cast<cvec<T, 1>>(base[cgather_next(index, stride, Indices)])...); -} - -template <size_t N, typename T> -KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride) -{ - return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>()); -} -template <size_t N, typename T> -KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t stride) -{ - size_t index = 0; - return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>()); -} - -template <size_t N, typename T, size_t... Indices> -KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, size_t size, - csizes_t<Indices...>) -{ - return concat(ref_cast<cvec<T, 1>>(base[cgather_next(index, stride, size, Indices)])...); -} - -template <size_t N, typename T> -KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride, size_t size) -{ - return cgather_helper<N, T>(base, index, stride, size, csizeseq_t<N>()); -} - -template <size_t N, size_t stride, typename T, size_t... Indices> -KFR_INTRINSIC void cscatter_helper(complex<T>* base, const cvec<T, N>& value, csizes_t<Indices...>) -{ - swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... }; -} - -template <size_t N, size_t stride, typename T> -KFR_INTRINSIC void cscatter(complex<T>* base, const cvec<T, N>& value) -{ - if constexpr (stride == 1) - { - cwrite<N>(base, value); - } - else - { - return cscatter_helper<N, stride, T>(base, value, csizeseq_t<N>()); - } -} - -template <size_t N, typename T, size_t... Indices> -KFR_INTRINSIC void cscatter_helper(complex<T>* base, size_t stride, const cvec<T, N>& value, - csizes_t<Indices...>) -{ - swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... }; -} - -template <size_t N, typename T> -KFR_INTRINSIC void cscatter(complex<T>* base, size_t stride, const cvec<T, N>& value) -{ - return cscatter_helper<N, T>(base, stride, value, csizeseq_t<N>()); -} - -template <size_t groupsize = 1, typename T, size_t N, typename IT> -KFR_INTRINSIC vec<T, N * 2 * groupsize> cgather(const complex<T>* base, const vec<IT, N>& offset) -{ - return internal::gather_helper<2 * groupsize>(ptr_cast<T>(base), offset, csizeseq_t<N>()); -} - -template <size_t groupsize = 1, typename T, size_t N, typename IT> -KFR_INTRINSIC void cscatter(complex<T>* base, const vec<IT, N>& offset, vec<T, N * 2 * groupsize> value) -{ - return internal::scatter_helper<2 * groupsize>(ptr_cast<T>(base), offset, value, csizeseq_t<N>()); -} - -template <typename T> -KFR_INTRINSIC void transpose4x8(const cvec<T, 8>& z0, const cvec<T, 8>& z1, const cvec<T, 8>& z2, - const cvec<T, 8>& z3, cvec<T, 4>& w0, cvec<T, 4>& w1, cvec<T, 4>& w2, - cvec<T, 4>& w3, cvec<T, 4>& w4, cvec<T, 4>& w5, cvec<T, 4>& w6, - cvec<T, 4>& w7) -{ - cvec<T, 16> a = concat(low(z0), low(z1), low(z2), low(z3)); - cvec<T, 16> b = concat(high(z0), high(z1), high(z2), high(z3)); - a = digitreverse4<2>(a); - b = digitreverse4<2>(b); - w0 = part<4, 0>(a); - w1 = part<4, 1>(a); - w2 = part<4, 2>(a); - w3 = part<4, 3>(a); - w4 = part<4, 0>(b); - w5 = part<4, 1>(b); - w6 = part<4, 2>(b); - w7 = part<4, 3>(b); -} - -template <typename T> -KFR_INTRINSIC void transpose4x8(const cvec<T, 4>& w0, const cvec<T, 4>& w1, const cvec<T, 4>& w2, - const cvec<T, 4>& w3, const cvec<T, 4>& w4, const cvec<T, 4>& w5, - const cvec<T, 4>& w6, const cvec<T, 4>& w7, cvec<T, 8>& z0, cvec<T, 8>& z1, - cvec<T, 8>& z2, cvec<T, 8>& z3) -{ - cvec<T, 16> a = concat(w0, w1, w2, w3); - cvec<T, 16> b = concat(w4, w5, w6, w7); - a = digitreverse4<2>(a); - b = digitreverse4<2>(b); - z0 = concat(part<4, 0>(a), part<4, 0>(b)); - z1 = concat(part<4, 1>(a), part<4, 1>(b)); - z2 = concat(part<4, 2>(a), part<4, 2>(b)); - z3 = concat(part<4, 3>(a), part<4, 3>(b)); -} - -template <typename T> -KFR_INTRINSIC void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d) -{ - cvec<T, 4> a0, a1, a2, a3; - cvec<T, 4> b0, b1, b2, b3; - cvec<T, 4> c0, c1, c2, c3; - cvec<T, 4> d0, d1, d2, d3; - - split<T, 32>(a, a0, a1, a2, a3); - split<T, 32>(b, b0, b1, b2, b3); - split<T, 32>(c, c0, c1, c2, c3); - split<T, 32>(d, d0, d1, d2, d3); - - a = concat(a0, b0, c0, d0); - b = concat(a1, b1, c1, d1); - c = concat(a2, b2, c2, d2); - d = concat(a3, b3, c3, d3); -} -template <typename T> -KFR_INTRINSIC void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d, cvec<T, 16>& aa, - cvec<T, 16>& bb, cvec<T, 16>& cc, cvec<T, 16>& dd) -{ - cvec<T, 4> a0, a1, a2, a3; - cvec<T, 4> b0, b1, b2, b3; - cvec<T, 4> c0, c1, c2, c3; - cvec<T, 4> d0, d1, d2, d3; - - split<T, 32>(a, a0, a1, a2, a3); - split<T, 32>(b, b0, b1, b2, b3); - split<T, 32>(c, c0, c1, c2, c3); - split<T, 32>(d, d0, d1, d2, d3); - - aa = concat(a0, b0, c0, d0); - bb = concat(a1, b1, c1, d1); - cc = concat(a2, b2, c2, d2); - dd = concat(a3, b3, c3, d3); -} - -template <bool b, typename T> -constexpr KFR_INTRINSIC T chsign(T x) -{ - return b ? -x : x; -} - -template <typename T, size_t N, size_t size, size_t start, size_t step, bool inverse = false, - size_t... indices> -constexpr KFR_INTRINSIC cvec<T, N> get_fixed_twiddle_helper(csizes_t<indices...>) -{ - return make_vector((indices & 1 ? chsign<inverse>(-sin_using_table<T>(size, (indices / 2 * step + start))) - : cos_using_table<T>(size, (indices / 2 * step + start)))...); -} - -template <typename T, size_t width, size_t... indices> -constexpr KFR_INTRINSIC cvec<T, width> get_fixed_twiddle_helper(csizes_t<indices...>, size_t size, - size_t start, size_t step) -{ - return make_vector((indices & 1 ? -sin_using_table<T>(size, indices / 2 * step + start) - : cos_using_table<T>(size, indices / 2 * step + start))...); -} - -template <typename T, size_t width, size_t size, size_t start, size_t step = 0, bool inverse = false> -constexpr KFR_INTRINSIC cvec<T, width> fixed_twiddle() -{ - return get_fixed_twiddle_helper<T, width, size, start, step, inverse>(csizeseq_t<width * 2>()); -} - -template <typename T, size_t width> -constexpr KFR_INTRINSIC cvec<T, width> fixed_twiddle(size_t size, size_t start, size_t step = 0) -{ - return get_fixed_twiddle_helper<T, width>(csizeseq_t<width * 2>(), start, step, size); -} - -// template <typename T, size_t N, size_t size, size_t start, size_t step = 0, bool inverse = false> -// constexpr cvec<T, N> fixed_twiddle = get_fixed_twiddle<T, N, size, start, step, inverse>(); - -template <typename T, size_t N, bool inverse> -constexpr static inline cvec<T, N> twiddleimagmask() -{ - return inverse ? broadcast<N * 2, T>(-1, +1) : broadcast<N * 2, T>(+1, -1); -} - -CMT_PRAGMA_GNU(GCC diagnostic push) -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wconversion") - -CMT_PRAGMA_GNU(GCC diagnostic pop) - -template <typename T, size_t N> -CMT_NOINLINE static vec<T, N> cossin_conj(const vec<T, N>& x) -{ - return negodd(cossin(x)); -} - -template <size_t k, size_t size, bool inverse = false, typename T, size_t width, - size_t kk = (inverse ? size - k : k) % size> -KFR_INTRINSIC vec<T, width> cmul_by_twiddle(const vec<T, width>& x) -{ - constexpr T isqrt2 = static_cast<T>(0.70710678118654752440084436210485); - if constexpr (kk == 0) - { - return x; - } - else if constexpr (kk == size * 1 / 8) - { - return swap<2>(subadd(swap<2>(x), x)) * isqrt2; - } - else if constexpr (kk == size * 2 / 8) - { - return negodd(swap<2>(x)); - } - else if constexpr (kk == size * 3 / 8) - { - return subadd(x, swap<2>(x)) * -isqrt2; - } - else if constexpr (kk == size * 4 / 8) - { - return -x; - } - else if constexpr (kk == size * 5 / 8) - { - return swap<2>(subadd(swap<2>(x), x)) * -isqrt2; - } - else if constexpr (kk == size * 6 / 8) - { - return swap<2>(negodd(x)); - } - else if constexpr (kk == size * 7 / 8) - { - return subadd(x, swap<2>(x)) * isqrt2; - } - else - { - return cmul(x, resize<width>(fixed_twiddle<T, 1, size, kk>())); - } -} - -template <size_t N, typename T> -KFR_INTRINSIC void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N>& w0, cvec<T, N>& w1) -{ - const cvec<T, N> sum = a0 + a1; - const cvec<T, N> dif = a0 - a1; - w0 = sum; - w1 = dif; -} - -template <size_t N, typename T> -KFR_INTRINSIC void butterfly2(cvec<T, N>& a0, cvec<T, N>& a1) -{ - butterfly2<N>(a0, a1, a0, a1); -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1, - const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1, - cvec<T, N>& w2, cvec<T, N>& w3) -{ - cvec<T, N> sum02, sum13, diff02, diff13; - cvec<T, N * 2> a01, a23, sum0213, diff0213; - - a01 = concat(a0, a1); - a23 = concat(a2, a3); - sum0213 = a01 + a23; - diff0213 = a01 - a23; - - sum02 = low(sum0213); - sum13 = high(sum0213); - diff02 = low(diff0213); - diff13 = high(diff0213); - w0 = sum02 + sum13; - w2 = sum02 - sum13; - if constexpr (inverse) - { - diff13 = (diff13 ^ broadcast<N * 2, T>(T(), -T())); - diff13 = swap<2>(diff13); - } - else - { - diff13 = swap<2>(diff13); - diff13 = (diff13 ^ broadcast<N * 2, T>(T(), -T())); - } - - w1 = diff02 + diff13; - w3 = diff02 - diff13; -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly4(ctrue_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1, - const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1, - cvec<T, N>& w2, cvec<T, N>& w3) -{ - vec<T, N> re0, im0, re1, im1, re2, im2, re3, im3; - vec<T, N> wre0, wim0, wre1, wim1, wre2, wim2, wre3, wim3; - - cvec<T, N> sum02, sum13, diff02, diff13; - vec<T, N> sum02re, sum13re, diff02re, diff13re; - vec<T, N> sum02im, sum13im, diff02im, diff13im; - - sum02 = a0 + a2; - sum13 = a1 + a3; - - w0 = sum02 + sum13; - w2 = sum02 - sum13; - - diff02 = a0 - a2; - diff13 = a1 - a3; - split(diff02, diff02re, diff02im); - split(diff13, diff13re, diff13im); - - (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re); - (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re); -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly8(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, - const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, - const cvec<T, N>& a6, const cvec<T, N>& a7, cvec<T, N>& w0, cvec<T, N>& w1, - cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, - cvec<T, N>& w7) -{ - cvec<T, N> b0 = a0, b2 = a2, b4 = a4, b6 = a6; - butterfly4<N, inverse>(cfalse, b0, b2, b4, b6, b0, b2, b4, b6); - cvec<T, N> b1 = a1, b3 = a3, b5 = a5, b7 = a7; - butterfly4<N, inverse>(cfalse, b1, b3, b5, b7, b1, b3, b5, b7); - w0 = b0 + b1; - w4 = b0 - b1; - - b3 = cmul_by_twiddle<1, 8, inverse>(b3); - b5 = cmul_by_twiddle<2, 8, inverse>(b5); - b7 = cmul_by_twiddle<3, 8, inverse>(b7); - - w1 = b2 + b3; - w5 = b2 - b3; - w2 = b4 + b5; - w6 = b4 - b5; - w3 = b6 + b7; - w7 = b6 - b7; -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly8(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, - cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7) -{ - butterfly8<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a0, a1, a2, a3, a4, a5, a6, a7); -} - -template <bool inverse = false, typename T> -KFR_INTRINSIC void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cvec<T, 2>& a67) -{ - cvec<T, 2> b01 = a01, b23 = a23, b45 = a45, b67 = a67; - - butterfly4<2, inverse>(cfalse, b01, b23, b45, b67, b01, b23, b45, b67); - - cvec<T, 2> b02, b13, b46, b57; - - cvec<T, 8> b01234567 = concat(b01, b23, b45, b67); - cvec<T, 8> b02461357 = concat(even<2>(b01234567), odd<2>(b01234567)); - split<T, 16>(b02461357, b02, b46, b13, b57); - - b13 = cmul(b13, fixed_twiddle<T, 2, 8, 0, 1, inverse>()); - b57 = cmul(b57, fixed_twiddle<T, 2, 8, 2, 1, inverse>()); - a01 = b02 + b13; - a23 = b46 + b57; - a45 = b02 - b13; - a67 = b46 - b57; -} - -template <bool inverse = false, typename T> -KFR_INTRINSIC void butterfly8_packed(cvec<T, 8>& v8) -{ - cvec<T, 2> w0, w1, w2, w3; - split<T, 16>(v8, w0, w1, w2, w3); - butterfly8<inverse>(w0, w1, w2, w3); - v8 = concat(w0, w1, w2, w3); -} - -template <bool inverse = false, typename T> -KFR_INTRINSIC void butterfly32_packed(cvec<T, 32>& v32) -{ - cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7; - split(v32, w0, w1, w2, w3, w4, w5, w6, w7); - butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7); - - w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>()); - w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>()); - w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>()); - w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>()); - w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>()); - w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>()); - w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>()); - - cvec<T, 8> z0, z1, z2, z3; - transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3); - - butterfly4<8, inverse>(cfalse, z0, z1, z2, z3, z0, z1, z2, z3); - v32 = concat(z0, z1, z2, z3); -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly4_packed(cvec<T, N * 4>& a0123) -{ - cvec<T, N> a0; - cvec<T, N> a1; - cvec<T, N> a2; - cvec<T, N> a3; - split<T, N * 4 * 2>(a0123, a0, a1, a2, a3); - butterfly4<N, inverse>(cfalse, a0, a1, a2, a3, a0, a1, a2, a3); - a0123 = concat(a0, a1, a2, a3); -} - -template <size_t N, typename T> -KFR_INTRINSIC void butterfly2_packed(cvec<T, N * 2>& a01) -{ - cvec<T, N> a0; - cvec<T, N> a1; - split(a01, a0, a1); - butterfly2<N>(a0, a1); - a01 = concat(a0, a1); -} - -template <size_t N, bool inverse = false, bool split_format = false, typename T> -KFR_INTRINSIC void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<T, N>& w1) -{ - if constexpr (split_format) - { - vec<T, N> re1, im1, tw1re, tw1im; - split<T, 2 * N>(a1, re1, im1); - split<T, 2 * N>(tw1, tw1re, tw1im); - vec<T, N> b1re = re1 * tw1re; - vec<T, N> b1im = im1 * tw1re; - if constexpr (inverse) - w1 = concat(b1re + im1 * tw1im, b1im - re1 * tw1im); - else - w1 = concat(b1re - im1 * tw1im, b1im + re1 * tw1im); - } - else - { - const cvec<T, N> b1 = a1 * dupeven(tw1); - const cvec<T, N> a1_ = swap<2>(a1); - - cvec<T, N> tw1_ = tw1; - if constexpr (inverse) - tw1_ = -(tw1_); - w1 = subadd(b1, a1_ * dupodd(tw1_)); - } -} - -template <size_t N, bool inverse = false, bool split_format = false, typename T> -KFR_INTRINSIC void apply_twiddles4(const cvec<T, N>& a1, const cvec<T, N>& a2, const cvec<T, N>& a3, - const cvec<T, N>& tw1, const cvec<T, N>& tw2, const cvec<T, N>& tw3, - cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3) -{ - apply_twiddle<N, inverse, split_format>(a1, tw1, w1); - apply_twiddle<N, inverse, split_format>(a2, tw2, w2); - apply_twiddle<N, inverse, split_format>(a3, tw3, w3); -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, - cvec<T, N>& __restrict a3, const cvec<T, N>& tw1, const cvec<T, N>& tw2, - const cvec<T, N>& tw3) -{ - apply_twiddles4<N, inverse>(a1, a2, a3, tw1, tw2, tw3, a1, a2, a3); -} - -template <size_t N, bool inverse = false, typename T, typename = u8[N - 1]> -KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, - cvec<T, N>& __restrict a3, const cvec<T, 1>& tw1, const cvec<T, 1>& tw2, - const cvec<T, 1>& tw3) -{ - apply_twiddles4<N, inverse>(a1, a2, a3, resize<N * 2>(tw1), resize<N * 2>(tw2), resize<N * 2>(tw3)); -} - -template <size_t N, bool inverse = false, typename T, typename = u8[N - 2]> -KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, - cvec<T, N>& __restrict a3, cvec<T, N / 2> tw1, cvec<T, N / 2> tw2, - cvec<T, N / 2> tw3) -{ - apply_twiddles4<N, inverse>(a1, a2, a3, resize<N * 2>(tw1), resize<N * 2>(tw2), resize<N * 2>(tw3)); -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void apply_vertical_twiddles4(cvec<T, N * 4>& b, cvec<T, N * 4>& c, cvec<T, N * 4>& d) -{ - cvec<T, 4> b0, b1, b2, b3; - cvec<T, 4> c0, c1, c2, c3; - cvec<T, 4> d0, d1, d2, d3; - - split(b, b0, b1, b2, b3); - split(c, c0, c1, c2, c3); - split(d, d0, d1, d2, d3); - - b1 = cmul_by_twiddle<4, 64, inverse>(b1); - b2 = cmul_by_twiddle<8, 64, inverse>(b2); - b3 = cmul_by_twiddle<12, 64, inverse>(b3); - - c1 = cmul_by_twiddle<8, 64, inverse>(c1); - c2 = cmul_by_twiddle<16, 64, inverse>(c2); - c3 = cmul_by_twiddle<24, 64, inverse>(c3); - - d1 = cmul_by_twiddle<12, 64, inverse>(d1); - d2 = cmul_by_twiddle<24, 64, inverse>(d2); - d3 = cmul_by_twiddle<36, 64, inverse>(d3); - - b = concat(b0, b1, b2, b3); - c = concat(c0, c1, c2, c3); - d = concat(d0, d1, d2, d3); -} - -template <size_t n2, size_t nnstep, size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void apply_twiddles4(cvec<T, N * 4>& __restrict a0123) -{ - cvec<T, N> a0; - cvec<T, N> a1; - cvec<T, N> a2; - cvec<T, N> a3; - split<T, 2 * N * 4>(a0123, a0, a1, a2, a3); - - cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1, inverse>(), - tw2 = fixed_twiddle<T, N, 64, n2 * nnstep * 2, nnstep * 2, inverse>(), - tw3 = fixed_twiddle<T, N, 64, n2 * nnstep * 3, nnstep * 3, inverse>(); - - apply_twiddles4<N>(a1, a2, a3, tw1, tw2, tw3); - - a0123 = concat(a0, a1, a2, a3); -} - -template <bool inverse, bool aligned, typename T> -KFR_INTRINSIC void butterfly64_memory(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, - const complex<T>* in) -{ - cvec<T, 16> w0, w1, w2, w3; - - w0 = cread_group<4, 4, aligned>( - in, 16); // concat(cread<4>(in + 0), cread<4>(in + 16), cread<4>(in + 32), cread<4>(in + 48)); - butterfly4_packed<4, inverse>(w0); - apply_twiddles4<0, 1, 4, inverse>(w0); - - w1 = cread_group<4, 4, aligned>( - in + 4, 16); // concat(cread<4>(in + 4), cread<4>(in + 20), cread<4>(in + 36), cread<4>(in + 52)); - butterfly4_packed<4, inverse>(w1); - apply_twiddles4<4, 1, 4, inverse>(w1); - - w2 = cread_group<4, 4, aligned>( - in + 8, 16); // concat(cread<4>(in + 8), cread<4>(in + 24), cread<4>(in + 40), cread<4>(in + 56)); - butterfly4_packed<4, inverse>(w2); - apply_twiddles4<8, 1, 4, inverse>(w2); - - w3 = cread_group<4, 4, aligned>( - in + 12, 16); // concat(cread<4>(in + 12), cread<4>(in + 28), cread<4>(in + 44), cread<4>(in + 60)); - butterfly4_packed<4, inverse>(w3); - apply_twiddles4<12, 1, 4, inverse>(w3); - - transpose4(w0, w1, w2, w3); - // pass 2: - - butterfly4_packed<4, inverse>(w0); - butterfly4_packed<4, inverse>(w1); - butterfly4_packed<4, inverse>(w2); - butterfly4_packed<4, inverse>(w3); - - transpose4(w0, w1, w2, w3); - - w0 = digitreverse4<2>(w0); - w1 = digitreverse4<2>(w1); - w2 = digitreverse4<2>(w2); - w3 = digitreverse4<2>(w3); - - apply_vertical_twiddles4<4, inverse>(w1, w2, w3); - - // pass 3: - butterfly4_packed<4, inverse>(w3); - cwrite_group<4, 4, aligned>(out + 12, 16, w3); // split(w3, out[3], out[7], out[11], out[15]); - - butterfly4_packed<4, inverse>(w2); - cwrite_group<4, 4, aligned>(out + 8, 16, w2); // split(w2, out[2], out[6], out[10], out[14]); - - butterfly4_packed<4, inverse>(w1); - cwrite_group<4, 4, aligned>(out + 4, 16, w1); // split(w1, out[1], out[5], out[9], out[13]); - - butterfly4_packed<4, inverse>(w0); - cwrite_group<4, 4, aligned>(out, 16, w0); // split(w0, out[0], out[4], out[8], out[12]); -} - -template <bool inverse = false, typename T> -KFR_INTRINSIC void butterfly16_packed(cvec<T, 16>& v16) -{ - butterfly4_packed<4, inverse>(v16); - apply_twiddles4<0, 4, 4, inverse>(v16); - v16 = digitreverse4<2>(v16); - butterfly4_packed<4, inverse>(v16); -} - -template <size_t index, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly16_multi_natural(complex<T>* out, const complex<T>* in) -{ - constexpr size_t N = 4; - - cvec<T, 4> a1 = cread<4>(in + index * 4 + 16 * 1); - cvec<T, 4> a5 = cread<4>(in + index * 4 + 16 * 5); - cvec<T, 4> a9 = cread<4>(in + index * 4 + 16 * 9); - cvec<T, 4> a13 = cread<4>(in + index * 4 + 16 * 13); - butterfly4<N, inverse>(cfalse, a1, a5, a9, a13, a1, a5, a9, a13); - a5 = cmul_by_twiddle<1, 16, inverse>(a5); - a9 = cmul_by_twiddle<2, 16, inverse>(a9); - a13 = cmul_by_twiddle<3, 16, inverse>(a13); - - cvec<T, 4> a2 = cread<4>(in + index * 4 + 16 * 2); - cvec<T, 4> a6 = cread<4>(in + index * 4 + 16 * 6); - cvec<T, 4> a10 = cread<4>(in + index * 4 + 16 * 10); - cvec<T, 4> a14 = cread<4>(in + index * 4 + 16 * 14); - butterfly4<N, inverse>(cfalse, a2, a6, a10, a14, a2, a6, a10, a14); - a6 = cmul_by_twiddle<2, 16, inverse>(a6); - a10 = cmul_by_twiddle<4, 16, inverse>(a10); - a14 = cmul_by_twiddle<6, 16, inverse>(a14); - - cvec<T, 4> a3 = cread<4>(in + index * 4 + 16 * 3); - cvec<T, 4> a7 = cread<4>(in + index * 4 + 16 * 7); - cvec<T, 4> a11 = cread<4>(in + index * 4 + 16 * 11); - cvec<T, 4> a15 = cread<4>(in + index * 4 + 16 * 15); - butterfly4<N, inverse>(cfalse, a3, a7, a11, a15, a3, a7, a11, a15); - a7 = cmul_by_twiddle<3, 16, inverse>(a7); - a11 = cmul_by_twiddle<6, 16, inverse>(a11); - a15 = cmul_by_twiddle<9, 16, inverse>(a15); - - cvec<T, 4> a0 = cread<4>(in + index * 4 + 16 * 0); - cvec<T, 4> a4 = cread<4>(in + index * 4 + 16 * 4); - cvec<T, 4> a8 = cread<4>(in + index * 4 + 16 * 8); - cvec<T, 4> a12 = cread<4>(in + index * 4 + 16 * 12); - butterfly4<N, inverse>(cfalse, a0, a4, a8, a12, a0, a4, a8, a12); - butterfly4<N, inverse>(cfalse, a0, a1, a2, a3, a0, a1, a2, a3); - cwrite<4>(out + index * 4 + 16 * 0, a0); - cwrite<4>(out + index * 4 + 16 * 4, a1); - cwrite<4>(out + index * 4 + 16 * 8, a2); - cwrite<4>(out + index * 4 + 16 * 12, a3); - butterfly4<N, inverse>(cfalse, a4, a5, a6, a7, a4, a5, a6, a7); - cwrite<4>(out + index * 4 + 16 * 1, a4); - cwrite<4>(out + index * 4 + 16 * 5, a5); - cwrite<4>(out + index * 4 + 16 * 9, a6); - cwrite<4>(out + index * 4 + 16 * 13, a7); - butterfly4<N, inverse>(cfalse, a8, a9, a10, a11, a8, a9, a10, a11); - cwrite<4>(out + index * 4 + 16 * 2, a8); - cwrite<4>(out + index * 4 + 16 * 6, a9); - cwrite<4>(out + index * 4 + 16 * 10, a10); - cwrite<4>(out + index * 4 + 16 * 14, a11); - butterfly4<N, inverse>(cfalse, a12, a13, a14, a15, a12, a13, a14, a15); - cwrite<4>(out + index * 4 + 16 * 3, a12); - cwrite<4>(out + index * 4 + 16 * 7, a13); - cwrite<4>(out + index * 4 + 16 * 11, a14); - cwrite<4>(out + index * 4 + 16 * 15, a15); -} - -template <size_t index, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly16_multi_flip(complex<T>* out, const complex<T>* in) -{ - constexpr size_t N = 4; - - cvec<T, 4> a1 = cread<4>(in + index * 4 + 16 * 1); - cvec<T, 4> a5 = cread<4>(in + index * 4 + 16 * 5); - cvec<T, 4> a9 = cread<4>(in + index * 4 + 16 * 9); - cvec<T, 4> a13 = cread<4>(in + index * 4 + 16 * 13); - butterfly4<N, inverse>(cfalse, a1, a5, a9, a13, a1, a5, a9, a13); - a5 = cmul_by_twiddle<1, 16, inverse>(a5); - a9 = cmul_by_twiddle<2, 16, inverse>(a9); - a13 = cmul_by_twiddle<3, 16, inverse>(a13); - - cvec<T, 4> a2 = cread<4>(in + index * 4 + 16 * 2); - cvec<T, 4> a6 = cread<4>(in + index * 4 + 16 * 6); - cvec<T, 4> a10 = cread<4>(in + index * 4 + 16 * 10); - cvec<T, 4> a14 = cread<4>(in + index * 4 + 16 * 14); - butterfly4<N, inverse>(cfalse, a2, a6, a10, a14, a2, a6, a10, a14); - a6 = cmul_by_twiddle<2, 16, inverse>(a6); - a10 = cmul_by_twiddle<4, 16, inverse>(a10); - a14 = cmul_by_twiddle<6, 16, inverse>(a14); - - cvec<T, 4> a3 = cread<4>(in + index * 4 + 16 * 3); - cvec<T, 4> a7 = cread<4>(in + index * 4 + 16 * 7); - cvec<T, 4> a11 = cread<4>(in + index * 4 + 16 * 11); - cvec<T, 4> a15 = cread<4>(in + index * 4 + 16 * 15); - butterfly4<N, inverse>(cfalse, a3, a7, a11, a15, a3, a7, a11, a15); - a7 = cmul_by_twiddle<3, 16, inverse>(a7); - a11 = cmul_by_twiddle<6, 16, inverse>(a11); - a15 = cmul_by_twiddle<9, 16, inverse>(a15); - - cvec<T, 16> w1 = concat(a1, a5, a9, a13); - cvec<T, 16> w2 = concat(a2, a6, a10, a14); - cvec<T, 16> w3 = concat(a3, a7, a11, a15); - - cvec<T, 4> a0 = cread<4>(in + index * 4 + 16 * 0); - cvec<T, 4> a4 = cread<4>(in + index * 4 + 16 * 4); - cvec<T, 4> a8 = cread<4>(in + index * 4 + 16 * 8); - cvec<T, 4> a12 = cread<4>(in + index * 4 + 16 * 12); - butterfly4<N, inverse>(cfalse, a0, a4, a8, a12, a0, a4, a8, a12); - cvec<T, 16> w0 = concat(a0, a4, a8, a12); - - butterfly4<N * 4, inverse>(cfalse, w0, w1, w2, w3, w0, w1, w2, w3); - - w0 = digitreverse4<2>(w0); - w1 = digitreverse4<2>(w1); - w2 = digitreverse4<2>(w2); - w3 = digitreverse4<2>(w3); - - transpose4(w0, w1, w2, w3); - cwrite<16>(out + index * 64 + 16 * 0, cmul(w0, fixed_twiddle<T, 16, 256, 0, index * 4 + 0, inverse>())); - cwrite<16>(out + index * 64 + 16 * 1, cmul(w1, fixed_twiddle<T, 16, 256, 0, index * 4 + 1, inverse>())); - cwrite<16>(out + index * 64 + 16 * 2, cmul(w2, fixed_twiddle<T, 16, 256, 0, index * 4 + 2, inverse>())); - cwrite<16>(out + index * 64 + 16 * 3, cmul(w3, fixed_twiddle<T, 16, 256, 0, index * 4 + 3, inverse>())); -} - -template <size_t n2, size_t nnstep, size_t N, typename T> -KFR_INTRINSIC void apply_twiddles2(cvec<T, N>& a1) -{ - cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1>(); - - a1 = cmul(a1, tw1); -} - -template <typename T, size_t N, bool inverse> -static constexpr KFR_INTRINSIC cvec<T, N> tw3r1() -{ - return static_cast<T>(-0.5 - 1.0); -} - -template <typename T, size_t N, bool inverse> -static constexpr KFR_INTRINSIC cvec<T, N> tw3i1() -{ - return static_cast<T>(0.86602540378443864676372317075) * twiddleimagmask<T, N, inverse>(); -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N>& w00, - cvec<T, N>& w01, cvec<T, N>& w02) -{ - - const cvec<T, N> sum1 = a01 + a02; - const cvec<T, N> dif1 = swap<2>(a01 - a02); - w00 = a00 + sum1; - - const cvec<T, N> s1 = w00 + sum1 * tw3r1<T, N, inverse>(); - - const cvec<T, N> d1 = dif1 * tw3i1<T, N, inverse>(); - - w01 = s1 + d1; - w02 = s1 - d1; -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly3(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2) -{ - butterfly3<N, inverse>(a0, a1, a2, a0, a1, a2); -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly6(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, - const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, - cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, - cvec<T, N>& w5) -{ - cvec<T, N * 2> a03 = concat(a0, a3); - cvec<T, N * 2> a25 = concat(a2, a5); - cvec<T, N * 2> a41 = concat(a4, a1); - butterfly3<N * 2, inverse>(a03, a25, a41, a03, a25, a41); - cvec<T, N> t0, t1, t2, t3, t4, t5; - split(a03, t0, t1); - split(a25, t2, t3); - split(a41, t4, t5); - t3 = -t3; - cvec<T, N * 2> a04 = concat(t0, t4); - cvec<T, N * 2> a15 = concat(t1, t5); - cvec<T, N * 2> w02, w35; - butterfly2<N * 2>(a04, a15, w02, w35); - split(w02, w0, w2); - split(w35, w3, w5); - - butterfly2<N>(t2, t3, w1, w4); -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly6(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, - cvec<T, N>& a5) -{ - butterfly6<N, inverse>(a0, a1, a2, a3, a4, a5, a0, a1, a2, a3, a4, a5); -} - -template <typename T, bool inverse = false> -static constexpr KFR_INTRINSIC cvec<T, 1> tw9_1() -{ - return { T(0.76604444311897803520239265055541), - (inverse ? -1 : 1) * T(-0.64278760968653932632264340990727) }; -} -template <typename T, bool inverse = false> -static constexpr KFR_INTRINSIC cvec<T, 1> tw9_2() -{ - return { T(0.17364817766693034885171662676931), - (inverse ? -1 : 1) * T(-0.98480775301220805936674302458952) }; -} -template <typename T, bool inverse = false> -static constexpr KFR_INTRINSIC cvec<T, 1> tw9_4() -{ - return { T(-0.93969262078590838405410927732473), - (inverse ? -1 : 1) * T(-0.34202014332566873304409961468226) }; -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly9(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, - const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, - const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8, - cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, - cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7, cvec<T, N>& w8) -{ - cvec<T, N * 3> a012 = concat(a0, a1, a2); - cvec<T, N * 3> a345 = concat(a3, a4, a5); - cvec<T, N * 3> a678 = concat(a6, a7, a8); - butterfly3<N * 3, inverse>(a012, a345, a678, a012, a345, a678); - cvec<T, N> t0, t1, t2, t3, t4, t5, t6, t7, t8; - split(a012, t0, t1, t2); - split(a345, t3, t4, t5); - split(a678, t6, t7, t8); - - t4 = cmul(t4, tw9_1<T, inverse>()); - t5 = cmul(t5, tw9_2<T, inverse>()); - t7 = cmul(t7, tw9_2<T, inverse>()); - t8 = cmul(t8, tw9_4<T, inverse>()); - - cvec<T, N * 3> t036 = concat(t0, t3, t6); - cvec<T, N * 3> t147 = concat(t1, t4, t7); - cvec<T, N * 3> t258 = concat(t2, t5, t8); - - butterfly3<N * 3, inverse>(t036, t147, t258, t036, t147, t258); - split(t036, w0, w1, w2); - split(t147, w3, w4, w5); - split(t258, w6, w7, w8); -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly9(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, - cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7, cvec<T, N>& a8) -{ - butterfly9<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a8, a0, a1, a2, a3, a4, a5, a6, a7, a8); -} - -template <typename T, size_t N, bool inverse> -static constexpr KFR_INTRINSIC cvec<T, N> tw7r1() -{ - return static_cast<T>(0.623489801858733530525004884 - 1.0); -} - -template <typename T, size_t N, bool inverse> -static constexpr KFR_INTRINSIC cvec<T, N> tw7i1() -{ - return static_cast<T>(0.78183148246802980870844452667) * twiddleimagmask<T, N, inverse>(); -} - -template <typename T, size_t N, bool inverse> -static constexpr KFR_INTRINSIC cvec<T, N> tw7r2() -{ - return static_cast<T>(-0.2225209339563144042889025645 - 1.0); -} - -template <typename T, size_t N, bool inverse> -static constexpr KFR_INTRINSIC cvec<T, N> tw7i2() -{ - return static_cast<T>(0.97492791218182360701813168299) * twiddleimagmask<T, N, inverse>(); -} - -template <typename T, size_t N, bool inverse> -static constexpr KFR_INTRINSIC cvec<T, N> tw7r3() -{ - return static_cast<T>(-0.90096886790241912623610231951 - 1.0); -} - -template <typename T, size_t N, bool inverse> -static constexpr KFR_INTRINSIC cvec<T, N> tw7i3() -{ - return static_cast<T>(0.43388373911755812047576833285) * twiddleimagmask<T, N, inverse>(); -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04, - cvec<T, N> a05, cvec<T, N> a06, cvec<T, N>& w00, cvec<T, N>& w01, - cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, - cvec<T, N>& w06) -{ - const cvec<T, N> sum1 = a01 + a06; - const cvec<T, N> dif1 = swap<2>(a01 - a06); - const cvec<T, N> sum2 = a02 + a05; - const cvec<T, N> dif2 = swap<2>(a02 - a05); - const cvec<T, N> sum3 = a03 + a04; - const cvec<T, N> dif3 = swap<2>(a03 - a04); - w00 = a00 + sum1 + sum2 + sum3; - - const cvec<T, N> s1 = - w00 + sum1 * tw7r1<T, N, inverse>() + sum2 * tw7r2<T, N, inverse>() + sum3 * tw7r3<T, N, inverse>(); - const cvec<T, N> s2 = - w00 + sum1 * tw7r2<T, N, inverse>() + sum2 * tw7r3<T, N, inverse>() + sum3 * tw7r1<T, N, inverse>(); - const cvec<T, N> s3 = - w00 + sum1 * tw7r3<T, N, inverse>() + sum2 * tw7r1<T, N, inverse>() + sum3 * tw7r2<T, N, inverse>(); - - const cvec<T, N> d1 = - dif1 * tw7i1<T, N, inverse>() + dif2 * tw7i2<T, N, inverse>() + dif3 * tw7i3<T, N, inverse>(); - const cvec<T, N> d2 = - dif1 * tw7i2<T, N, inverse>() - dif2 * tw7i3<T, N, inverse>() - dif3 * tw7i1<T, N, inverse>(); - const cvec<T, N> d3 = - dif1 * tw7i3<T, N, inverse>() - dif2 * tw7i1<T, N, inverse>() + dif3 * tw7i2<T, N, inverse>(); - - w01 = s1 + d1; - w06 = s1 - d1; - w02 = s2 + d2; - w05 = s2 - d2; - w03 = s3 + d3; - w04 = s3 - d3; -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly7(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, - cvec<T, N>& a5, cvec<T, N>& a6) -{ - butterfly7<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a0, a1, a2, a3, a4, a5, a6); -} - -template <typename T, size_t N, bool inverse> -static const cvec<T, N> tw11r1 = static_cast<T>(0.84125353283118116886181164892 - 1.0); - -template <typename T, size_t N, bool inverse> -static const cvec<T, N> tw11i1 = - static_cast<T>(0.54064081745559758210763595432) * twiddleimagmask<T, N, inverse>(); - -template <typename T, size_t N, bool inverse> -static const cvec<T, N> tw11r2 = static_cast<T>(0.41541501300188642552927414923 - 1.0); - -template <typename T, size_t N, bool inverse> -static const cvec<T, N> tw11i2 = - static_cast<T>(0.90963199535451837141171538308) * twiddleimagmask<T, N, inverse>(); - -template <typename T, size_t N, bool inverse> -static const cvec<T, N> tw11r3 = static_cast<T>(-0.14231483827328514044379266862 - 1.0); - -template <typename T, size_t N, bool inverse> -static const cvec<T, N> tw11i3 = - static_cast<T>(0.98982144188093273237609203778) * twiddleimagmask<T, N, inverse>(); - -template <typename T, size_t N, bool inverse> -static const cvec<T, N> tw11r4 = static_cast<T>(-0.65486073394528506405692507247 - 1.0); - -template <typename T, size_t N, bool inverse> -static const cvec<T, N> tw11i4 = - static_cast<T>(0.75574957435425828377403584397) * twiddleimagmask<T, N, inverse>(); - -template <typename T, size_t N, bool inverse> -static const cvec<T, N> tw11r5 = static_cast<T>(-0.95949297361449738989036805707 - 1.0); - -template <typename T, size_t N, bool inverse> -static const cvec<T, N> tw11i5 = - static_cast<T>(0.28173255684142969771141791535) * twiddleimagmask<T, N, inverse>(); - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly11(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04, - cvec<T, N> a05, cvec<T, N> a06, cvec<T, N> a07, cvec<T, N> a08, cvec<T, N> a09, - cvec<T, N> a10, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02, - cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06, - cvec<T, N>& w07, cvec<T, N>& w08, cvec<T, N>& w09, cvec<T, N>& w10) -{ - const cvec<T, N> sum1 = a01 + a10; - const cvec<T, N> dif1 = swap<2>(a01 - a10); - const cvec<T, N> sum2 = a02 + a09; - const cvec<T, N> dif2 = swap<2>(a02 - a09); - const cvec<T, N> sum3 = a03 + a08; - const cvec<T, N> dif3 = swap<2>(a03 - a08); - const cvec<T, N> sum4 = a04 + a07; - const cvec<T, N> dif4 = swap<2>(a04 - a07); - const cvec<T, N> sum5 = a05 + a06; - const cvec<T, N> dif5 = swap<2>(a05 - a06); - w00 = a00 + sum1 + sum2 + sum3 + sum4 + sum5; - - const cvec<T, N> s1 = w00 + sum1 * tw11r1<T, N, inverse> + sum2 * tw11r2<T, N, inverse> + - sum3 * tw11r3<T, N, inverse> + sum4 * tw11r4<T, N, inverse> + - sum5 * tw11r5<T, N, inverse>; - const cvec<T, N> s2 = w00 + sum1 * tw11r2<T, N, inverse> + sum2 * tw11r3<T, N, inverse> + - sum3 * tw11r4<T, N, inverse> + sum4 * tw11r5<T, N, inverse> + - sum5 * tw11r1<T, N, inverse>; - const cvec<T, N> s3 = w00 + sum1 * tw11r3<T, N, inverse> + sum2 * tw11r4<T, N, inverse> + - sum3 * tw11r5<T, N, inverse> + sum4 * tw11r1<T, N, inverse> + - sum5 * tw11r2<T, N, inverse>; - const cvec<T, N> s4 = w00 + sum1 * tw11r4<T, N, inverse> + sum2 * tw11r5<T, N, inverse> + - sum3 * tw11r1<T, N, inverse> + sum4 * tw11r2<T, N, inverse> + - sum5 * tw11r3<T, N, inverse>; - const cvec<T, N> s5 = w00 + sum1 * tw11r5<T, N, inverse> + sum2 * tw11r1<T, N, inverse> + - sum3 * tw11r2<T, N, inverse> + sum4 * tw11r3<T, N, inverse> + - sum5 * tw11r4<T, N, inverse>; - - const cvec<T, N> d1 = dif1 * tw11i1<T, N, inverse> + dif2 * tw11i2<T, N, inverse> + - dif3 * tw11i3<T, N, inverse> + dif4 * tw11i4<T, N, inverse> + - dif5 * tw11i5<T, N, inverse>; - const cvec<T, N> d2 = dif1 * tw11i2<T, N, inverse> - dif2 * tw11i3<T, N, inverse> - - dif3 * tw11i4<T, N, inverse> - dif4 * tw11i5<T, N, inverse> - - dif5 * tw11i1<T, N, inverse>; - const cvec<T, N> d3 = dif1 * tw11i3<T, N, inverse> - dif2 * tw11i4<T, N, inverse> + - dif3 * tw11i5<T, N, inverse> + dif4 * tw11i1<T, N, inverse> + - dif5 * tw11i2<T, N, inverse>; - const cvec<T, N> d4 = dif1 * tw11i4<T, N, inverse> - dif2 * tw11i5<T, N, inverse> + - dif3 * tw11i1<T, N, inverse> - dif4 * tw11i2<T, N, inverse> - - dif5 * tw11i3<T, N, inverse>; - const cvec<T, N> d5 = dif1 * tw11i5<T, N, inverse> - dif2 * tw11i1<T, N, inverse> + - dif3 * tw11i2<T, N, inverse> - dif4 * tw11i3<T, N, inverse> + - dif5 * tw11i4<T, N, inverse>; - - w01 = s1 + d1; - w10 = s1 - d1; - w02 = s2 + d2; - w09 = s2 - d2; - w03 = s3 + d3; - w08 = s3 - d3; - w04 = s4 + d4; - w07 = s4 - d4; - w05 = s5 + d5; - w06 = s5 - d5; -} - -template <typename T, size_t N, bool inverse> -static constexpr KFR_INTRINSIC cvec<T, N> tw5r1() -{ - return static_cast<T>(0.30901699437494742410229341718 - 1.0); -} -template <typename T, size_t N, bool inverse> -static constexpr KFR_INTRINSIC cvec<T, N> tw5i1() -{ - return static_cast<T>(0.95105651629515357211643933338) * twiddleimagmask<T, N, inverse>(); -} -template <typename T, size_t N, bool inverse> -static constexpr KFR_INTRINSIC cvec<T, N> tw5r2() -{ - return static_cast<T>(-0.80901699437494742410229341718 - 1.0); -} -template <typename T, size_t N, bool inverse> -static constexpr KFR_INTRINSIC cvec<T, N> tw5i2() -{ - return static_cast<T>(0.58778525229247312916870595464) * twiddleimagmask<T, N, inverse>(); -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02, - const cvec<T, N>& a03, const cvec<T, N>& a04, cvec<T, N>& w00, cvec<T, N>& w01, - cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04) -{ - const cvec<T, N> sum1 = a01 + a04; - const cvec<T, N> dif1 = swap<2>(a01 - a04); - const cvec<T, N> sum2 = a02 + a03; - const cvec<T, N> dif2 = swap<2>(a02 - a03); - w00 = a00 + sum1 + sum2; - - const cvec<T, N> s1 = w00 + sum1 * tw5r1<T, N, inverse>() + sum2 * tw5r2<T, N, inverse>(); - const cvec<T, N> s2 = w00 + sum1 * tw5r2<T, N, inverse>() + sum2 * tw5r1<T, N, inverse>(); - - const cvec<T, N> d1 = dif1 * tw5i1<T, N, inverse>() + dif2 * tw5i2<T, N, inverse>(); - const cvec<T, N> d2 = dif1 * tw5i2<T, N, inverse>() - dif2 * tw5i1<T, N, inverse>(); - - w01 = s1 + d1; - w04 = s1 - d1; - w02 = s2 + d2; - w03 = s2 - d2; -} - -template <size_t N, bool inverse = false, typename T> -KFR_INTRINSIC void butterfly10(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, - const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, - const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8, - const cvec<T, N>& a9, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, - cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7, - cvec<T, N>& w8, cvec<T, N>& w9) -{ - cvec<T, N * 2> a05 = concat(a0, a5); - cvec<T, N * 2> a27 = concat(a2, a7); - cvec<T, N * 2> a49 = concat(a4, a9); - cvec<T, N * 2> a61 = concat(a6, a1); - cvec<T, N * 2> a83 = concat(a8, a3); - butterfly5<N * 2, inverse>(a05, a27, a49, a61, a83, a05, a27, a49, a61, a83); - cvec<T, N> t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; - split(a05, t0, t1); - split(a27, t2, t3); - split(a49, t4, t5); - split(a61, t6, t7); - split(a83, t8, t9); - t5 = -t5; - - cvec<T, N * 2> t02, t13; - cvec<T, N * 2> w06, w51; - t02 = concat(t0, t2); - t13 = concat(t1, t3); - butterfly2<N * 2>(t02, t13, w06, w51); - split(w06, w0, w6); - split(w51, w5, w1); - - cvec<T, N * 2> t68, t79; - cvec<T, N * 2> w84, w39; - t68 = concat(t6, t8); - t79 = concat(t7, t9); - butterfly2<N * 2>(t68, t79, w84, w39); - split(w84, w8, w4); - split(w39, w3, w9); - butterfly2<N>(t4, t5, w7, w2); -} - -template <bool inverse, typename T, size_t N> -KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, vec<T, N>& out0, - vec<T, N>& out1) -{ - butterfly2<N / 2>(in0, in1, out0, out1); -} -template <bool inverse, typename T, size_t N> -KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, - const vec<T, N>& in2, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2) -{ - butterfly3<N / 2, inverse>(in0, in1, in2, out0, out1, out2); -} - -template <bool inverse, typename T, size_t N> -KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, - const vec<T, N>& in2, const vec<T, N>& in3, vec<T, N>& out0, vec<T, N>& out1, - vec<T, N>& out2, vec<T, N>& out3) -{ - butterfly4<N / 2, inverse>(cfalse, in0, in1, in2, in3, out0, out1, out2, out3); -} -template <bool inverse, typename T, size_t N> -KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, - const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, - vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, - vec<T, N>& out4) -{ - butterfly5<N / 2, inverse>(in0, in1, in2, in3, in4, out0, out1, out2, out3, out4); -} -template <bool inverse, typename T, size_t N> -KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, - const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, - const vec<T, N>& in5, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, - vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5) -{ - butterfly6<N / 2, inverse>(in0, in1, in2, in3, in4, in5, out0, out1, out2, out3, out4, out5); -} -template <bool inverse, typename T, size_t N> -KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, - const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, - const vec<T, N>& in5, const vec<T, N>& in6, vec<T, N>& out0, vec<T, N>& out1, - vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, - vec<T, N>& out6) -{ - butterfly7<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, out0, out1, out2, out3, out4, out5, out6); -} -template <bool inverse, typename T, size_t N> -KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, - const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, - const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, - vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, - vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7) -{ - butterfly8<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4, out5, - out6, out7); -} -template <bool inverse, typename T, size_t N> -KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, - const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, - const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, - const vec<T, N>& in8, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, - vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, - vec<T, N>& out7, vec<T, N>& out8) -{ - butterfly9<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, out0, out1, out2, out3, out4, - out5, out6, out7, out8); -} -template <bool inverse, typename T, size_t N> -KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, - const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, - const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, - const vec<T, N>& in8, const vec<T, N>& in9, vec<T, N>& out0, vec<T, N>& out1, - vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, - vec<T, N>& out6, vec<T, N>& out7, vec<T, N>& out8, vec<T, N>& out9) -{ - butterfly10<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, out0, out1, out2, out3, - out4, out5, out6, out7, out8, out9); -} -template <bool inverse, typename T, size_t N> -KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, - const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, - const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, - const vec<T, N>& in8, const vec<T, N>& in9, const vec<T, N>& in10, - vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, - vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7, - vec<T, N>& out8, vec<T, N>& out9, vec<T, N>& out10) -{ - butterfly11<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, out0, out1, out2, - out3, out4, out5, out6, out7, out8, out9, out10); -} -template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()> -KFR_INTRINSIC void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w) -{ - vec<T, Nout> temp = read(cunaligned, csize<Nout>, ptr_cast<T>(ptr)); - if constexpr (transposed) - temp = ctranspose<sizeof...(N)>(temp); - split(temp, w...); -} - -// Warning: Reads past the end. Use with care -KFR_INTRINSIC void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0, - cvec<f32, 4>& w1, cvec<f32, 4>& w2) -{ - cvec<f32, 4> w3; - cvec<f32, 16> v16 = concat(cread<4>(ptr), cread<4>(ptr + 3), cread<4>(ptr + 6), cread<4>(ptr + 9)); - v16 = digitreverse4<2>(v16); - split<f32, 32>(v16, w0, w1, w2, w3); -} - -KFR_INTRINSIC void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0, - cvec<f32, 4>& w1, cvec<f32, 4>& w2, cvec<f32, 4>& w3, cvec<f32, 4>& w4) -{ - cvec<f32, 16> v16 = concat(cread<4>(ptr), cread<4>(ptr + 5), cread<4>(ptr + 10), cread<4>(ptr + 15)); - v16 = digitreverse4<2>(v16); - split<f32, 32>(v16, w0, w1, w2, w3); - w4 = cgather<4, 5>(ptr + 4); -} - -template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()> -KFR_INTRINSIC void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N>... args) -{ - auto temp = concat(args...); - if constexpr (transposed) - temp = ctransposeinverse<sizeof...(N)>(temp); - write(ptr_cast<T>(ptr), temp); -} - -template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2> -KFR_INTRINSIC vec<T, N> mul_tw(cbool_t<false>, const vec<T, N>& x, const complex<T>* twiddle) -{ - return I == 0 ? x : cmul(x, cread<width>(twiddle + width * (I - 1))); -} -template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2> -KFR_INTRINSIC vec<T, N> mul_tw(cbool_t<true>, const vec<T, N>& x, const complex<T>* twiddle) -{ - return I == 0 ? x : cmul_conj(x, cread<width>(twiddle + width * (I - 1))); -} - -// Non-final -template <typename T, size_t width, size_t radix, bool inverse, size_t... I> -KFR_INTRINSIC void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>, - cbool_t<inverse>, complex<T>* out, const complex<T>* in, - const complex<T>* tw, size_t stride) -{ - carray<cvec<T, width>, radix> inout; - - swallow{ (inout.get(csize_t<I>()) = cread<width>(in + i + stride * I))... }; - - butterfly(cbool_t<inverse>(), inout.template get<I>()..., inout.template get<I>()...); - - swallow{ ( - cwrite<width>(out + i + stride * I, - mul_tw<I, radix>(cbool_t<inverse>(), inout.template get<I>(), tw + i * (radix - 1))), - 0)... }; -} - -// Final -template <typename T, size_t width, size_t radix, bool inverse, size_t... I> -KFR_INTRINSIC void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>, - cbool_t<inverse>, complex<T>* out, const complex<T>* in, size_t stride) -{ - carray<cvec<T, width>, radix> inout; - - // swallow{ ( inout.get( csize<I> ) = infn( i, I, cvec<T, width>( ) ) )... }; - cread_transposed(ctrue, in + i * radix, inout.template get<I>()...); - - butterfly(cbool_t<inverse>(), inout.template get<I>()..., inout.template get<I>()...); - - swallow{ (cwrite<width>(out + i + stride * I, inout.get(csize_t<I>())), 0)... }; -} - -template <size_t width, size_t radix, typename... Args> -KFR_INTRINSIC void butterfly(size_t i, csize_t<width>, csize_t<radix>, Args&&... args) -{ - butterfly_helper(csizeseq_t<radix>(), i, csize_t<width>(), csize_t<radix>(), std::forward<Args>(args)...); -} - -template <typename... Args> -KFR_INTRINSIC void butterfly_cycle(size_t&, size_t, csize_t<0>, Args&&...) -{ -} -template <size_t width, typename... Args> -KFR_INTRINSIC void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&... args) -{ - CMT_LOOP_NOUNROLL - for (; i < count / width * width; i += width) - butterfly(i, csize_t<width>(), std::forward<Args>(args)...); - butterfly_cycle(i, count, csize_t<width / 2>(), std::forward<Args>(args)...); -} - -template <size_t width, typename... Args> -KFR_INTRINSIC void butterflies(size_t count, csize_t<width>, Args&&... args) -{ - CMT_ASSUME(count > 0); - size_t i = 0; - butterfly_cycle(i, count, csize_t<width>(), std::forward<Args>(args)...); -} - -template <typename T, bool inverse, typename Tradix, typename Tstride> -KFR_INTRINSIC void generic_butterfly_cycle(csize_t<0>, Tradix, cbool_t<inverse>, complex<T>*, - const complex<T>*, Tstride, size_t, size_t, const complex<T>*, - size_t) -{ -} - -template <size_t width, bool inverse, typename T, typename Tradix, typename Thalfradix, - typename Thalfradixsqr, typename Tstride> -KFR_INTRINSIC void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t<inverse>, complex<T>* out, - const complex<T>* in, Tstride ostride, Thalfradix halfradix, - Thalfradixsqr halfradix_sqr, const complex<T>* twiddle, size_t i) -{ - CMT_LOOP_NOUNROLL - for (; i < halfradix / width * width; i += width) - { - const cvec<T, 1> in0 = cread<1>(in); - cvec<T, width> sum0 = resize<2 * width>(in0); - cvec<T, width> sum1 = sum0; - - for (size_t j = 0; j < halfradix; j++) - { - const cvec<T, 1> ina = cread<1>(in + (1 + j)); - const cvec<T, 1> inb = cread<1>(in + radix - (j + 1)); - cvec<T, width> tw = cread<width>(twiddle); - if constexpr (inverse) - tw = negodd /*cconj*/ (tw); - - cmul_2conj(sum0, sum1, ina, inb, tw); - twiddle += halfradix; - } - twiddle = twiddle - halfradix_sqr + width; - - if (is_constant_val(ostride)) - { - cwrite<width>(out + (1 + i), sum0); - cwrite<width>(out + (radix - (i + 1)) - (width - 1), reverse<2>(sum1)); - } - else - { - cscatter<width>(out + (i + 1) * ostride, ostride, sum0); - cscatter<width>(out + (radix - (i + 1)) * ostride - (width - 1) * ostride, ostride, - reverse<2>(sum1)); - } - } - generic_butterfly_cycle(csize_t<width / 2>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix, - halfradix_sqr, twiddle, i); -} - -template <typename T> -KFR_INTRINSIC vec<T, 2> hcadd(vec<T, 2> value) -{ - return value; -} -template <typename T, size_t N, KFR_ENABLE_IF(N >= 4)> -KFR_INTRINSIC vec<T, 2> hcadd(vec<T, N> value) -{ - return hcadd(low(value) + high(value)); -} - -template <size_t width, typename T, bool inverse, typename Tstride = csize_t<1>> -KFR_INTRINSIC void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in, - const complex<T>* twiddle, Tstride ostride = Tstride{}) -{ - CMT_ASSUME(radix > 0); - { - cvec<T, width> sum = T(); - size_t j = 0; - CMT_LOOP_NOUNROLL - for (; j < radix / width * width; j += width) - { - sum += cread<width>(in + j); - } - cvec<T, 1> sums = T(); - CMT_LOOP_NOUNROLL - for (; j < radix; j++) - { - sums += cread<1>(in + j); - } - cwrite<1>(out, hcadd(sum) + sums); - } - const auto halfradix = radix / 2; - CMT_ASSUME(halfradix > 0); - size_t i = 0; - - generic_butterfly_cycle(csize_t<width>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix, - halfradix * halfradix, twiddle, i); -} - -template <size_t width, size_t radix, typename T, bool inverse, typename Tstride = csize_t<1>> -KFR_INTRINSIC void spec_generic_butterfly_w(csize_t<radix>, cbool_t<inverse>, complex<T>* out, - const complex<T>* in, const complex<T>* twiddle, - Tstride ostride = Tstride{}) -{ - { - cvec<T, width> sum = T(); - size_t j = 0; - CMT_LOOP_UNROLL - for (; j < radix / width * width; j += width) - { - sum += cread<width>(in + j); - } - cvec<T, 1> sums = T(); - CMT_LOOP_UNROLL - for (; j < radix; j++) - { - sums += cread<1>(in + j); - } - cwrite<1>(out, hcadd(sum) + sums); - } - const size_t halfradix = radix / 2; - const size_t halfradix_sqr = halfradix * halfradix; - CMT_ASSUME(halfradix > 0); - size_t i = 0; - - generic_butterfly_cycle(csize_t<width>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix, - halfradix_sqr, twiddle, i); -} - -template <typename T, bool inverse, typename Tstride = csize_t<1>> -KFR_INTRINSIC void generic_butterfly(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in, - complex<T>*, const complex<T>* twiddle, Tstride ostride = {}) -{ - cswitch( - csizes_t<11, 13>(), radix, - [&](auto radix_) CMT_INLINE_LAMBDA - { - constexpr size_t width = vector_width<T>; - spec_generic_butterfly_w<width>(radix_, cbool_t<inverse>(), out, in, twiddle, ostride); - }, - [&]() CMT_INLINE_LAMBDA - { - constexpr size_t width = vector_width<T>; - generic_butterfly_w<width>(radix, cbool_t<inverse>(), out, in, twiddle, ostride); - }); -} - -template <typename T, size_t N> -constexpr cvec<T, N> cmask08 = broadcast<N * 2, T>(T(), -T()); - -template <typename T, size_t N> -constexpr cvec<T, N> cmask0088 = broadcast<N * 4, T>(T(), T(), -T(), -T()); - -template <bool A = false, typename T, size_t N> -KFR_INTRINSIC void cbitreverse_write(complex<T>* dest, const vec<T, N>& x) -{ - cwrite<N / 2, A>(dest, bitreverse<2>(x)); -} - -template <bool A = false, typename T, size_t N> -KFR_INTRINSIC void cdigitreverse4_write(complex<T>* dest, const vec<T, N>& x) -{ - cwrite<N / 2, A>(dest, digitreverse4<2>(x)); -} - -template <size_t N, bool A = false, typename T> -KFR_INTRINSIC cvec<T, N> cbitreverse_read(const complex<T>* src) -{ - return bitreverse<2>(cread<N, A>(src)); -} - -template <size_t N, bool A = false, typename T> -KFR_INTRINSIC cvec<T, N> cdigitreverse4_read(const complex<T>* src) -{ - return digitreverse4<2>(cread<N, A>(src)); -} - -#if 1 - -template <> -KFR_INTRINSIC cvec<f64, 16> cdigitreverse4_read<16, false, f64>(const complex<f64>* src) -{ - return concat(cread<1>(src + 0), cread<1>(src + 4), cread<1>(src + 8), cread<1>(src + 12), - cread<1>(src + 1), cread<1>(src + 5), cread<1>(src + 9), cread<1>(src + 13), - cread<1>(src + 2), cread<1>(src + 6), cread<1>(src + 10), cread<1>(src + 14), - cread<1>(src + 3), cread<1>(src + 7), cread<1>(src + 11), cread<1>(src + 15)); -} -template <> -KFR_INTRINSIC void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, const vec<f64, 32>& x) -{ - cwrite<1>(dest, part<16, 0>(x)); - cwrite<1>(dest + 4, part<16, 1>(x)); - cwrite<1>(dest + 8, part<16, 2>(x)); - cwrite<1>(dest + 12, part<16, 3>(x)); - - cwrite<1>(dest + 1, part<16, 4>(x)); - cwrite<1>(dest + 5, part<16, 5>(x)); - cwrite<1>(dest + 9, part<16, 6>(x)); - cwrite<1>(dest + 13, part<16, 7>(x)); - - cwrite<1>(dest + 2, part<16, 8>(x)); - cwrite<1>(dest + 6, part<16, 9>(x)); - cwrite<1>(dest + 10, part<16, 10>(x)); - cwrite<1>(dest + 14, part<16, 11>(x)); - - cwrite<1>(dest + 3, part<16, 12>(x)); - cwrite<1>(dest + 7, part<16, 13>(x)); - cwrite<1>(dest + 11, part<16, 14>(x)); - cwrite<1>(dest + 15, part<16, 15>(x)); -} -#endif -} // namespace intrinsics -} // namespace CMT_ARCH_NAME -} // namespace kfr - -CMT_PRAGMA_MSVC(warning(pop)) - -CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/dsp/impl/dsp-impl.cpp b/include/kfr/dsp/impl/dsp-impl.cpp @@ -1,28 +0,0 @@ -#include "../biquad.hpp" -#include "../fir.hpp" - -namespace kfr -{ -inline namespace CMT_ARCH_NAME -{ -template <typename U, typename T> -filter<U>* make_fir_filter(const univector_ref<const T>& taps) -{ - return new fir_filter<T, U>(taps); -} - -template filter<float>* make_fir_filter<float, float>(const univector_ref<const float>&); -template filter<double>* make_fir_filter<double, double>(const univector_ref<const double>&); -template filter<float>* make_fir_filter<float, double>(const univector_ref<const double>&); - -template <typename T, size_t maxfiltercount> -KFR_FUNCTION filter<T>* make_biquad_filter(const biquad_params<T>* bq, size_t count) -{ - return new biquad_filter<T, maxfiltercount>(bq, count); -} - -template filter<float>* make_biquad_filter<float, 64>(const biquad_params<float>* bq, size_t count); -template filter<double>* make_biquad_filter<double, 64>(const biquad_params<double>* bq, size_t count); - -} // namespace CMT_ARCH_NAME -} // namespace kfr -\ No newline at end of file diff --git a/include/kfr/io/audiofile.hpp b/include/kfr/io/audiofile.hpp @@ -32,35 +32,6 @@ #include "../simd/vec.hpp" #include "file.hpp" -#if !defined CMT_ARCH_SSE2 && !defined CMT_ARCH_ARM64 -#define DR_MP3_NO_SIMD 1 -#define DR_FLAC_NO_SIMD 1 -#endif - -#if !defined CMT_ARCH_SSE2 -#define DRFLAC_NO_SSE2 1 -#endif - -#if !defined CMT_ARCH_SSE41 -#define DRFLAC_NO_SSE41 1 -#endif - -#ifndef KFR_DISABLE_WAV -#define DR_WAV_NO_STDIO -#define DR_WAV_NO_CONVERSION_API -#include "dr/dr_wav.h" -#endif -#ifndef KFR_DISABLE_FLAC -#define DR_FLAC_NO_STDIO -#define DR_FLAC_NO_CONVERSION_API -#include "dr/dr_flac.h" -#endif -#ifndef KFR_DISABLE_MP3 -#define DR_MP3_NO_STDIO -#define DR_MP3_NO_CONVERSION_API -#include "dr/dr_mp3.h" -#endif - namespace kfr { @@ -122,113 +93,33 @@ struct audio_writer : public abstract_writer<T> virtual void close() = 0; }; -namespace internal_generic -{ #ifndef KFR_DISABLE_WAV -static inline size_t drwav_writer_write_proc(abstract_writer<void>* file, const void* pData, - size_t bytesToWrite) -{ - return file->write(pData, bytesToWrite); -} -static inline drwav_bool32 drwav_writer_seek_proc(abstract_writer<void>* file, int offset, - drwav_seek_origin origin) -{ - return file->seek(offset, origin == drwav_seek_origin_start ? seek_origin::begin : seek_origin::current); -} -static inline size_t drwav_reader_read_proc(abstract_reader<void>* file, void* pBufferOut, size_t bytesToRead) -{ - return file->read(pBufferOut, bytesToRead); -} -static inline drwav_bool32 drwav_reader_seek_proc(abstract_reader<void>* file, int offset, - drwav_seek_origin origin) -{ - return file->seek(offset, origin == drwav_seek_origin_start ? seek_origin::begin : seek_origin::current); -} -#endif -#ifndef KFR_DISABLE_FLAC -static inline size_t drflac_reader_read_proc(abstract_reader<void>* file, void* pBufferOut, - size_t bytesToRead) -{ - return file->read(pBufferOut, bytesToRead); -} -static inline drflac_bool32 drflac_reader_seek_proc(abstract_reader<void>* file, int offset, - drflac_seek_origin origin) -{ - return file->seek(offset, origin == drflac_seek_origin_start ? seek_origin::begin : seek_origin::current); -} -#endif -#ifndef KFR_DISABLE_MP3 -static inline size_t drmp3_reader_read_proc(abstract_reader<void>* file, void* pBufferOut, size_t bytesToRead) + +namespace internal_generic { - return file->read(pBufferOut, bytesToRead); -} -static inline drmp3_bool32 drmp3_reader_seek_proc(abstract_reader<void>* file, int offset, - drmp3_seek_origin origin) +struct wav_file; +struct wav_file_deleter { - return file->seek(offset, origin == drmp3_seek_origin_start ? seek_origin::begin : seek_origin::current); -} -#endif - + void operator()(wav_file*); +}; } // namespace internal_generic -#ifndef KFR_DISABLE_WAV /// @brief WAV format writer template <typename T> struct audio_writer_wav : audio_writer<T> { /// @brief Constructs WAV writer using target writer and format - audio_writer_wav(std::shared_ptr<abstract_writer<>>&& writer, const audio_format& fmt) - : writer(std::move(writer)), fmt(fmt) - { - drwav_data_format wav_fmt; - wav_fmt.channels = static_cast<drwav_uint32>(fmt.channels); - wav_fmt.sampleRate = static_cast<drwav_uint32>(fmt.samplerate); - wav_fmt.format = - fmt.type >= audio_sample_type::first_float ? DR_WAVE_FORMAT_IEEE_FLOAT : DR_WAVE_FORMAT_PCM; - wav_fmt.bitsPerSample = static_cast<drwav_uint32>(audio_sample_bit_depth(fmt.type)); - wav_fmt.container = fmt.use_w64 ? drwav_container_w64 : drwav_container_riff; - closed = !drwav_init_write(&f, &wav_fmt, (drwav_write_proc)&internal_generic::drwav_writer_write_proc, - (drwav_seek_proc)&internal_generic::drwav_writer_seek_proc, - this->writer.get(), nullptr); - } - ~audio_writer_wav() override { close(); } + audio_writer_wav(std::shared_ptr<abstract_writer<>>&& writer, const audio_format& fmt); + ~audio_writer_wav() override; using audio_writer<T>::write; /// @brief Write data to underlying binary writer /// data is PCM samples in interleaved format /// size is the number of samples (PCM frames * channels) - size_t write(const T* data, size_t size) override - { - if (closed) - return 0; - if (fmt.type == audio_sample_type::unknown) - return 0; - if (fmt.type == audio_sample_traits<T>::type) - { - const size_t sz = drwav_write_pcm_frames_le(&f, size, data); - fmt.length += sz; - return sz * fmt.channels; - } - else - { - univector<uint8_t> native(size * audio_sample_sizeof(fmt.type)); - convert(native.data(), fmt.type, data, size); - const size_t sz = drwav_write_pcm_frames_le(&f, size / fmt.channels, native.data()); - fmt.length += sz; - return sz * fmt.channels; - } - } + size_t write(const T* data, size_t size) override; - void close() override - { - if (!closed) - { - drwav_uninit(&f); - writer.reset(); - closed = true; - } - } + void close() override; const audio_format_and_length& format() const override { return fmt; } @@ -238,11 +129,16 @@ struct audio_writer_wav : audio_writer<T> private: std::shared_ptr<abstract_writer<>> writer; - drwav f; + std::unique_ptr<internal_generic::wav_file, internal_generic::wav_file_deleter> f; audio_format_and_length fmt; - bool closed = false; }; +extern template struct audio_writer_wav<i16>; +extern template struct audio_writer_wav<i24>; +extern template struct audio_writer_wav<i32>; +extern template struct audio_writer_wav<f32>; +extern template struct audio_writer_wav<f64>; + /// @brief WAV format reader template <typename T> struct audio_reader_wav : audio_reader<T> @@ -250,249 +146,131 @@ struct audio_reader_wav : audio_reader<T> using audio_reader<T>::read; /// @brief Constructs WAV reader - audio_reader_wav(std::shared_ptr<abstract_reader<>>&& reader) : reader(std::move(reader)) - { - drwav_init(&f, (drwav_read_proc)&internal_generic::drwav_reader_read_proc, - (drwav_seek_proc)&internal_generic::drwav_reader_seek_proc, this->reader.get(), nullptr); - fmt.channels = f.channels; - fmt.samplerate = f.sampleRate; - fmt.length = static_cast<imax>(f.totalPCMFrameCount); - switch (f.translatedFormatTag) - { - case DR_WAVE_FORMAT_IEEE_FLOAT: - switch (f.bitsPerSample) - { - case 32: - fmt.type = audio_sample_type::f32; - break; - case 64: - fmt.type = audio_sample_type::f64; - break; - default: - fmt.type = audio_sample_type::unknown; - break; - } - break; - case DR_WAVE_FORMAT_PCM: - switch (f.bitsPerSample) - { - case 8: - fmt.type = audio_sample_type::i8; - break; - case 16: - fmt.type = audio_sample_type::i16; - break; - case 24: - fmt.type = audio_sample_type::i24; - break; - case 32: - fmt.type = audio_sample_type::i32; - break; - case 64: - fmt.type = audio_sample_type::i64; - break; - default: - fmt.type = audio_sample_type::unknown; - break; - } - break; - default: - fmt.type = audio_sample_type::unknown; - break; - } - } - ~audio_reader_wav() override { drwav_uninit(&f); } + audio_reader_wav(std::shared_ptr<abstract_reader<>>&& reader); + ~audio_reader_wav() override; + + /// @brief Reads and decodes audio data + size_t read(T* data, size_t size) override; + + /// @brief Seeks to specific sample + bool seek(imax offset, seek_origin origin) override; /// @brief Returns audio format description const audio_format_and_length& format() const override { return fmt; } - /// @brief Reads and decodes audio data - size_t read(T* data, size_t size) override - { - if (fmt.type == audio_sample_type::unknown) - return 0; - if (fmt.type == audio_sample_traits<T>::type) - { - const size_t sz = drwav_read_pcm_frames(&f, size / fmt.channels, data); - position += sz; - return sz * fmt.channels; - } - else - { - univector<uint8_t> native(size * audio_sample_sizeof(fmt.type)); - const size_t sz = drwav_read_pcm_frames(&f, size / fmt.channels, native.data()); - position += sz; - convert(data, native.data(), fmt.type, sz * fmt.channels); - return sz * fmt.channels; - } - } - /// @brief Returns current position imax tell() const override { return position; } - /// @brief Seeks to specific sample - bool seek(imax offset, seek_origin origin) override - { - switch (origin) - { - case seek_origin::current: - return drwav_seek_to_pcm_frame(&f, static_cast<drmp3_uint64>(this->position + offset)); - case seek_origin::begin: - return drwav_seek_to_pcm_frame(&f, static_cast<drmp3_uint64>(offset)); - case seek_origin::end: - return drwav_seek_to_pcm_frame(&f, static_cast<drmp3_uint64>(fmt.length + offset)); - } - return false; - } - private: std::shared_ptr<abstract_reader<>> reader; - drwav f; + std::unique_ptr<internal_generic::wav_file> f; audio_format_and_length fmt; imax position = 0; }; + +extern template struct audio_reader_wav<i16>; +extern template struct audio_reader_wav<i24>; +extern template struct audio_reader_wav<i32>; +extern template struct audio_reader_wav<f32>; +extern template struct audio_reader_wav<f64>; #endif #ifndef KFR_DISABLE_FLAC +namespace internal_generic +{ +struct flac_file; +struct flac_file_deleter +{ + void operator()(flac_file*); +}; +} // namespace internal_generic + /// @brief FLAC format reader template <typename T> struct audio_reader_flac : audio_reader<T> { /// @brief Constructs FLAC reader - audio_reader_flac(std::shared_ptr<abstract_reader<>>&& reader) : reader(std::move(reader)) - { - f = drflac_open((drflac_read_proc)&internal_generic::drflac_reader_read_proc, - (drflac_seek_proc)&internal_generic::drflac_reader_seek_proc, this->reader.get(), - nullptr); - fmt.channels = f->channels; - fmt.samplerate = f->sampleRate; - fmt.length = static_cast<imax>(f->totalPCMFrameCount); - fmt.type = audio_sample_type::i32; - } - ~audio_reader_flac() override { drflac_close(f); } + audio_reader_flac(std::shared_ptr<abstract_reader<>>&& reader); + ~audio_reader_flac() override; + + /// @brief Reads and decodes audio data + size_t read(T* data, size_t size) override; + + /// @brief Seeks to specific sample + bool seek(imax offset, seek_origin origin) override; /// @brief Returns audio format description const audio_format_and_length& format() const override { return fmt; } - /// @brief Reads and decodes audio data - size_t read(T* data, size_t size) override - { - if (fmt.type == audio_sample_type::unknown) - return 0; - if (audio_sample_traits<T>::type == audio_sample_type::i32) - { - const size_t sz = - drflac_read_pcm_frames_s32(f, size / fmt.channels, reinterpret_cast<i32*>(data)); - position += sz; - return sz * fmt.channels; - } - else - { - univector<i32> native(size * sizeof(i32)); - const size_t sz = drflac_read_pcm_frames_s32(f, size / fmt.channels, native.data()); - position += sz; - convert(data, native.data(), sz * fmt.channels); - return sz * fmt.channels; - } - } - /// @brief Returns current position imax tell() const override { return position; } - /// @brief Seeks to specific sample - bool seek(imax offset, seek_origin origin) override - { - switch (origin) - { - case seek_origin::current: - return drflac_seek_to_pcm_frame(f, static_cast<drmp3_uint64>(this->position + offset)); - case seek_origin::begin: - return drflac_seek_to_pcm_frame(f, static_cast<drmp3_uint64>(offset)); - case seek_origin::end: - return drflac_seek_to_pcm_frame(f, static_cast<drmp3_uint64>(fmt.length + offset)); - } - return false; - } - private: std::shared_ptr<abstract_reader<>> reader; - drflac* f; + std::unique_ptr<internal_generic::flac_file, internal_generic::flac_file_deleter> f; audio_format_and_length fmt; imax position = 0; }; + +extern template struct audio_reader_flac<i16>; +extern template struct audio_reader_flac<i24>; +extern template struct audio_reader_flac<i32>; +extern template struct audio_reader_flac<f32>; +extern template struct audio_reader_flac<f64>; #endif #ifndef KFR_DISABLE_MP3 +struct mp3_config +{ + uint32_t outputChannels; + uint32_t outputSampleRate; +}; + +namespace internal_generic +{ +struct mp3_file; +struct mp3_file_deleter +{ + void operator()(mp3_file*); +}; +} // namespace internal_generic + /// @brief MP3 format reader template <typename T> struct audio_reader_mp3 : audio_reader<T> { /// @brief Constructs MP3 reader - audio_reader_mp3(std::shared_ptr<abstract_reader<>>&& reader) : reader(std::move(reader)) - { - drmp3_init(&f, (drmp3_read_proc)&internal_generic::drmp3_reader_read_proc, - (drmp3_seek_proc)&internal_generic::drmp3_reader_seek_proc, this->reader.get(), &config, - nullptr); - fmt.channels = f.channels; - fmt.samplerate = f.sampleRate; - fmt.length = static_cast<imax>(drmp3_get_pcm_frame_count(&f)); - fmt.type = audio_sample_type::i16; - } - ~audio_reader_mp3() override { drmp3_uninit(&f); } + audio_reader_mp3(std::shared_ptr<abstract_reader<>>&& reader); + ~audio_reader_mp3() override; + + /// @brief Reads and decodes audio data + size_t read(T* data, size_t size) override; + + /// @brief Seeks to specific sample + bool seek(imax offset, seek_origin origin) override; - drmp3_config config{ 0, 0 }; + mp3_config config{ 0, 0 }; /// @brief Returns audio format description const audio_format_and_length& format() const override { return fmt; } - /// @brief Reads and decodes audio data - size_t read(T* data, size_t size) override - { - if (fmt.type == audio_sample_type::unknown) - return 0; - if (audio_sample_traits<T>::type == audio_sample_type::i16) - { - const size_t sz = - drmp3_read_pcm_frames_s16(&f, size / fmt.channels, reinterpret_cast<i16*>(data)); - position += sz; - return sz * fmt.channels; - } - else - { - univector<i16> native(size * sizeof(i16)); - const size_t sz = drmp3_read_pcm_frames_s16(&f, size / fmt.channels, native.data()); - position += sz; - convert(data, native.data(), sz * fmt.channels); - return sz * fmt.channels; - } - } - /// @brief Returns current position imax tell() const override { return position; } - /// @brief Seeks to specific sample - bool seek(imax offset, seek_origin origin) override - { - switch (origin) - { - case seek_origin::current: - return drmp3_seek_to_pcm_frame(&f, static_cast<drmp3_uint64>(this->position + offset)); - case seek_origin::begin: - return drmp3_seek_to_pcm_frame(&f, static_cast<drmp3_uint64>(offset)); - case seek_origin::end: - return drmp3_seek_to_pcm_frame(&f, static_cast<drmp3_uint64>(fmt.length + offset)); - } - return false; - } - private: std::shared_ptr<abstract_reader<>> reader; - drmp3 f; + std::unique_ptr<internal_generic::mp3_file, internal_generic::mp3_file_deleter> f; audio_format_and_length fmt; imax position = 0; }; + +extern template struct audio_reader_mp3<i16>; +extern template struct audio_reader_mp3<i24>; +extern template struct audio_reader_mp3<i32>; +extern template struct audio_reader_mp3<f32>; +extern template struct audio_reader_mp3<f64>; #endif } // namespace kfr diff --git a/include/kfr/io/impl/audiofile-impl.cpp b/include/kfr/io/impl/audiofile-impl.cpp @@ -1,49 +0,0 @@ -/** @addtogroup io - * @{ - */ -/* - Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) - This file is part of KFR - - KFR is free software: you can redistribute it and/or modify - it under the terms of the GNU General Public License as published by - the Free Software Foundation, either version 2 of the License, or - (at your option) any later version. - - KFR is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - GNU General Public License for more details. - - You should have received a copy of the GNU General Public License - along with KFR. - - If GPL is not suitable for your project, you must purchase a commercial license to use KFR. - Buying a commercial license is mandatory as soon as you develop commercial activities without - disclosing the source code of your own applications. - See https://www.kfrlib.com for details. - */ - -#include "../audiofile.hpp" -CMT_PRAGMA_GNU(GCC diagnostic push) -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wimplicit-fallthrough") -CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunused-function") - -#ifndef KFR_DISABLE_WAV -#define DR_WAV_NO_STDIO -#define DR_WAV_NO_CONVERSION_API -#define DR_WAV_IMPLEMENTATION -#include "../dr/dr_wav.h" -#endif -#ifndef KFR_DISABLE_FLAC -#define DR_FLAC_IMPLEMENTATION -#define DR_FLAC_NO_STDIO -#include "../dr/dr_flac.h" -#endif -#ifndef KFR_DISABLE_MP3 -#define DR_MP3_IMPLEMENTATION -#define DR_MP3_NO_STDIO -#include "../dr/dr_mp3.h" -#endif - -CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/sources.cmake b/sources.cmake @@ -22,6 +22,7 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/kfr.h ${PROJECT_SOURCE_DIR}/include/kfr/base/basic_expressions.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/conversion.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/base/endianness.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/expression.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/filter.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/fraction.hpp @@ -37,6 +38,7 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/base/small_buffer.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/state_holder.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/tensor.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/base/transpose.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/univector.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/static_array.hpp ${PROJECT_SOURCE_DIR}/include/kfr/cometa/array.hpp @@ -54,15 +56,6 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/dft/convolution.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dft/fft.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dft/reference_dft.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/data/bitrev.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/data/sincos.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/bitrev.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-fft.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-templates.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-impl.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-templates.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/ft.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dsp/biquad.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dsp/biquad_design.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dsp/dcremove.hpp @@ -89,9 +82,6 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/io/file.hpp ${PROJECT_SOURCE_DIR}/include/kfr/io/python_plot.hpp ${PROJECT_SOURCE_DIR}/include/kfr/io/tostring.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_flac.h - ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_mp3.h - ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_wav.h ${PROJECT_SOURCE_DIR}/include/kfr/math/asin_acos.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/atan.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/compiletime.hpp @@ -240,6 +230,7 @@ set( KFR_BASE_SRC ${PROJECT_SOURCE_DIR}/include/kfr/base/basic_expressions.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/conversion.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/base/endianness.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/expression.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/filter.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/fraction.hpp @@ -255,6 +246,7 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/base/small_buffer.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/state_holder.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/tensor.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/base/transpose.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/univector.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/static_array.hpp ) @@ -290,9 +282,6 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/io/file.hpp ${PROJECT_SOURCE_DIR}/include/kfr/io/python_plot.hpp ${PROJECT_SOURCE_DIR}/include/kfr/io/tostring.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_flac.h - ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_mp3.h - ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_wav.h ) @@ -331,6 +320,7 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/kfr.h ${PROJECT_SOURCE_DIR}/include/kfr/base/basic_expressions.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/conversion.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/base/endianness.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/expression.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/filter.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/fraction.hpp @@ -346,6 +336,7 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/base/small_buffer.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/state_holder.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/tensor.hpp + ${PROJECT_SOURCE_DIR}/include/kfr/base/transpose.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/univector.hpp ${PROJECT_SOURCE_DIR}/include/kfr/base/impl/static_array.hpp ${PROJECT_SOURCE_DIR}/include/kfr/cometa/array.hpp @@ -363,15 +354,6 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/dft/convolution.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dft/fft.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dft/reference_dft.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/data/bitrev.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/data/sincos.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/bitrev.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-fft.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-templates.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-impl.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-templates.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/ft.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dsp/biquad.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dsp/biquad_design.hpp ${PROJECT_SOURCE_DIR}/include/kfr/dsp/dcremove.hpp @@ -398,9 +380,6 @@ set( ${PROJECT_SOURCE_DIR}/include/kfr/io/file.hpp ${PROJECT_SOURCE_DIR}/include/kfr/io/python_plot.hpp ${PROJECT_SOURCE_DIR}/include/kfr/io/tostring.hpp - ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_flac.h - ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_mp3.h - ${PROJECT_SOURCE_DIR}/include/kfr/io/dr/dr_wav.h ${PROJECT_SOURCE_DIR}/include/kfr/math/asin_acos.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/atan.hpp ${PROJECT_SOURCE_DIR}/include/kfr/math/compiletime.hpp @@ -475,17 +454,17 @@ set( set( KFR_DFT_SRC - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/convolution-impl.cpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl-f32.cpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/dft-impl-f64.cpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-impl-f32.cpp - ${PROJECT_SOURCE_DIR}/include/kfr/dft/impl/fft-impl-f64.cpp + ${PROJECT_SOURCE_DIR}/src/dft/convolution-impl.cpp + ${PROJECT_SOURCE_DIR}/src/dft/dft-impl-f32.cpp + ${PROJECT_SOURCE_DIR}/src/dft/dft-impl-f64.cpp + ${PROJECT_SOURCE_DIR}/src/dft/fft-impl-f32.cpp + ${PROJECT_SOURCE_DIR}/src/dft/fft-impl-f64.cpp ) set( KFR_IO_SRC - ${PROJECT_SOURCE_DIR}/include/kfr/io/impl/audiofile-impl.cpp + ${PROJECT_SOURCE_DIR}/src/io/audiofile-impl.cpp ) diff --git a/src/capi/CMakeLists.txt b/src/capi/CMakeLists.txt @@ -0,0 +1,127 @@ +# Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) +# This file is part of KFR +# +# KFR is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License, or +# (at your option) any later version. +# +# KFR is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with KFR. + +cmake_minimum_required(VERSION 3.12) + +if (WIN32) + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) +endif () + +set(CMAKE_CXX_VISIBILITY_PRESET "default") +set(CMAKE_C_VISIBILITY_PRESET "default") + +if (CMAKE_CXX_COMPILER_ID MATCHES "Clang") + add_compile_options(-fdiagnostics-absolute-paths) +endif () + +if (MSVC) + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /MTd") +endif () + +if (APPLE) + add_compile_options(-mmacosx-version-min=10.9) +endif () + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELWITHDEBINFO ${PROJECT_BINARY_DIR}/bin) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/bin) +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib) +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_RELWITHDEBINFO ${PROJECT_BINARY_DIR}/lib) +set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/lib) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_RELWITHDEBINFO ${PROJECT_BINARY_DIR}/lib) +set(CMAKE_LIBRARY_OUTPUT_DIRECTORY_DEBUG ${PROJECT_BINARY_DIR}/lib) + +add_library(kfr_capi_all INTERFACE) +target_link_libraries(kfr_capi_all INTERFACE kfr) +if (NOT WIN32) + add_library(kfr_capi_all_pic INTERFACE) + target_link_libraries(kfr_capi_all_pic INTERFACE kfr) +endif () + +function (add_c_library ARCH) + add_library(kfr_capi_${ARCH} STATIC ${KFR_DFT_SRC} dsp.cpp) + target_link_libraries(kfr_capi_${ARCH} kfr) + target_set_arch(kfr_capi_${ARCH} PRIVATE ${ARCH}) + target_link_libraries(kfr_capi_all INTERFACE kfr_capi_${ARCH}) + dft_compile_options(kfr_capi_${ARCH}) + + if (NOT WIN32) + add_library(kfr_capi_${ARCH}_pic STATIC ${KFR_DFT_SRC} dsp.cpp) + set_property(TARGET kfr_capi_${ARCH}_pic + PROPERTY POSITION_INDEPENDENT_CODE 1) + target_link_libraries(kfr_capi_${ARCH}_pic kfr) + target_set_arch(kfr_capi_${ARCH}_pic PRIVATE ${ARCH}) + + target_link_libraries(kfr_capi_all_pic INTERFACE kfr_capi_${ARCH}_pic) + dft_compile_options(kfr_capi_${ARCH}_pic) + endif () + +endfunction () + +add_library(kfr_capi SHARED capi.cpp) + +add_c_library(sse2) +add_c_library(sse41) +add_c_library(avx) +add_c_library(avx2) +add_c_library(avx512) + +link_as_whole(kfr_capi_all INTERFACE kfr_capi_sse2) +if (NOT WIN32) + link_as_whole(kfr_capi_all_pic INTERFACE kfr_capi_sse2_pic) +endif () + +target_compile_definitions( + kfr_capi + PRIVATE -DKFR_DFT_MULTI=1 + -DCMT_MULTI=1 + -DCMT_MULTI_ENABLED_SSE2=1 + -DCMT_MULTI_ENABLED_SSE41=1 + -DCMT_MULTI_ENABLED_AVX=1 + -DCMT_MULTI_ENABLED_AVX2=1 + -DCMT_MULTI_ENABLED_AVX512=1 + -DKFR_BUILDING_DLL=1) + +target_set_arch(kfr_capi PRIVATE sse2) + +if (WIN32) + target_link_libraries(kfr_capi PRIVATE kfr kfr_capi_all) +else () + target_link_libraries(kfr_capi PRIVATE kfr kfr_capi_all_pic) + + if (APPLE) + message( + STATUS + "Minimum macOS version is set to ${CMAKE_OSX_DEPLOYMENT_TARGET}" + ) + message(STATUS "Set CMAKE_OSX_DEPLOYMENT_TARGET variable to change") + else () + set_property( + TARGET kfr_capi + APPEND + PROPERTY LINK_LIBRARIES + -nodefaultlibs + -Wl,-Bdynamic + -lm + -lc + -Wl,-Bstatic + -lstdc++ + -lgcc + -s) + endif () +endif () diff --git a/capi/capi.cpp b/src/capi/capi.cpp diff --git a/src/capi/dsp.cpp b/src/capi/dsp.cpp @@ -0,0 +1,28 @@ +#include <kfr/dsp/biquad.hpp> +#include <kfr/dsp/fir.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +template <typename U, typename T> +filter<U>* make_fir_filter(const univector_ref<const T>& taps) +{ + return new fir_filter<T, U>(taps); +} + +template filter<float>* make_fir_filter<float, float>(const univector_ref<const float>&); +template filter<double>* make_fir_filter<double, double>(const univector_ref<const double>&); +template filter<float>* make_fir_filter<float, double>(const univector_ref<const double>&); + +template <typename T, size_t maxfiltercount> +KFR_FUNCTION filter<T>* make_biquad_filter(const biquad_params<T>* bq, size_t count) +{ + return new biquad_filter<T, maxfiltercount>(bq, count); +} + +template filter<float>* make_biquad_filter<float, 64>(const biquad_params<float>* bq, size_t count); +template filter<double>* make_biquad_filter<double, 64>(const biquad_params<double>* bq, size_t count); + +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/src/dft/CMakeLists.txt b/src/dft/CMakeLists.txt @@ -0,0 +1,74 @@ +cmake_minimum_required(VERSION 3.12) + +set(DFT_LIBS) + +if (KFR_ENABLE_DFT_MULTIARCH) + add_library(kfr_dft INTERFACE) + add_library(kfr_dft_all INTERFACE) + target_link_libraries(kfr_dft INTERFACE kfr kfr_dft_all) + target_compile_definitions( + kfr_dft + INTERFACE -DKFR_DFT_MULTI=1 + -DCMT_MULTI=1 + -DCMT_MULTI_ENABLED_SSE2=1 + -DCMT_MULTI_ENABLED_SSE41=1 + -DCMT_MULTI_ENABLED_AVX=1 + -DCMT_MULTI_ENABLED_AVX2=1 + -DCMT_MULTI_ENABLED_AVX512=1) + + add_arch_library(kfr_dft sse2 "${KFR_DFT_SRC}" "") + add_arch_library(kfr_dft sse41 "${KFR_DFT_SRC}" "") + add_arch_library(kfr_dft avx "${KFR_DFT_SRC}" "") + add_arch_library(kfr_dft avx2 "${KFR_DFT_SRC}" "") + add_arch_library(kfr_dft avx512 "${KFR_DFT_SRC}" "") + list( + APPEND + DFT_LIBS + kfr_dft_sse2 + kfr_dft_sse41 + kfr_dft_avx + kfr_dft_avx2 + kfr_dft_avx512) + + link_as_whole(kfr_dft_all INTERFACE kfr_dft_sse2) + +else () + add_library(kfr_dft ${KFR_DFT_SRC}) + target_link_libraries(kfr_dft kfr use_arch) + if (KFR_ENABLE_DFT_NP) + target_compile_definitions(kfr_dft PUBLIC -DKFR_DFT_NPo2) + else () + target_compile_definitions(kfr_dft PUBLIC -DKFR_DFT_NO_NPo2) + endif () + list(APPEND DFT_LIBS kfr_dft) +endif () + +function (dft_compile_options LIB) + if (MSVC AND CLANG) + target_compile_options(${LIB} PRIVATE SHELL:-Xclang -ffp-contract=fast + -Xclang -O3) + else () + target_compile_options(${LIB} PRIVATE -ffp-contract=fast -O3) + endif () +endfunction () + +foreach (LIB IN LISTS DFT_LIBS) + dft_compile_options(${LIB}) +endforeach () + +if (KFR_INSTALL_LIBRARIES) + if (KFR_ENABLE_DFT_MULTIARCH) + install( + TARGETS kfr_dft_sse2 kfr_dft_sse41 kfr_dft_avx kfr_dft_avx2 + kfr_dft_avx512 + ARCHIVE DESTINATION lib + LIBRARY DESTINATION lib + RUNTIME DESTINATION bin) + else () + install( + TARGETS kfr_dft + ARCHIVE DESTINATION lib + LIBRARY DESTINATION lib + RUNTIME DESTINATION bin) + endif () +endif () diff --git a/src/dft/bitrev.hpp b/src/dft/bitrev.hpp @@ -0,0 +1,480 @@ +/** @addtogroup dft + * @{ + */ +/* + Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include <kfr/simd/complex.hpp> +#include <kfr/simd/constants.hpp> +#include <kfr/simd/digitreverse.hpp> +#include <kfr/simd/vec.hpp> + +#include "data/bitrev.hpp" + +#include "ft.hpp" + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +constexpr inline static bool fft_reorder_aligned = false; + +constexpr inline static size_t bitrev_table_log2N = ilog2(arraysize(data::bitrev_table)); + +template <size_t Bits> +CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x) +{ + if constexpr (Bits > bitrev_table_log2N) + return bitreverse<Bits>(x); + + return data::bitrev_table[x] >> (bitrev_table_log2N - Bits); +} + +template <bool use_table> +CMT_GNU_CONSTEXPR inline u32 bitrev_using_table(u32 x, size_t bits, cbool_t<use_table>) +{ + if constexpr (use_table) + { + return data::bitrev_table[x] >> (bitrev_table_log2N - bits); + } + else + { + return bitreverse<32>(x) >> (32 - bits); + } +} + +CMT_GNU_CONSTEXPR inline u32 dig4rev_using_table(u32 x, size_t bits) +{ + if (bits > bitrev_table_log2N) + { + if (bits <= 16) + return digitreverse4<16>(x) >> (16 - bits); + else + return digitreverse4<32>(x) >> (32 - bits); + } + + x = data::bitrev_table[x]; + x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1)); + x = x >> (bitrev_table_log2N - bits); + return x; +} + +template <size_t log2n, size_t bitrev, typename T> +KFR_INTRINSIC void fft_reorder_swap(T* inout, size_t i) +{ + using cxx = cvec<T, 16>; + constexpr size_t N = 1 << log2n; + constexpr size_t N4 = 2 * N / 4; + + cxx vi = cread_group<4, 4, N4 / 2, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i)); + vi = digitreverse<bitrev, 2>(vi); + cwrite_group<4, 4, N4 / 2, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), vi); +} + +template <size_t log2n, size_t bitrev, typename T> +KFR_INTRINSIC void fft_reorder_swap_two(T* inout, size_t i, size_t j) +{ + CMT_ASSUME(i != j); + using cxx = cvec<T, 16>; + constexpr size_t N = 1 << log2n; + constexpr size_t N4 = 2 * N / 4; + + cxx vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4 / 2); + cxx vj = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4 / 2); + + vi = digitreverse<bitrev, 2>(vi); + cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4 / 2, vi); + vj = digitreverse<bitrev, 2>(vj); + cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4 / 2, vj); +} + +template <size_t log2n, size_t bitrev, typename T> +KFR_INTRINSIC void fft_reorder_swap(T* inout, size_t i, size_t j) +{ + CMT_ASSUME(i != j); + using cxx = cvec<T, 16>; + constexpr size_t N = 1 << log2n; + constexpr size_t N4 = 2 * N / 4; + + cxx vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4 / 2); + cxx vj = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4 / 2); + + vi = digitreverse<bitrev, 2>(vi); + cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4 / 2, vi); + vj = digitreverse<bitrev, 2>(vj); + cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4 / 2, vj); +} + +template <size_t log2n, size_t bitrev, typename T> +KFR_INTRINSIC void fft_reorder_swap(complex<T>* inout, size_t i) +{ + fft_reorder_swap<log2n, bitrev>(ptr_cast<T>(inout), i * 2); +} + +template <size_t log2n, size_t bitrev, typename T> +KFR_INTRINSIC void fft_reorder_swap_two(complex<T>* inout, size_t i0, size_t i1) +{ + fft_reorder_swap_two<log2n, bitrev>(ptr_cast<T>(inout), i0 * 2, i1 * 2); +} + +template <size_t log2n, size_t bitrev, typename T> +KFR_INTRINSIC void fft_reorder_swap(complex<T>* inout, size_t i, size_t j) +{ + fft_reorder_swap<log2n, bitrev>(ptr_cast<T>(inout), i * 2, j * 2); +} + +template <typename T> +KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<11>) +{ + fft_reorder_swap_two<11>(inout, 0 * 4, 8 * 4); + fft_reorder_swap<11>(inout, 1 * 4, 64 * 4); + fft_reorder_swap<11>(inout, 2 * 4, 32 * 4); + fft_reorder_swap<11>(inout, 3 * 4, 96 * 4); + fft_reorder_swap<11>(inout, 4 * 4, 16 * 4); + fft_reorder_swap<11>(inout, 5 * 4, 80 * 4); + fft_reorder_swap<11>(inout, 6 * 4, 48 * 4); + fft_reorder_swap<11>(inout, 7 * 4, 112 * 4); + fft_reorder_swap<11>(inout, 9 * 4, 72 * 4); + fft_reorder_swap<11>(inout, 10 * 4, 40 * 4); + fft_reorder_swap<11>(inout, 11 * 4, 104 * 4); + fft_reorder_swap<11>(inout, 12 * 4, 24 * 4); + fft_reorder_swap<11>(inout, 13 * 4, 88 * 4); + fft_reorder_swap<11>(inout, 14 * 4, 56 * 4); + fft_reorder_swap<11>(inout, 15 * 4, 120 * 4); + fft_reorder_swap<11>(inout, 17 * 4, 68 * 4); + fft_reorder_swap<11>(inout, 18 * 4, 36 * 4); + fft_reorder_swap<11>(inout, 19 * 4, 100 * 4); + fft_reorder_swap_two<11>(inout, 20 * 4, 28 * 4); + fft_reorder_swap<11>(inout, 21 * 4, 84 * 4); + fft_reorder_swap<11>(inout, 22 * 4, 52 * 4); + fft_reorder_swap<11>(inout, 23 * 4, 116 * 4); + fft_reorder_swap<11>(inout, 25 * 4, 76 * 4); + fft_reorder_swap<11>(inout, 26 * 4, 44 * 4); + fft_reorder_swap<11>(inout, 27 * 4, 108 * 4); + fft_reorder_swap<11>(inout, 29 * 4, 92 * 4); + fft_reorder_swap<11>(inout, 30 * 4, 60 * 4); + fft_reorder_swap<11>(inout, 31 * 4, 124 * 4); + fft_reorder_swap<11>(inout, 33 * 4, 66 * 4); + fft_reorder_swap_two<11>(inout, 34 * 4, 42 * 4); + fft_reorder_swap<11>(inout, 35 * 4, 98 * 4); + fft_reorder_swap<11>(inout, 37 * 4, 82 * 4); + fft_reorder_swap<11>(inout, 38 * 4, 50 * 4); + fft_reorder_swap<11>(inout, 39 * 4, 114 * 4); + fft_reorder_swap<11>(inout, 41 * 4, 74 * 4); + fft_reorder_swap<11>(inout, 43 * 4, 106 * 4); + fft_reorder_swap<11>(inout, 45 * 4, 90 * 4); + fft_reorder_swap<11>(inout, 46 * 4, 58 * 4); + fft_reorder_swap<11>(inout, 47 * 4, 122 * 4); + fft_reorder_swap<11>(inout, 49 * 4, 70 * 4); + fft_reorder_swap<11>(inout, 51 * 4, 102 * 4); + fft_reorder_swap<11>(inout, 53 * 4, 86 * 4); + fft_reorder_swap_two<11>(inout, 54 * 4, 62 * 4); + fft_reorder_swap<11>(inout, 55 * 4, 118 * 4); + fft_reorder_swap<11>(inout, 57 * 4, 78 * 4); + fft_reorder_swap<11>(inout, 59 * 4, 110 * 4); + fft_reorder_swap<11>(inout, 61 * 4, 94 * 4); + fft_reorder_swap<11>(inout, 63 * 4, 126 * 4); + fft_reorder_swap_two<11>(inout, 65 * 4, 73 * 4); + fft_reorder_swap<11>(inout, 67 * 4, 97 * 4); + fft_reorder_swap<11>(inout, 69 * 4, 81 * 4); + fft_reorder_swap<11>(inout, 71 * 4, 113 * 4); + fft_reorder_swap<11>(inout, 75 * 4, 105 * 4); + fft_reorder_swap<11>(inout, 77 * 4, 89 * 4); + fft_reorder_swap<11>(inout, 79 * 4, 121 * 4); + fft_reorder_swap<11>(inout, 83 * 4, 101 * 4); + fft_reorder_swap_two<11>(inout, 85 * 4, 93 * 4); + fft_reorder_swap<11>(inout, 87 * 4, 117 * 4); + fft_reorder_swap<11>(inout, 91 * 4, 109 * 4); + fft_reorder_swap<11>(inout, 95 * 4, 125 * 4); + fft_reorder_swap_two<11>(inout, 99 * 4, 107 * 4); + fft_reorder_swap<11>(inout, 103 * 4, 115 * 4); + fft_reorder_swap<11>(inout, 111 * 4, 123 * 4); + fft_reorder_swap_two<11>(inout, 119 * 4, 127 * 4); +} + +template <typename T> +KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<7>) +{ + constexpr size_t bitrev = 2; + fft_reorder_swap_two<7, bitrev>(inout, 0 * 4, 2 * 4); + fft_reorder_swap<7, bitrev>(inout, 1 * 4, 4 * 4); + fft_reorder_swap<7, bitrev>(inout, 3 * 4, 6 * 4); + fft_reorder_swap_two<7, bitrev>(inout, 5 * 4, 7 * 4); +} + +template <typename T> +KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>, cfalse_t /* use_br2 */) +{ + constexpr size_t bitrev = 4; + fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 5 * 4); + fft_reorder_swap<8, bitrev>(inout, 1 * 4, 4 * 4); + fft_reorder_swap<8, bitrev>(inout, 2 * 4, 8 * 4); + fft_reorder_swap<8, bitrev>(inout, 3 * 4, 12 * 4); + fft_reorder_swap<8, bitrev>(inout, 6 * 4, 9 * 4); + fft_reorder_swap<8, bitrev>(inout, 7 * 4, 13 * 4); + fft_reorder_swap_two<8, bitrev>(inout, 10 * 4, 15 * 4); + fft_reorder_swap<8, bitrev>(inout, 11 * 4, 14 * 4); +} + +template <typename T> +KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<8>, ctrue_t /* use_br2 */) +{ + constexpr size_t bitrev = 2; + fft_reorder_swap_two<8, bitrev>(inout, 0 * 4, 6 * 4); + fft_reorder_swap<8, bitrev>(inout, 1 * 4, 8 * 4); + fft_reorder_swap<8, bitrev>(inout, 2 * 4, 4 * 4); + fft_reorder_swap<8, bitrev>(inout, 3 * 4, 12 * 4); + fft_reorder_swap<8, bitrev>(inout, 5 * 4, 10 * 4); + fft_reorder_swap<8, bitrev>(inout, 7 * 4, 14 * 4); + fft_reorder_swap_two<8, bitrev>(inout, 9 * 4, 15 * 4); + fft_reorder_swap<8, bitrev>(inout, 11 * 4, 13 * 4); +} + +template <typename T> +KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<9>) +{ + constexpr size_t bitrev = 2; + fft_reorder_swap_two<9, bitrev>(inout, 0 * 4, 4 * 4); + fft_reorder_swap<9, bitrev>(inout, 1 * 4, 16 * 4); + fft_reorder_swap<9, bitrev>(inout, 2 * 4, 8 * 4); + fft_reorder_swap<9, bitrev>(inout, 3 * 4, 24 * 4); + fft_reorder_swap<9, bitrev>(inout, 5 * 4, 20 * 4); + fft_reorder_swap<9, bitrev>(inout, 6 * 4, 12 * 4); + fft_reorder_swap<9, bitrev>(inout, 7 * 4, 28 * 4); + fft_reorder_swap<9, bitrev>(inout, 9 * 4, 18 * 4); + fft_reorder_swap_two<9, bitrev>(inout, 10 * 4, 14 * 4); + fft_reorder_swap<9, bitrev>(inout, 11 * 4, 26 * 4); + fft_reorder_swap<9, bitrev>(inout, 13 * 4, 22 * 4); + fft_reorder_swap<9, bitrev>(inout, 15 * 4, 30 * 4); + fft_reorder_swap_two<9, bitrev>(inout, 17 * 4, 21 * 4); + fft_reorder_swap<9, bitrev>(inout, 19 * 4, 25 * 4); + fft_reorder_swap<9, bitrev>(inout, 23 * 4, 29 * 4); + fft_reorder_swap_two<9, bitrev>(inout, 27 * 4, 31 * 4); +} + +template <typename T> +KFR_INTRINSIC void fft_reorder(complex<T>* inout, csize_t<10>, ctrue_t /* use_br2 */) +{ + constexpr size_t bitrev = 2; + fft_reorder_swap_two<10, bitrev>(inout, 0 * 4, 12 * 4); + fft_reorder_swap<10, bitrev>(inout, 1 * 4, 32 * 4); + fft_reorder_swap<10, bitrev>(inout, 2 * 4, 16 * 4); + fft_reorder_swap<10, bitrev>(inout, 3 * 4, 48 * 4); + fft_reorder_swap<10, bitrev>(inout, 4 * 4, 8 * 4); + fft_reorder_swap<10, bitrev>(inout, 5 * 4, 40 * 4); + fft_reorder_swap<10, bitrev>(inout, 6 * 4, 24 * 4); + fft_reorder_swap<10, bitrev>(inout, 7 * 4, 56 * 4); + fft_reorder_swap<10, bitrev>(inout, 9 * 4, 36 * 4); + fft_reorder_swap<10, bitrev>(inout, 10 * 4, 20 * 4); + fft_reorder_swap<10, bitrev>(inout, 11 * 4, 52 * 4); + fft_reorder_swap<10, bitrev>(inout, 13 * 4, 44 * 4); + fft_reorder_swap<10, bitrev>(inout, 14 * 4, 28 * 4); + fft_reorder_swap<10, bitrev>(inout, 15 * 4, 60 * 4); + fft_reorder_swap<10, bitrev>(inout, 17 * 4, 34 * 4); + fft_reorder_swap_two<10, bitrev>(inout, 18 * 4, 30 * 4); + fft_reorder_swap<10, bitrev>(inout, 19 * 4, 50 * 4); + fft_reorder_swap<10, bitrev>(inout, 21 * 4, 42 * 4); + fft_reorder_swap<10, bitrev>(inout, 22 * 4, 26 * 4); + fft_reorder_swap<10, bitrev>(inout, 23 * 4, 58 * 4); + fft_reorder_swap<10, bitrev>(inout, 25 * 4, 38 * 4); + fft_reorder_swap<10, bitrev>(inout, 27 * 4, 54 * 4); + fft_reorder_swap<10, bitrev>(inout, 29 * 4, 46 * 4); + fft_reorder_swap<10, bitrev>(inout, 31 * 4, 62 * 4); + fft_reorder_swap_two<10, bitrev>(inout, 33 * 4, 45 * 4); + fft_reorder_swap<10, bitrev>(inout, 35 * 4, 49 * 4); + fft_reorder_swap<10, bitrev>(inout, 37 * 4, 41 * 4); + fft_reorder_swap<10, bitrev>(inout, 39 * 4, 57 * 4); + fft_reorder_swap<10, bitrev>(inout, 43 * 4, 53 * 4); + fft_reorder_swap<10, bitrev>(inout, 47 * 4, 61 * 4); + fft_reorder_swap_two<10, bitrev>(inout, 51 * 4, 63 * 4); + fft_reorder_swap<10, bitrev>(inout, 55 * 4, 59 * 4); +} + +template <typename T, bool use_br2> +KFR_INTRINSIC void cwrite_reordered(T* out, const cvec<T, 16>& value, size_t N4, cbool_t<use_br2>) +{ + cwrite_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(out), N4, + digitreverse<(use_br2 ? 2 : 4), 2>(value)); +} + +template <typename T, bool use_br2> +KFR_INTRINSIC void fft_reorder_swap_n4(T* inout, size_t i, size_t j, size_t N4, cbool_t<use_br2>) +{ + CMT_ASSUME(i != j); + const cvec<T, 16> vi = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + i), N4); + const cvec<T, 16> vj = cread_group<4, 4, fft_reorder_aligned>(ptr_cast<complex<T>>(inout + j), N4); + cwrite_reordered(inout + j, vi, N4, cbool_t<use_br2>()); + cwrite_reordered(inout + i, vj, N4, cbool_t<use_br2>()); +} + +template <typename T, bool use_table> +KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2, cbool_t<use_table>) +{ + const size_t N = size_t(1) << log2n; + const size_t N4 = N / 4; + const size_t iend = N / 16 * 4 * 2; + constexpr size_t istep = 2 * 4; + const size_t jstep1 = (1 << (log2n - 5)) * 4 * 2; + const size_t jstep2 = size_t(size_t(1) << (log2n - 5)) * 4 * 2 - size_t(size_t(1) << (log2n - 6)) * 4 * 2; + T* io = ptr_cast<T>(inout); + + for (size_t i = 0; i < iend;) + { + size_t j = bitrev_using_table(static_cast<u32>(i >> 3), log2n - 4, cbool<use_table>) << 3; + if (i >= j) + { + fft_reorder_swap_n4(io, i, j, N4, use_br2); + } + else + { + i += 4 * istep; + continue; + } + i += istep; + j = j + jstep1; + + if (i >= j) + { + fft_reorder_swap_n4(io, i, j, N4, use_br2); + } + i += istep; + j = j - jstep2; + + if (i >= j) + { + fft_reorder_swap_n4(io, i, j, N4, use_br2); + } + i += istep; + j = j + jstep1; + + if (i >= j) + { + fft_reorder_swap_n4(io, i, j, N4, use_br2); + } + i += istep; + } +} + +template <typename T> +KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, ctrue_t use_br2) +{ + if (log2n - 4 > bitrev_table_log2N) + { + fft_reorder(inout, log2n, ctrue, cfalse); + } + else + { + fft_reorder(inout, log2n, ctrue, ctrue); + } +} + +template <typename T> +KFR_INTRINSIC void fft_reorder(complex<T>* inout, size_t log2n, cfalse_t use_br2) +{ + const size_t N = size_t(1) << log2n; + const size_t N4 = N / 4; + const size_t N16 = N * 2 / 16; + size_t iend = N16; + constexpr size_t istep = 2 * 4; + const size_t jstep = N / 64 * 4 * 2; + T* io = ptr_cast<T>(inout); + + size_t i = 0; + CMT_PRAGMA_CLANG(clang loop unroll_count(2)) + for (; i < iend;) + { + size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; + + if (i >= j) + fft_reorder_swap_n4(io, i, j, N4, use_br2); + i += istep * 4; + } + iend += N16; + CMT_PRAGMA_CLANG(clang loop unroll_count(2)) + for (; i < iend;) + { + size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; + + fft_reorder_swap_n4(io, i, j, N4, use_br2); + + i += istep; + j = j + jstep; + + if (i >= j) + fft_reorder_swap_n4(io, i, j, N4, use_br2); + i += istep * 3; + } + iend += N16; + CMT_PRAGMA_CLANG(clang loop unroll_count(2)) + for (; i < iend;) + { + size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; + + fft_reorder_swap_n4(io, i, j, N4, use_br2); + + i += istep; + j = j + jstep; + + fft_reorder_swap_n4(io, i, j, N4, use_br2); + + i += istep; + j = j + jstep; + + if (i >= j) + fft_reorder_swap_n4(io, i, j, N4, use_br2); + i += istep * 2; + } + iend += N16; + CMT_PRAGMA_CLANG(clang loop unroll_count(2)) + for (; i < iend;) + { + size_t j = dig4rev_using_table(static_cast<u32>(i >> 3), log2n - 4) << 3; + + fft_reorder_swap_n4(io, i, j, N4, use_br2); + + i += istep; + j = j + jstep; + + fft_reorder_swap_n4(io, i, j, N4, use_br2); + + i += istep; + j = j + jstep; + + fft_reorder_swap_n4(io, i, j, N4, use_br2); + + i += istep; + j = j + jstep; + + if (i >= j) + fft_reorder_swap_n4(io, i, j, N4, use_br2); + i += istep; + } +} +} // namespace intrinsics +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/src/dft/convolution-impl.cpp b/src/dft/convolution-impl.cpp @@ -0,0 +1,307 @@ +/** @addtogroup dft + * @{ + */ +/* + Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#include <kfr/base/simd_expressions.hpp> +#include <kfr/simd/complex.hpp> +#include <kfr/dft/convolution.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +namespace intrinsics +{ + +template <typename T> +univector<T> convolve(const univector_ref<const T>& src1, const univector_ref<const T>& src2) +{ + using ST = subtype<T>; + const size_t size = next_poweroftwo(src1.size() + src2.size() - 1); + univector<complex<ST>> src1padded = src1; + univector<complex<ST>> src2padded = src2; + src1padded.resize(size); + src2padded.resize(size); + + dft_plan_ptr<ST> dft = dft_cache::instance().get(ctype_t<ST>(), size); + univector<u8> temp(dft->temp_size); + dft->execute(src1padded, src1padded, temp); + dft->execute(src2padded, src2padded, temp); + src1padded = src1padded * src2padded; + dft->execute(src1padded, src1padded, temp, true); + const ST invsize = reciprocal<ST>(static_cast<ST>(size)); + return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize; +} + +template <typename T> +univector<T> correlate(const univector_ref<const T>& src1, const univector_ref<const T>& src2) +{ + using ST = subtype<T>; + const size_t size = next_poweroftwo(src1.size() + src2.size() - 1); + univector<complex<ST>> src1padded = src1; + univector<complex<ST>> src2padded = reverse(src2); + src1padded.resize(size); + src2padded.resize(size); + dft_plan_ptr<ST> dft = dft_cache::instance().get(ctype_t<ST>(), size); + univector<u8> temp(dft->temp_size); + dft->execute(src1padded, src1padded, temp); + dft->execute(src2padded, src2padded, temp); + src1padded = src1padded * src2padded; + dft->execute(src1padded, src1padded, temp, true); + const ST invsize = reciprocal<ST>(static_cast<ST>(size)); + return truncate(real(src1padded), src1.size() + src2.size() - 1) * invsize; +} + +template <typename T> +univector<T> autocorrelate(const univector_ref<const T>& src1) +{ + univector<T> result = correlate(src1, src1); + result = result.slice(result.size() / 2); + return result; +} + +} // namespace intrinsics + +template <typename T> +convolve_filter<T>::convolve_filter(size_t size_, size_t block_size_) + : data_size(size_), block_size(next_poweroftwo(block_size_)), fft(2 * block_size), temp(fft.temp_size), + segments((data_size + block_size - 1) / block_size), position(0), ir_segments(segments.size()), + saved_input(block_size), input_position(0), premul(fft.csize()), cscratch(fft.csize()), + scratch1(fft.size), scratch2(fft.size), overlap(block_size) +{ +} + +template <typename T> +convolve_filter<T>::convolve_filter(const univector_ref<const T>& data, size_t block_size_) + : convolve_filter(data.size(), block_size_) +{ + set_data(data); +} + +template <typename T> +void convolve_filter<T>::set_data(const univector_ref<const T>& data) +{ + data_size = data.size(); + segments.resize((data_size + block_size - 1) / block_size); + ir_segments.resize(segments.size()); + univector<T> input(fft.size); + const ST ifftsize = reciprocal(static_cast<ST>(fft.size)); + for (size_t i = 0; i < ir_segments.size(); i++) + { + segments[i].resize(fft.csize()); + ir_segments[i].resize(fft.csize()); + input = padded(data.slice(i * block_size, block_size)); + + fft.execute(ir_segments[i], input, temp); + process(ir_segments[i], ir_segments[i] * ifftsize); + } + reset(); +} + +template <typename T> +void convolve_filter<T>::process_buffer(T* output, const T* input, size_t size) +{ + // Note that the conditionals in the following algorithm are meant to + // reduce complexity in the common cases of either processing complete + // blocks (processing == block_size) or only one segment. + + // For complex filtering, use CCs pack format to omit special processing in fft_multiply[_accumulate]. + const dft_pack_format fft_multiply_pack = this->real_fft ? dft_pack_format::Perm : dft_pack_format::CCs; + + size_t processed = 0; + while (processed < size) + { + // Calculate how many samples to process this iteration. + auto const processing = std::min(size - processed, block_size - input_position); + + // Prepare input to forward FFT: + if (processing == block_size) + { + // No need to work with saved_input. + builtin_memcpy(scratch1.data(), input + processed, processing * sizeof(T)); + } + else + { + // Append this iteration's input to the saved_input current block. + builtin_memcpy(saved_input.data() + input_position, input + processed, processing * sizeof(T)); + builtin_memcpy(scratch1.data(), saved_input.data(), block_size * sizeof(T)); + } + + // Forward FFT saved_input block. + fft.execute(segments[position], scratch1, temp); + + if (segments.size() == 1) + { + // Just one segment/block of history. + // Y_k = H * X_k + fft_multiply(cscratch, ir_segments[0], segments[0], fft_multiply_pack); + } + else + { + // More than one segment/block of history so this is more involved. + if (input_position == 0) + { + // At the start of an input block, we premultiply the history from + // previous input blocks with the extended filter blocks. + + // Y_(k-i,i) = H_i * X_(k-i) + // premul += Y_(k-i,i) for i=1,...,N + + fft_multiply(premul, ir_segments[1], segments[(position + 1) % segments.size()], + fft_multiply_pack); + for (size_t i = 2; i < segments.size(); i++) + { + const size_t n = (position + i) % segments.size(); + fft_multiply_accumulate(premul, ir_segments[i], segments[n], fft_multiply_pack); + } + } + // Y_(k,0) = H_0 * X_k + // Y_k = premul + Y_(k,0) + fft_multiply_accumulate(cscratch, premul, ir_segments[0], segments[position], fft_multiply_pack); + } + // y_k = IFFT( Y_k ) + fft.execute(scratch2, cscratch, temp, cinvert_t{}); + + // z_k = y_k + overlap + process(make_univector(output + processed, processing), + scratch2.slice(input_position, processing) + overlap.slice(input_position, processing)); + + input_position += processing; + processed += processing; + + // If a whole block was processed, prepare for next block. + if (input_position == block_size) + { + // Input block k is complete. Move to (k+1)-th input block. + input_position = 0; + + // Zero out the saved_input if it will be used in the next iteration. + auto const remaining = size - processed; + if (remaining < block_size && remaining > 0) + { + process(saved_input, zeros()); + } + + builtin_memcpy(overlap.data(), scratch2.data() + block_size, block_size * sizeof(T)); + + position = position > 0 ? position - 1 : segments.size() - 1; + } + } +} + +template <typename T> +void convolve_filter<T>::reset() +{ + for (auto& segment : segments) + { + process(segment, zeros()); + } + position = 0; + process(saved_input, zeros()); + input_position = 0; + process(overlap, zeros()); +} + +namespace intrinsics +{ + +template univector<float> convolve<float>(const univector_ref<const float>&, + const univector_ref<const float>&); +template univector<complex<float>> convolve<complex<float>>(const univector_ref<const complex<float>>&, + const univector_ref<const complex<float>>&); +template univector<float> correlate<float>(const univector_ref<const float>&, + const univector_ref<const float>&); +template univector<complex<float>> correlate<complex<float>>(const univector_ref<const complex<float>>&, + const univector_ref<const complex<float>>&); + +template univector<float> autocorrelate<float>(const univector_ref<const float>&); +template univector<complex<float>> autocorrelate<complex<float>>(const univector_ref<const complex<float>>&); + +} // namespace intrinsics + +template convolve_filter<float>::convolve_filter(size_t, size_t); +template convolve_filter<complex<float>>::convolve_filter(size_t, size_t); + +template convolve_filter<float>::convolve_filter(const univector_ref<const float>&, size_t); +template convolve_filter<complex<float>>::convolve_filter(const univector_ref<const complex<float>>&, size_t); + +template void convolve_filter<float>::set_data(const univector_ref<const float>&); +template void convolve_filter<complex<float>>::set_data(const univector_ref<const complex<float>>&); + +template void convolve_filter<float>::process_buffer(float* output, const float* input, size_t size); +template void convolve_filter<complex<float>>::process_buffer(complex<float>* output, + const complex<float>* input, size_t size); + +template void convolve_filter<float>::reset(); +template void convolve_filter<complex<float>>::reset(); + +namespace intrinsics +{ + +template univector<double> convolve<double>(const univector_ref<const double>&, + const univector_ref<const double>&); +template univector<complex<double>> convolve<complex<double>>(const univector_ref<const complex<double>>&, + const univector_ref<const complex<double>>&); +template univector<double> correlate<double>(const univector_ref<const double>&, + const univector_ref<const double>&); +template univector<complex<double>> correlate<complex<double>>(const univector_ref<const complex<double>>&, + const univector_ref<const complex<double>>&); + +template univector<double> autocorrelate<double>(const univector_ref<const double>&); +template univector<complex<double>> autocorrelate<complex<double>>( + const univector_ref<const complex<double>>&); + +} // namespace intrinsics + +template convolve_filter<double>::convolve_filter(size_t, size_t); +template convolve_filter<complex<double>>::convolve_filter(size_t, size_t); + +template convolve_filter<double>::convolve_filter(const univector_ref<const double>&, size_t); +template convolve_filter<complex<double>>::convolve_filter(const univector_ref<const complex<double>>&, + size_t); + +template void convolve_filter<double>::set_data(const univector_ref<const double>&); +template void convolve_filter<complex<double>>::set_data(const univector_ref<const complex<double>>&); + +template void convolve_filter<double>::process_buffer(double* output, const double* input, size_t size); +template void convolve_filter<complex<double>>::process_buffer(complex<double>* output, + const complex<double>* input, size_t size); + +template void convolve_filter<double>::reset(); +template void convolve_filter<complex<double>>::reset(); + +template <typename T> +filter<T>* make_convolve_filter(const univector_ref<const T>& taps, size_t block_size) +{ + return new convolve_filter<T>(taps, block_size); +} + +template filter<float>* make_convolve_filter(const univector_ref<const float>&, size_t); +template filter<complex<float>>* make_convolve_filter(const univector_ref<const complex<float>>&, size_t); +template filter<double>* make_convolve_filter(const univector_ref<const double>&, size_t); +template filter<complex<double>>* make_convolve_filter(const univector_ref<const complex<double>>&, size_t); + +} // namespace CMT_ARCH_NAME +} // namespace kfr diff --git a/include/kfr/dft/data/bitrev.hpp b/src/dft/data/bitrev.hpp diff --git a/src/dft/data/sincos.hpp b/src/dft/data/sincos.hpp @@ -0,0 +1,192 @@ +/* + Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include <kfr/kfr.h> +#include <kfr/simd/types.hpp> +#include <cstdint> + +namespace kfr +{ + +namespace data +{ + +template <typename T> +constexpr inline T c_sin_table[65] = { + /* sin(2*pi* 0/ 256) */ f32(0.0), + /* sin(2*pi* 1/ 256) */ f32(0.02454122852291228803173452945928292506547), + /* sin(2*pi* 2/ 256) */ f32(0.04906767432741801425495497694268265831475), + /* sin(2*pi* 3/ 256) */ f32(0.0735645635996674235294656215752343218133), + /* sin(2*pi* 4/ 256) */ f32(0.09801714032956060199419556388864184586114), + /* sin(2*pi* 5/ 256) */ f32(0.1224106751992161984987044741509457875752), + /* sin(2*pi* 6/ 256) */ f32(0.1467304744553617516588501296467178197062), + /* sin(2*pi* 7/ 256) */ f32(0.1709618887603012263636423572082635319663), + /* sin(2*pi* 8/ 256) */ f32(0.1950903220161282678482848684770222409277), + /* sin(2*pi* 9/ 256) */ f32(0.2191012401568697972277375474973577988484), + /* sin(2*pi* 10/ 256) */ f32(0.242980179903263889948274162077471118321), + /* sin(2*pi* 11/ 256) */ f32(0.2667127574748983863252865151164363940421), + /* sin(2*pi* 12/ 256) */ f32(0.2902846772544623676361923758173952746915), + /* sin(2*pi* 13/ 256) */ f32(0.3136817403988914766564788459941003099934), + /* sin(2*pi* 14/ 256) */ f32(0.3368898533922200506892532126191475704778), + /* sin(2*pi* 15/ 256) */ f32(0.3598950365349881487751045723267564202023), + /* sin(2*pi* 16/ 256) */ f32(0.3826834323650897717284599840303988667613), + /* sin(2*pi* 17/ 256) */ f32(0.4052413140049898709084813055050524665119), + /* sin(2*pi* 18/ 256) */ f32(0.4275550934302820943209668568887985343046), + /* sin(2*pi* 19/ 256) */ f32(0.4496113296546066000462945794242270758832), + /* sin(2*pi* 20/ 256) */ f32(0.4713967368259976485563876259052543776575), + /* sin(2*pi* 21/ 256) */ f32(0.4928981922297840368730266887588092682397), + /* sin(2*pi* 22/ 256) */ f32(0.514102744193221726593693838968815772608), + /* sin(2*pi* 23/ 256) */ f32(0.5349976198870972106630769046370179155603), + /* sin(2*pi* 24/ 256) */ f32(0.5555702330196022247428308139485328743749), + /* sin(2*pi* 25/ 256) */ f32(0.575808191417845300745972453815730841776), + /* sin(2*pi* 26/ 256) */ f32(0.5956993044924333434670365288299698895119), + /* sin(2*pi* 27/ 256) */ f32(0.6152315905806268454849135634139842776594), + /* sin(2*pi* 28/ 256) */ f32(0.6343932841636454982151716132254933706757), + /* sin(2*pi* 29/ 256) */ f32(0.6531728429537767640842030136563054150769), + /* sin(2*pi* 30/ 256) */ f32(0.6715589548470184006253768504274218032288), + /* sin(2*pi* 31/ 256) */ f32(0.6895405447370669246167306299574847028455), + /* sin(2*pi* 32/ 256) */ f32(0.7071067811865475244008443621048490392848), + /* sin(2*pi* 33/ 256) */ f32(0.7242470829514669209410692432905531674831), + /* sin(2*pi* 34/ 256) */ f32(0.740951125354959091175616897495162729729), + /* sin(2*pi* 35/ 256) */ f32(0.7572088465064845475754640536057844730404), + /* sin(2*pi* 36/ 256) */ f32(0.773010453362736960810906609758469800971), + /* sin(2*pi* 37/ 256) */ f32(0.7883464276266062620091647053596892826565), + /* sin(2*pi* 38/ 256) */ f32(0.8032075314806449098066765129631419238796), + /* sin(2*pi* 39/ 256) */ f32(0.817584813151583696504920884130633809471), + /* sin(2*pi* 40/ 256) */ f32(0.8314696123025452370787883776179057567386), + /* sin(2*pi* 41/ 256) */ f32(0.8448535652497070732595712051049570977198), + /* sin(2*pi* 42/ 256) */ f32(0.8577286100002720699022699842847701370425), + /* sin(2*pi* 43/ 256) */ f32(0.8700869911087114186522924044838488439108), + /* sin(2*pi* 44/ 256) */ f32(0.8819212643483550297127568636603883495084), + /* sin(2*pi* 45/ 256) */ f32(0.8932243011955153203424164474933979780006), + /* sin(2*pi* 46/ 256) */ f32(0.9039892931234433315862002972305370487101), + /* sin(2*pi* 47/ 256) */ f32(0.9142097557035306546350148293935774010447), + /* sin(2*pi* 48/ 256) */ f32(0.9238795325112867561281831893967882868224), + /* sin(2*pi* 49/ 256) */ f32(0.932992798834738887711660255543302498295), + /* sin(2*pi* 50/ 256) */ f32(0.9415440651830207784125094025995023571856), + /* sin(2*pi* 51/ 256) */ f32(0.9495281805930366671959360741893450282522), + /* sin(2*pi* 52/ 256) */ f32(0.9569403357322088649357978869802699694828), + /* sin(2*pi* 53/ 256) */ f32(0.9637760657954398666864643555078351536631), + /* sin(2*pi* 54/ 256) */ f32(0.9700312531945439926039842072861002514569), + /* sin(2*pi* 55/ 256) */ f32(0.975702130038528544460395766419527971644), + /* sin(2*pi* 56/ 256) */ f32(0.9807852804032304491261822361342390369739), + /* sin(2*pi* 57/ 256) */ f32(0.9852776423889412447740184331785477871601), + /* sin(2*pi* 58/ 256) */ f32(0.9891765099647809734516737380162430639837), + /* sin(2*pi* 59/ 256) */ f32(0.9924795345987099981567672516611178200108), + /* sin(2*pi* 60/ 256) */ f32(0.9951847266721968862448369531094799215755), + /* sin(2*pi* 61/ 256) */ f32(0.9972904566786902161355971401825678211717), + /* sin(2*pi* 62/ 256) */ f32(0.9987954562051723927147716047591006944432), + /* sin(2*pi* 63/ 256) */ f32(0.9996988186962042201157656496661721968501), + /* sin(2*pi* 64/ 256) */ f32(1.0000000000000000000000000000000000000000) +}; + +// data generated by mpfr +template <> +constexpr inline f64 c_sin_table<f64>[65] = { + /* sin(2*pi* 0/ 256) */ f64(0.0), + /* sin(2*pi* 1/ 256) */ f64(0.02454122852291228803173452945928292506547), + /* sin(2*pi* 2/ 256) */ f64(0.04906767432741801425495497694268265831475), + /* sin(2*pi* 3/ 256) */ f64(0.0735645635996674235294656215752343218133), + /* sin(2*pi* 4/ 256) */ f64(0.09801714032956060199419556388864184586114), + /* sin(2*pi* 5/ 256) */ f64(0.1224106751992161984987044741509457875752), + /* sin(2*pi* 6/ 256) */ f64(0.1467304744553617516588501296467178197062), + /* sin(2*pi* 7/ 256) */ f64(0.1709618887603012263636423572082635319663), + /* sin(2*pi* 8/ 256) */ f64(0.1950903220161282678482848684770222409277), + /* sin(2*pi* 9/ 256) */ f64(0.2191012401568697972277375474973577988484), + /* sin(2*pi* 10/ 256) */ f64(0.242980179903263889948274162077471118321), + /* sin(2*pi* 11/ 256) */ f64(0.2667127574748983863252865151164363940421), + /* sin(2*pi* 12/ 256) */ f64(0.2902846772544623676361923758173952746915), + /* sin(2*pi* 13/ 256) */ f64(0.3136817403988914766564788459941003099934), + /* sin(2*pi* 14/ 256) */ f64(0.3368898533922200506892532126191475704778), + /* sin(2*pi* 15/ 256) */ f64(0.3598950365349881487751045723267564202023), + /* sin(2*pi* 16/ 256) */ f64(0.3826834323650897717284599840303988667613), + /* sin(2*pi* 17/ 256) */ f64(0.4052413140049898709084813055050524665119), + /* sin(2*pi* 18/ 256) */ f64(0.4275550934302820943209668568887985343046), + /* sin(2*pi* 19/ 256) */ f64(0.4496113296546066000462945794242270758832), + /* sin(2*pi* 20/ 256) */ f64(0.4713967368259976485563876259052543776575), + /* sin(2*pi* 21/ 256) */ f64(0.4928981922297840368730266887588092682397), + /* sin(2*pi* 22/ 256) */ f64(0.514102744193221726593693838968815772608), + /* sin(2*pi* 23/ 256) */ f64(0.5349976198870972106630769046370179155603), + /* sin(2*pi* 24/ 256) */ f64(0.5555702330196022247428308139485328743749), + /* sin(2*pi* 25/ 256) */ f64(0.575808191417845300745972453815730841776), + /* sin(2*pi* 26/ 256) */ f64(0.5956993044924333434670365288299698895119), + /* sin(2*pi* 27/ 256) */ f64(0.6152315905806268454849135634139842776594), + /* sin(2*pi* 28/ 256) */ f64(0.6343932841636454982151716132254933706757), + /* sin(2*pi* 29/ 256) */ f64(0.6531728429537767640842030136563054150769), + /* sin(2*pi* 30/ 256) */ f64(0.6715589548470184006253768504274218032288), + /* sin(2*pi* 31/ 256) */ f64(0.6895405447370669246167306299574847028455), + /* sin(2*pi* 32/ 256) */ f64(0.7071067811865475244008443621048490392848), + /* sin(2*pi* 33/ 256) */ f64(0.7242470829514669209410692432905531674831), + /* sin(2*pi* 34/ 256) */ f64(0.740951125354959091175616897495162729729), + /* sin(2*pi* 35/ 256) */ f64(0.7572088465064845475754640536057844730404), + /* sin(2*pi* 36/ 256) */ f64(0.773010453362736960810906609758469800971), + /* sin(2*pi* 37/ 256) */ f64(0.7883464276266062620091647053596892826565), + /* sin(2*pi* 38/ 256) */ f64(0.8032075314806449098066765129631419238796), + /* sin(2*pi* 39/ 256) */ f64(0.817584813151583696504920884130633809471), + /* sin(2*pi* 40/ 256) */ f64(0.8314696123025452370787883776179057567386), + /* sin(2*pi* 41/ 256) */ f64(0.8448535652497070732595712051049570977198), + /* sin(2*pi* 42/ 256) */ f64(0.8577286100002720699022699842847701370425), + /* sin(2*pi* 43/ 256) */ f64(0.8700869911087114186522924044838488439108), + /* sin(2*pi* 44/ 256) */ f64(0.8819212643483550297127568636603883495084), + /* sin(2*pi* 45/ 256) */ f64(0.8932243011955153203424164474933979780006), + /* sin(2*pi* 46/ 256) */ f64(0.9039892931234433315862002972305370487101), + /* sin(2*pi* 47/ 256) */ f64(0.9142097557035306546350148293935774010447), + /* sin(2*pi* 48/ 256) */ f64(0.9238795325112867561281831893967882868224), + /* sin(2*pi* 49/ 256) */ f64(0.932992798834738887711660255543302498295), + /* sin(2*pi* 50/ 256) */ f64(0.9415440651830207784125094025995023571856), + /* sin(2*pi* 51/ 256) */ f64(0.9495281805930366671959360741893450282522), + /* sin(2*pi* 52/ 256) */ f64(0.9569403357322088649357978869802699694828), + /* sin(2*pi* 53/ 256) */ f64(0.9637760657954398666864643555078351536631), + /* sin(2*pi* 54/ 256) */ f64(0.9700312531945439926039842072861002514569), + /* sin(2*pi* 55/ 256) */ f64(0.975702130038528544460395766419527971644), + /* sin(2*pi* 56/ 256) */ f64(0.9807852804032304491261822361342390369739), + /* sin(2*pi* 57/ 256) */ f64(0.9852776423889412447740184331785477871601), + /* sin(2*pi* 58/ 256) */ f64(0.9891765099647809734516737380162430639837), + /* sin(2*pi* 59/ 256) */ f64(0.9924795345987099981567672516611178200108), + /* sin(2*pi* 60/ 256) */ f64(0.9951847266721968862448369531094799215755), + /* sin(2*pi* 61/ 256) */ f64(0.9972904566786902161355971401825678211717), + /* sin(2*pi* 62/ 256) */ f64(0.9987954562051723927147716047591006944432), + /* sin(2*pi* 63/ 256) */ f64(0.9996988186962042201157656496661721968501), + /* sin(2*pi* 64/ 256) */ f64(1.0000000000000000000000000000000000000000) +}; + +} // namespace data + +template <typename T> +constexpr inline T sin_using_table_256(size_t k) +{ + return (k > 128 ? -1 : +1) * data::c_sin_table<T>[k % 128 >= 64 ? 128 - k % 128 : k % 128]; +} + +template <typename T> +constexpr inline T sin_using_table(size_t size, size_t k) +{ + return sin_using_table_256<T>((k * 256 / size) % 256); +} +template <typename T> +constexpr inline T cos_using_table(size_t size, size_t k) +{ + return sin_using_table<T>(size, k + size / 4); +} +} // namespace kfr diff --git a/src/dft/dft-fft.hpp b/src/dft/dft-fft.hpp @@ -0,0 +1,114 @@ +/** @addtogroup dft + * @{ + */ +/* + Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include <kfr/base/basic_expressions.hpp> +#include <kfr/math/complex_math.hpp> +#include <kfr/testo/assert.hpp> +#include <kfr/dft/cache.hpp> +#include <kfr/dft/fft.hpp> +#include "bitrev.hpp" +#include "ft.hpp" + +namespace kfr +{ + +inline namespace CMT_ARCH_NAME +{ +namespace intrinsics +{ +struct name_test_impl +{ +}; +} // namespace intrinsics +} // namespace CMT_ARCH_NAME + +template <typename T, cpu_t cpu> +struct dft_name_impl +{ +}; + +template <typename Class> +inline const char* dft_name(Class*) +{ + constexpr static size_t prefix_len = ctype_name<intrinsics::name_test_impl>().length() - 14; + static constexpr cstring full_name = ctype_name<std::decay_t<Class>>(); + static constexpr cstring name_arch = + concat_cstring(full_name.slice(csize<prefix_len>), make_cstring("("), + make_cstring(CMT_STRINGIFY(CMT_ARCH_NAME)), make_cstring(")")); + return name_arch.c_str(); +} + +#define DFT_STAGE_FN \ + KFR_MEM_INTRINSIC void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) final \ + { \ + return do_execute<false>(out, in, temp); \ + } \ + KFR_MEM_INTRINSIC void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) final \ + { \ + return do_execute<true>(out, in, temp); \ + } +#define DFT_STAGE_FN_NONFINAL \ + void do_execute(cdirect_t, complex<T>* out, const complex<T>* in, u8* temp) override \ + { \ + return do_execute<false>(out, in, temp); \ + } \ + void do_execute(cinvert_t, complex<T>* out, const complex<T>* in, u8* temp) override \ + { \ + return do_execute<true>(out, in, temp); \ + } + +inline namespace CMT_ARCH_NAME +{ + +#define DFT_ASSERT TESTO_ASSERT_INACTIVE + +template <typename T> +constexpr size_t fft_vector_width = vector_width<T>; + +CMT_PRAGMA_GNU(GCC diagnostic push) +#if CMT_HAS_WARNING("-Wassume") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wassume") +#endif + +template <typename Stage, bool add_stages = true, typename T, typename... Args> +void add_stage(dft_plan<T>* plan, Args... args) +{ + dft_stage<T>* stage = new Stage(args...); + stage->need_reorder = true; + plan->data_size += stage->data_size; + plan->temp_size += stage->temp_size; + plan->all_stages.push_back(dft_stage_ptr<T>(stage)); + if constexpr (add_stages) + { + plan->stages[0].push_back(stage); + plan->stages[1].push_back(stage); + } +} + +} // namespace CMT_ARCH_NAME + +} // namespace kfr diff --git a/include/kfr/dft/impl/dft-impl-f32.cpp b/src/dft/dft-impl-f32.cpp diff --git a/include/kfr/dft/impl/dft-impl-f64.cpp b/src/dft/dft-impl-f64.cpp diff --git a/src/dft/dft-impl.hpp b/src/dft/dft-impl.hpp @@ -0,0 +1,568 @@ +/** @addtogroup dft + * @{ + */ +/* + Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include <kfr/base/math_expressions.hpp> +#include <kfr/base/simd_expressions.hpp> +#include "dft-fft.hpp" + +CMT_PRAGMA_GNU(GCC diagnostic push) +#if CMT_HAS_WARNING("-Wshadow") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wshadow") +#endif +#if CMT_HAS_WARNING("-Wunused-lambda-capture") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunused-lambda-capture") +#endif +#if CMT_HAS_WARNING("-Wpass-failed") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpass-failed") +#endif + +CMT_PRAGMA_MSVC(warning(push)) +CMT_PRAGMA_MSVC(warning(disable : 4100)) + +namespace kfr +{ + +inline namespace CMT_ARCH_NAME +{ +constexpr csizes_t<2, 3, 4, 5, 6, 7, 8, 9, 10> dft_radices{}; + +namespace intrinsics +{ + +template <typename T> +void dft_stage_fixed_initialize(dft_stage<T>* stage, size_t width) +{ + complex<T>* twiddle = ptr_cast<complex<T>>(stage->data); + const size_t N = stage->repeats * stage->radix; + const size_t Nord = stage->repeats; + size_t i = 0; + + while (width > 0) + { + CMT_LOOP_NOUNROLL + for (; i < Nord / width * width; i += width) + { + CMT_LOOP_NOUNROLL + for (size_t j = 1; j < stage->radix; j++) + { + CMT_LOOP_NOUNROLL + for (size_t k = 0; k < width; k++) + { + cvec<T, 1> xx = cossin_conj(broadcast<2, T>(c_pi<T, 2> * (i + k) * j / N)); + ref_cast<cvec<T, 1>>(twiddle[k]) = xx; + } + twiddle += width; + } + } + width = width / 2; + } +} + +template <typename T, size_t fixed_radix> +struct dft_stage_fixed_impl : dft_stage<T> +{ + dft_stage_fixed_impl(size_t, size_t iterations, size_t blocks) + { + this->name = dft_name(this); + this->radix = fixed_radix; + this->blocks = blocks; + this->repeats = iterations; + this->recursion = false; // true; + this->stage_size = fixed_radix * iterations * blocks; + this->data_size = align_up((this->repeats * (fixed_radix - 1)) * sizeof(complex<T>), + platform<>::native_cache_alignment); + } + + constexpr static size_t rradix = fixed_radix; + + constexpr static size_t width = fixed_radix >= 7 ? fft_vector_width<T> / 2 + : fixed_radix >= 4 ? fft_vector_width<T> + : fft_vector_width<T> * 2; + virtual void do_initialize(size_t) override final { dft_stage_fixed_initialize(this, width); } + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) + { + const size_t Nord = this->repeats; + const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + + const size_t N = Nord * fixed_radix; + CMT_LOOP_NOUNROLL + for (size_t b = 0; b < this->blocks; b++) + { + butterflies(Nord, csize<width>, csize<fixed_radix>, cbool<inverse>, out, in, twiddle, Nord); + in += N; + out += N; + } + } +}; + +template <typename T, size_t fixed_radix> +struct dft_stage_fixed_final_impl : dft_stage<T> +{ + dft_stage_fixed_final_impl(size_t, size_t iterations, size_t blocks) + { + this->name = dft_name(this); + this->radix = fixed_radix; + this->blocks = blocks; + this->repeats = iterations; + this->stage_size = fixed_radix * iterations * blocks; + this->recursion = false; + this->can_inplace = false; + } + constexpr static size_t width = fixed_radix >= 7 ? fft_vector_width<T> / 2 + : fixed_radix >= 4 ? fft_vector_width<T> + : fft_vector_width<T> * 2; + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) + { + const size_t b = this->blocks; + + butterflies(b, csize<width>, csize<fixed_radix>, cbool<inverse>, out, in, b); + } +}; + +template <typename E> +inline E& apply_conj(E& e, cfalse_t) +{ + return e; +} + +template <typename E> +inline auto apply_conj(E& e, ctrue_t) +{ + return cconj(e); +} + +/// [0, N - 1, N - 2, N - 3, ..., 3, 2, 1] +template <typename E> +struct fft_inverse : expression_with_traits<E> +{ + using value_type = typename expression_with_traits<E>::value_type; + + KFR_MEM_INTRINSIC fft_inverse(E&& expr) CMT_NOEXCEPT : expression_with_traits<E>(std::forward<E>(expr)) {} + + friend KFR_INTRINSIC vec<value_type, 1> get_elements(const fft_inverse& self, shape<1> index, + axis_params<0, 1>) + { + const size_t size = get_shape(self).front(); + return get_elements(self.first(), index.front() == 0 ? 0 : size - index, axis_params<0, 1>()); + } + + template <size_t N> + friend KFR_MEM_INTRINSIC vec<value_type, N> get_elements(const fft_inverse& self, shape<1> index, + axis_params<0, N>) + { + const size_t size = get_shape(self).front(); + if (index.front() == 0) + { + return concat(get_elements(self.first(), index, axis_params<0, 1>()), + reverse(get_elements(self.first(), size - (N - 1), axis_params<0, N - 1>()))); + } + return reverse(get_elements(self.first(), size - index - (N - 1), axis_params<0, N>())); + } +}; + +template <typename E> +inline auto apply_fft_inverse(E&& e) +{ + return fft_inverse<E>(std::forward<E>(e)); +} + +template <typename T> +struct dft_arblen_stage_impl : dft_stage<T> +{ + dft_arblen_stage_impl(size_t size) + : size(size), fftsize(next_poweroftwo(size) * 2), plan(fftsize, dft_order::internal) + { + this->name = dft_name(this); + this->radix = size; + this->blocks = 1; + this->repeats = 1; + this->recursion = false; + this->can_inplace = false; + this->temp_size = plan.temp_size; + this->stage_size = size; + + chirp_ = render(cexp(sqr(linspace(T(1) - size, size - T(1), size * 2 - 1, true, ctrue)) * + complex<T>(0, -1) * c_pi<T> / size)); + + ichirpp_ = render(truncate(padded(1 / slice(chirp_, 0, 2 * size - 1)), fftsize)); + + univector<u8> temp(plan.temp_size); + plan.execute(ichirpp_, ichirpp_, temp); + xp.resize(fftsize, 0); + xp_fft.resize(fftsize); + invN2 = T(1) / fftsize; + } + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) + { + const size_t n = this->size; + + auto&& chirp = apply_conj(chirp_, cbool<inverse>); + + xp.slice(0, n) = make_univector(in, n) * slice(chirp, n - 1); + + plan.execute(xp_fft.data(), xp.data(), temp); + + if (inverse) + xp_fft = xp_fft * cconj(apply_fft_inverse(ichirpp_)); + else + xp_fft = xp_fft * ichirpp_; + plan.execute(xp_fft.data(), xp_fft.data(), temp, ctrue); + + make_univector(out, n) = xp_fft.slice(n - 1, n) * slice(chirp, n - 1, n) * invN2; + } + + const size_t size; + const size_t fftsize; + T invN2; + dft_plan<T> plan; + univector<complex<T>> chirp_; + univector<complex<T>> ichirpp_; + univector<complex<T>> xp; + univector<complex<T>> xp_fft; +}; + +template <typename T, size_t radix1, size_t radix2, size_t size = radix1 * radix2> +struct dft_special_stage_impl : dft_stage<T> +{ + dft_special_stage_impl() : stage1(radix1, size / radix1, 1), stage2(radix2, 1, size / radix2) + { + this->name = dft_name(this); + this->radix = size; + this->blocks = 1; + this->repeats = 1; + this->recursion = false; + this->can_inplace = false; + this->stage_size = size; + this->temp_size = stage1.temp_size + stage2.temp_size + sizeof(complex<T>) * size; + this->data_size = stage1.data_size + stage2.data_size; + } + void dump() const override + { + dft_stage<T>::dump(); + printf(" "); + stage1.dump(); + printf(" "); + stage2.dump(); + } + void do_initialize(size_t stage_size) override + { + stage1.data = this->data; + stage2.data = this->data + stage1.data_size; + stage1.initialize(stage_size); + stage2.initialize(stage_size); + } + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) + { + complex<T>* scratch = ptr_cast<complex<T>>(temp + stage1.temp_size + stage2.temp_size); + stage1.do_execute(cbool<inverse>, scratch, in, temp); + stage2.do_execute(cbool<inverse>, out, scratch, temp + stage1.temp_size); + } + dft_stage_fixed_impl<T, radix1> stage1; + dft_stage_fixed_final_impl<T, radix2> stage2; +}; + +template <typename T, bool final> +struct dft_stage_generic_impl : dft_stage<T> +{ + dft_stage_generic_impl(size_t radix, size_t iterations, size_t blocks) + { + this->name = dft_name(this); + this->radix = radix; + this->blocks = blocks; + this->repeats = iterations; + this->recursion = false; // true; + this->can_inplace = false; + this->stage_size = radix * iterations * blocks; + this->temp_size = align_up(sizeof(complex<T>) * radix, platform<>::native_cache_alignment); + this->data_size = + align_up(sizeof(complex<T>) * sqr(this->radix / 2), platform<>::native_cache_alignment); + } + +protected: + virtual void do_initialize(size_t) override final + { + complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + CMT_LOOP_NOUNROLL + for (size_t i = 0; i < this->radix / 2; i++) + { + CMT_LOOP_NOUNROLL + for (size_t j = 0; j < this->radix / 2; j++) + { + cwrite<1>(twiddle++, cossin_conj(broadcast<2>((i + 1) * (j + 1) * c_pi<T, 2> / this->radix))); + } + } + } + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8* temp) + { + const complex<T>* twiddle = ptr_cast<complex<T>>(this->data); + const size_t bl = this->blocks; + + CMT_LOOP_NOUNROLL + for (size_t b = 0; b < bl; b++) + generic_butterfly(this->radix, cbool<inverse>, out + b, in + b * this->radix, + ptr_cast<complex<T>>(temp), twiddle, bl); + } +}; + +template <typename T, typename Tr2> +inline void dft_permute(complex<T>* out, const complex<T>* in, size_t r0, size_t r1, Tr2 first_radix) +{ + CMT_ASSUME(r0 > 1); + CMT_ASSUME(r1 > 1); + + CMT_LOOP_NOUNROLL + for (size_t p = 0; p < r0; p++) + { + const complex<T>* in1 = in; + CMT_LOOP_NOUNROLL + for (size_t i = 0; i < r1; i++) + { + const complex<T>* in2 = in1; + CMT_LOOP_UNROLL + for (size_t j = 0; j < first_radix; j++) + { + *out++ = *in2; + in2 += r1; + } + in1++; + in += first_radix; + } + } +} + +template <typename T, typename Tr2> +inline void dft_permute_deep(complex<T>*& out, const complex<T>* in, const size_t* radices, size_t count, + size_t index, size_t inscale, size_t inner_size, Tr2 first_radix) +{ + const bool b = index == 1; + const size_t radix = radices[index]; + if (b) + { + CMT_LOOP_NOUNROLL + for (size_t i = 0; i < radix; i++) + { + const complex<T>* in1 = in; + CMT_LOOP_UNROLL + for (size_t j = 0; j < first_radix; j++) + { + *out++ = *in1; + in1 += inner_size; + } + in += inscale; + } + } + else + { + const size_t steps = radix; + const size_t inscale_next = inscale * radix; + CMT_LOOP_NOUNROLL + for (size_t i = 0; i < steps; i++) + { + dft_permute_deep(out, in, radices, count, index - 1, inscale_next, inner_size, first_radix); + in += inscale; + } + } +} + +template <typename T> +struct dft_reorder_stage_impl : dft_stage<T> +{ + dft_reorder_stage_impl(const int* radices, size_t count) : count(count) + { + this->name = dft_name(this); + this->can_inplace = false; + this->data_size = 0; + std::copy(radices, radices + count, this->radices); + this->inner_size = 1; + this->size = 1; + for (size_t r = 0; r < count; r++) + { + if (r != 0 && r != count - 1) + this->inner_size *= radices[r]; + this->size *= radices[r]; + } + this->stage_size = this->size; + } + +protected: + size_t radices[32]; + size_t count = 0; + size_t size = 0; + size_t inner_size = 0; + virtual void do_initialize(size_t) override final {} + + DFT_STAGE_FN + template <bool inverse> + KFR_MEM_INTRINSIC void do_execute(complex<T>* out, const complex<T>* in, u8*) + { + cswitch( + dft_radices, radices[0], + [&](auto first_radix) + { + if (count == 3) + { + dft_permute(out, in, radices[2], radices[1], first_radix); + } + else + { + const size_t rlast = radices[count - 1]; + for (size_t p = 0; p < rlast; p++) + { + dft_permute_deep(out, in, radices, count, count - 2, 1, inner_size, first_radix); + in += size / rlast; + } + } + }, + [&]() + { + if (count == 3) + { + dft_permute(out, in, radices[2], radices[1], radices[0]); + } + else + { + const size_t rlast = radices[count - 1]; + for (size_t p = 0; p < rlast; p++) + { + dft_permute_deep(out, in, radices, count, count - 2, 1, inner_size, radices[0]); + in += size / rlast; + } + } + }); + } +}; +} // namespace intrinsics + +template <bool is_final, typename T> +void prepare_dft_stage(dft_plan<T>* self, size_t radix, size_t iterations, size_t blocks, cbool_t<is_final>) +{ + return cswitch( + dft_radices, radix, + [self, iterations, blocks](auto radix) CMT_INLINE_LAMBDA + { + add_stage<std::conditional_t<is_final, intrinsics::dft_stage_fixed_final_impl<T, val_of(radix)>, + intrinsics::dft_stage_fixed_impl<T, val_of(radix)>>>( + self, radix, iterations, blocks); + }, + [self, radix, iterations, blocks]() + { add_stage<intrinsics::dft_stage_generic_impl<T, is_final>>(self, radix, iterations, blocks); }); +} + +template <typename T> +void init_dft(dft_plan<T>* self, size_t size, dft_order) +{ + if (size == 60) + { + add_stage<intrinsics::dft_special_stage_impl<T, 6, 10>>(self); + } + else if (size == 48) + { + add_stage<intrinsics::dft_special_stage_impl<T, 6, 8>>(self); + } + else + { + size_t cur_size = size; + constexpr size_t radices_count = dft_radices.back() + 1; + u8 count[radices_count] = { 0 }; + int radices[32] = { 0 }; + size_t radices_size = 0; + + cforeach(dft_radices[csizeseq<dft_radices.size(), dft_radices.size() - 1, -1>], + [&](auto radix) + { + while (cur_size && cur_size % val_of(radix) == 0) + { + count[val_of(radix)]++; + cur_size /= val_of(radix); + } + }); + + int num_stages = 0; + if (cur_size >= 101) + { + add_stage<intrinsics::dft_arblen_stage_impl<T>>(self, size); + ++num_stages; + self->arblen = true; + } + else + { + size_t blocks = 1; + size_t iterations = size; + + for (size_t r = dft_radices.front(); r <= dft_radices.back(); r++) + { + for (size_t i = 0; i < count[r]; i++) + { + iterations /= r; + radices[radices_size++] = static_cast<int>(r); + if (iterations == 1) + prepare_dft_stage(self, r, iterations, blocks, ctrue); + else + prepare_dft_stage(self, r, iterations, blocks, cfalse); + ++num_stages; + blocks *= r; + } + } + + if (cur_size > 1) + { + iterations /= cur_size; + radices[radices_size++] = static_cast<int>(cur_size); + if (iterations == 1) + prepare_dft_stage(self, cur_size, iterations, blocks, ctrue); + else + prepare_dft_stage(self, cur_size, iterations, blocks, cfalse); + ++num_stages; + } + + if (num_stages > 2) + add_stage<intrinsics::dft_reorder_stage_impl<T>>(self, radices, radices_size); + } + } +} + +} // namespace CMT_ARCH_NAME + +} // namespace kfr + +CMT_PRAGMA_GNU(GCC diagnostic pop) + +CMT_PRAGMA_MSVC(warning(pop)) diff --git a/src/dft/dft-templates.hpp b/src/dft/dft-templates.hpp @@ -0,0 +1,41 @@ +/** @addtogroup dft + * @{ + */ +/* + Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ + +#ifdef FLOAT +#include <kfr/dft/fft.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +#ifndef KFR_DFT_NO_NPo2 +template void init_dft(dft_plan<FLOAT>*, size_t, dft_order); +#endif +} // namespace CMT_ARCH_NAME +} // namespace kfr + +#endif diff --git a/include/kfr/dft/impl/fft-impl-f32.cpp b/src/dft/fft-impl-f32.cpp diff --git a/include/kfr/dft/impl/fft-impl-f64.cpp b/src/dft/fft-impl-f64.cpp diff --git a/include/kfr/dft/impl/fft-impl.hpp b/src/dft/fft-impl.hpp diff --git a/src/dft/fft-templates.hpp b/src/dft/fft-templates.hpp @@ -0,0 +1,39 @@ +/** @addtogroup dft + * @{ + */ +/* + Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ + +#ifdef FLOAT +#include <kfr/dft/fft.hpp> + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ +template void dft_initialize<FLOAT>(dft_plan<FLOAT>& plan); +template void dft_real_initialize<FLOAT>(dft_plan_real<FLOAT>& plan); +} // namespace CMT_ARCH_NAME +} // namespace kfr + +#endif diff --git a/src/dft/ft.hpp b/src/dft/ft.hpp @@ -0,0 +1,1785 @@ +/** @addtogroup dft + * @{ + */ +/* + Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ +#pragma once + +#include <kfr/base/univector.hpp> +#include <kfr/math/sin_cos.hpp> +#include <kfr/simd/complex.hpp> +#include <kfr/simd/constants.hpp> +#include <kfr/simd/digitreverse.hpp> +#include <kfr/simd/read_write.hpp> +#include <kfr/simd/vec.hpp> + +#include <kfr/base/memory.hpp> +#include "data/sincos.hpp" + +CMT_PRAGMA_GNU(GCC diagnostic push) +#if CMT_HAS_WARNING("-Wpass-failed") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wpass-failed") +#endif + +CMT_PRAGMA_MSVC(warning(push)) +CMT_PRAGMA_MSVC(warning(disable : 4127)) + +namespace kfr +{ +inline namespace CMT_ARCH_NAME +{ + +template <typename T, size_t N> +using cvec = vec<T, N * 2>; + +namespace intrinsics +{ + +template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> +KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, N>& y) +{ + return subadd(x * dupeven(y), swap<2>(x) * dupodd(y)); +} +template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> +KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, N>& x, const vec<T, 2>& y) +{ + vec<T, N> yy = resize<N>(y); + return cmul_impl(x, yy); +} +template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> +KFR_INTRINSIC vec<T, N> cmul_impl(const vec<T, 2>& x, const vec<T, N>& y) +{ + vec<T, N> xx = resize<N>(x); + return cmul_impl(xx, y); +} + +/// Complex Multiplication +template <typename T, size_t N1, size_t N2> +KFR_INTRINSIC vec<T, const_max(N1, N2)> cmul(const vec<T, N1>& x, const vec<T, N2>& y) +{ + return intrinsics::cmul_impl(x, y); +} + +template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> +KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, N>& y) +{ + return swap<2>(subadd(swap<2>(x) * dupeven(y), x * dupodd(y))); +} +template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> +KFR_INTRINSIC vec<T, N> cmul_2conj(const vec<T, N>& in0, const vec<T, N>& in1, const vec<T, N>& tw) +{ + return (in0 + in1) * dupeven(tw) + swap<2>(cnegimag(in0 - in1)) * dupodd(tw); +} +template <typename T, size_t N, KFR_ENABLE_IF(N >= 2)> +KFR_INTRINSIC void cmul_2conj(vec<T, N>& out0, vec<T, N>& out1, const vec<T, 2>& in0, const vec<T, 2>& in1, + const vec<T, N>& tw) +{ + const vec<T, N> twr = dupeven(tw); + const vec<T, N> twi = dupodd(tw); + const vec<T, 2> sum = (in0 + in1); + const vec<T, 2> dif = swap<2>(negodd(in0 - in1)); + const vec<T, N> sumtw = resize<N>(sum) * twr; + const vec<T, N> diftw = resize<N>(dif) * twi; + out0 += sumtw + diftw; + out1 += sumtw - diftw; +} +template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> +KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, N>& x, const vec<T, 2>& y) +{ + vec<T, N> yy = resize<N>(y); + return cmul_conj(x, yy); +} +template <typename T, size_t N, KFR_ENABLE_IF(N > 2)> +KFR_INTRINSIC vec<T, N> cmul_conj(const vec<T, 2>& x, const vec<T, N>& y) +{ + vec<T, N> xx = resize<N>(x); + return cmul_conj(xx, y); +} + +template <size_t N, bool A = false, typename T> +KFR_INTRINSIC cvec<T, N> cread(const complex<T>* src) +{ + return cvec<T, N>(ptr_cast<T>(src), cbool_t<A>()); +} + +template <size_t N, bool A = false, typename T> +KFR_INTRINSIC void cwrite(complex<T>* dest, const cvec<T, N>& value) +{ + value.write(ptr_cast<T>(dest), cbool_t<A>()); +} + +template <size_t count, size_t N, bool A = false, typename T> +KFR_INTRINSIC cvec<T, count * N> cread_group(const complex<T>* src, size_t stride) +{ + return internal::read_group_impl<2, count, N, A>(ptr_cast<T>(src), stride, csizeseq_t<count>()); +} + +template <size_t count, size_t N, bool A = false, typename T> +KFR_INTRINSIC void cwrite_group(complex<T>* dest, size_t stride, const cvec<T, count * N>& value) +{ + return internal::write_group_impl<2, count, N, A>(ptr_cast<T>(dest), stride, value, csizeseq_t<count>()); +} + +template <size_t N, bool A = false, bool split = false, typename T> +KFR_INTRINSIC cvec<T, N> cread_split(const complex<T>* src) +{ + cvec<T, N> temp = cvec<T, N>(ptr_cast<T>(src), cbool_t<A>()); + if constexpr (split) + temp = splitpairs(temp); + return temp; +} + +template <size_t N, bool A = false, bool split = false, typename T> +KFR_INTRINSIC void cwrite_split(complex<T>* dest, const cvec<T, N>& value) +{ + cvec<T, N> v = value; + if constexpr (split) + v = interleavehalves(v); + v.write(ptr_cast<T>(dest), cbool_t<A>()); +} + +template <> +inline cvec<f32, 8> cread_split<8, false, true, f32>(const complex<f32>* src) +{ + const cvec<f32, 4> l = concat(cread<2>(src), cread<2>(src + 4)); + const cvec<f32, 4> h = concat(cread<2>(src + 2), cread<2>(src + 6)); + + return concat(shuffle<0, 2, 8 + 0, 8 + 2>(l, h), shuffle<1, 3, 8 + 1, 8 + 3>(l, h)); +} +template <> +inline cvec<f32, 8> cread_split<8, true, true, f32>(const complex<f32>* src) +{ + const cvec<f32, 4> l = concat(cread<2, true>(src), cread<2, true>(src + 4)); + const cvec<f32, 4> h = concat(cread<2, true>(src + 2), cread<2, true>(src + 6)); + + return concat(shuffle<0, 2, 8 + 0, 8 + 2>(l, h), shuffle<1, 3, 8 + 1, 8 + 3>(l, h)); +} + +template <> +inline cvec<f64, 4> cread_split<4, false, true, f64>(const complex<f64>* src) +{ + const cvec<f64, 2> l = concat(cread<1>(src), cread<1>(src + 2)); + const cvec<f64, 2> h = concat(cread<1>(src + 1), cread<1>(src + 3)); + + return concat(shuffle<0, 4, 2, 6>(l, h), shuffle<1, 5, 3, 7>(l, h)); +} + +template <> +inline void cwrite_split<8, false, true, f32>(complex<f32>* dest, const cvec<f32, 8>& x) +{ + const cvec<f32, 8> xx = + concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x))); + + cvec<f32, 2> a, b, c, d; + split<f32, 16>(xx, a, b, c, d); + cwrite<2>(dest, a); + cwrite<2>(dest + 4, b); + cwrite<2>(dest + 2, c); + cwrite<2>(dest + 6, d); +} +template <> +inline void cwrite_split<8, true, true, f32>(complex<f32>* dest, const cvec<f32, 8>& x) +{ + const cvec<f32, 8> xx = + concat(shuffle<0, 8 + 0, 1, 8 + 1>(low(x), high(x)), shuffle<2, 8 + 2, 3, 8 + 3>(low(x), high(x))); + + cvec<f32, 2> a, b, c, d; + split<f32, 16>(xx, a, b, c, d); + cwrite<2, true>(dest + 0, a); + cwrite<2, true>(dest + 4, b); + cwrite<2, true>(dest + 2, c); + cwrite<2, true>(dest + 6, d); +} + +template <> +inline void cwrite_split<4, false, true, f64>(complex<f64>* dest, const cvec<f64, 4>& x) +{ + const cvec<f64, 4> xx = + concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x))); + cwrite<1>(dest, part<4, 0>(xx)); + cwrite<1>(dest + 2, part<4, 1>(xx)); + cwrite<1>(dest + 1, part<4, 2>(xx)); + cwrite<1>(dest + 3, part<4, 3>(xx)); +} +template <> +inline void cwrite_split<4, true, true, f64>(complex<f64>* dest, const cvec<f64, 4>& x) +{ + const cvec<f64, 4> xx = + concat(shuffle<0, 4, 2, 6>(low(x), high(x)), shuffle<1, 5, 3, 7>(low(x), high(x))); + cwrite<1, true>(dest + 0, part<4, 0>(xx)); + cwrite<1, true>(dest + 2, part<4, 1>(xx)); + cwrite<1, true>(dest + 1, part<4, 2>(xx)); + cwrite<1, true>(dest + 3, part<4, 3>(xx)); +} + +template <size_t N, size_t stride, typename T, size_t... Indices> +KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, csizes_t<Indices...>) +{ + return concat(ref_cast<cvec<T, 1>>(base[Indices * stride])...); +} + +template <size_t N, size_t stride, typename T> +KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base) +{ + if constexpr (stride == 1) + { + return ref_cast<cvec<T, N>>(*base); + } + else + { + return cgather_helper<N, stride, T>(base, csizeseq_t<N>()); + } +} + +KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t size, size_t) +{ + size_t temp = index; + index += stride; + if (index >= size) + index -= size; + return temp; +} +KFR_INTRINSIC size_t cgather_next(size_t& index, size_t stride, size_t) +{ + size_t temp = index; + index += stride; + return temp; +} + +template <size_t N, typename T, size_t... Indices> +KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, + csizes_t<Indices...>) +{ + return concat(ref_cast<cvec<T, 1>>(base[cgather_next(index, stride, Indices)])...); +} + +template <size_t N, typename T> +KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride) +{ + return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>()); +} +template <size_t N, typename T> +KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t stride) +{ + size_t index = 0; + return cgather_helper<N, T>(base, index, stride, csizeseq_t<N>()); +} + +template <size_t N, typename T, size_t... Indices> +KFR_INTRINSIC cvec<T, N> cgather_helper(const complex<T>* base, size_t& index, size_t stride, size_t size, + csizes_t<Indices...>) +{ + return concat(ref_cast<cvec<T, 1>>(base[cgather_next(index, stride, size, Indices)])...); +} + +template <size_t N, typename T> +KFR_INTRINSIC cvec<T, N> cgather(const complex<T>* base, size_t& index, size_t stride, size_t size) +{ + return cgather_helper<N, T>(base, index, stride, size, csizeseq_t<N>()); +} + +template <size_t N, size_t stride, typename T, size_t... Indices> +KFR_INTRINSIC void cscatter_helper(complex<T>* base, const cvec<T, N>& value, csizes_t<Indices...>) +{ + swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... }; +} + +template <size_t N, size_t stride, typename T> +KFR_INTRINSIC void cscatter(complex<T>* base, const cvec<T, N>& value) +{ + if constexpr (stride == 1) + { + cwrite<N>(base, value); + } + else + { + return cscatter_helper<N, stride, T>(base, value, csizeseq_t<N>()); + } +} + +template <size_t N, typename T, size_t... Indices> +KFR_INTRINSIC void cscatter_helper(complex<T>* base, size_t stride, const cvec<T, N>& value, + csizes_t<Indices...>) +{ + swallow{ (cwrite<1>(base + Indices * stride, slice<Indices * 2, 2>(value)), 0)... }; +} + +template <size_t N, typename T> +KFR_INTRINSIC void cscatter(complex<T>* base, size_t stride, const cvec<T, N>& value) +{ + return cscatter_helper<N, T>(base, stride, value, csizeseq_t<N>()); +} + +template <size_t groupsize = 1, typename T, size_t N, typename IT> +KFR_INTRINSIC vec<T, N * 2 * groupsize> cgather(const complex<T>* base, const vec<IT, N>& offset) +{ + return internal::gather_helper<2 * groupsize>(ptr_cast<T>(base), offset, csizeseq_t<N>()); +} + +template <size_t groupsize = 1, typename T, size_t N, typename IT> +KFR_INTRINSIC void cscatter(complex<T>* base, const vec<IT, N>& offset, vec<T, N * 2 * groupsize> value) +{ + return internal::scatter_helper<2 * groupsize>(ptr_cast<T>(base), offset, value, csizeseq_t<N>()); +} + +template <typename T> +KFR_INTRINSIC void transpose4x8(const cvec<T, 8>& z0, const cvec<T, 8>& z1, const cvec<T, 8>& z2, + const cvec<T, 8>& z3, cvec<T, 4>& w0, cvec<T, 4>& w1, cvec<T, 4>& w2, + cvec<T, 4>& w3, cvec<T, 4>& w4, cvec<T, 4>& w5, cvec<T, 4>& w6, + cvec<T, 4>& w7) +{ + cvec<T, 16> a = concat(low(z0), low(z1), low(z2), low(z3)); + cvec<T, 16> b = concat(high(z0), high(z1), high(z2), high(z3)); + a = digitreverse4<2>(a); + b = digitreverse4<2>(b); + w0 = part<4, 0>(a); + w1 = part<4, 1>(a); + w2 = part<4, 2>(a); + w3 = part<4, 3>(a); + w4 = part<4, 0>(b); + w5 = part<4, 1>(b); + w6 = part<4, 2>(b); + w7 = part<4, 3>(b); +} + +template <typename T> +KFR_INTRINSIC void transpose4x8(const cvec<T, 4>& w0, const cvec<T, 4>& w1, const cvec<T, 4>& w2, + const cvec<T, 4>& w3, const cvec<T, 4>& w4, const cvec<T, 4>& w5, + const cvec<T, 4>& w6, const cvec<T, 4>& w7, cvec<T, 8>& z0, cvec<T, 8>& z1, + cvec<T, 8>& z2, cvec<T, 8>& z3) +{ + cvec<T, 16> a = concat(w0, w1, w2, w3); + cvec<T, 16> b = concat(w4, w5, w6, w7); + a = digitreverse4<2>(a); + b = digitreverse4<2>(b); + z0 = concat(part<4, 0>(a), part<4, 0>(b)); + z1 = concat(part<4, 1>(a), part<4, 1>(b)); + z2 = concat(part<4, 2>(a), part<4, 2>(b)); + z3 = concat(part<4, 3>(a), part<4, 3>(b)); +} + +template <typename T> +KFR_INTRINSIC void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d) +{ + cvec<T, 4> a0, a1, a2, a3; + cvec<T, 4> b0, b1, b2, b3; + cvec<T, 4> c0, c1, c2, c3; + cvec<T, 4> d0, d1, d2, d3; + + split<T, 32>(a, a0, a1, a2, a3); + split<T, 32>(b, b0, b1, b2, b3); + split<T, 32>(c, c0, c1, c2, c3); + split<T, 32>(d, d0, d1, d2, d3); + + a = concat(a0, b0, c0, d0); + b = concat(a1, b1, c1, d1); + c = concat(a2, b2, c2, d2); + d = concat(a3, b3, c3, d3); +} +template <typename T> +KFR_INTRINSIC void transpose4(cvec<T, 16>& a, cvec<T, 16>& b, cvec<T, 16>& c, cvec<T, 16>& d, cvec<T, 16>& aa, + cvec<T, 16>& bb, cvec<T, 16>& cc, cvec<T, 16>& dd) +{ + cvec<T, 4> a0, a1, a2, a3; + cvec<T, 4> b0, b1, b2, b3; + cvec<T, 4> c0, c1, c2, c3; + cvec<T, 4> d0, d1, d2, d3; + + split<T, 32>(a, a0, a1, a2, a3); + split<T, 32>(b, b0, b1, b2, b3); + split<T, 32>(c, c0, c1, c2, c3); + split<T, 32>(d, d0, d1, d2, d3); + + aa = concat(a0, b0, c0, d0); + bb = concat(a1, b1, c1, d1); + cc = concat(a2, b2, c2, d2); + dd = concat(a3, b3, c3, d3); +} + +template <bool b, typename T> +constexpr KFR_INTRINSIC T chsign(T x) +{ + return b ? -x : x; +} + +template <typename T, size_t N, size_t size, size_t start, size_t step, bool inverse = false, + size_t... indices> +constexpr KFR_INTRINSIC cvec<T, N> get_fixed_twiddle_helper(csizes_t<indices...>) +{ + return make_vector((indices & 1 ? chsign<inverse>(-sin_using_table<T>(size, (indices / 2 * step + start))) + : cos_using_table<T>(size, (indices / 2 * step + start)))...); +} + +template <typename T, size_t width, size_t... indices> +constexpr KFR_INTRINSIC cvec<T, width> get_fixed_twiddle_helper(csizes_t<indices...>, size_t size, + size_t start, size_t step) +{ + return make_vector((indices & 1 ? -sin_using_table<T>(size, indices / 2 * step + start) + : cos_using_table<T>(size, indices / 2 * step + start))...); +} + +template <typename T, size_t width, size_t size, size_t start, size_t step = 0, bool inverse = false> +constexpr KFR_INTRINSIC cvec<T, width> fixed_twiddle() +{ + return get_fixed_twiddle_helper<T, width, size, start, step, inverse>(csizeseq_t<width * 2>()); +} + +template <typename T, size_t width> +constexpr KFR_INTRINSIC cvec<T, width> fixed_twiddle(size_t size, size_t start, size_t step = 0) +{ + return get_fixed_twiddle_helper<T, width>(csizeseq_t<width * 2>(), start, step, size); +} + +// template <typename T, size_t N, size_t size, size_t start, size_t step = 0, bool inverse = false> +// constexpr cvec<T, N> fixed_twiddle = get_fixed_twiddle<T, N, size, start, step, inverse>(); + +template <typename T, size_t N, bool inverse> +constexpr static inline cvec<T, N> twiddleimagmask() +{ + return inverse ? broadcast<N * 2, T>(-1, +1) : broadcast<N * 2, T>(+1, -1); +} + +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wconversion") + +CMT_PRAGMA_GNU(GCC diagnostic pop) + +template <typename T, size_t N> +CMT_NOINLINE static vec<T, N> cossin_conj(const vec<T, N>& x) +{ + return negodd(cossin(x)); +} + +template <size_t k, size_t size, bool inverse = false, typename T, size_t width, + size_t kk = (inverse ? size - k : k) % size> +KFR_INTRINSIC vec<T, width> cmul_by_twiddle(const vec<T, width>& x) +{ + constexpr T isqrt2 = static_cast<T>(0.70710678118654752440084436210485); + if constexpr (kk == 0) + { + return x; + } + else if constexpr (kk == size * 1 / 8) + { + return swap<2>(subadd(swap<2>(x), x)) * isqrt2; + } + else if constexpr (kk == size * 2 / 8) + { + return negodd(swap<2>(x)); + } + else if constexpr (kk == size * 3 / 8) + { + return subadd(x, swap<2>(x)) * -isqrt2; + } + else if constexpr (kk == size * 4 / 8) + { + return -x; + } + else if constexpr (kk == size * 5 / 8) + { + return swap<2>(subadd(swap<2>(x), x)) * -isqrt2; + } + else if constexpr (kk == size * 6 / 8) + { + return swap<2>(negodd(x)); + } + else if constexpr (kk == size * 7 / 8) + { + return subadd(x, swap<2>(x)) * isqrt2; + } + else + { + return cmul(x, resize<width>(fixed_twiddle<T, 1, size, kk>())); + } +} + +template <size_t N, typename T> +KFR_INTRINSIC void butterfly2(const cvec<T, N>& a0, const cvec<T, N>& a1, cvec<T, N>& w0, cvec<T, N>& w1) +{ + const cvec<T, N> sum = a0 + a1; + const cvec<T, N> dif = a0 - a1; + w0 = sum; + w1 = dif; +} + +template <size_t N, typename T> +KFR_INTRINSIC void butterfly2(cvec<T, N>& a0, cvec<T, N>& a1) +{ + butterfly2<N>(a0, a1, a0, a1); +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly4(cfalse_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1, + const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1, + cvec<T, N>& w2, cvec<T, N>& w3) +{ + cvec<T, N> sum02, sum13, diff02, diff13; + cvec<T, N * 2> a01, a23, sum0213, diff0213; + + a01 = concat(a0, a1); + a23 = concat(a2, a3); + sum0213 = a01 + a23; + diff0213 = a01 - a23; + + sum02 = low(sum0213); + sum13 = high(sum0213); + diff02 = low(diff0213); + diff13 = high(diff0213); + w0 = sum02 + sum13; + w2 = sum02 - sum13; + if constexpr (inverse) + { + diff13 = (diff13 ^ broadcast<N * 2, T>(T(), -T())); + diff13 = swap<2>(diff13); + } + else + { + diff13 = swap<2>(diff13); + diff13 = (diff13 ^ broadcast<N * 2, T>(T(), -T())); + } + + w1 = diff02 + diff13; + w3 = diff02 - diff13; +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly4(ctrue_t /*split_format*/, const cvec<T, N>& a0, const cvec<T, N>& a1, + const cvec<T, N>& a2, const cvec<T, N>& a3, cvec<T, N>& w0, cvec<T, N>& w1, + cvec<T, N>& w2, cvec<T, N>& w3) +{ + vec<T, N> re0, im0, re1, im1, re2, im2, re3, im3; + vec<T, N> wre0, wim0, wre1, wim1, wre2, wim2, wre3, wim3; + + cvec<T, N> sum02, sum13, diff02, diff13; + vec<T, N> sum02re, sum13re, diff02re, diff13re; + vec<T, N> sum02im, sum13im, diff02im, diff13im; + + sum02 = a0 + a2; + sum13 = a1 + a3; + + w0 = sum02 + sum13; + w2 = sum02 - sum13; + + diff02 = a0 - a2; + diff13 = a1 - a3; + split(diff02, diff02re, diff02im); + split(diff13, diff13re, diff13im); + + (inverse ? w3 : w1) = concat(diff02re + diff13im, diff02im - diff13re); + (inverse ? w1 : w3) = concat(diff02re - diff13im, diff02im + diff13re); +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly8(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, + const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, + const cvec<T, N>& a6, const cvec<T, N>& a7, cvec<T, N>& w0, cvec<T, N>& w1, + cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, + cvec<T, N>& w7) +{ + cvec<T, N> b0 = a0, b2 = a2, b4 = a4, b6 = a6; + butterfly4<N, inverse>(cfalse, b0, b2, b4, b6, b0, b2, b4, b6); + cvec<T, N> b1 = a1, b3 = a3, b5 = a5, b7 = a7; + butterfly4<N, inverse>(cfalse, b1, b3, b5, b7, b1, b3, b5, b7); + w0 = b0 + b1; + w4 = b0 - b1; + + b3 = cmul_by_twiddle<1, 8, inverse>(b3); + b5 = cmul_by_twiddle<2, 8, inverse>(b5); + b7 = cmul_by_twiddle<3, 8, inverse>(b7); + + w1 = b2 + b3; + w5 = b2 - b3; + w2 = b4 + b5; + w6 = b4 - b5; + w3 = b6 + b7; + w7 = b6 - b7; +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly8(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, + cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7) +{ + butterfly8<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a0, a1, a2, a3, a4, a5, a6, a7); +} + +template <bool inverse = false, typename T> +KFR_INTRINSIC void butterfly8(cvec<T, 2>& a01, cvec<T, 2>& a23, cvec<T, 2>& a45, cvec<T, 2>& a67) +{ + cvec<T, 2> b01 = a01, b23 = a23, b45 = a45, b67 = a67; + + butterfly4<2, inverse>(cfalse, b01, b23, b45, b67, b01, b23, b45, b67); + + cvec<T, 2> b02, b13, b46, b57; + + cvec<T, 8> b01234567 = concat(b01, b23, b45, b67); + cvec<T, 8> b02461357 = concat(even<2>(b01234567), odd<2>(b01234567)); + split<T, 16>(b02461357, b02, b46, b13, b57); + + b13 = cmul(b13, fixed_twiddle<T, 2, 8, 0, 1, inverse>()); + b57 = cmul(b57, fixed_twiddle<T, 2, 8, 2, 1, inverse>()); + a01 = b02 + b13; + a23 = b46 + b57; + a45 = b02 - b13; + a67 = b46 - b57; +} + +template <bool inverse = false, typename T> +KFR_INTRINSIC void butterfly8_packed(cvec<T, 8>& v8) +{ + cvec<T, 2> w0, w1, w2, w3; + split<T, 16>(v8, w0, w1, w2, w3); + butterfly8<inverse>(w0, w1, w2, w3); + v8 = concat(w0, w1, w2, w3); +} + +template <bool inverse = false, typename T> +KFR_INTRINSIC void butterfly32_packed(cvec<T, 32>& v32) +{ + cvec<T, 4> w0, w1, w2, w3, w4, w5, w6, w7; + split(v32, w0, w1, w2, w3, w4, w5, w6, w7); + butterfly8<4, inverse>(w0, w1, w2, w3, w4, w5, w6, w7); + + w1 = cmul(w1, fixed_twiddle<T, 4, 32, 0, 1, inverse>()); + w2 = cmul(w2, fixed_twiddle<T, 4, 32, 0, 2, inverse>()); + w3 = cmul(w3, fixed_twiddle<T, 4, 32, 0, 3, inverse>()); + w4 = cmul(w4, fixed_twiddle<T, 4, 32, 0, 4, inverse>()); + w5 = cmul(w5, fixed_twiddle<T, 4, 32, 0, 5, inverse>()); + w6 = cmul(w6, fixed_twiddle<T, 4, 32, 0, 6, inverse>()); + w7 = cmul(w7, fixed_twiddle<T, 4, 32, 0, 7, inverse>()); + + cvec<T, 8> z0, z1, z2, z3; + transpose4x8(w0, w1, w2, w3, w4, w5, w6, w7, z0, z1, z2, z3); + + butterfly4<8, inverse>(cfalse, z0, z1, z2, z3, z0, z1, z2, z3); + v32 = concat(z0, z1, z2, z3); +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly4_packed(cvec<T, N * 4>& a0123) +{ + cvec<T, N> a0; + cvec<T, N> a1; + cvec<T, N> a2; + cvec<T, N> a3; + split<T, N * 4 * 2>(a0123, a0, a1, a2, a3); + butterfly4<N, inverse>(cfalse, a0, a1, a2, a3, a0, a1, a2, a3); + a0123 = concat(a0, a1, a2, a3); +} + +template <size_t N, typename T> +KFR_INTRINSIC void butterfly2_packed(cvec<T, N * 2>& a01) +{ + cvec<T, N> a0; + cvec<T, N> a1; + split(a01, a0, a1); + butterfly2<N>(a0, a1); + a01 = concat(a0, a1); +} + +template <size_t N, bool inverse = false, bool split_format = false, typename T> +KFR_INTRINSIC void apply_twiddle(const cvec<T, N>& a1, const cvec<T, N>& tw1, cvec<T, N>& w1) +{ + if constexpr (split_format) + { + vec<T, N> re1, im1, tw1re, tw1im; + split<T, 2 * N>(a1, re1, im1); + split<T, 2 * N>(tw1, tw1re, tw1im); + vec<T, N> b1re = re1 * tw1re; + vec<T, N> b1im = im1 * tw1re; + if constexpr (inverse) + w1 = concat(b1re + im1 * tw1im, b1im - re1 * tw1im); + else + w1 = concat(b1re - im1 * tw1im, b1im + re1 * tw1im); + } + else + { + const cvec<T, N> b1 = a1 * dupeven(tw1); + const cvec<T, N> a1_ = swap<2>(a1); + + cvec<T, N> tw1_ = tw1; + if constexpr (inverse) + tw1_ = -(tw1_); + w1 = subadd(b1, a1_ * dupodd(tw1_)); + } +} + +template <size_t N, bool inverse = false, bool split_format = false, typename T> +KFR_INTRINSIC void apply_twiddles4(const cvec<T, N>& a1, const cvec<T, N>& a2, const cvec<T, N>& a3, + const cvec<T, N>& tw1, const cvec<T, N>& tw2, const cvec<T, N>& tw3, + cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3) +{ + apply_twiddle<N, inverse, split_format>(a1, tw1, w1); + apply_twiddle<N, inverse, split_format>(a2, tw2, w2); + apply_twiddle<N, inverse, split_format>(a3, tw3, w3); +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, + cvec<T, N>& __restrict a3, const cvec<T, N>& tw1, const cvec<T, N>& tw2, + const cvec<T, N>& tw3) +{ + apply_twiddles4<N, inverse>(a1, a2, a3, tw1, tw2, tw3, a1, a2, a3); +} + +template <size_t N, bool inverse = false, typename T, typename = u8[N - 1]> +KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, + cvec<T, N>& __restrict a3, const cvec<T, 1>& tw1, const cvec<T, 1>& tw2, + const cvec<T, 1>& tw3) +{ + apply_twiddles4<N, inverse>(a1, a2, a3, resize<N * 2>(tw1), resize<N * 2>(tw2), resize<N * 2>(tw3)); +} + +template <size_t N, bool inverse = false, typename T, typename = u8[N - 2]> +KFR_INTRINSIC void apply_twiddles4(cvec<T, N>& __restrict a1, cvec<T, N>& __restrict a2, + cvec<T, N>& __restrict a3, cvec<T, N / 2> tw1, cvec<T, N / 2> tw2, + cvec<T, N / 2> tw3) +{ + apply_twiddles4<N, inverse>(a1, a2, a3, resize<N * 2>(tw1), resize<N * 2>(tw2), resize<N * 2>(tw3)); +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void apply_vertical_twiddles4(cvec<T, N * 4>& b, cvec<T, N * 4>& c, cvec<T, N * 4>& d) +{ + cvec<T, 4> b0, b1, b2, b3; + cvec<T, 4> c0, c1, c2, c3; + cvec<T, 4> d0, d1, d2, d3; + + split(b, b0, b1, b2, b3); + split(c, c0, c1, c2, c3); + split(d, d0, d1, d2, d3); + + b1 = cmul_by_twiddle<4, 64, inverse>(b1); + b2 = cmul_by_twiddle<8, 64, inverse>(b2); + b3 = cmul_by_twiddle<12, 64, inverse>(b3); + + c1 = cmul_by_twiddle<8, 64, inverse>(c1); + c2 = cmul_by_twiddle<16, 64, inverse>(c2); + c3 = cmul_by_twiddle<24, 64, inverse>(c3); + + d1 = cmul_by_twiddle<12, 64, inverse>(d1); + d2 = cmul_by_twiddle<24, 64, inverse>(d2); + d3 = cmul_by_twiddle<36, 64, inverse>(d3); + + b = concat(b0, b1, b2, b3); + c = concat(c0, c1, c2, c3); + d = concat(d0, d1, d2, d3); +} + +template <size_t n2, size_t nnstep, size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void apply_twiddles4(cvec<T, N * 4>& __restrict a0123) +{ + cvec<T, N> a0; + cvec<T, N> a1; + cvec<T, N> a2; + cvec<T, N> a3; + split<T, 2 * N * 4>(a0123, a0, a1, a2, a3); + + cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1, inverse>(), + tw2 = fixed_twiddle<T, N, 64, n2 * nnstep * 2, nnstep * 2, inverse>(), + tw3 = fixed_twiddle<T, N, 64, n2 * nnstep * 3, nnstep * 3, inverse>(); + + apply_twiddles4<N>(a1, a2, a3, tw1, tw2, tw3); + + a0123 = concat(a0, a1, a2, a3); +} + +template <bool inverse, bool aligned, typename T> +KFR_INTRINSIC void butterfly64_memory(cbool_t<inverse>, cbool_t<aligned>, complex<T>* out, + const complex<T>* in) +{ + cvec<T, 16> w0, w1, w2, w3; + + w0 = cread_group<4, 4, aligned>( + in, 16); // concat(cread<4>(in + 0), cread<4>(in + 16), cread<4>(in + 32), cread<4>(in + 48)); + butterfly4_packed<4, inverse>(w0); + apply_twiddles4<0, 1, 4, inverse>(w0); + + w1 = cread_group<4, 4, aligned>( + in + 4, 16); // concat(cread<4>(in + 4), cread<4>(in + 20), cread<4>(in + 36), cread<4>(in + 52)); + butterfly4_packed<4, inverse>(w1); + apply_twiddles4<4, 1, 4, inverse>(w1); + + w2 = cread_group<4, 4, aligned>( + in + 8, 16); // concat(cread<4>(in + 8), cread<4>(in + 24), cread<4>(in + 40), cread<4>(in + 56)); + butterfly4_packed<4, inverse>(w2); + apply_twiddles4<8, 1, 4, inverse>(w2); + + w3 = cread_group<4, 4, aligned>( + in + 12, 16); // concat(cread<4>(in + 12), cread<4>(in + 28), cread<4>(in + 44), cread<4>(in + 60)); + butterfly4_packed<4, inverse>(w3); + apply_twiddles4<12, 1, 4, inverse>(w3); + + transpose4(w0, w1, w2, w3); + // pass 2: + + butterfly4_packed<4, inverse>(w0); + butterfly4_packed<4, inverse>(w1); + butterfly4_packed<4, inverse>(w2); + butterfly4_packed<4, inverse>(w3); + + transpose4(w0, w1, w2, w3); + + w0 = digitreverse4<2>(w0); + w1 = digitreverse4<2>(w1); + w2 = digitreverse4<2>(w2); + w3 = digitreverse4<2>(w3); + + apply_vertical_twiddles4<4, inverse>(w1, w2, w3); + + // pass 3: + butterfly4_packed<4, inverse>(w3); + cwrite_group<4, 4, aligned>(out + 12, 16, w3); // split(w3, out[3], out[7], out[11], out[15]); + + butterfly4_packed<4, inverse>(w2); + cwrite_group<4, 4, aligned>(out + 8, 16, w2); // split(w2, out[2], out[6], out[10], out[14]); + + butterfly4_packed<4, inverse>(w1); + cwrite_group<4, 4, aligned>(out + 4, 16, w1); // split(w1, out[1], out[5], out[9], out[13]); + + butterfly4_packed<4, inverse>(w0); + cwrite_group<4, 4, aligned>(out, 16, w0); // split(w0, out[0], out[4], out[8], out[12]); +} + +template <bool inverse = false, typename T> +KFR_INTRINSIC void butterfly16_packed(cvec<T, 16>& v16) +{ + butterfly4_packed<4, inverse>(v16); + apply_twiddles4<0, 4, 4, inverse>(v16); + v16 = digitreverse4<2>(v16); + butterfly4_packed<4, inverse>(v16); +} + +template <size_t index, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly16_multi_natural(complex<T>* out, const complex<T>* in) +{ + constexpr size_t N = 4; + + cvec<T, 4> a1 = cread<4>(in + index * 4 + 16 * 1); + cvec<T, 4> a5 = cread<4>(in + index * 4 + 16 * 5); + cvec<T, 4> a9 = cread<4>(in + index * 4 + 16 * 9); + cvec<T, 4> a13 = cread<4>(in + index * 4 + 16 * 13); + butterfly4<N, inverse>(cfalse, a1, a5, a9, a13, a1, a5, a9, a13); + a5 = cmul_by_twiddle<1, 16, inverse>(a5); + a9 = cmul_by_twiddle<2, 16, inverse>(a9); + a13 = cmul_by_twiddle<3, 16, inverse>(a13); + + cvec<T, 4> a2 = cread<4>(in + index * 4 + 16 * 2); + cvec<T, 4> a6 = cread<4>(in + index * 4 + 16 * 6); + cvec<T, 4> a10 = cread<4>(in + index * 4 + 16 * 10); + cvec<T, 4> a14 = cread<4>(in + index * 4 + 16 * 14); + butterfly4<N, inverse>(cfalse, a2, a6, a10, a14, a2, a6, a10, a14); + a6 = cmul_by_twiddle<2, 16, inverse>(a6); + a10 = cmul_by_twiddle<4, 16, inverse>(a10); + a14 = cmul_by_twiddle<6, 16, inverse>(a14); + + cvec<T, 4> a3 = cread<4>(in + index * 4 + 16 * 3); + cvec<T, 4> a7 = cread<4>(in + index * 4 + 16 * 7); + cvec<T, 4> a11 = cread<4>(in + index * 4 + 16 * 11); + cvec<T, 4> a15 = cread<4>(in + index * 4 + 16 * 15); + butterfly4<N, inverse>(cfalse, a3, a7, a11, a15, a3, a7, a11, a15); + a7 = cmul_by_twiddle<3, 16, inverse>(a7); + a11 = cmul_by_twiddle<6, 16, inverse>(a11); + a15 = cmul_by_twiddle<9, 16, inverse>(a15); + + cvec<T, 4> a0 = cread<4>(in + index * 4 + 16 * 0); + cvec<T, 4> a4 = cread<4>(in + index * 4 + 16 * 4); + cvec<T, 4> a8 = cread<4>(in + index * 4 + 16 * 8); + cvec<T, 4> a12 = cread<4>(in + index * 4 + 16 * 12); + butterfly4<N, inverse>(cfalse, a0, a4, a8, a12, a0, a4, a8, a12); + butterfly4<N, inverse>(cfalse, a0, a1, a2, a3, a0, a1, a2, a3); + cwrite<4>(out + index * 4 + 16 * 0, a0); + cwrite<4>(out + index * 4 + 16 * 4, a1); + cwrite<4>(out + index * 4 + 16 * 8, a2); + cwrite<4>(out + index * 4 + 16 * 12, a3); + butterfly4<N, inverse>(cfalse, a4, a5, a6, a7, a4, a5, a6, a7); + cwrite<4>(out + index * 4 + 16 * 1, a4); + cwrite<4>(out + index * 4 + 16 * 5, a5); + cwrite<4>(out + index * 4 + 16 * 9, a6); + cwrite<4>(out + index * 4 + 16 * 13, a7); + butterfly4<N, inverse>(cfalse, a8, a9, a10, a11, a8, a9, a10, a11); + cwrite<4>(out + index * 4 + 16 * 2, a8); + cwrite<4>(out + index * 4 + 16 * 6, a9); + cwrite<4>(out + index * 4 + 16 * 10, a10); + cwrite<4>(out + index * 4 + 16 * 14, a11); + butterfly4<N, inverse>(cfalse, a12, a13, a14, a15, a12, a13, a14, a15); + cwrite<4>(out + index * 4 + 16 * 3, a12); + cwrite<4>(out + index * 4 + 16 * 7, a13); + cwrite<4>(out + index * 4 + 16 * 11, a14); + cwrite<4>(out + index * 4 + 16 * 15, a15); +} + +template <size_t index, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly16_multi_flip(complex<T>* out, const complex<T>* in) +{ + constexpr size_t N = 4; + + cvec<T, 4> a1 = cread<4>(in + index * 4 + 16 * 1); + cvec<T, 4> a5 = cread<4>(in + index * 4 + 16 * 5); + cvec<T, 4> a9 = cread<4>(in + index * 4 + 16 * 9); + cvec<T, 4> a13 = cread<4>(in + index * 4 + 16 * 13); + butterfly4<N, inverse>(cfalse, a1, a5, a9, a13, a1, a5, a9, a13); + a5 = cmul_by_twiddle<1, 16, inverse>(a5); + a9 = cmul_by_twiddle<2, 16, inverse>(a9); + a13 = cmul_by_twiddle<3, 16, inverse>(a13); + + cvec<T, 4> a2 = cread<4>(in + index * 4 + 16 * 2); + cvec<T, 4> a6 = cread<4>(in + index * 4 + 16 * 6); + cvec<T, 4> a10 = cread<4>(in + index * 4 + 16 * 10); + cvec<T, 4> a14 = cread<4>(in + index * 4 + 16 * 14); + butterfly4<N, inverse>(cfalse, a2, a6, a10, a14, a2, a6, a10, a14); + a6 = cmul_by_twiddle<2, 16, inverse>(a6); + a10 = cmul_by_twiddle<4, 16, inverse>(a10); + a14 = cmul_by_twiddle<6, 16, inverse>(a14); + + cvec<T, 4> a3 = cread<4>(in + index * 4 + 16 * 3); + cvec<T, 4> a7 = cread<4>(in + index * 4 + 16 * 7); + cvec<T, 4> a11 = cread<4>(in + index * 4 + 16 * 11); + cvec<T, 4> a15 = cread<4>(in + index * 4 + 16 * 15); + butterfly4<N, inverse>(cfalse, a3, a7, a11, a15, a3, a7, a11, a15); + a7 = cmul_by_twiddle<3, 16, inverse>(a7); + a11 = cmul_by_twiddle<6, 16, inverse>(a11); + a15 = cmul_by_twiddle<9, 16, inverse>(a15); + + cvec<T, 16> w1 = concat(a1, a5, a9, a13); + cvec<T, 16> w2 = concat(a2, a6, a10, a14); + cvec<T, 16> w3 = concat(a3, a7, a11, a15); + + cvec<T, 4> a0 = cread<4>(in + index * 4 + 16 * 0); + cvec<T, 4> a4 = cread<4>(in + index * 4 + 16 * 4); + cvec<T, 4> a8 = cread<4>(in + index * 4 + 16 * 8); + cvec<T, 4> a12 = cread<4>(in + index * 4 + 16 * 12); + butterfly4<N, inverse>(cfalse, a0, a4, a8, a12, a0, a4, a8, a12); + cvec<T, 16> w0 = concat(a0, a4, a8, a12); + + butterfly4<N * 4, inverse>(cfalse, w0, w1, w2, w3, w0, w1, w2, w3); + + w0 = digitreverse4<2>(w0); + w1 = digitreverse4<2>(w1); + w2 = digitreverse4<2>(w2); + w3 = digitreverse4<2>(w3); + + transpose4(w0, w1, w2, w3); + cwrite<16>(out + index * 64 + 16 * 0, cmul(w0, fixed_twiddle<T, 16, 256, 0, index * 4 + 0, inverse>())); + cwrite<16>(out + index * 64 + 16 * 1, cmul(w1, fixed_twiddle<T, 16, 256, 0, index * 4 + 1, inverse>())); + cwrite<16>(out + index * 64 + 16 * 2, cmul(w2, fixed_twiddle<T, 16, 256, 0, index * 4 + 2, inverse>())); + cwrite<16>(out + index * 64 + 16 * 3, cmul(w3, fixed_twiddle<T, 16, 256, 0, index * 4 + 3, inverse>())); +} + +template <size_t n2, size_t nnstep, size_t N, typename T> +KFR_INTRINSIC void apply_twiddles2(cvec<T, N>& a1) +{ + cvec<T, N> tw1 = fixed_twiddle<T, N, 64, n2 * nnstep * 1, nnstep * 1>(); + + a1 = cmul(a1, tw1); +} + +template <typename T, size_t N, bool inverse> +static constexpr KFR_INTRINSIC cvec<T, N> tw3r1() +{ + return static_cast<T>(-0.5 - 1.0); +} + +template <typename T, size_t N, bool inverse> +static constexpr KFR_INTRINSIC cvec<T, N> tw3i1() +{ + return static_cast<T>(0.86602540378443864676372317075) * twiddleimagmask<T, N, inverse>(); +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly3(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N>& w00, + cvec<T, N>& w01, cvec<T, N>& w02) +{ + + const cvec<T, N> sum1 = a01 + a02; + const cvec<T, N> dif1 = swap<2>(a01 - a02); + w00 = a00 + sum1; + + const cvec<T, N> s1 = w00 + sum1 * tw3r1<T, N, inverse>(); + + const cvec<T, N> d1 = dif1 * tw3i1<T, N, inverse>(); + + w01 = s1 + d1; + w02 = s1 - d1; +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly3(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2) +{ + butterfly3<N, inverse>(a0, a1, a2, a0, a1, a2); +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly6(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, + const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, + cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, + cvec<T, N>& w5) +{ + cvec<T, N * 2> a03 = concat(a0, a3); + cvec<T, N * 2> a25 = concat(a2, a5); + cvec<T, N * 2> a41 = concat(a4, a1); + butterfly3<N * 2, inverse>(a03, a25, a41, a03, a25, a41); + cvec<T, N> t0, t1, t2, t3, t4, t5; + split(a03, t0, t1); + split(a25, t2, t3); + split(a41, t4, t5); + t3 = -t3; + cvec<T, N * 2> a04 = concat(t0, t4); + cvec<T, N * 2> a15 = concat(t1, t5); + cvec<T, N * 2> w02, w35; + butterfly2<N * 2>(a04, a15, w02, w35); + split(w02, w0, w2); + split(w35, w3, w5); + + butterfly2<N>(t2, t3, w1, w4); +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly6(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, + cvec<T, N>& a5) +{ + butterfly6<N, inverse>(a0, a1, a2, a3, a4, a5, a0, a1, a2, a3, a4, a5); +} + +template <typename T, bool inverse = false> +static constexpr KFR_INTRINSIC cvec<T, 1> tw9_1() +{ + return { T(0.76604444311897803520239265055541), + (inverse ? -1 : 1) * T(-0.64278760968653932632264340990727) }; +} +template <typename T, bool inverse = false> +static constexpr KFR_INTRINSIC cvec<T, 1> tw9_2() +{ + return { T(0.17364817766693034885171662676931), + (inverse ? -1 : 1) * T(-0.98480775301220805936674302458952) }; +} +template <typename T, bool inverse = false> +static constexpr KFR_INTRINSIC cvec<T, 1> tw9_4() +{ + return { T(-0.93969262078590838405410927732473), + (inverse ? -1 : 1) * T(-0.34202014332566873304409961468226) }; +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly9(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, + const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, + const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8, + cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, cvec<T, N>& w3, cvec<T, N>& w4, + cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7, cvec<T, N>& w8) +{ + cvec<T, N * 3> a012 = concat(a0, a1, a2); + cvec<T, N * 3> a345 = concat(a3, a4, a5); + cvec<T, N * 3> a678 = concat(a6, a7, a8); + butterfly3<N * 3, inverse>(a012, a345, a678, a012, a345, a678); + cvec<T, N> t0, t1, t2, t3, t4, t5, t6, t7, t8; + split(a012, t0, t1, t2); + split(a345, t3, t4, t5); + split(a678, t6, t7, t8); + + t4 = cmul(t4, tw9_1<T, inverse>()); + t5 = cmul(t5, tw9_2<T, inverse>()); + t7 = cmul(t7, tw9_2<T, inverse>()); + t8 = cmul(t8, tw9_4<T, inverse>()); + + cvec<T, N * 3> t036 = concat(t0, t3, t6); + cvec<T, N * 3> t147 = concat(t1, t4, t7); + cvec<T, N * 3> t258 = concat(t2, t5, t8); + + butterfly3<N * 3, inverse>(t036, t147, t258, t036, t147, t258); + split(t036, w0, w1, w2); + split(t147, w3, w4, w5); + split(t258, w6, w7, w8); +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly9(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, + cvec<T, N>& a5, cvec<T, N>& a6, cvec<T, N>& a7, cvec<T, N>& a8) +{ + butterfly9<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a7, a8, a0, a1, a2, a3, a4, a5, a6, a7, a8); +} + +template <typename T, size_t N, bool inverse> +static constexpr KFR_INTRINSIC cvec<T, N> tw7r1() +{ + return static_cast<T>(0.623489801858733530525004884 - 1.0); +} + +template <typename T, size_t N, bool inverse> +static constexpr KFR_INTRINSIC cvec<T, N> tw7i1() +{ + return static_cast<T>(0.78183148246802980870844452667) * twiddleimagmask<T, N, inverse>(); +} + +template <typename T, size_t N, bool inverse> +static constexpr KFR_INTRINSIC cvec<T, N> tw7r2() +{ + return static_cast<T>(-0.2225209339563144042889025645 - 1.0); +} + +template <typename T, size_t N, bool inverse> +static constexpr KFR_INTRINSIC cvec<T, N> tw7i2() +{ + return static_cast<T>(0.97492791218182360701813168299) * twiddleimagmask<T, N, inverse>(); +} + +template <typename T, size_t N, bool inverse> +static constexpr KFR_INTRINSIC cvec<T, N> tw7r3() +{ + return static_cast<T>(-0.90096886790241912623610231951 - 1.0); +} + +template <typename T, size_t N, bool inverse> +static constexpr KFR_INTRINSIC cvec<T, N> tw7i3() +{ + return static_cast<T>(0.43388373911755812047576833285) * twiddleimagmask<T, N, inverse>(); +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly7(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04, + cvec<T, N> a05, cvec<T, N> a06, cvec<T, N>& w00, cvec<T, N>& w01, + cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, + cvec<T, N>& w06) +{ + const cvec<T, N> sum1 = a01 + a06; + const cvec<T, N> dif1 = swap<2>(a01 - a06); + const cvec<T, N> sum2 = a02 + a05; + const cvec<T, N> dif2 = swap<2>(a02 - a05); + const cvec<T, N> sum3 = a03 + a04; + const cvec<T, N> dif3 = swap<2>(a03 - a04); + w00 = a00 + sum1 + sum2 + sum3; + + const cvec<T, N> s1 = + w00 + sum1 * tw7r1<T, N, inverse>() + sum2 * tw7r2<T, N, inverse>() + sum3 * tw7r3<T, N, inverse>(); + const cvec<T, N> s2 = + w00 + sum1 * tw7r2<T, N, inverse>() + sum2 * tw7r3<T, N, inverse>() + sum3 * tw7r1<T, N, inverse>(); + const cvec<T, N> s3 = + w00 + sum1 * tw7r3<T, N, inverse>() + sum2 * tw7r1<T, N, inverse>() + sum3 * tw7r2<T, N, inverse>(); + + const cvec<T, N> d1 = + dif1 * tw7i1<T, N, inverse>() + dif2 * tw7i2<T, N, inverse>() + dif3 * tw7i3<T, N, inverse>(); + const cvec<T, N> d2 = + dif1 * tw7i2<T, N, inverse>() - dif2 * tw7i3<T, N, inverse>() - dif3 * tw7i1<T, N, inverse>(); + const cvec<T, N> d3 = + dif1 * tw7i3<T, N, inverse>() - dif2 * tw7i1<T, N, inverse>() + dif3 * tw7i2<T, N, inverse>(); + + w01 = s1 + d1; + w06 = s1 - d1; + w02 = s2 + d2; + w05 = s2 - d2; + w03 = s3 + d3; + w04 = s3 - d3; +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly7(cvec<T, N>& a0, cvec<T, N>& a1, cvec<T, N>& a2, cvec<T, N>& a3, cvec<T, N>& a4, + cvec<T, N>& a5, cvec<T, N>& a6) +{ + butterfly7<N, inverse>(a0, a1, a2, a3, a4, a5, a6, a0, a1, a2, a3, a4, a5, a6); +} + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11r1 = static_cast<T>(0.84125353283118116886181164892 - 1.0); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11i1 = + static_cast<T>(0.54064081745559758210763595432) * twiddleimagmask<T, N, inverse>(); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11r2 = static_cast<T>(0.41541501300188642552927414923 - 1.0); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11i2 = + static_cast<T>(0.90963199535451837141171538308) * twiddleimagmask<T, N, inverse>(); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11r3 = static_cast<T>(-0.14231483827328514044379266862 - 1.0); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11i3 = + static_cast<T>(0.98982144188093273237609203778) * twiddleimagmask<T, N, inverse>(); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11r4 = static_cast<T>(-0.65486073394528506405692507247 - 1.0); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11i4 = + static_cast<T>(0.75574957435425828377403584397) * twiddleimagmask<T, N, inverse>(); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11r5 = static_cast<T>(-0.95949297361449738989036805707 - 1.0); + +template <typename T, size_t N, bool inverse> +static const cvec<T, N> tw11i5 = + static_cast<T>(0.28173255684142969771141791535) * twiddleimagmask<T, N, inverse>(); + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly11(cvec<T, N> a00, cvec<T, N> a01, cvec<T, N> a02, cvec<T, N> a03, cvec<T, N> a04, + cvec<T, N> a05, cvec<T, N> a06, cvec<T, N> a07, cvec<T, N> a08, cvec<T, N> a09, + cvec<T, N> a10, cvec<T, N>& w00, cvec<T, N>& w01, cvec<T, N>& w02, + cvec<T, N>& w03, cvec<T, N>& w04, cvec<T, N>& w05, cvec<T, N>& w06, + cvec<T, N>& w07, cvec<T, N>& w08, cvec<T, N>& w09, cvec<T, N>& w10) +{ + const cvec<T, N> sum1 = a01 + a10; + const cvec<T, N> dif1 = swap<2>(a01 - a10); + const cvec<T, N> sum2 = a02 + a09; + const cvec<T, N> dif2 = swap<2>(a02 - a09); + const cvec<T, N> sum3 = a03 + a08; + const cvec<T, N> dif3 = swap<2>(a03 - a08); + const cvec<T, N> sum4 = a04 + a07; + const cvec<T, N> dif4 = swap<2>(a04 - a07); + const cvec<T, N> sum5 = a05 + a06; + const cvec<T, N> dif5 = swap<2>(a05 - a06); + w00 = a00 + sum1 + sum2 + sum3 + sum4 + sum5; + + const cvec<T, N> s1 = w00 + sum1 * tw11r1<T, N, inverse> + sum2 * tw11r2<T, N, inverse> + + sum3 * tw11r3<T, N, inverse> + sum4 * tw11r4<T, N, inverse> + + sum5 * tw11r5<T, N, inverse>; + const cvec<T, N> s2 = w00 + sum1 * tw11r2<T, N, inverse> + sum2 * tw11r3<T, N, inverse> + + sum3 * tw11r4<T, N, inverse> + sum4 * tw11r5<T, N, inverse> + + sum5 * tw11r1<T, N, inverse>; + const cvec<T, N> s3 = w00 + sum1 * tw11r3<T, N, inverse> + sum2 * tw11r4<T, N, inverse> + + sum3 * tw11r5<T, N, inverse> + sum4 * tw11r1<T, N, inverse> + + sum5 * tw11r2<T, N, inverse>; + const cvec<T, N> s4 = w00 + sum1 * tw11r4<T, N, inverse> + sum2 * tw11r5<T, N, inverse> + + sum3 * tw11r1<T, N, inverse> + sum4 * tw11r2<T, N, inverse> + + sum5 * tw11r3<T, N, inverse>; + const cvec<T, N> s5 = w00 + sum1 * tw11r5<T, N, inverse> + sum2 * tw11r1<T, N, inverse> + + sum3 * tw11r2<T, N, inverse> + sum4 * tw11r3<T, N, inverse> + + sum5 * tw11r4<T, N, inverse>; + + const cvec<T, N> d1 = dif1 * tw11i1<T, N, inverse> + dif2 * tw11i2<T, N, inverse> + + dif3 * tw11i3<T, N, inverse> + dif4 * tw11i4<T, N, inverse> + + dif5 * tw11i5<T, N, inverse>; + const cvec<T, N> d2 = dif1 * tw11i2<T, N, inverse> - dif2 * tw11i3<T, N, inverse> - + dif3 * tw11i4<T, N, inverse> - dif4 * tw11i5<T, N, inverse> - + dif5 * tw11i1<T, N, inverse>; + const cvec<T, N> d3 = dif1 * tw11i3<T, N, inverse> - dif2 * tw11i4<T, N, inverse> + + dif3 * tw11i5<T, N, inverse> + dif4 * tw11i1<T, N, inverse> + + dif5 * tw11i2<T, N, inverse>; + const cvec<T, N> d4 = dif1 * tw11i4<T, N, inverse> - dif2 * tw11i5<T, N, inverse> + + dif3 * tw11i1<T, N, inverse> - dif4 * tw11i2<T, N, inverse> - + dif5 * tw11i3<T, N, inverse>; + const cvec<T, N> d5 = dif1 * tw11i5<T, N, inverse> - dif2 * tw11i1<T, N, inverse> + + dif3 * tw11i2<T, N, inverse> - dif4 * tw11i3<T, N, inverse> + + dif5 * tw11i4<T, N, inverse>; + + w01 = s1 + d1; + w10 = s1 - d1; + w02 = s2 + d2; + w09 = s2 - d2; + w03 = s3 + d3; + w08 = s3 - d3; + w04 = s4 + d4; + w07 = s4 - d4; + w05 = s5 + d5; + w06 = s5 - d5; +} + +template <typename T, size_t N, bool inverse> +static constexpr KFR_INTRINSIC cvec<T, N> tw5r1() +{ + return static_cast<T>(0.30901699437494742410229341718 - 1.0); +} +template <typename T, size_t N, bool inverse> +static constexpr KFR_INTRINSIC cvec<T, N> tw5i1() +{ + return static_cast<T>(0.95105651629515357211643933338) * twiddleimagmask<T, N, inverse>(); +} +template <typename T, size_t N, bool inverse> +static constexpr KFR_INTRINSIC cvec<T, N> tw5r2() +{ + return static_cast<T>(-0.80901699437494742410229341718 - 1.0); +} +template <typename T, size_t N, bool inverse> +static constexpr KFR_INTRINSIC cvec<T, N> tw5i2() +{ + return static_cast<T>(0.58778525229247312916870595464) * twiddleimagmask<T, N, inverse>(); +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly5(const cvec<T, N>& a00, const cvec<T, N>& a01, const cvec<T, N>& a02, + const cvec<T, N>& a03, const cvec<T, N>& a04, cvec<T, N>& w00, cvec<T, N>& w01, + cvec<T, N>& w02, cvec<T, N>& w03, cvec<T, N>& w04) +{ + const cvec<T, N> sum1 = a01 + a04; + const cvec<T, N> dif1 = swap<2>(a01 - a04); + const cvec<T, N> sum2 = a02 + a03; + const cvec<T, N> dif2 = swap<2>(a02 - a03); + w00 = a00 + sum1 + sum2; + + const cvec<T, N> s1 = w00 + sum1 * tw5r1<T, N, inverse>() + sum2 * tw5r2<T, N, inverse>(); + const cvec<T, N> s2 = w00 + sum1 * tw5r2<T, N, inverse>() + sum2 * tw5r1<T, N, inverse>(); + + const cvec<T, N> d1 = dif1 * tw5i1<T, N, inverse>() + dif2 * tw5i2<T, N, inverse>(); + const cvec<T, N> d2 = dif1 * tw5i2<T, N, inverse>() - dif2 * tw5i1<T, N, inverse>(); + + w01 = s1 + d1; + w04 = s1 - d1; + w02 = s2 + d2; + w03 = s2 - d2; +} + +template <size_t N, bool inverse = false, typename T> +KFR_INTRINSIC void butterfly10(const cvec<T, N>& a0, const cvec<T, N>& a1, const cvec<T, N>& a2, + const cvec<T, N>& a3, const cvec<T, N>& a4, const cvec<T, N>& a5, + const cvec<T, N>& a6, const cvec<T, N>& a7, const cvec<T, N>& a8, + const cvec<T, N>& a9, cvec<T, N>& w0, cvec<T, N>& w1, cvec<T, N>& w2, + cvec<T, N>& w3, cvec<T, N>& w4, cvec<T, N>& w5, cvec<T, N>& w6, cvec<T, N>& w7, + cvec<T, N>& w8, cvec<T, N>& w9) +{ + cvec<T, N * 2> a05 = concat(a0, a5); + cvec<T, N * 2> a27 = concat(a2, a7); + cvec<T, N * 2> a49 = concat(a4, a9); + cvec<T, N * 2> a61 = concat(a6, a1); + cvec<T, N * 2> a83 = concat(a8, a3); + butterfly5<N * 2, inverse>(a05, a27, a49, a61, a83, a05, a27, a49, a61, a83); + cvec<T, N> t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; + split(a05, t0, t1); + split(a27, t2, t3); + split(a49, t4, t5); + split(a61, t6, t7); + split(a83, t8, t9); + t5 = -t5; + + cvec<T, N * 2> t02, t13; + cvec<T, N * 2> w06, w51; + t02 = concat(t0, t2); + t13 = concat(t1, t3); + butterfly2<N * 2>(t02, t13, w06, w51); + split(w06, w0, w6); + split(w51, w5, w1); + + cvec<T, N * 2> t68, t79; + cvec<T, N * 2> w84, w39; + t68 = concat(t6, t8); + t79 = concat(t7, t9); + butterfly2<N * 2>(t68, t79, w84, w39); + split(w84, w8, w4); + split(w39, w3, w9); + butterfly2<N>(t4, t5, w7, w2); +} + +template <bool inverse, typename T, size_t N> +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, vec<T, N>& out0, + vec<T, N>& out1) +{ + butterfly2<N / 2>(in0, in1, out0, out1); +} +template <bool inverse, typename T, size_t N> +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2) +{ + butterfly3<N / 2, inverse>(in0, in1, in2, out0, out1, out2); +} + +template <bool inverse, typename T, size_t N> +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, const vec<T, N>& in3, vec<T, N>& out0, vec<T, N>& out1, + vec<T, N>& out2, vec<T, N>& out3) +{ + butterfly4<N / 2, inverse>(cfalse, in0, in1, in2, in3, out0, out1, out2, out3); +} +template <bool inverse, typename T, size_t N> +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, + vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, + vec<T, N>& out4) +{ + butterfly5<N / 2, inverse>(in0, in1, in2, in3, in4, out0, out1, out2, out3, out4); +} +template <bool inverse, typename T, size_t N> +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, + const vec<T, N>& in5, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, + vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5) +{ + butterfly6<N / 2, inverse>(in0, in1, in2, in3, in4, in5, out0, out1, out2, out3, out4, out5); +} +template <bool inverse, typename T, size_t N> +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, + const vec<T, N>& in5, const vec<T, N>& in6, vec<T, N>& out0, vec<T, N>& out1, + vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, + vec<T, N>& out6) +{ + butterfly7<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, out0, out1, out2, out3, out4, out5, out6); +} +template <bool inverse, typename T, size_t N> +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, + const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, + vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, + vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7) +{ + butterfly8<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, out3, out4, out5, + out6, out7); +} +template <bool inverse, typename T, size_t N> +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, + const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, + const vec<T, N>& in8, vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, + vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, + vec<T, N>& out7, vec<T, N>& out8) +{ + butterfly9<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, out0, out1, out2, out3, out4, + out5, out6, out7, out8); +} +template <bool inverse, typename T, size_t N> +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, + const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, + const vec<T, N>& in8, const vec<T, N>& in9, vec<T, N>& out0, vec<T, N>& out1, + vec<T, N>& out2, vec<T, N>& out3, vec<T, N>& out4, vec<T, N>& out5, + vec<T, N>& out6, vec<T, N>& out7, vec<T, N>& out8, vec<T, N>& out9) +{ + butterfly10<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, out0, out1, out2, out3, + out4, out5, out6, out7, out8, out9); +} +template <bool inverse, typename T, size_t N> +KFR_INTRINSIC void butterfly(cbool_t<inverse>, const vec<T, N>& in0, const vec<T, N>& in1, + const vec<T, N>& in2, const vec<T, N>& in3, const vec<T, N>& in4, + const vec<T, N>& in5, const vec<T, N>& in6, const vec<T, N>& in7, + const vec<T, N>& in8, const vec<T, N>& in9, const vec<T, N>& in10, + vec<T, N>& out0, vec<T, N>& out1, vec<T, N>& out2, vec<T, N>& out3, + vec<T, N>& out4, vec<T, N>& out5, vec<T, N>& out6, vec<T, N>& out7, + vec<T, N>& out8, vec<T, N>& out9, vec<T, N>& out10) +{ + butterfly11<N / 2, inverse>(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, out0, out1, out2, + out3, out4, out5, out6, out7, out8, out9, out10); +} +template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()> +KFR_INTRINSIC void cread_transposed(cbool_t<transposed>, const complex<T>* ptr, vec<T, N>&... w) +{ + vec<T, Nout> temp = read(cunaligned, csize<Nout>, ptr_cast<T>(ptr)); + if constexpr (transposed) + temp = ctranspose<sizeof...(N)>(temp); + split(temp, w...); +} + +// Warning: Reads past the end. Use with care +KFR_INTRINSIC void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0, + cvec<f32, 4>& w1, cvec<f32, 4>& w2) +{ + cvec<f32, 4> w3; + cvec<f32, 16> v16 = concat(cread<4>(ptr), cread<4>(ptr + 3), cread<4>(ptr + 6), cread<4>(ptr + 9)); + v16 = digitreverse4<2>(v16); + split<f32, 32>(v16, w0, w1, w2, w3); +} + +KFR_INTRINSIC void cread_transposed(cbool_t<true>, const complex<f32>* ptr, cvec<f32, 4>& w0, + cvec<f32, 4>& w1, cvec<f32, 4>& w2, cvec<f32, 4>& w3, cvec<f32, 4>& w4) +{ + cvec<f32, 16> v16 = concat(cread<4>(ptr), cread<4>(ptr + 5), cread<4>(ptr + 10), cread<4>(ptr + 15)); + v16 = digitreverse4<2>(v16); + split<f32, 32>(v16, w0, w1, w2, w3); + w4 = cgather<4, 5>(ptr + 4); +} + +template <bool transposed, typename T, size_t... N, size_t Nout = csum<size_t, N...>()> +KFR_INTRINSIC void cwrite_transposed(cbool_t<transposed>, complex<T>* ptr, vec<T, N>... args) +{ + auto temp = concat(args...); + if constexpr (transposed) + temp = ctransposeinverse<sizeof...(N)>(temp); + write(ptr_cast<T>(ptr), temp); +} + +template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2> +KFR_INTRINSIC vec<T, N> mul_tw(cbool_t<false>, const vec<T, N>& x, const complex<T>* twiddle) +{ + return I == 0 ? x : cmul(x, cread<width>(twiddle + width * (I - 1))); +} +template <size_t I, size_t radix, typename T, size_t N, size_t width = N / 2> +KFR_INTRINSIC vec<T, N> mul_tw(cbool_t<true>, const vec<T, N>& x, const complex<T>* twiddle) +{ + return I == 0 ? x : cmul_conj(x, cread<width>(twiddle + width * (I - 1))); +} + +// Non-final +template <typename T, size_t width, size_t radix, bool inverse, size_t... I> +KFR_INTRINSIC void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>, + cbool_t<inverse>, complex<T>* out, const complex<T>* in, + const complex<T>* tw, size_t stride) +{ + carray<cvec<T, width>, radix> inout; + + swallow{ (inout.get(csize_t<I>()) = cread<width>(in + i + stride * I))... }; + + butterfly(cbool_t<inverse>(), inout.template get<I>()..., inout.template get<I>()...); + + swallow{ ( + cwrite<width>(out + i + stride * I, + mul_tw<I, radix>(cbool_t<inverse>(), inout.template get<I>(), tw + i * (radix - 1))), + 0)... }; +} + +// Final +template <typename T, size_t width, size_t radix, bool inverse, size_t... I> +KFR_INTRINSIC void butterfly_helper(csizes_t<I...>, size_t i, csize_t<width>, csize_t<radix>, + cbool_t<inverse>, complex<T>* out, const complex<T>* in, size_t stride) +{ + carray<cvec<T, width>, radix> inout; + + // swallow{ ( inout.get( csize<I> ) = infn( i, I, cvec<T, width>( ) ) )... }; + cread_transposed(ctrue, in + i * radix, inout.template get<I>()...); + + butterfly(cbool_t<inverse>(), inout.template get<I>()..., inout.template get<I>()...); + + swallow{ (cwrite<width>(out + i + stride * I, inout.get(csize_t<I>())), 0)... }; +} + +template <size_t width, size_t radix, typename... Args> +KFR_INTRINSIC void butterfly(size_t i, csize_t<width>, csize_t<radix>, Args&&... args) +{ + butterfly_helper(csizeseq_t<radix>(), i, csize_t<width>(), csize_t<radix>(), std::forward<Args>(args)...); +} + +template <typename... Args> +KFR_INTRINSIC void butterfly_cycle(size_t&, size_t, csize_t<0>, Args&&...) +{ +} +template <size_t width, typename... Args> +KFR_INTRINSIC void butterfly_cycle(size_t& i, size_t count, csize_t<width>, Args&&... args) +{ + CMT_LOOP_NOUNROLL + for (; i < count / width * width; i += width) + butterfly(i, csize_t<width>(), std::forward<Args>(args)...); + butterfly_cycle(i, count, csize_t<width / 2>(), std::forward<Args>(args)...); +} + +template <size_t width, typename... Args> +KFR_INTRINSIC void butterflies(size_t count, csize_t<width>, Args&&... args) +{ + CMT_ASSUME(count > 0); + size_t i = 0; + butterfly_cycle(i, count, csize_t<width>(), std::forward<Args>(args)...); +} + +template <typename T, bool inverse, typename Tradix, typename Tstride> +KFR_INTRINSIC void generic_butterfly_cycle(csize_t<0>, Tradix, cbool_t<inverse>, complex<T>*, + const complex<T>*, Tstride, size_t, size_t, const complex<T>*, + size_t) +{ +} + +template <size_t width, bool inverse, typename T, typename Tradix, typename Thalfradix, + typename Thalfradixsqr, typename Tstride> +KFR_INTRINSIC void generic_butterfly_cycle(csize_t<width>, Tradix radix, cbool_t<inverse>, complex<T>* out, + const complex<T>* in, Tstride ostride, Thalfradix halfradix, + Thalfradixsqr halfradix_sqr, const complex<T>* twiddle, size_t i) +{ + CMT_LOOP_NOUNROLL + for (; i < halfradix / width * width; i += width) + { + const cvec<T, 1> in0 = cread<1>(in); + cvec<T, width> sum0 = resize<2 * width>(in0); + cvec<T, width> sum1 = sum0; + + for (size_t j = 0; j < halfradix; j++) + { + const cvec<T, 1> ina = cread<1>(in + (1 + j)); + const cvec<T, 1> inb = cread<1>(in + radix - (j + 1)); + cvec<T, width> tw = cread<width>(twiddle); + if constexpr (inverse) + tw = negodd /*cconj*/ (tw); + + cmul_2conj(sum0, sum1, ina, inb, tw); + twiddle += halfradix; + } + twiddle = twiddle - halfradix_sqr + width; + + if (is_constant_val(ostride)) + { + cwrite<width>(out + (1 + i), sum0); + cwrite<width>(out + (radix - (i + 1)) - (width - 1), reverse<2>(sum1)); + } + else + { + cscatter<width>(out + (i + 1) * ostride, ostride, sum0); + cscatter<width>(out + (radix - (i + 1)) * ostride - (width - 1) * ostride, ostride, + reverse<2>(sum1)); + } + } + generic_butterfly_cycle(csize_t<width / 2>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix, + halfradix_sqr, twiddle, i); +} + +template <typename T> +KFR_INTRINSIC vec<T, 2> hcadd(vec<T, 2> value) +{ + return value; +} +template <typename T, size_t N, KFR_ENABLE_IF(N >= 4)> +KFR_INTRINSIC vec<T, 2> hcadd(vec<T, N> value) +{ + return hcadd(low(value) + high(value)); +} + +template <size_t width, typename T, bool inverse, typename Tstride = csize_t<1>> +KFR_INTRINSIC void generic_butterfly_w(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in, + const complex<T>* twiddle, Tstride ostride = Tstride{}) +{ + CMT_ASSUME(radix > 0); + { + cvec<T, width> sum = T(); + size_t j = 0; + CMT_LOOP_NOUNROLL + for (; j < radix / width * width; j += width) + { + sum += cread<width>(in + j); + } + cvec<T, 1> sums = T(); + CMT_LOOP_NOUNROLL + for (; j < radix; j++) + { + sums += cread<1>(in + j); + } + cwrite<1>(out, hcadd(sum) + sums); + } + const auto halfradix = radix / 2; + CMT_ASSUME(halfradix > 0); + size_t i = 0; + + generic_butterfly_cycle(csize_t<width>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix, + halfradix * halfradix, twiddle, i); +} + +template <size_t width, size_t radix, typename T, bool inverse, typename Tstride = csize_t<1>> +KFR_INTRINSIC void spec_generic_butterfly_w(csize_t<radix>, cbool_t<inverse>, complex<T>* out, + const complex<T>* in, const complex<T>* twiddle, + Tstride ostride = Tstride{}) +{ + { + cvec<T, width> sum = T(); + size_t j = 0; + CMT_LOOP_UNROLL + for (; j < radix / width * width; j += width) + { + sum += cread<width>(in + j); + } + cvec<T, 1> sums = T(); + CMT_LOOP_UNROLL + for (; j < radix; j++) + { + sums += cread<1>(in + j); + } + cwrite<1>(out, hcadd(sum) + sums); + } + const size_t halfradix = radix / 2; + const size_t halfradix_sqr = halfradix * halfradix; + CMT_ASSUME(halfradix > 0); + size_t i = 0; + + generic_butterfly_cycle(csize_t<width>(), radix, cbool_t<inverse>(), out, in, ostride, halfradix, + halfradix_sqr, twiddle, i); +} + +template <typename T, bool inverse, typename Tstride = csize_t<1>> +KFR_INTRINSIC void generic_butterfly(size_t radix, cbool_t<inverse>, complex<T>* out, const complex<T>* in, + complex<T>*, const complex<T>* twiddle, Tstride ostride = {}) +{ + cswitch( + csizes_t<11, 13>(), radix, + [&](auto radix_) CMT_INLINE_LAMBDA + { + constexpr size_t width = vector_width<T>; + spec_generic_butterfly_w<width>(radix_, cbool_t<inverse>(), out, in, twiddle, ostride); + }, + [&]() CMT_INLINE_LAMBDA + { + constexpr size_t width = vector_width<T>; + generic_butterfly_w<width>(radix, cbool_t<inverse>(), out, in, twiddle, ostride); + }); +} + +template <typename T, size_t N> +constexpr cvec<T, N> cmask08 = broadcast<N * 2, T>(T(), -T()); + +template <typename T, size_t N> +constexpr cvec<T, N> cmask0088 = broadcast<N * 4, T>(T(), T(), -T(), -T()); + +template <bool A = false, typename T, size_t N> +KFR_INTRINSIC void cbitreverse_write(complex<T>* dest, const vec<T, N>& x) +{ + cwrite<N / 2, A>(dest, bitreverse<2>(x)); +} + +template <bool A = false, typename T, size_t N> +KFR_INTRINSIC void cdigitreverse4_write(complex<T>* dest, const vec<T, N>& x) +{ + cwrite<N / 2, A>(dest, digitreverse4<2>(x)); +} + +template <size_t N, bool A = false, typename T> +KFR_INTRINSIC cvec<T, N> cbitreverse_read(const complex<T>* src) +{ + return bitreverse<2>(cread<N, A>(src)); +} + +template <size_t N, bool A = false, typename T> +KFR_INTRINSIC cvec<T, N> cdigitreverse4_read(const complex<T>* src) +{ + return digitreverse4<2>(cread<N, A>(src)); +} + +#if 1 + +template <> +KFR_INTRINSIC cvec<f64, 16> cdigitreverse4_read<16, false, f64>(const complex<f64>* src) +{ + return concat(cread<1>(src + 0), cread<1>(src + 4), cread<1>(src + 8), cread<1>(src + 12), + cread<1>(src + 1), cread<1>(src + 5), cread<1>(src + 9), cread<1>(src + 13), + cread<1>(src + 2), cread<1>(src + 6), cread<1>(src + 10), cread<1>(src + 14), + cread<1>(src + 3), cread<1>(src + 7), cread<1>(src + 11), cread<1>(src + 15)); +} +template <> +KFR_INTRINSIC void cdigitreverse4_write<false, f64, 32>(complex<f64>* dest, const vec<f64, 32>& x) +{ + cwrite<1>(dest, part<16, 0>(x)); + cwrite<1>(dest + 4, part<16, 1>(x)); + cwrite<1>(dest + 8, part<16, 2>(x)); + cwrite<1>(dest + 12, part<16, 3>(x)); + + cwrite<1>(dest + 1, part<16, 4>(x)); + cwrite<1>(dest + 5, part<16, 5>(x)); + cwrite<1>(dest + 9, part<16, 6>(x)); + cwrite<1>(dest + 13, part<16, 7>(x)); + + cwrite<1>(dest + 2, part<16, 8>(x)); + cwrite<1>(dest + 6, part<16, 9>(x)); + cwrite<1>(dest + 10, part<16, 10>(x)); + cwrite<1>(dest + 14, part<16, 11>(x)); + + cwrite<1>(dest + 3, part<16, 12>(x)); + cwrite<1>(dest + 7, part<16, 13>(x)); + cwrite<1>(dest + 11, part<16, 14>(x)); + cwrite<1>(dest + 15, part<16, 15>(x)); +} +#endif +} // namespace intrinsics +} // namespace CMT_ARCH_NAME +} // namespace kfr + +CMT_PRAGMA_MSVC(warning(pop)) + +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/src/io/CMakeLists.txt b/src/io/CMakeLists.txt diff --git a/src/io/audiofile-impl.cpp b/src/io/audiofile-impl.cpp @@ -0,0 +1,407 @@ +/** @addtogroup io + * @{ + */ +/* + Copyright (C) 2016-2023 Dan Cazarin (https://www.kfrlib.com) + This file is part of KFR + + KFR is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 2 of the License, or + (at your option) any later version. + + KFR is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with KFR. + + If GPL is not suitable for your project, you must purchase a commercial license to use KFR. + Buying a commercial license is mandatory as soon as you develop commercial activities without + disclosing the source code of your own applications. + See https://www.kfrlib.com for details. + */ + +#include <kfr/io/audiofile.hpp> +CMT_PRAGMA_GNU(GCC diagnostic push) +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wimplicit-fallthrough") +CMT_PRAGMA_GNU(GCC diagnostic ignored "-Wunused-function") + +#ifndef KFR_DISABLE_WAV +#define DR_WAV_NO_STDIO +#define DR_WAV_NO_CONVERSION_API +#define DR_WAV_IMPLEMENTATION +#include "dr/dr_wav.h" +#endif +#ifndef KFR_DISABLE_FLAC +#define DR_FLAC_IMPLEMENTATION +#define DR_FLAC_NO_STDIO +#include "dr/dr_flac.h" +#endif +#ifndef KFR_DISABLE_MP3 +#define DR_MP3_IMPLEMENTATION +#define DR_MP3_NO_STDIO +#include "dr/dr_mp3.h" +#endif + +namespace kfr +{ + +namespace internal_generic +{ +#ifndef KFR_DISABLE_WAV +size_t drwav_writer_write_proc(abstract_writer<void>* file, const void* pData, size_t bytesToWrite) +{ + return file->write(pData, bytesToWrite); +} +drwav_bool32 drwav_writer_seek_proc(abstract_writer<void>* file, int offset, drwav_seek_origin origin) +{ + return file->seek(offset, origin == drwav_seek_origin_start ? seek_origin::begin : seek_origin::current); +} +size_t drwav_reader_read_proc(abstract_reader<void>* file, void* pBufferOut, size_t bytesToRead) +{ + return file->read(pBufferOut, bytesToRead); +} +drwav_bool32 drwav_reader_seek_proc(abstract_reader<void>* file, int offset, drwav_seek_origin origin) +{ + return file->seek(offset, origin == drwav_seek_origin_start ? seek_origin::begin : seek_origin::current); +} +#endif +#ifndef KFR_DISABLE_FLAC +size_t drflac_reader_read_proc(abstract_reader<void>* file, void* pBufferOut, size_t bytesToRead) +{ + return file->read(pBufferOut, bytesToRead); +} +drflac_bool32 drflac_reader_seek_proc(abstract_reader<void>* file, int offset, drflac_seek_origin origin) +{ + return file->seek(offset, origin == drflac_seek_origin_start ? seek_origin::begin : seek_origin::current); +} +#endif +#ifndef KFR_DISABLE_MP3 +size_t drmp3_reader_read_proc(abstract_reader<void>* file, void* pBufferOut, size_t bytesToRead) +{ + return file->read(pBufferOut, bytesToRead); +} +drmp3_bool32 drmp3_reader_seek_proc(abstract_reader<void>* file, int offset, drmp3_seek_origin origin) +{ + return file->seek(offset, origin == drmp3_seek_origin_start ? seek_origin::begin : seek_origin::current); +} +#endif + +struct wav_file : drwav +{ +}; +struct flac_file : drflac +{ +}; +struct mp3_file : drmp3 +{ +}; + +void wav_file_deleter::operator()(wav_file* f) +{ + drwav_uninit(f); + delete f; +} + +void flac_file_deleter::operator()(flac_file* f) +{ + drflac_close(f); +} +void mp3_file_deleter::operator()(mp3_file* f) +{ + drmp3_uninit(f); + delete f; +} + +} // namespace internal_generic + +template <typename T> +audio_writer_wav<T>::audio_writer_wav(std::shared_ptr<abstract_writer<>>&& writer, const audio_format& fmt) + : writer(std::move(writer)), fmt(fmt) +{ + drwav_data_format wav_fmt; + wav_fmt.channels = static_cast<drwav_uint32>(fmt.channels); + wav_fmt.sampleRate = static_cast<drwav_uint32>(fmt.samplerate); + wav_fmt.format = + fmt.type >= audio_sample_type::first_float ? DR_WAVE_FORMAT_IEEE_FLOAT : DR_WAVE_FORMAT_PCM; + wav_fmt.bitsPerSample = static_cast<drwav_uint32>(audio_sample_bit_depth(fmt.type)); + wav_fmt.container = fmt.use_w64 ? drwav_container_w64 : drwav_container_riff; + f.reset(new internal_generic::wav_file()); + if (!drwav_init_write(f.get(), &wav_fmt, (drwav_write_proc)&internal_generic::drwav_writer_write_proc, + (drwav_seek_proc)&internal_generic::drwav_writer_seek_proc, this->writer.get(), + nullptr)) + { + delete f.release(); + } +} + +template <typename T> +size_t audio_writer_wav<T>::write(const T* data, size_t size) +{ + if (!f) + return 0; + if (fmt.type == audio_sample_type::unknown) + return 0; + if (fmt.type == audio_sample_traits<T>::type) + { + const size_t sz = drwav_write_pcm_frames_le(f.get(), size, data); + fmt.length += sz; + return sz * fmt.channels; + } + else + { + univector<uint8_t> native(size * audio_sample_sizeof(fmt.type)); + convert(native.data(), fmt.type, data, size); + const size_t sz = drwav_write_pcm_frames_le(f.get(), size / fmt.channels, native.data()); + fmt.length += sz; + return sz * fmt.channels; + } +} + +template <typename T> +audio_writer_wav<T>::~audio_writer_wav() +{ +} + +template <typename T> +void audio_writer_wav<T>::close() +{ + f.reset(); + writer.reset(); +} + +template struct audio_writer_wav<i16>; +template struct audio_writer_wav<i24>; +template struct audio_writer_wav<i32>; +template struct audio_writer_wav<f32>; +template struct audio_writer_wav<f64>; + +template <typename T> +audio_reader_wav<T>::audio_reader_wav(std::shared_ptr<abstract_reader<>>&& reader) : reader(std::move(reader)) +{ + f.reset(new internal_generic::wav_file()); + drwav_init(f.get(), (drwav_read_proc)&internal_generic::drwav_reader_read_proc, + (drwav_seek_proc)&internal_generic::drwav_reader_seek_proc, this->reader.get(), nullptr); + fmt.channels = f->channels; + fmt.samplerate = f->sampleRate; + fmt.length = static_cast<imax>(f->totalPCMFrameCount); + switch (f->translatedFormatTag) + { + case DR_WAVE_FORMAT_IEEE_FLOAT: + switch (f->bitsPerSample) + { + case 32: + fmt.type = audio_sample_type::f32; + break; + case 64: + fmt.type = audio_sample_type::f64; + break; + default: + fmt.type = audio_sample_type::unknown; + break; + } + break; + case DR_WAVE_FORMAT_PCM: + switch (f->bitsPerSample) + { + case 8: + fmt.type = audio_sample_type::i8; + break; + case 16: + fmt.type = audio_sample_type::i16; + break; + case 24: + fmt.type = audio_sample_type::i24; + break; + case 32: + fmt.type = audio_sample_type::i32; + break; + case 64: + fmt.type = audio_sample_type::i64; + break; + default: + fmt.type = audio_sample_type::unknown; + break; + } + break; + default: + fmt.type = audio_sample_type::unknown; + break; + } +} +template <typename T> +audio_reader_wav<T>::~audio_reader_wav() +{ +} + +template <typename T> +size_t audio_reader_wav<T>::read(T* data, size_t size) +{ + if (fmt.type == audio_sample_type::unknown) + return 0; + if (fmt.type == audio_sample_traits<T>::type) + { + const size_t sz = drwav_read_pcm_frames(f.get(), size / fmt.channels, data); + position += sz; + return sz * fmt.channels; + } + else + { + univector<uint8_t> native(size * audio_sample_sizeof(fmt.type)); + const size_t sz = drwav_read_pcm_frames(f.get(), size / fmt.channels, native.data()); + position += sz; + convert(data, native.data(), fmt.type, sz * fmt.channels); + return sz * fmt.channels; + } +} + +template <typename T> +bool audio_reader_wav<T>::seek(imax offset, seek_origin origin) +{ + switch (origin) + { + case seek_origin::current: + return drwav_seek_to_pcm_frame(f.get(), static_cast<drmp3_uint64>(this->position + offset)); + case seek_origin::begin: + return drwav_seek_to_pcm_frame(f.get(), static_cast<drmp3_uint64>(offset)); + case seek_origin::end: + return drwav_seek_to_pcm_frame(f.get(), static_cast<drmp3_uint64>(fmt.length + offset)); + } + return false; +} + +template struct audio_reader_wav<i16>; +template struct audio_reader_wav<i24>; +template struct audio_reader_wav<i32>; +template struct audio_reader_wav<f32>; +template struct audio_reader_wav<f64>; + +template <typename T> +audio_reader_flac<T>::audio_reader_flac(std::shared_ptr<abstract_reader<>>&& reader) + : reader(std::move(reader)) +{ + f.reset(reinterpret_cast<internal_generic::flac_file*>(drflac_open( + (drflac_read_proc)&internal_generic::drflac_reader_read_proc, + (drflac_seek_proc)&internal_generic::drflac_reader_seek_proc, this->reader.get(), nullptr))); + fmt.channels = f->channels; + fmt.samplerate = f->sampleRate; + fmt.length = static_cast<imax>(f->totalPCMFrameCount); + fmt.type = audio_sample_type::i32; +} +template <typename T> +audio_reader_flac<T>::~audio_reader_flac() +{ +} + +template <typename T> +size_t audio_reader_flac<T>::read(T* data, size_t size) +{ + if (fmt.type == audio_sample_type::unknown) + return 0; + if (audio_sample_traits<T>::type == audio_sample_type::i32) + { + const size_t sz = + drflac_read_pcm_frames_s32(f.get(), size / fmt.channels, reinterpret_cast<i32*>(data)); + position += sz; + return sz * fmt.channels; + } + else + { + univector<i32> native(size * sizeof(i32)); + const size_t sz = drflac_read_pcm_frames_s32(f.get(), size / fmt.channels, native.data()); + position += sz; + convert(data, native.data(), sz * fmt.channels); + return sz * fmt.channels; + } +} + +template <typename T> +bool audio_reader_flac<T>::seek(imax offset, seek_origin origin) +{ + switch (origin) + { + case seek_origin::current: + return drflac_seek_to_pcm_frame(f.get(), static_cast<drmp3_uint64>(this->position + offset)); + case seek_origin::begin: + return drflac_seek_to_pcm_frame(f.get(), static_cast<drmp3_uint64>(offset)); + case seek_origin::end: + return drflac_seek_to_pcm_frame(f.get(), static_cast<drmp3_uint64>(fmt.length + offset)); + } + return false; +} + +template struct audio_reader_flac<i16>; +template struct audio_reader_flac<i24>; +template struct audio_reader_flac<i32>; +template struct audio_reader_flac<f32>; +template struct audio_reader_flac<f64>; + +static_assert(sizeof(drmp3_config) == sizeof(uint32_t) * 2); +static_assert(sizeof(mp3_config) == sizeof(uint32_t) * 2); + +template <typename T> +audio_reader_mp3<T>::audio_reader_mp3(std::shared_ptr<abstract_reader<>>&& reader) : reader(std::move(reader)) +{ + f.reset(new internal_generic::mp3_file()); + drmp3_init(f.get(), (drmp3_read_proc)&internal_generic::drmp3_reader_read_proc, + (drmp3_seek_proc)&internal_generic::drmp3_reader_seek_proc, this->reader.get(), + reinterpret_cast<const drmp3_config*>(&config), nullptr); + fmt.channels = f->channels; + fmt.samplerate = f->sampleRate; + fmt.length = static_cast<imax>(drmp3_get_pcm_frame_count(f.get())); + fmt.type = audio_sample_type::i16; +} +template <typename T> +audio_reader_mp3<T>::~audio_reader_mp3() +{ +} + +template <typename T> +size_t audio_reader_mp3<T>::read(T* data, size_t size) +{ + if (fmt.type == audio_sample_type::unknown) + return 0; + if (audio_sample_traits<T>::type == audio_sample_type::i16) + { + const size_t sz = + drmp3_read_pcm_frames_s16(f.get(), size / fmt.channels, reinterpret_cast<i16*>(data)); + position += sz; + return sz * fmt.channels; + } + else + { + univector<i16> native(size * sizeof(i16)); + const size_t sz = drmp3_read_pcm_frames_s16(f.get(), size / fmt.channels, native.data()); + position += sz; + convert(data, native.data(), sz * fmt.channels); + return sz * fmt.channels; + } +} + +template <typename T> +bool audio_reader_mp3<T>::seek(imax offset, seek_origin origin) +{ + switch (origin) + { + case seek_origin::current: + return drmp3_seek_to_pcm_frame(f.get(), static_cast<drmp3_uint64>(this->position + offset)); + case seek_origin::begin: + return drmp3_seek_to_pcm_frame(f.get(), static_cast<drmp3_uint64>(offset)); + case seek_origin::end: + return drmp3_seek_to_pcm_frame(f.get(), static_cast<drmp3_uint64>(fmt.length + offset)); + } + return false; +} + +template struct audio_reader_mp3<i16>; +template struct audio_reader_mp3<i24>; +template struct audio_reader_mp3<i32>; +template struct audio_reader_mp3<f32>; +template struct audio_reader_mp3<f64>; + +} // namespace kfr + +CMT_PRAGMA_GNU(GCC diagnostic pop) diff --git a/include/kfr/io/dr/README.txt b/src/io/dr/README.txt diff --git a/include/kfr/io/dr/dr_flac.h b/src/io/dr/dr_flac.h diff --git a/include/kfr/io/dr/dr_mp3.h b/src/io/dr/dr_mp3.h diff --git a/include/kfr/io/dr/dr_wav.h b/src/io/dr/dr_wav.h diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt @@ -14,7 +14,7 @@ # You should have received a copy of the GNU General Public License # along with KFR. -cmake_minimum_required(VERSION 3.10) +cmake_minimum_required(VERSION 3.12) add_definitions(-DKFR_TESTING=1) add_definitions(-DKFR_SRC_DIR=\"${CMAKE_SOURCE_DIR}\") diff --git a/tests/dft_test.cpp b/tests/dft_test.cpp @@ -10,7 +10,7 @@ #include <kfr/base.hpp> #include <kfr/dft.hpp> #include <kfr/dsp.hpp> -#include <kfr/io.hpp> +#include <kfr/io/tostring.hpp> #include <set> using namespace kfr; diff --git a/tools/CMakeLists.txt b/tools/CMakeLists.txt @@ -14,7 +14,7 @@ # You should have received a copy of the GNU General Public License # along with KFR. -cmake_minimum_required(VERSION 3.10) +cmake_minimum_required(VERSION 3.12) # Binary output directories set(CMAKE_RUNTIME_OUTPUT_DIRECTORY_RELEASE ${PROJECT_BINARY_DIR}/bin) diff --git a/update-sources.py b/update-sources.py @@ -39,8 +39,8 @@ list_sources("KFR_IO_SRC", "include/kfr/io", ['*.hpp', '*.h']) list_sources("KFR_RUNTIME_SRC", "include/kfr/runtime", ['*.hpp', '*.h']) list_sources("KFR_GRAPHICS_SRC", "include/kfr/graphics", ['*.hpp', '*.h']) list_sources("KFR_SRC", "include", ['*.hpp', '*.h']) -list_sources("KFR_DFT_SRC", "include/kfr/dft", ['*.cpp'], ["dft-src.cpp"]) -list_sources("KFR_IO_SRC", "include/kfr/io", ['*.cpp']) +list_sources("KFR_DFT_SRC", "src/dft", ['*.cpp'], ["dft-src.cpp"]) +list_sources("KFR_IO_SRC", "src/io", ['*.cpp']) list_sources("KFR_UNITTEST_SRC", "tests/unit", ['*.cpp'])