commit 68b99bb06aea25f096c483f674ae3f53c7d0e18b
parent a3abae2873fcddd0a4c41aefea8b13c5fbd30084
Author: [email protected] <[email protected]>
Date: Mon, 12 Feb 2024 14:31:11 +0000
Move dft_plan::execute to src
Diffstat:
4 files changed, 246 insertions(+), 167 deletions(-)
diff --git a/include/kfr/dft/fft.hpp b/include/kfr/dft/fft.hpp
@@ -130,10 +130,21 @@ struct dft_stage;
template <typename T>
using dft_stage_ptr = std::unique_ptr<dft_stage<T>>;
+namespace internal_generic
+{
template <typename T>
void dft_initialize(dft_plan<T>& plan);
template <typename T>
void dft_real_initialize(dft_plan_real<T>& plan);
+template <typename T, bool inverse>
+void dft_execute(const dft_plan<T>& plan, cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp);
+
+template <typename T>
+using fn_transpose = void (*)(complex<T>*, const complex<T>*, shape<2>);
+template <typename T>
+void dft_initialize_transpose(fn_transpose<T>& transpose);
+
+} // namespace internal_generic
/// @brief 1D DFT/FFT
template <typename T>
@@ -164,16 +175,10 @@ struct dft_plan
explicit dft_plan(size_t size, dft_order order = dft_order::normal)
: size(size), temp_size(0), data_size(0), arblen(false)
{
- dft_initialize(*this);
+ internal_generic::dft_initialize(*this);
}
- void dump() const
- {
- for (const std::unique_ptr<dft_stage<T>>& s : all_stages)
- {
- s->dump();
- }
- }
+ void dump() const;
KFR_MEM_INTRINSIC void execute(complex<T>* out, const complex<T>* in, u8* temp,
bool inverse = false) const
@@ -233,81 +238,10 @@ struct dft_plan
std::array<bitset, 2> disposition_inplace;
std::array<bitset, 2> disposition_outofplace;
- void calc_disposition()
- {
- for (bool inverse : { false, true })
- {
- auto&& stages = this->stages[inverse];
- bitset can_inplace_per_stage;
- for (int i = 0; i < stages.size(); ++i)
- {
- can_inplace_per_stage[i] = stages[i]->can_inplace;
- }
+ void calc_disposition();
- disposition_inplace[static_cast<int>(inverse)] =
- precompute_disposition(stages.size(), can_inplace_per_stage, true);
- disposition_outofplace[static_cast<int>(inverse)] =
- precompute_disposition(stages.size(), can_inplace_per_stage, false);
- }
- }
-
- static bitset precompute_disposition(int num_stages, bitset can_inplace_per_stage, bool inplace_requested)
- {
- static bitset even{ 0x5555555555555555ull };
- bitset mask = ~bitset() >> (DFT_MAX_STAGES - num_stages);
- bitset result;
- // disposition indicates where is input for corresponding stage
- // first bit : 0 - input, 1 - scratch
- // other bits: 0 - output, 1 - scratch
-
- // build disposition that works always
- if (num_stages % 2 == 0)
- { // even
- result = ~even & mask;
- }
- else
- { // odd
- result = even & mask;
- }
-
- int num_inplace = can_inplace_per_stage.count();
-
-#ifdef KFR_DFT_ELIMINATE_MEMCPY
- if (num_inplace > 0 && inplace_requested)
- {
- if (result.test(0)) // input is in scratch
- {
- // num_inplace must be odd
- if (num_inplace % 2 == 0)
- --num_inplace;
- }
- else
- {
- // num_inplace must be even
- if (num_inplace % 2 != 0)
- --num_inplace;
- }
- }
-#endif
- if (num_inplace > 0)
- {
- for (int i = num_stages - 1; i >= 0; --i)
- {
- if (can_inplace_per_stage.test(i))
- {
- result ^= ~bitset() >> (DFT_MAX_STAGES - (i + 1));
-
- if (--num_inplace == 0)
- break;
- }
- }
- }
-
- if (!inplace_requested) // out-of-place first stage; IN->OUT
- result.reset(0);
-
- return result;
- }
+ static bitset precompute_disposition(int num_stages, bitset can_inplace_per_stage,
+ bool inplace_requested);
protected:
struct noinit
@@ -317,89 +251,11 @@ protected:
: size(size), temp_size(0), data_size(0), arblen(false)
{
}
- const complex<T>* select_in(bitset disposition, size_t stage, const complex<T>* out, const complex<T>* in,
- const complex<T>* scratch) const
- {
- return disposition.test(stage) ? scratch : stage == 0 ? in : out;
- }
- complex<T>* select_out(bitset disposition, size_t stage, size_t total_stages, complex<T>* out,
- complex<T>* scratch) const
- {
- return stage == total_stages - 1 ? out : disposition.test(stage + 1) ? scratch : out;
- }
template <bool inverse>
- void execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const
+ KFR_INTRINSIC void execute_dft(cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp) const
{
- if (temp == nullptr && temp_size > 0)
- {
- return call_with_temp(temp_size, std::bind(&dft_plan<T>::execute_dft<inverse>, this,
- cbool_t<inverse>{}, out, in, std::placeholders::_1));
- }
- auto&& stages = this->stages[inverse];
- if (stages.size() == 1 && (stages[0]->can_inplace || in != out))
- {
- return stages[0]->execute(cbool<inverse>, out, in, temp);
- }
- size_t stack[DFT_MAX_STAGES] = { 0 };
-
- bitset disposition =
- in == out ? this->disposition_inplace[inverse] : this->disposition_outofplace[inverse];
-
- complex<T>* scratch = ptr_cast<complex<T>>(
- temp + this->temp_size -
- align_up(sizeof(complex<T>) * (this->size + 1), platform<>::native_cache_alignment));
-
- bool in_scratch = disposition.test(0);
- if (in_scratch)
- {
- stages[0]->copy_input(inverse, scratch, in, this->size);
- }
-
- const size_t count = stages.size();
-
- for (size_t depth = 0; depth < count;)
- {
- if (stages[depth]->recursion)
- {
- size_t offset = 0;
- size_t rdepth = depth;
- size_t maxdepth = depth;
- do
- {
- if (stack[rdepth] == stages[rdepth]->repeats)
- {
- stack[rdepth] = 0;
- rdepth--;
- }
- else
- {
- complex<T>* rout = select_out(disposition, rdepth, stages.size(), out, scratch);
- const complex<T>* rin = select_in(disposition, rdepth, out, in, scratch);
- stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp);
- offset += stages[rdepth]->out_offset;
- stack[rdepth]++;
- if (rdepth < count - 1 && stages[rdepth + 1]->recursion)
- rdepth++;
- else
- maxdepth = rdepth;
- }
- } while (rdepth != depth);
- depth = maxdepth + 1;
- }
- else
- {
- size_t offset = 0;
- while (offset < this->size)
- {
- stages[depth]->execute(
- cbool<inverse>, select_out(disposition, depth, stages.size(), out, scratch) + offset,
- select_in(disposition, depth, out, in, scratch) + offset, temp);
- offset += stages[depth]->stage_size;
- }
- depth++;
- }
- }
+ internal_generic::dft_execute(*this, cbool<inverse>, out, in, temp);
}
};
@@ -434,7 +290,7 @@ struct dft_plan_real : dft_plan<T>
: dft_plan<T>(typename dft_plan<T>::noinit{}, size / 2), size(size), fmt(fmt)
{
KFR_LOGIC_CHECK(is_even(size), "dft_plan_real requires size to be even");
- dft_real_initialize(*this);
+ internal_generic::dft_real_initialize(*this);
}
void execute(complex<T>*, const complex<T>*, u8*, bool = false) const = delete;
@@ -527,6 +383,7 @@ struct dft_plan_md
dfts[i] = dft_plan<T>(this->size[i]);
temp_size = std::max(temp_size, dfts[i].temp_size);
}
+ internal_generic::dft_initialize_transpose(transpose);
}
void execute(complex<T>* out, const complex<T>* in, u8* temp, bool inverse = false) const
@@ -590,7 +447,7 @@ private:
builtin_memcpy(out, in, sizeof(complex<T>) * total);
}
- matrix_transpose(out, out, shape{ sh.remove_back().product(), sh.back() });
+ transpose(out, out, shape{ sh.remove_back().product(), sh.back() });
if (axis == 0)
break;
@@ -607,7 +464,7 @@ private:
index_t axis = 0;
for (;;)
{
- matrix_transpose(out, in, shape{ sh.front(), sh.remove_front().product() });
+ transpose(out, in, shape{ sh.front(), sh.remove_front().product() });
if (size[axis] > 1)
{
@@ -626,6 +483,7 @@ private:
using dft_list =
std::conditional_t<Dims == dynamic_shape, std::vector<dft_plan<T>>, std::array<dft_plan<T>, Dims>>;
dft_list dfts;
+ internal_generic::fn_transpose<T> transpose;
};
/// @brief Multidimensional DFT
@@ -687,6 +545,7 @@ struct dft_plan_md_real
{
temp_size += complex_size().product() * sizeof(complex<T>);
}
+ internal_generic::dft_initialize_transpose(transpose);
}
void execute(complex<T>* out, const T* in, u8* temp, cdirect_t = {}) const
@@ -803,7 +662,7 @@ private:
dfts[axis].execute(out + o, out + o, temp, cfalse);
}
- matrix_transpose(out, out, shape{ sh.remove_back().product(), sh.back() });
+ transpose(out, out, shape{ sh.remove_back().product(), sh.back() });
if (axis == 0)
break;
@@ -822,7 +681,7 @@ private:
index_t axis = 0;
for (;;)
{
- matrix_transpose(out, in, shape{ sh.front(), sh.remove_front().product() });
+ transpose(out, in, shape{ sh.front(), sh.remove_front().product() });
if (size[axis] > 1)
{
@@ -847,6 +706,7 @@ private:
std::array<dft_plan<T>, const_max(Dims, 1) - 1>>;
dft_list dfts;
dft_plan_real<T> dft_real;
+ internal_generic::fn_transpose<T> transpose;
};
/// @brief DCT type 2 (unscaled)
diff --git a/src/dft/dft.cpp b/src/dft/dft.cpp
@@ -30,15 +30,114 @@
namespace kfr
{
+template <typename T>
+void dft_plan<T>::dump() const
+{
+ for (const std::unique_ptr<dft_stage<T>>& s : all_stages)
+ {
+ s->dump();
+ }
+}
+
+template <typename T>
+void dft_plan<T>::calc_disposition()
+{
+ for (bool inverse : { false, true })
+ {
+ auto&& stages = this->stages[inverse];
+ bitset can_inplace_per_stage;
+ for (int i = 0; i < stages.size(); ++i)
+ {
+ can_inplace_per_stage[i] = stages[i]->can_inplace;
+ }
+
+ disposition_inplace[static_cast<int>(inverse)] =
+ precompute_disposition(stages.size(), can_inplace_per_stage, true);
+ disposition_outofplace[static_cast<int>(inverse)] =
+ precompute_disposition(stages.size(), can_inplace_per_stage, false);
+ }
+}
+
+template <typename T>
+typename dft_plan<T>::bitset dft_plan<T>::precompute_disposition(int num_stages, bitset can_inplace_per_stage,
+ bool inplace_requested)
+{
+ static bitset even{ 0x5555555555555555ull };
+ bitset mask = ~bitset() >> (DFT_MAX_STAGES - num_stages);
+ bitset result;
+ // disposition indicates where is input for corresponding stage
+ // first bit : 0 - input, 1 - scratch
+ // other bits: 0 - output, 1 - scratch
+
+ // build disposition that works always
+ if (num_stages % 2 == 0)
+ { // even
+ result = ~even & mask;
+ }
+ else
+ { // odd
+ result = even & mask;
+ }
+
+ int num_inplace = can_inplace_per_stage.count();
+
+#ifdef KFR_DFT_ELIMINATE_MEMCPY
+ if (num_inplace > 0 && inplace_requested)
+ {
+ if (result.test(0)) // input is in scratch
+ {
+ // num_inplace must be odd
+ if (num_inplace % 2 == 0)
+ --num_inplace;
+ }
+ else
+ {
+ // num_inplace must be even
+ if (num_inplace % 2 != 0)
+ --num_inplace;
+ }
+ }
+#endif
+ if (num_inplace > 0)
+ {
+ for (int i = num_stages - 1; i >= 0; --i)
+ {
+ if (can_inplace_per_stage.test(i))
+ {
+ result ^= ~bitset() >> (DFT_MAX_STAGES - (i + 1));
+
+ if (--num_inplace == 0)
+ break;
+ }
+ }
+ }
+
+ if (!inplace_requested) // out-of-place first stage; IN->OUT
+ result.reset(0);
+
+ return result;
+}
+
+template struct dft_plan<float>;
+template struct dft_plan<double>;
+
CMT_MULTI_PROTO(namespace impl {
template <typename T>
void dft_initialize(dft_plan<T> & plan);
template <typename T>
void dft_real_initialize(dft_plan_real<T> & plan);
+ template <typename T, bool inverse>
+ void dft_execute(const dft_plan<T>& plan, cbool_t<inverse>, complex<T>* out, const complex<T>* in,
+ u8* temp);
+ template <typename T>
+ void dft_initialize_transpose(internal_generic::fn_transpose<T> & transpose);
})
#ifdef CMT_MULTI_NEEDS_GATE
+namespace internal_generic
+{
+
template <typename T>
void dft_initialize(dft_plan<T>& plan)
{
@@ -49,11 +148,33 @@ void dft_real_initialize(dft_plan_real<T>& plan)
{
CMT_MULTI_GATE(ns::impl::dft_real_initialize(plan));
}
+template <typename T, bool inverse>
+void dft_execute(const dft_plan<T>& plan, cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp)
+{
+ CMT_MULTI_GATE(ns::impl::dft_execute(plan, cbool<inverse>, out, in, temp));
+}
+template <typename T>
+void dft_initialize_transpose(fn_transpose<T>& transpose)
+{
+ CMT_MULTI_GATE(ns::impl::dft_initialize_transpose(transpose));
+}
template void dft_initialize<float>(dft_plan<float>&);
template void dft_initialize<double>(dft_plan<double>&);
template void dft_real_initialize<float>(dft_plan_real<float>&);
template void dft_real_initialize<double>(dft_plan_real<double>&);
+template void dft_execute<float>(const dft_plan<float>&, cbool_t<false>, complex<float>*,
+ const complex<float>*, u8*);
+template void dft_execute<float>(const dft_plan<float>&, cbool_t<true>, complex<float>*,
+ const complex<float>*, u8*);
+template void dft_execute<double>(const dft_plan<double>&, cbool_t<false>, complex<double>*,
+ const complex<double>*, u8*);
+template void dft_execute<double>(const dft_plan<double>&, cbool_t<true>, complex<double>*,
+ const complex<double>*, u8*);
+template void dft_initialize_transpose<float>(fn_transpose<float>&);
+template void dft_initialize_transpose<double>(fn_transpose<double>&);
+
+} // namespace internal_generic
#endif
diff --git a/src/dft/fft-impl.hpp b/src/dft/fft-impl.hpp
@@ -1936,6 +1936,99 @@ void dft_initialize(dft_plan<T>& plan)
initialize_data(&plan);
initialize_order(&plan);
}
+
+template <typename T>
+const complex<T>* select_in(const dft_plan<T>& plan, typename dft_plan<T>::bitset disposition, size_t stage,
+ const complex<T>* out, const complex<T>* in, const complex<T>* scratch)
+{
+ return disposition.test(stage) ? scratch : stage == 0 ? in : out;
+}
+template <typename T>
+complex<T>* select_out(const dft_plan<T>& plan, typename dft_plan<T>::bitset disposition, size_t stage,
+ size_t total_stages, complex<T>* out, complex<T>* scratch)
+{
+ return stage == total_stages - 1 ? out : disposition.test(stage + 1) ? scratch : out;
+}
+
+template <typename T, bool inverse>
+void dft_execute(const dft_plan<T>& plan, cbool_t<inverse>, complex<T>* out, const complex<T>* in, u8* temp)
+{
+ if (temp == nullptr && plan.temp_size > 0)
+ {
+ return call_with_temp(plan.temp_size, std::bind(&impl::dft_execute<T, inverse>, std::cref(plan),
+ cbool_t<inverse>{}, out, in, std::placeholders::_1));
+ }
+ auto&& stages = plan.stages[inverse];
+ if (stages.size() == 1 && (stages[0]->can_inplace || in != out))
+ {
+ return stages[0]->execute(cbool<inverse>, out, in, temp);
+ }
+ size_t stack[DFT_MAX_STAGES] = { 0 };
+
+ typename dft_plan<T>::bitset disposition =
+ in == out ? plan.disposition_inplace[inverse] : plan.disposition_outofplace[inverse];
+
+ complex<T>* scratch = ptr_cast<complex<T>>(
+ temp + plan.temp_size -
+ align_up(sizeof(complex<T>) * (plan.size + 1), platform<>::native_cache_alignment));
+
+ bool in_scratch = disposition.test(0);
+ if (in_scratch)
+ {
+ stages[0]->copy_input(inverse, scratch, in, plan.size);
+ }
+
+ const size_t count = stages.size();
+
+ for (size_t depth = 0; depth < count;)
+ {
+ if (stages[depth]->recursion)
+ {
+ size_t offset = 0;
+ size_t rdepth = depth;
+ size_t maxdepth = depth;
+ do
+ {
+ if (stack[rdepth] == stages[rdepth]->repeats)
+ {
+ stack[rdepth] = 0;
+ rdepth--;
+ }
+ else
+ {
+ complex<T>* rout = select_out(plan, disposition, rdepth, stages.size(), out, scratch);
+ const complex<T>* rin = select_in(plan, disposition, rdepth, out, in, scratch);
+ stages[rdepth]->execute(cbool<inverse>, rout + offset, rin + offset, temp);
+ offset += stages[rdepth]->out_offset;
+ stack[rdepth]++;
+ if (rdepth < count - 1 && stages[rdepth + 1]->recursion)
+ rdepth++;
+ else
+ maxdepth = rdepth;
+ }
+ } while (rdepth != depth);
+ depth = maxdepth + 1;
+ }
+ else
+ {
+ size_t offset = 0;
+ while (offset < plan.size)
+ {
+ stages[depth]->execute(cbool<inverse>,
+ select_out(plan, disposition, depth, stages.size(), out, scratch) +
+ offset,
+ select_in(plan, disposition, depth, out, in, scratch) + offset, temp);
+ offset += stages[depth]->stage_size;
+ }
+ depth++;
+ }
+ }
+}
+template <typename T>
+void dft_initialize_transpose(internal_generic::fn_transpose<T>& transpose)
+{
+ transpose = &kfr::CMT_ARCH_NAME::matrix_transpose;
+}
} // namespace impl
namespace intrinsics
diff --git a/src/dft/fft-templates.hpp b/src/dft/fft-templates.hpp
@@ -35,6 +35,11 @@ namespace impl
{
template void dft_initialize<FLOAT>(dft_plan<FLOAT>& plan);
template void dft_real_initialize<FLOAT>(dft_plan_real<FLOAT>& plan);
+template void dft_execute<FLOAT>(const dft_plan<FLOAT>&, cbool_t<false>, complex<FLOAT>*,
+ const complex<FLOAT>*, u8*);
+template void dft_execute<FLOAT>(const dft_plan<FLOAT>&, cbool_t<true>, complex<FLOAT>*,
+ const complex<FLOAT>*, u8*);
+template void dft_initialize_transpose<FLOAT>(internal_generic::fn_transpose<FLOAT>& transpose);
} // namespace impl
} // namespace CMT_ARCH_NAME
} // namespace kfr