commit d38a5aa048a510c6e3a447874a48a91ac8f7d4a3
parent 46038f070e74f5044e2069ea6d5e5f2c55adcd19
Author: d.levin256@gmail.com <d.levin256@gmail.com>
Date: Wed, 14 Nov 2018 00:17:45 +0300
AVX-512 fixes
Diffstat:
4 files changed, 50 insertions(+), 26 deletions(-)
diff --git a/include/kfr/base/logical.hpp b/include/kfr/base/logical.hpp
@@ -109,28 +109,28 @@ KFR_SINTRIN bool bittestall(const i64avx& x) { return _mm256_testc_si256(*x, *al
#if defined CMT_ARCH_AVX512
// horizontal OR
-KFR_SINTRIN bool bittestany(const f32avx512& x) { return !_mm512_test_epi32_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const f64avx512& x) { return !_mm512_test_epi64_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const u8avx512& x) { return !_mm512_test_epi8_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const u16avx512& x) { return !_mm512_test_epi16_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const u32avx512& x) { return !_mm512_test_epi32_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const u64avx512& x) { return !_mm512_test_epi64_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const i8avx512& x) { return !_mm512_test_epi8_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const i16avx512& x) { return !_mm512_test_epi16_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const i32avx512& x) { return !_mm512_test_epi32_mask(*x, *x); }
-KFR_SINTRIN bool bittestany(const i64avx512& x) { return !_mm512_test_epi64_mask(*x, *x); }
+KFR_SINTRIN bool bittestany(const f32avx512& x) { return _mm512_test_epi32_mask(*x, *x); }
+KFR_SINTRIN bool bittestany(const f64avx512& x) { return _mm512_test_epi64_mask(*x, *x); }
+KFR_SINTRIN bool bittestany(const u8avx512& x) { return _mm512_test_epi8_mask(*x, *x); }
+KFR_SINTRIN bool bittestany(const u16avx512& x) { return _mm512_test_epi16_mask(*x, *x); }
+KFR_SINTRIN bool bittestany(const u32avx512& x) { return _mm512_test_epi32_mask(*x, *x); }
+KFR_SINTRIN bool bittestany(const u64avx512& x) { return _mm512_test_epi64_mask(*x, *x); }
+KFR_SINTRIN bool bittestany(const i8avx512& x) { return _mm512_test_epi8_mask(*x, *x); }
+KFR_SINTRIN bool bittestany(const i16avx512& x) { return _mm512_test_epi16_mask(*x, *x); }
+KFR_SINTRIN bool bittestany(const i32avx512& x) { return _mm512_test_epi32_mask(*x, *x); }
+KFR_SINTRIN bool bittestany(const i64avx512& x) { return _mm512_test_epi64_mask(*x, *x); }
// horizontal AND
-KFR_SINTRIN bool bittestall(const f32avx512& x) { return ~bittestany(~x); }
-KFR_SINTRIN bool bittestall(const f64avx512& x) { return ~bittestany(~x); }
-KFR_SINTRIN bool bittestall(const u8avx512& x) { return ~bittestany(~x); }
-KFR_SINTRIN bool bittestall(const u16avx512& x) { return ~bittestany(~x); }
-KFR_SINTRIN bool bittestall(const u32avx512& x) { return ~bittestany(~x); }
-KFR_SINTRIN bool bittestall(const u64avx512& x) { return ~bittestany(~x); }
-KFR_SINTRIN bool bittestall(const i8avx512& x) { return ~bittestany(~x); }
-KFR_SINTRIN bool bittestall(const i16avx512& x) { return ~bittestany(~x); }
-KFR_SINTRIN bool bittestall(const i32avx512& x) { return ~bittestany(~x); }
-KFR_SINTRIN bool bittestall(const i64avx512& x) { return ~bittestany(~x); }
+KFR_SINTRIN bool bittestall(const f32avx512& x) { return !bittestany(~x); }
+KFR_SINTRIN bool bittestall(const f64avx512& x) { return !bittestany(~x); }
+KFR_SINTRIN bool bittestall(const u8avx512& x) { return !bittestany(~x); }
+KFR_SINTRIN bool bittestall(const u16avx512& x) { return !bittestany(~x); }
+KFR_SINTRIN bool bittestall(const u32avx512& x) { return !bittestany(~x); }
+KFR_SINTRIN bool bittestall(const u64avx512& x) { return !bittestany(~x); }
+KFR_SINTRIN bool bittestall(const i8avx512& x) { return !bittestany(~x); }
+KFR_SINTRIN bool bittestall(const i16avx512& x) { return !bittestany(~x); }
+KFR_SINTRIN bool bittestall(const i32avx512& x) { return !bittestany(~x); }
+KFR_SINTRIN bool bittestall(const i64avx512& x) { return !bittestany(~x); }
#endif
diff --git a/include/kfr/base/min_max.hpp b/include/kfr/base/min_max.hpp
@@ -87,6 +87,11 @@ KFR_SINTRIN i64avx min(const i64avx& x, const i64avx& y) { return _mm256_min_epi
KFR_SINTRIN u64avx min(const u64avx& x, const u64avx& y) { return _mm256_min_epu64(*x, *y); }
KFR_SINTRIN i64avx max(const i64avx& x, const i64avx& y) { return _mm256_max_epi64(*x, *y); }
KFR_SINTRIN u64avx max(const u64avx& x, const u64avx& y) { return _mm256_max_epu64(*x, *y); }
+
+KFR_SINTRIN i64sse min(const i64sse& x, const i64sse& y) { return _mm_min_epi64(*x, *y); }
+KFR_SINTRIN u64sse min(const u64sse& x, const u64sse& y) { return _mm_min_epu64(*x, *y); }
+KFR_SINTRIN i64sse max(const i64sse& x, const i64sse& y) { return _mm_max_epi64(*x, *y); }
+KFR_SINTRIN u64sse max(const u64sse& x, const u64sse& y) { return _mm_max_epu64(*x, *y); }
#else
KFR_SINTRIN i64sse min(const i64sse& x, const i64sse& y) { return select(x < y, x, y); }
KFR_SINTRIN u64sse min(const u64sse& x, const u64sse& y) { return select(x < y, x, y); }
diff --git a/include/kfr/base/saturation.hpp b/include/kfr/base/saturation.hpp
@@ -103,6 +103,16 @@ KFR_SINTRIN u8avx satsub(const u8avx& x, const u8avx& y) { return _mm256_subs_ep
KFR_SINTRIN i8avx satsub(const i8avx& x, const i8avx& y) { return _mm256_subs_epi8(*x, *y); }
KFR_SINTRIN u16avx satsub(const u16avx& x, const u16avx& y) { return _mm256_subs_epu16(*x, *y); }
KFR_SINTRIN i16avx satsub(const i16avx& x, const i16avx& y) { return _mm256_subs_epi16(*x, *y); }
+
+KFR_SINTRIN i32avx satadd(const i32avx& a, const i32avx& b) { return saturated_signed_add(a, b); }
+KFR_SINTRIN i64avx satadd(const i64avx& a, const i64avx& b) { return saturated_signed_add(a, b); }
+KFR_SINTRIN u32avx satadd(const u32avx& a, const u32avx& b) { return saturated_unsigned_add(a, b); }
+KFR_SINTRIN u64avx satadd(const u64avx& a, const u64avx& b) { return saturated_unsigned_add(a, b); }
+
+KFR_SINTRIN i32avx satsub(const i32avx& a, const i32avx& b) { return saturated_signed_sub(a, b); }
+KFR_SINTRIN i64avx satsub(const i64avx& a, const i64avx& b) { return saturated_signed_sub(a, b); }
+KFR_SINTRIN u32avx satsub(const u32avx& a, const u32avx& b) { return saturated_unsigned_sub(a, b); }
+KFR_SINTRIN u64avx satsub(const u64avx& a, const u64avx& b) { return saturated_unsigned_sub(a, b); }
#endif
#if defined CMT_ARCH_AVX512
@@ -114,6 +124,15 @@ KFR_SINTRIN u8avx512 satsub(const u8avx512& x, const u8avx512& y) { return _mm51
KFR_SINTRIN i8avx512 satsub(const i8avx512& x, const i8avx512& y) { return _mm512_subs_epi8(*x, *y); }
KFR_SINTRIN u16avx512 satsub(const u16avx512& x, const u16avx512& y) { return _mm512_subs_epu16(*x, *y); }
KFR_SINTRIN i16avx512 satsub(const i16avx512& x, const i16avx512& y) { return _mm512_subs_epi16(*x, *y); }
+
+KFR_SINTRIN i32avx512 satadd(const i32avx512& a, const i32avx512& b) { return saturated_signed_add(a, b); }
+KFR_SINTRIN i64avx512 satadd(const i64avx512& a, const i64avx512& b) { return saturated_signed_add(a, b); }
+KFR_SINTRIN u32avx512 satadd(const u32avx512& a, const u32avx512& b) { return saturated_unsigned_add(a, b); }
+KFR_SINTRIN u64avx512 satadd(const u64avx512& a, const u64avx512& b) { return saturated_unsigned_add(a, b); }
+KFR_SINTRIN i32avx512 satsub(const i32avx512& a, const i32avx512& b) { return saturated_signed_sub(a, b); }
+KFR_SINTRIN i64avx512 satsub(const i64avx512& a, const i64avx512& b) { return saturated_signed_sub(a, b); }
+KFR_SINTRIN u32avx512 satsub(const u32avx512& a, const u32avx512& b) { return saturated_unsigned_sub(a, b); }
+KFR_SINTRIN u64avx512 satsub(const u64avx512& a, const u64avx512& b) { return saturated_unsigned_sub(a, b); }
#endif
KFR_HANDLE_ALL_SIZES_2(satadd)
diff --git a/include/kfr/base/select.hpp b/include/kfr/base/select.hpp
@@ -136,15 +136,15 @@ KFR_SINTRIN u8avx512 select(const maskfor<u8avx512>& m, const u8avx512& x, const
}
KFR_SINTRIN u16avx512 select(const maskfor<u16avx512>& m, const u16avx512& x, const u16avx512& y)
{
- return _mm512_mask_blend_epi8(_mm512_test_epi16_mask(*m, *m), *y, *x);
+ return _mm512_mask_blend_epi16(_mm512_test_epi16_mask(*m, *m), *y, *x);
}
KFR_SINTRIN u32avx512 select(const maskfor<u32avx512>& m, const u32avx512& x, const u32avx512& y)
{
- return _mm512_mask_blend_epi8(_mm512_test_epi32_mask(*m, *m), *y, *x);
+ return _mm512_mask_blend_epi32(_mm512_test_epi32_mask(*m, *m), *y, *x);
}
KFR_SINTRIN u64avx512 select(const maskfor<u64avx512>& m, const u64avx512& x, const u64avx512& y)
{
- return _mm512_mask_blend_epi8(_mm512_test_epi64_mask(*m, *m), *y, *x);
+ return _mm512_mask_blend_epi64(_mm512_test_epi64_mask(*m, *m), *y, *x);
}
KFR_SINTRIN i8avx512 select(const maskfor<i8avx512>& m, const i8avx512& x, const i8avx512& y)
{
@@ -152,15 +152,15 @@ KFR_SINTRIN i8avx512 select(const maskfor<i8avx512>& m, const i8avx512& x, const
}
KFR_SINTRIN i16avx512 select(const maskfor<i16avx512>& m, const i16avx512& x, const i16avx512& y)
{
- return _mm512_mask_blend_epi8(_mm512_test_epi16_mask(*m, *m), *y, *x);
+ return _mm512_mask_blend_epi16(_mm512_test_epi16_mask(*m, *m), *y, *x);
}
KFR_SINTRIN i32avx512 select(const maskfor<i32avx512>& m, const i32avx512& x, const i32avx512& y)
{
- return _mm512_mask_blend_epi8(_mm512_test_epi32_mask(*m, *m), *y, *x);
+ return _mm512_mask_blend_epi32(_mm512_test_epi32_mask(*m, *m), *y, *x);
}
KFR_SINTRIN i64avx512 select(const maskfor<i64avx512>& m, const i64avx512& x, const i64avx512& y)
{
- return _mm512_mask_blend_epi8(_mm512_test_epi64_mask(*m, *m), *y, *x);
+ return _mm512_mask_blend_epi64(_mm512_test_epi64_mask(*m, *m), *y, *x);
}
#endif