SIMD: Add sum intrinsics for float/double.#17681
Conversation
|
@seiko2plus looks ok? |
|
|
||
| // Horizontal add: Calculates the sum of all vector elements. | ||
| NPY_FINLINE float npyv_sum_f32(float32x4_t a) | ||
| { | ||
| float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a)); | ||
| return vget_lane_f32(vpadd_f32(r, r), 0); | ||
| } | ||
| #ifdef __aarch64__ | ||
| NPY_FINLINE double npyv_sum_f64(float64x2_t a) | ||
| { | ||
| return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); | ||
| } | ||
| #endif |
There was a problem hiding this comment.
| // Horizontal add: Calculates the sum of all vector elements. | |
| NPY_FINLINE float npyv_sum_f32(float32x4_t a) | |
| { | |
| float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a)); | |
| return vget_lane_f32(vpadd_f32(r, r), 0); | |
| } | |
| #ifdef __aarch64__ | |
| NPY_FINLINE double npyv_sum_f64(float64x2_t a) | |
| { | |
| return vget_lane_f64(vget_low_f64(a) + vget_high_f64(a), 0); | |
| } | |
| #endif | |
| // Horizontal add: Calculates the sum of all vector elements. | |
| #if NPY_SIMD_F64 | |
| #define npyv_sum_f32 vaddvq_f32 | |
| #define npyv_sum_f64 vaddvq_f64 | |
| #else | |
| NPY_FINLINE float npyv_sum_f32(npyv_f32 a) | |
| { | |
| float32x2_t r = vadd_f32(vget_high_f32(a), vget_low_f32(a)); | |
| return vget_lane_f32(vpadd_f32(r, r), 0); | |
| } | |
| #endif |
EDIT: bring vpadd_f32 again as it was, It should perform better than extracting two scalars.
| NPY_FINLINE float npyv_sum_f32(npyv_f32 a) | ||
| { | ||
| return vec_extract(a, 0) + vec_extract(a, 1) + | ||
| vec_extract(a, 2) + vec_extract(a, 3); | ||
| } |
There was a problem hiding this comment.
| NPY_FINLINE float npyv_sum_f32(npyv_f32 a) | |
| { | |
| return vec_extract(a, 0) + vec_extract(a, 1) + | |
| vec_extract(a, 2) + vec_extract(a, 3); | |
| } | |
| NPY_FINLINE float npyv_sum_f32(npyv_f32 a) | |
| { | |
| npyv_f32 sum = vec_add(a, npyv_combineh_f32(a, a)); | |
| return vec_extract(sum, 0) + vec_extract(sum, 1); | |
| } |
EDIT: my bad, fix swaping the highest half
| #endif // !NPY_HAVE_FMA3 | ||
| #endif // _NPY_SIMD_AVX2_ARITHMETIC_H | ||
|
|
||
| // Horizontal add: Calculates the sum of all vector elements. |
There was a problem hiding this comment.
please, move the intrinsics inside the header guard _NPY_SIMD_AVX2_ARITHMETIC_H,
same thing for other SIMD extensions.
|
I prefer adding a testing unit for any new intrinsics to keep things under control. // try to follow the current defentions in the way of sorting the source
// this how you should define the new python methods
SIMD_IMPL_INTRIN_1(sum_f32, f32, vf32)
#if NPY_SIMD_F64
SIMD_IMPL_INTRIN_1(sum_f64, f64, vf64)
#endif
// and that how we attach them
SIMD_INTRIN_DEF(sum_f32)
#if NPY_SIMD_F64
SIMD_INTRIN_DEF(sum_f64)
#endifOnce you get done bringing the new methods to def test_reduce(self):
data = self._data()
vdata = self.load(data)
# reduce sum
data_sum = sum(data)
vsum = self.sum(vdata)
assert vsum == data_sumYou can also have some fun and try to use # 1- bring the baseline via dict `targets` or attribute `baseline`
from numpy.core._simd import targets, baseline
npyv = targets["baseline"] # or baseline
# you can also dump `targets` to get the supported SIMD extensions
# by default build option `--simd-test` contains the most common SIMD extentions
if not npyv.simd: # equivalent to C def `NPY_SIMD`
print((
"How that possible? changed the default build settings?\n"
"maybe you running it under armhf, then get targets['NEON']"
))
return
a = npyv.load_f32(range(npyv.nlanes_f32))
print("sum of f32", npyv.sum_f32(a))
if npyv.simd_f64: # equivalent to C def `NPY_SIMD_F64`
b = npyv.load_f64(range(npyv.nlanes_f64))
print("sum of f64", npyv.sum_f64(b))EDIT: improve the examples |
|
@seiko2plus Thanks for your detailed recommendations, The |
|
|
||
| data_sum = sum(data) | ||
| vsum = self.sum(vdata) | ||
| assert vsum == data_sum |
|
Thanks @Qiyu8 |
The origin PR is too large to review, So I will split to several small PRs, Here is the sum intrinsics that was about to used in
einsum, The intrinsics has been fully discussed and tested.