16#if defined(HIGHWAY_HWY_CONTRIB_UNROLLER_UNROLLER_INL_H_) == \
17 defined(HWY_TARGET_TOGGLE)
18#ifdef HIGHWAY_HWY_CONTRIB_UNROLLER_UNROLLER_INL_H_
19#undef HIGHWAY_HWY_CONTRIB_UNROLLER_UNROLLER_INL_H_
21#define HIGHWAY_HWY_CONTRIB_UNROLLER_UNROLLER_INL_H_
34template <
class DERIVED,
typename IN_T,
typename OUT_T>
39 DERIVED*
me() {
return static_cast<DERIVED*
>(
this); }
55 return me()->Func(idx, x, y);
67 return me()->LoadImpl(idx, from);
80 X_VEC MaskLoad(
const ptrdiff_t idx, IN_T* from,
const ptrdiff_t places) {
81 return me()->MaskLoadImpl(idx, from, places);
88 static_cast<size_t>(places +
static_cast<ptrdiff_t
>(
ActualLanes()))));
89 if (places < 0) mask = maskneg;
95 return me()->StoreAndShortCircuitImpl(idx, to, x);
104 ptrdiff_t
const places) {
105 return me()->MaskStoreImpl(idx, to, x, places);
109 const ptrdiff_t places) {
113 static_cast<size_t>(places +
static_cast<ptrdiff_t
>(
ActualLanes()))));
114 if (places < 0) mask = maskneg;
117 return std::abs(places);
120 ptrdiff_t
Reduce(
const Y_VEC x, OUT_T* to) {
return me()->ReduceImpl(x, to); }
130 me()->ReduceImpl(x0, x1, x2, y);
142template <
class DERIVED,
typename IN0_T,
typename IN1_T,
typename OUT_T>
144 DERIVED*
me() {
return static_cast<DERIVED*
>(
this); }
169 return me()->Func(idx, x0, x1, y);
185 return me()->Load0Impl(idx, from);
193 return me()->Load1Impl(idx, from);
207 return me()->MaskLoad0Impl(idx, from, places);
211 const ptrdiff_t places) {
215 static_cast<size_t>(places +
static_cast<ptrdiff_t
>(
ActualLanes()))));
216 if (places < 0) mask = maskneg;
222 const ptrdiff_t places) {
223 return me()->MaskLoad1Impl(idx, from, places);
227 const ptrdiff_t places) {
231 static_cast<size_t>(places +
static_cast<ptrdiff_t
>(
ActualLanes()))));
232 if (places < 0) mask = maskneg;
239 return me()->StoreAndShortCircuitImpl(idx, to, x);
248 const ptrdiff_t places) {
249 return me()->MaskStoreImpl(idx, to, x, places);
253 const ptrdiff_t places) {
257 static_cast<size_t>(places +
static_cast<ptrdiff_t
>(
ActualLanes()))));
258 if (places < 0) mask = maskneg;
261 return std::abs(places);
264 ptrdiff_t
Reduce(
const Y_VEC x, OUT_T* to) {
return me()->ReduceImpl(x, to); }
274 me()->ReduceImpl(x0, x1, x2, y);
286template <
class FUNC,
typename IN_T,
typename OUT_T>
289 auto xx = f.X0Init();
293#if HWY_MEM_OPS_MIGHT_FAULT
294 constexpr auto lane_sz =
300 HWY_ALIGN IN_T xtmp[
static_cast<size_t>(lane_sz)];
301 HWY_ALIGN OUT_T ytmp[
static_cast<size_t>(lane_sz)];
303 CopyBytes(x, xtmp,
static_cast<size_t>(n) *
sizeof(IN_T));
304 xx = f.MaskLoad(0, xtmp, n);
305 yy = f.Func(0, xx, yy);
307 i += f.MaskStore(0, ytmp, yy, n);
308 i += f.Reduce(yy, ytmp);
309 CopyBytes(ytmp, y,
static_cast<size_t>(i) *
sizeof(OUT_T));
314 const ptrdiff_t actual_lanes =
316 if (n > 4 * actual_lanes) {
317 auto xx1 = f.X0Init();
318 auto yy1 = f.YInit();
319 auto xx2 = f.X0Init();
320 auto yy2 = f.YInit();
321 auto xx3 = f.X0Init();
322 auto yy3 = f.YInit();
324 while (i + 4 * actual_lanes - 1 < n) {
332 i -= 3 * actual_lanes;
334 yy = f.Func(i, xx, yy);
335 yy1 = f.Func(i + actual_lanes, xx1, yy1);
336 yy2 = f.Func(i + 2 * actual_lanes, xx2, yy2);
337 yy3 = f.Func(i + 3 * actual_lanes, xx3, yy3);
339 if (!f.StoreAndShortCircuit(i, y, yy))
return;
341 if (!f.StoreAndShortCircuit(i, y, yy1))
return;
343 if (!f.StoreAndShortCircuit(i, y, yy2))
return;
345 if (!f.StoreAndShortCircuit(i, y, yy3))
return;
349 f.Reduce(yy3, yy2, yy1, &yy);
352 while (i + actual_lanes - 1 < n) {
354 yy = f.Func(i, xx, yy);
355 if (!f.StoreAndShortCircuit(i, y, yy))
return;
360 xx = f.MaskLoad(n - actual_lanes, x, i - n);
361 yy = f.Func(n - actual_lanes, xx, yy);
362 f.MaskStore(n - actual_lanes, y, yy, i - n);
368template <
class FUNC,
typename IN0_T,
typename IN1_T,
typename OUT_T>
372 const ptrdiff_t lane_sz =
375 auto xx00 = f.X0Init();
376 auto xx10 = f.X1Init();
381#if HWY_MEM_OPS_MIGHT_FAULT
386 constexpr auto max_lane_sz =
388 HWY_ALIGN IN0_T xtmp0[
static_cast<size_t>(max_lane_sz)];
389 HWY_ALIGN IN1_T xtmp1[
static_cast<size_t>(max_lane_sz)];
390 HWY_ALIGN OUT_T ytmp[
static_cast<size_t>(max_lane_sz)];
392 CopyBytes(x0, xtmp0,
static_cast<size_t>(n) *
sizeof(IN0_T));
393 CopyBytes(x1, xtmp1,
static_cast<size_t>(n) *
sizeof(IN1_T));
394 xx00 = f.MaskLoad0(0, xtmp0, n);
395 xx10 = f.MaskLoad1(0, xtmp1, n);
396 yy = f.Func(0, xx00, xx10, yy);
398 i += f.MaskStore(0, ytmp, yy, n);
399 i += f.Reduce(yy, ytmp);
400 CopyBytes(ytmp, y,
static_cast<size_t>(i) *
sizeof(OUT_T));
405 if (n > 4 * lane_sz) {
406 auto xx01 = f.X0Init();
407 auto xx11 = f.X1Init();
408 auto yy1 = f.YInit();
409 auto xx02 = f.X0Init();
410 auto xx12 = f.X1Init();
411 auto yy2 = f.YInit();
412 auto xx03 = f.X0Init();
413 auto xx13 = f.X1Init();
414 auto yy3 = f.YInit();
416 while (i + 4 * lane_sz - 1 < n) {
417 xx00 = f.Load0(i, x0);
418 xx10 = f.Load1(i, x1);
420 xx01 = f.Load0(i, x0);
421 xx11 = f.Load1(i, x1);
423 xx02 = f.Load0(i, x0);
424 xx12 = f.Load1(i, x1);
426 xx03 = f.Load0(i, x0);
427 xx13 = f.Load1(i, x1);
430 yy = f.Func(i, xx00, xx10, yy);
431 yy1 = f.Func(i + lane_sz, xx01, xx11, yy1);
432 yy2 = f.Func(i + 2 * lane_sz, xx02, xx12, yy2);
433 yy3 = f.Func(i + 3 * lane_sz, xx03, xx13, yy3);
435 if (!f.StoreAndShortCircuit(i, y, yy))
return;
437 if (!f.StoreAndShortCircuit(i, y, yy1))
return;
439 if (!f.StoreAndShortCircuit(i, y, yy2))
return;
441 if (!f.StoreAndShortCircuit(i, y, yy3))
return;
445 f.Reduce(yy3, yy2, yy1, &yy);
448 while (i + lane_sz - 1 < n) {
449 xx00 = f.Load0(i, x0);
450 xx10 = f.Load1(i, x1);
451 yy = f.Func(i, xx00, xx10, yy);
452 if (!f.StoreAndShortCircuit(i, y, yy))
return;
457 xx00 = f.MaskLoad0(n - lane_sz, x0, i - n);
458 xx10 = f.MaskLoad1(n - lane_sz, x1, i - n);
459 yy = f.Func(n - lane_sz, xx00, xx10, yy);
460 f.MaskStore(n - lane_sz, y, yy, i - n);
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_RESTRICT
Definition base.h:95
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
D d
Definition arm_sve-inl.h:1915
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
typename detail::CappedTagChecker< T, kLimit, kPow2 >::type CappedTag
Definition ops/shared-inl.h:379
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition ops/shared-inl.h:367
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:46
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
void Unroller(FUNC &f, IN_T *HWY_RESTRICT x, OUT_T *HWY_RESTRICT y, const ptrdiff_t n)
Definition unroller-inl.h:287
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
typename detail::TypeFromSize< N >::Signed SignedFromSize
Definition base.h:2094
typename RemoveRefT< T >::type RemoveRef
Definition base.h:575
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_ALIGN
Definition set_macros-inl.h:167
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition ops/shared-inl.h:198
Definition unroller-inl.h:143
hn::Rebind< IN0_T, LargerD > I0T
Definition unroller-inl.h:157
I0T d_in0
Definition unroller-inl.h:160
X1_VEC Load1Impl(const ptrdiff_t idx, IN1_T *from)
Definition unroller-inl.h:196
X1_VEC Load1(const ptrdiff_t idx, IN1_T *from)
Definition unroller-inl.h:192
void ReduceImpl(const Y_VEC x0, const Y_VEC x1, const Y_VEC x2, Y_VEC *y)
Definition unroller-inl.h:277
X1_VEC X1InitImpl()
Definition unroller-inl.h:178
ptrdiff_t MaskStore(const ptrdiff_t idx, OUT_T *to, const Y_VEC x, const ptrdiff_t places)
Definition unroller-inl.h:247
X0_VEC Load0Impl(const ptrdiff_t idx, IN0_T *from)
Definition unroller-inl.h:188
hn::Vec< I1T > X1_VEC
Definition unroller-inl.h:165
static size_t ActualLanes()
Definition unroller-inl.h:153
X0_VEC MaskLoad0(const ptrdiff_t idx, IN0_T *from, const ptrdiff_t places)
Definition unroller-inl.h:206
hn::Vec< I1T > MaskLoad1Impl(const ptrdiff_t idx, IN1_T *from, const ptrdiff_t places)
Definition unroller-inl.h:226
X0_VEC Load0(const ptrdiff_t idx, IN0_T *from)
Definition unroller-inl.h:184
ptrdiff_t MaskStoreImpl(const ptrdiff_t idx, OUT_T *to, const Y_VEC x, const ptrdiff_t places)
Definition unroller-inl.h:252
X0_VEC X0InitImpl()
Definition unroller-inl.h:174
bool StoreAndShortCircuit(const ptrdiff_t idx, OUT_T *to, const Y_VEC x)
Definition unroller-inl.h:238
static constexpr size_t MaxUnitLanes()
Definition unroller-inl.h:150
X0_VEC X0Init()
Definition unroller-inl.h:172
ptrdiff_t ReduceImpl(const Y_VEC x, OUT_T *to)
Definition unroller-inl.h:266
hn::Rebind< OUT_T, LargerD > OT
Definition unroller-inl.h:159
Y_VEC YInitImpl()
Definition unroller-inl.h:182
OT d_out
Definition unroller-inl.h:162
hn::Rebind< IN1_T, LargerD > I1T
Definition unroller-inl.h:158
bool StoreAndShortCircuitImpl(const ptrdiff_t idx, OUT_T *to, const Y_VEC x)
Definition unroller-inl.h:242
X1_VEC X1Init()
Definition unroller-inl.h:176
SignedFromSize< kMaxTSize > LargerT
Definition unroller-inl.h:148
hn::Vec< I0T > X0_VEC
Definition unroller-inl.h:164
static constexpr size_t kMaxTSize
Definition unroller-inl.h:146
X0_VEC MaskLoad0Impl(const ptrdiff_t idx, IN0_T *from, const ptrdiff_t places)
Definition unroller-inl.h:210
hn::Vec< OT > Func(const ptrdiff_t idx, const hn::Vec< I0T > x0, const hn::Vec< I1T > x1, const Y_VEC y)
Definition unroller-inl.h:167
I1T d_in1
Definition unroller-inl.h:161
DERIVED * me()
Definition unroller-inl.h:144
hn::Vec< I1T > MaskLoad1(const ptrdiff_t idx, IN1_T *from, const ptrdiff_t places)
Definition unroller-inl.h:221
Y_VEC YInit()
Definition unroller-inl.h:180
hn::CappedTag< LargerT, MaxUnitLanes()> LargerD
Definition unroller-inl.h:155
void Reduce(const Y_VEC x0, const Y_VEC x1, const Y_VEC x2, Y_VEC *y)
Definition unroller-inl.h:273
hn::Vec< OT > Y_VEC
Definition unroller-inl.h:163
ptrdiff_t Reduce(const Y_VEC x, OUT_T *to)
Definition unroller-inl.h:264
Definition unroller-inl.h:35
bool StoreAndShortCircuitImpl(const ptrdiff_t idx, OUT_T *to, const Y_VEC x)
Definition unroller-inl.h:98
X_VEC MaskLoadImpl(const ptrdiff_t idx, IN_T *from, const ptrdiff_t places)
Definition unroller-inl.h:84
OT d_out
Definition unroller-inl.h:50
bool StoreAndShortCircuit(const ptrdiff_t idx, OUT_T *to, const Y_VEC x)
Definition unroller-inl.h:94
ptrdiff_t MaskStoreImpl(const ptrdiff_t idx, OUT_T *to, const Y_VEC x, const ptrdiff_t places)
Definition unroller-inl.h:108
void ReduceImpl(const Y_VEC x0, const Y_VEC x1, const Y_VEC x2, Y_VEC *y)
Definition unroller-inl.h:133
Y_VEC YInit()
Definition unroller-inl.h:62
X_VEC X0InitImpl()
Definition unroller-inl.h:60
Y_VEC YInitImpl()
Definition unroller-inl.h:64
hn::CappedTag< LargerT, MaxUnitLanes()> LargerD
Definition unroller-inl.h:46
hn::Vec< IT > X_VEC
Definition unroller-inl.h:52
X_VEC LoadImpl(const ptrdiff_t idx, IN_T *from)
Definition unroller-inl.h:70
void Reduce(const Y_VEC x0, const Y_VEC x1, const Y_VEC x2, Y_VEC *y)
Definition unroller-inl.h:129
ptrdiff_t Reduce(const Y_VEC x, OUT_T *to)
Definition unroller-inl.h:120
static size_t ActualLanes()
Definition unroller-inl.h:44
DERIVED * me()
Definition unroller-inl.h:39
X_VEC X0Init()
Definition unroller-inl.h:58
X_VEC MaskLoad(const ptrdiff_t idx, IN_T *from, const ptrdiff_t places)
Definition unroller-inl.h:80
X_VEC Load(const ptrdiff_t idx, IN_T *from)
Definition unroller-inl.h:66
hn::Rebind< OUT_T, LargerD > OT
Definition unroller-inl.h:48
ptrdiff_t MaskStore(const ptrdiff_t idx, OUT_T *to, const Y_VEC x, ptrdiff_t const places)
Definition unroller-inl.h:103
SignedFromSize< kMaxTSize > LargerT
Definition unroller-inl.h:37
Y_VEC Func(const ptrdiff_t idx, const X_VEC x, const Y_VEC y)
Definition unroller-inl.h:54
static constexpr size_t kMaxTSize
Definition unroller-inl.h:36
hn::Rebind< IN_T, LargerD > IT
Definition unroller-inl.h:47
static constexpr size_t MaxUnitLanes()
Definition unroller-inl.h:41
ptrdiff_t ReduceImpl(const Y_VEC x, OUT_T *to)
Definition unroller-inl.h:122
hn::Vec< OT > Y_VEC
Definition unroller-inl.h:51
IT d_in
Definition unroller-inl.h:49