Grok 12.0.1
unroller-inl.h
Go to the documentation of this file.
1// Copyright 2023 Matthew Kolbe
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16#if defined(HIGHWAY_HWY_CONTRIB_UNROLLER_UNROLLER_INL_H_) == \
17 defined(HWY_TARGET_TOGGLE)
18#ifdef HIGHWAY_HWY_CONTRIB_UNROLLER_UNROLLER_INL_H_
19#undef HIGHWAY_HWY_CONTRIB_UNROLLER_UNROLLER_INL_H_
20#else
21#define HIGHWAY_HWY_CONTRIB_UNROLLER_UNROLLER_INL_H_
22#endif
23
24#include <cstdlib> // std::abs
25
26#include "hwy/highway.h"
27
29namespace hwy {
30namespace HWY_NAMESPACE {
31
32namespace hn = hwy::HWY_NAMESPACE;
33
34template <class DERIVED, typename IN_T, typename OUT_T>
36 static constexpr size_t kMaxTSize = HWY_MAX(sizeof(IN_T), sizeof(OUT_T));
37 using LargerT = SignedFromSize<kMaxTSize>; // only the size matters.
38
39 DERIVED* me() { return static_cast<DERIVED*>(this); }
40
41 static constexpr size_t MaxUnitLanes() {
43 }
44 static size_t ActualLanes() { return Lanes(hn::ScalableTag<LargerT>()); }
45
53
54 Y_VEC Func(const ptrdiff_t idx, const X_VEC x, const Y_VEC y) {
55 return me()->Func(idx, x, y);
56 }
57
58 X_VEC X0Init() { return me()->X0InitImpl(); }
59
61
62 Y_VEC YInit() { return me()->YInitImpl(); }
63
65
66 X_VEC Load(const ptrdiff_t idx, IN_T* from) {
67 return me()->LoadImpl(idx, from);
68 }
69
70 X_VEC LoadImpl(const ptrdiff_t idx, IN_T* from) {
71 return hn::LoadU(d_in, from + idx);
72 }
73
74 // MaskLoad can take in either a positive or negative number for `places`. if
75 // the number is positive, then it loads the top `places` values, and if it's
76 // negative, it loads the bottom |places| values. example: places = 3
77 // | o | o | o | x | x | x | x | x |
78 // example places = -3
79 // | x | x | x | x | x | o | o | o |
80 X_VEC MaskLoad(const ptrdiff_t idx, IN_T* from, const ptrdiff_t places) {
81 return me()->MaskLoadImpl(idx, from, places);
82 }
83
84 X_VEC MaskLoadImpl(const ptrdiff_t idx, IN_T* from, const ptrdiff_t places) {
85 auto mask = hn::FirstN(d_in, static_cast<size_t>(places));
86 auto maskneg = hn::Not(hn::FirstN(
87 d_in,
88 static_cast<size_t>(places + static_cast<ptrdiff_t>(ActualLanes()))));
89 if (places < 0) mask = maskneg;
90
91 return hn::MaskedLoad(mask, d_in, from + idx);
92 }
93
94 bool StoreAndShortCircuit(const ptrdiff_t idx, OUT_T* to, const Y_VEC x) {
95 return me()->StoreAndShortCircuitImpl(idx, to, x);
96 }
97
98 bool StoreAndShortCircuitImpl(const ptrdiff_t idx, OUT_T* to, const Y_VEC x) {
99 hn::StoreU(x, d_out, to + idx);
100 return true;
101 }
102
103 ptrdiff_t MaskStore(const ptrdiff_t idx, OUT_T* to, const Y_VEC x,
104 ptrdiff_t const places) {
105 return me()->MaskStoreImpl(idx, to, x, places);
106 }
107
108 ptrdiff_t MaskStoreImpl(const ptrdiff_t idx, OUT_T* to, const Y_VEC x,
109 const ptrdiff_t places) {
110 auto mask = hn::FirstN(d_out, static_cast<size_t>(places));
111 auto maskneg = hn::Not(hn::FirstN(
112 d_out,
113 static_cast<size_t>(places + static_cast<ptrdiff_t>(ActualLanes()))));
114 if (places < 0) mask = maskneg;
115
116 hn::BlendedStore(x, mask, d_out, to + idx);
117 return std::abs(places);
118 }
119
120 ptrdiff_t Reduce(const Y_VEC x, OUT_T* to) { return me()->ReduceImpl(x, to); }
121
122 ptrdiff_t ReduceImpl(const Y_VEC x, OUT_T* to) {
123 // default does nothing
124 (void)x;
125 (void)to;
126 return 0;
127 }
128
129 void Reduce(const Y_VEC x0, const Y_VEC x1, const Y_VEC x2, Y_VEC* y) {
130 me()->ReduceImpl(x0, x1, x2, y);
131 }
132
133 void ReduceImpl(const Y_VEC x0, const Y_VEC x1, const Y_VEC x2, Y_VEC* y) {
134 // default does nothing
135 (void)x0;
136 (void)x1;
137 (void)x2;
138 (void)y;
139 }
140};
141
142template <class DERIVED, typename IN0_T, typename IN1_T, typename OUT_T>
144 DERIVED* me() { return static_cast<DERIVED*>(this); }
145
146 static constexpr size_t kMaxTSize =
147 HWY_MAX(sizeof(IN0_T), HWY_MAX(sizeof(IN1_T), sizeof(OUT_T)));
148 using LargerT = SignedFromSize<kMaxTSize>; // only the size matters.
149
150 static constexpr size_t MaxUnitLanes() {
152 }
153 static size_t ActualLanes() { return Lanes(hn::ScalableTag<LargerT>()); }
154
156
166
167 hn::Vec<OT> Func(const ptrdiff_t idx, const hn::Vec<I0T> x0,
168 const hn::Vec<I1T> x1, const Y_VEC y) {
169 return me()->Func(idx, x0, x1, y);
170 }
171
172 X0_VEC X0Init() { return me()->X0InitImpl(); }
173
175
176 X1_VEC X1Init() { return me()->X1InitImpl(); }
177
179
180 Y_VEC YInit() { return me()->YInitImpl(); }
181
183
184 X0_VEC Load0(const ptrdiff_t idx, IN0_T* from) {
185 return me()->Load0Impl(idx, from);
186 }
187
188 X0_VEC Load0Impl(const ptrdiff_t idx, IN0_T* from) {
189 return hn::LoadU(d_in0, from + idx);
190 }
191
192 X1_VEC Load1(const ptrdiff_t idx, IN1_T* from) {
193 return me()->Load1Impl(idx, from);
194 }
195
196 X1_VEC Load1Impl(const ptrdiff_t idx, IN1_T* from) {
197 return hn::LoadU(d_in1, from + idx);
198 }
199
200 // maskload can take in either a positive or negative number for `places`. if
201 // the number is positive, then it loads the top `places` values, and if it's
202 // negative, it loads the bottom |places| values. example: places = 3
203 // | o | o | o | x | x | x | x | x |
204 // example places = -3
205 // | x | x | x | x | x | o | o | o |
206 X0_VEC MaskLoad0(const ptrdiff_t idx, IN0_T* from, const ptrdiff_t places) {
207 return me()->MaskLoad0Impl(idx, from, places);
208 }
209
210 X0_VEC MaskLoad0Impl(const ptrdiff_t idx, IN0_T* from,
211 const ptrdiff_t places) {
212 auto mask = hn::FirstN(d_in0, static_cast<size_t>(places));
213 auto maskneg = hn::Not(hn::FirstN(
214 d_in0,
215 static_cast<size_t>(places + static_cast<ptrdiff_t>(ActualLanes()))));
216 if (places < 0) mask = maskneg;
217
218 return hn::MaskedLoad(mask, d_in0, from + idx);
219 }
220
221 hn::Vec<I1T> MaskLoad1(const ptrdiff_t idx, IN1_T* from,
222 const ptrdiff_t places) {
223 return me()->MaskLoad1Impl(idx, from, places);
224 }
225
226 hn::Vec<I1T> MaskLoad1Impl(const ptrdiff_t idx, IN1_T* from,
227 const ptrdiff_t places) {
228 auto mask = hn::FirstN(d_in1, static_cast<size_t>(places));
229 auto maskneg = hn::Not(hn::FirstN(
230 d_in1,
231 static_cast<size_t>(places + static_cast<ptrdiff_t>(ActualLanes()))));
232 if (places < 0) mask = maskneg;
233
234 return hn::MaskedLoad(mask, d_in1, from + idx);
235 }
236
237 // store returns a bool that is `false` when
238 bool StoreAndShortCircuit(const ptrdiff_t idx, OUT_T* to, const Y_VEC x) {
239 return me()->StoreAndShortCircuitImpl(idx, to, x);
240 }
241
242 bool StoreAndShortCircuitImpl(const ptrdiff_t idx, OUT_T* to, const Y_VEC x) {
243 hn::StoreU(x, d_out, to + idx);
244 return true;
245 }
246
247 ptrdiff_t MaskStore(const ptrdiff_t idx, OUT_T* to, const Y_VEC x,
248 const ptrdiff_t places) {
249 return me()->MaskStoreImpl(idx, to, x, places);
250 }
251
252 ptrdiff_t MaskStoreImpl(const ptrdiff_t idx, OUT_T* to, const Y_VEC x,
253 const ptrdiff_t places) {
254 auto mask = hn::FirstN(d_out, static_cast<size_t>(places));
255 auto maskneg = hn::Not(hn::FirstN(
256 d_out,
257 static_cast<size_t>(places + static_cast<ptrdiff_t>(ActualLanes()))));
258 if (places < 0) mask = maskneg;
259
260 hn::BlendedStore(x, mask, d_out, to + idx);
261 return std::abs(places);
262 }
263
264 ptrdiff_t Reduce(const Y_VEC x, OUT_T* to) { return me()->ReduceImpl(x, to); }
265
266 ptrdiff_t ReduceImpl(const Y_VEC x, OUT_T* to) {
267 // default does nothing
268 (void)x;
269 (void)to;
270 return 0;
271 }
272
273 void Reduce(const Y_VEC x0, const Y_VEC x1, const Y_VEC x2, Y_VEC* y) {
274 me()->ReduceImpl(x0, x1, x2, y);
275 }
276
277 void ReduceImpl(const Y_VEC x0, const Y_VEC x1, const Y_VEC x2, Y_VEC* y) {
278 // default does nothing
279 (void)x0;
280 (void)x1;
281 (void)x2;
282 (void)y;
283 }
284};
285
286template <class FUNC, typename IN_T, typename OUT_T>
287inline void Unroller(FUNC& f, IN_T* HWY_RESTRICT x, OUT_T* HWY_RESTRICT y,
288 const ptrdiff_t n) {
289 auto xx = f.X0Init();
290 auto yy = f.YInit();
291 ptrdiff_t i = 0;
292
293#if HWY_MEM_OPS_MIGHT_FAULT
294 constexpr auto lane_sz =
295 static_cast<ptrdiff_t>(RemoveRef<FUNC>::MaxUnitLanes());
296 if (n < lane_sz) {
297 const DFromV<decltype(yy)> d;
298 // this may not fit on the stack for HWY_RVV, but we do not reach this code
299 // there
300 HWY_ALIGN IN_T xtmp[static_cast<size_t>(lane_sz)];
301 HWY_ALIGN OUT_T ytmp[static_cast<size_t>(lane_sz)];
302
303 CopyBytes(x, xtmp, static_cast<size_t>(n) * sizeof(IN_T));
304 xx = f.MaskLoad(0, xtmp, n);
305 yy = f.Func(0, xx, yy);
306 Store(Zero(d), d, ytmp);
307 i += f.MaskStore(0, ytmp, yy, n);
308 i += f.Reduce(yy, ytmp);
309 CopyBytes(ytmp, y, static_cast<size_t>(i) * sizeof(OUT_T));
310 return;
311 }
312#endif
313
314 const ptrdiff_t actual_lanes =
315 static_cast<ptrdiff_t>(RemoveRef<FUNC>::ActualLanes());
316 if (n > 4 * actual_lanes) {
317 auto xx1 = f.X0Init();
318 auto yy1 = f.YInit();
319 auto xx2 = f.X0Init();
320 auto yy2 = f.YInit();
321 auto xx3 = f.X0Init();
322 auto yy3 = f.YInit();
323
324 while (i + 4 * actual_lanes - 1 < n) {
325 xx = f.Load(i, x);
326 i += actual_lanes;
327 xx1 = f.Load(i, x);
328 i += actual_lanes;
329 xx2 = f.Load(i, x);
330 i += actual_lanes;
331 xx3 = f.Load(i, x);
332 i -= 3 * actual_lanes;
333
334 yy = f.Func(i, xx, yy);
335 yy1 = f.Func(i + actual_lanes, xx1, yy1);
336 yy2 = f.Func(i + 2 * actual_lanes, xx2, yy2);
337 yy3 = f.Func(i + 3 * actual_lanes, xx3, yy3);
338
339 if (!f.StoreAndShortCircuit(i, y, yy)) return;
340 i += actual_lanes;
341 if (!f.StoreAndShortCircuit(i, y, yy1)) return;
342 i += actual_lanes;
343 if (!f.StoreAndShortCircuit(i, y, yy2)) return;
344 i += actual_lanes;
345 if (!f.StoreAndShortCircuit(i, y, yy3)) return;
346 i += actual_lanes;
347 }
348
349 f.Reduce(yy3, yy2, yy1, &yy);
350 }
351
352 while (i + actual_lanes - 1 < n) {
353 xx = f.Load(i, x);
354 yy = f.Func(i, xx, yy);
355 if (!f.StoreAndShortCircuit(i, y, yy)) return;
356 i += actual_lanes;
357 }
358
359 if (i != n) {
360 xx = f.MaskLoad(n - actual_lanes, x, i - n);
361 yy = f.Func(n - actual_lanes, xx, yy);
362 f.MaskStore(n - actual_lanes, y, yy, i - n);
363 }
364
365 f.Reduce(yy, y);
366}
367
368template <class FUNC, typename IN0_T, typename IN1_T, typename OUT_T>
369inline void Unroller(FUNC& HWY_RESTRICT f, IN0_T* HWY_RESTRICT x0,
370 IN1_T* HWY_RESTRICT x1, OUT_T* HWY_RESTRICT y,
371 const ptrdiff_t n) {
372 const ptrdiff_t lane_sz =
373 static_cast<ptrdiff_t>(RemoveRef<FUNC>::ActualLanes());
374
375 auto xx00 = f.X0Init();
376 auto xx10 = f.X1Init();
377 auto yy = f.YInit();
378
379 ptrdiff_t i = 0;
380
381#if HWY_MEM_OPS_MIGHT_FAULT
382 if (n < lane_sz) {
383 const DFromV<decltype(yy)> d;
384 // this may not fit on the stack for HWY_RVV, but we do not reach this code
385 // there
386 constexpr auto max_lane_sz =
387 static_cast<ptrdiff_t>(RemoveRef<FUNC>::MaxUnitLanes());
388 HWY_ALIGN IN0_T xtmp0[static_cast<size_t>(max_lane_sz)];
389 HWY_ALIGN IN1_T xtmp1[static_cast<size_t>(max_lane_sz)];
390 HWY_ALIGN OUT_T ytmp[static_cast<size_t>(max_lane_sz)];
391
392 CopyBytes(x0, xtmp0, static_cast<size_t>(n) * sizeof(IN0_T));
393 CopyBytes(x1, xtmp1, static_cast<size_t>(n) * sizeof(IN1_T));
394 xx00 = f.MaskLoad0(0, xtmp0, n);
395 xx10 = f.MaskLoad1(0, xtmp1, n);
396 yy = f.Func(0, xx00, xx10, yy);
397 Store(Zero(d), d, ytmp);
398 i += f.MaskStore(0, ytmp, yy, n);
399 i += f.Reduce(yy, ytmp);
400 CopyBytes(ytmp, y, static_cast<size_t>(i) * sizeof(OUT_T));
401 return;
402 }
403#endif
404
405 if (n > 4 * lane_sz) {
406 auto xx01 = f.X0Init();
407 auto xx11 = f.X1Init();
408 auto yy1 = f.YInit();
409 auto xx02 = f.X0Init();
410 auto xx12 = f.X1Init();
411 auto yy2 = f.YInit();
412 auto xx03 = f.X0Init();
413 auto xx13 = f.X1Init();
414 auto yy3 = f.YInit();
415
416 while (i + 4 * lane_sz - 1 < n) {
417 xx00 = f.Load0(i, x0);
418 xx10 = f.Load1(i, x1);
419 i += lane_sz;
420 xx01 = f.Load0(i, x0);
421 xx11 = f.Load1(i, x1);
422 i += lane_sz;
423 xx02 = f.Load0(i, x0);
424 xx12 = f.Load1(i, x1);
425 i += lane_sz;
426 xx03 = f.Load0(i, x0);
427 xx13 = f.Load1(i, x1);
428 i -= 3 * lane_sz;
429
430 yy = f.Func(i, xx00, xx10, yy);
431 yy1 = f.Func(i + lane_sz, xx01, xx11, yy1);
432 yy2 = f.Func(i + 2 * lane_sz, xx02, xx12, yy2);
433 yy3 = f.Func(i + 3 * lane_sz, xx03, xx13, yy3);
434
435 if (!f.StoreAndShortCircuit(i, y, yy)) return;
436 i += lane_sz;
437 if (!f.StoreAndShortCircuit(i, y, yy1)) return;
438 i += lane_sz;
439 if (!f.StoreAndShortCircuit(i, y, yy2)) return;
440 i += lane_sz;
441 if (!f.StoreAndShortCircuit(i, y, yy3)) return;
442 i += lane_sz;
443 }
444
445 f.Reduce(yy3, yy2, yy1, &yy);
446 }
447
448 while (i + lane_sz - 1 < n) {
449 xx00 = f.Load0(i, x0);
450 xx10 = f.Load1(i, x1);
451 yy = f.Func(i, xx00, xx10, yy);
452 if (!f.StoreAndShortCircuit(i, y, yy)) return;
453 i += lane_sz;
454 }
455
456 if (i != n) {
457 xx00 = f.MaskLoad0(n - lane_sz, x0, i - n);
458 xx10 = f.MaskLoad1(n - lane_sz, x1, i - n);
459 yy = f.Func(n - lane_sz, xx00, xx10, yy);
460 f.MaskStore(n - lane_sz, y, yy, i - n);
461 }
462
463 f.Reduce(yy, y);
464}
465
466} // namespace HWY_NAMESPACE
467} // namespace hwy
469
470#endif // HIGHWAY_HWY_CONTRIB_UNROLLER_UNROLLER_INL_H_
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_RESTRICT
Definition base.h:95
Definition copy-inl.h:32
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
D d
Definition arm_sve-inl.h:1915
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
typename detail::CappedTagChecker< T, kLimit, kPow2 >::type CappedTag
Definition ops/shared-inl.h:379
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition ops/shared-inl.h:367
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:46
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
void Unroller(FUNC &f, IN_T *HWY_RESTRICT x, OUT_T *HWY_RESTRICT y, const ptrdiff_t n)
Definition unroller-inl.h:287
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
Definition abort.h:8
HWY_API void CopyBytes(const From *from, To *to)
Definition base.h:327
typename detail::TypeFromSize< N >::Signed SignedFromSize
Definition base.h:2094
typename RemoveRefT< T >::type RemoveRef
Definition base.h:575
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_ALIGN
Definition set_macros-inl.h:167
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition ops/shared-inl.h:198
Definition unroller-inl.h:143
hn::Rebind< IN0_T, LargerD > I0T
Definition unroller-inl.h:157
I0T d_in0
Definition unroller-inl.h:160
X1_VEC Load1Impl(const ptrdiff_t idx, IN1_T *from)
Definition unroller-inl.h:196
X1_VEC Load1(const ptrdiff_t idx, IN1_T *from)
Definition unroller-inl.h:192
void ReduceImpl(const Y_VEC x0, const Y_VEC x1, const Y_VEC x2, Y_VEC *y)
Definition unroller-inl.h:277
X1_VEC X1InitImpl()
Definition unroller-inl.h:178
ptrdiff_t MaskStore(const ptrdiff_t idx, OUT_T *to, const Y_VEC x, const ptrdiff_t places)
Definition unroller-inl.h:247
X0_VEC Load0Impl(const ptrdiff_t idx, IN0_T *from)
Definition unroller-inl.h:188
hn::Vec< I1T > X1_VEC
Definition unroller-inl.h:165
static size_t ActualLanes()
Definition unroller-inl.h:153
X0_VEC MaskLoad0(const ptrdiff_t idx, IN0_T *from, const ptrdiff_t places)
Definition unroller-inl.h:206
hn::Vec< I1T > MaskLoad1Impl(const ptrdiff_t idx, IN1_T *from, const ptrdiff_t places)
Definition unroller-inl.h:226
X0_VEC Load0(const ptrdiff_t idx, IN0_T *from)
Definition unroller-inl.h:184
ptrdiff_t MaskStoreImpl(const ptrdiff_t idx, OUT_T *to, const Y_VEC x, const ptrdiff_t places)
Definition unroller-inl.h:252
X0_VEC X0InitImpl()
Definition unroller-inl.h:174
bool StoreAndShortCircuit(const ptrdiff_t idx, OUT_T *to, const Y_VEC x)
Definition unroller-inl.h:238
static constexpr size_t MaxUnitLanes()
Definition unroller-inl.h:150
X0_VEC X0Init()
Definition unroller-inl.h:172
ptrdiff_t ReduceImpl(const Y_VEC x, OUT_T *to)
Definition unroller-inl.h:266
hn::Rebind< OUT_T, LargerD > OT
Definition unroller-inl.h:159
Y_VEC YInitImpl()
Definition unroller-inl.h:182
OT d_out
Definition unroller-inl.h:162
hn::Rebind< IN1_T, LargerD > I1T
Definition unroller-inl.h:158
bool StoreAndShortCircuitImpl(const ptrdiff_t idx, OUT_T *to, const Y_VEC x)
Definition unroller-inl.h:242
X1_VEC X1Init()
Definition unroller-inl.h:176
SignedFromSize< kMaxTSize > LargerT
Definition unroller-inl.h:148
hn::Vec< I0T > X0_VEC
Definition unroller-inl.h:164
static constexpr size_t kMaxTSize
Definition unroller-inl.h:146
X0_VEC MaskLoad0Impl(const ptrdiff_t idx, IN0_T *from, const ptrdiff_t places)
Definition unroller-inl.h:210
hn::Vec< OT > Func(const ptrdiff_t idx, const hn::Vec< I0T > x0, const hn::Vec< I1T > x1, const Y_VEC y)
Definition unroller-inl.h:167
I1T d_in1
Definition unroller-inl.h:161
DERIVED * me()
Definition unroller-inl.h:144
hn::Vec< I1T > MaskLoad1(const ptrdiff_t idx, IN1_T *from, const ptrdiff_t places)
Definition unroller-inl.h:221
Y_VEC YInit()
Definition unroller-inl.h:180
hn::CappedTag< LargerT, MaxUnitLanes()> LargerD
Definition unroller-inl.h:155
void Reduce(const Y_VEC x0, const Y_VEC x1, const Y_VEC x2, Y_VEC *y)
Definition unroller-inl.h:273
hn::Vec< OT > Y_VEC
Definition unroller-inl.h:163
ptrdiff_t Reduce(const Y_VEC x, OUT_T *to)
Definition unroller-inl.h:264
Definition unroller-inl.h:35
bool StoreAndShortCircuitImpl(const ptrdiff_t idx, OUT_T *to, const Y_VEC x)
Definition unroller-inl.h:98
X_VEC MaskLoadImpl(const ptrdiff_t idx, IN_T *from, const ptrdiff_t places)
Definition unroller-inl.h:84
OT d_out
Definition unroller-inl.h:50
bool StoreAndShortCircuit(const ptrdiff_t idx, OUT_T *to, const Y_VEC x)
Definition unroller-inl.h:94
ptrdiff_t MaskStoreImpl(const ptrdiff_t idx, OUT_T *to, const Y_VEC x, const ptrdiff_t places)
Definition unroller-inl.h:108
void ReduceImpl(const Y_VEC x0, const Y_VEC x1, const Y_VEC x2, Y_VEC *y)
Definition unroller-inl.h:133
Y_VEC YInit()
Definition unroller-inl.h:62
X_VEC X0InitImpl()
Definition unroller-inl.h:60
Y_VEC YInitImpl()
Definition unroller-inl.h:64
hn::CappedTag< LargerT, MaxUnitLanes()> LargerD
Definition unroller-inl.h:46
hn::Vec< IT > X_VEC
Definition unroller-inl.h:52
X_VEC LoadImpl(const ptrdiff_t idx, IN_T *from)
Definition unroller-inl.h:70
void Reduce(const Y_VEC x0, const Y_VEC x1, const Y_VEC x2, Y_VEC *y)
Definition unroller-inl.h:129
ptrdiff_t Reduce(const Y_VEC x, OUT_T *to)
Definition unroller-inl.h:120
static size_t ActualLanes()
Definition unroller-inl.h:44
DERIVED * me()
Definition unroller-inl.h:39
X_VEC X0Init()
Definition unroller-inl.h:58
X_VEC MaskLoad(const ptrdiff_t idx, IN_T *from, const ptrdiff_t places)
Definition unroller-inl.h:80
X_VEC Load(const ptrdiff_t idx, IN_T *from)
Definition unroller-inl.h:66
hn::Rebind< OUT_T, LargerD > OT
Definition unroller-inl.h:48
ptrdiff_t MaskStore(const ptrdiff_t idx, OUT_T *to, const Y_VEC x, ptrdiff_t const places)
Definition unroller-inl.h:103
SignedFromSize< kMaxTSize > LargerT
Definition unroller-inl.h:37
Y_VEC Func(const ptrdiff_t idx, const X_VEC x, const Y_VEC y)
Definition unroller-inl.h:54
static constexpr size_t kMaxTSize
Definition unroller-inl.h:36
hn::Rebind< IN_T, LargerD > IT
Definition unroller-inl.h:47
static constexpr size_t MaxUnitLanes()
Definition unroller-inl.h:41
ptrdiff_t ReduceImpl(const Y_VEC x, OUT_T *to)
Definition unroller-inl.h:122
hn::Vec< OT > Y_VEC
Definition unroller-inl.h:51
IT d_in
Definition unroller-inl.h:49
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()