19#include <riscv_vector.h>
29#define HWY_RVV_HAVE_F16C 1
31#define HWY_RVV_HAVE_F16C 0
37using DFromV =
typename DFromV_t<RemoveConst<V>>::type;
40using TFromV = TFromD<DFromV<V>>;
42template <
typename T,
size_t N,
int kPow2>
55template <
typename T,
size_t N,
int kPow2>
59 static constexpr int kMinVecPow2 =
60 -3 +
static_cast<int>(
FloorLog2(
sizeof(T)));
62 static constexpr int kNewPow2 =
HWY_MAX(kPow2, kMinVecPow2);
63 static constexpr size_t kNewN = D::template NewN<kNewPow2, kNumMaxLanes>();
85#define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP) \
86 X_MACRO(64, 0, 64, NAME, OP) \
87 X_MACRO(32, 0, 32, NAME, OP) \
88 X_MACRO(16, 0, 16, NAME, OP) \
89 X_MACRO(8, 0, 8, NAME, OP) \
90 X_MACRO(8, 1, 4, NAME, OP) \
91 X_MACRO(8, 2, 2, NAME, OP) \
92 X_MACRO(8, 3, 1, NAME, OP)
104#define HWY_RVV_FOREACH_08_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
105 X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, 32, NAME, OP) \
106 X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, 16, NAME, OP) \
107 X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, 8, NAME, OP) \
108 X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, 4, NAME, OP) \
109 X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, 2, NAME, OP) \
110 X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, 1, NAME, OP)
112#define HWY_RVV_FOREACH_16_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
113 X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, 32, NAME, OP) \
114 X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, 16, NAME, OP) \
115 X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, 8, NAME, OP) \
116 X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, 4, NAME, OP) \
117 X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, 2, NAME, OP)
119#define HWY_RVV_FOREACH_32_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
120 X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, 32, NAME, OP) \
121 X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, 16, NAME, OP) \
122 X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, 8, NAME, OP) \
123 X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, 4, NAME, OP)
125#define HWY_RVV_FOREACH_64_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
126 X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, 32, NAME, OP) \
127 X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, 16, NAME, OP) \
128 X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, 8, NAME, OP)
131#define HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
132 X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, 32, NAME, OP) \
133 X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, 16, NAME, OP) \
134 X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, 8, NAME, OP) \
135 X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, 4, NAME, OP) \
136 X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, 2, NAME, OP) \
137 X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, 1, NAME, OP)
139#define HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
140 X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, 64, NAME, OP) \
141 X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, 32, NAME, OP) \
142 X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, 16, NAME, OP) \
143 X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, 8, NAME, OP) \
144 X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, 4, NAME, OP) \
145 X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, 2, NAME, OP)
147#define HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
148 X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, 64, NAME, OP) \
149 X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, 32, NAME, OP) \
150 X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, 16, NAME, OP) \
151 X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, 8, NAME, OP) \
152 X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, 4, NAME, OP)
154#define HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
155 X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, 64, NAME, OP) \
156 X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, 32, NAME, OP) \
157 X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, 16, NAME, OP) \
158 X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, 8, NAME, OP)
161#define HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
162 X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, 64, NAME, OP) \
163 X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, 32, NAME, OP) \
164 X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, 16, NAME, OP) \
165 X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, 8, NAME, OP) \
166 X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, 4, NAME, OP)
168#define HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
169 X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, 64, NAME, OP) \
170 X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, 32, NAME, OP) \
171 X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, 16, NAME, OP) \
172 X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, 8, NAME, OP)
174#define HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
175 X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, 64, NAME, OP) \
176 X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, 32, NAME, OP) \
177 X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, 16, NAME, OP)
179#define HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
180 X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, 64, NAME, OP) \
181 X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, 32, NAME, OP)
184#define HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
185 HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
186 X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, 2, NAME, OP)
188#define HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
189 HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
190 X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, 4, NAME, OP)
192#define HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
193 HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
194 X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, 8, NAME, OP)
196#define HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
197 HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
198 X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, 16, NAME, OP)
201#define HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
202 HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
203 X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, 1, NAME, OP)
205#define HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
206 HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
207 X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, 2, NAME, OP)
209#define HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
210 HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
211 X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, 4, NAME, OP)
213#define HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
214 HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
215 X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, 8, NAME, OP)
233#define HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
235#define HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
236 X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -3, 64, NAME, OP)
238#define HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
239 X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -2, 64, NAME, OP)
241#define HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
242 X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, -1, 64, NAME, OP)
245#define HWY_RVV_FOREACH_08_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
246 HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
247 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
249#define HWY_RVV_FOREACH_16_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
250 HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
251 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
253#define HWY_RVV_FOREACH_32_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
254 HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
255 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
257#define HWY_RVV_FOREACH_64_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
258 HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
259 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
262#define HWY_RVV_FOREACH_08_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
263 HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
264 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
266#define HWY_RVV_FOREACH_16_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
267 HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
268 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
270#define HWY_RVV_FOREACH_32_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
271 HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
272 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
274#define HWY_RVV_FOREACH_64_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
275 HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
276 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
279#define HWY_RVV_FOREACH_08_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
280 HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
281 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
283#define HWY_RVV_FOREACH_16_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
284 HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
285 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
287#define HWY_RVV_FOREACH_32_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
288 HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
289 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
291#define HWY_RVV_FOREACH_64_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
292 HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
293 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
296#define HWY_RVV_FOREACH_08_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
297 HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
298 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
300#define HWY_RVV_FOREACH_16_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
301 HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
302 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
304#define HWY_RVV_FOREACH_32_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
305 HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
306 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
308#define HWY_RVV_FOREACH_64_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
309 HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
310 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
313#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
314 HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, uint, u, NAME, OP)
315#define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
316 HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, uint, u, NAME, OP)
317#define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
318 HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, uint, u, NAME, OP)
319#define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \
320 HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, uint, u, NAME, OP)
323#define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
324 HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, int, i, NAME, OP)
325#define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
326 HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, int, i, NAME, OP)
327#define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
328 HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, int, i, NAME, OP)
329#define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) \
330 HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, int, i, NAME, OP)
335#define HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS) \
336 HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, float, f, NAME, OP)
340#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
341 HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS)
343#define HWY_RVV_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
345#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
346#define HWY_RVV_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
348#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
349 HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP)
350#define HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) \
351 HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, float, f, NAME, OP)
354#define HWY_RVV_FOREACH_UI08(X_MACRO, NAME, OP, LMULS) \
355 HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
356 HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)
358#define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP, LMULS) \
359 HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
360 HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)
362#define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \
363 HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
364 HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)
366#define HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) \
367 HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \
368 HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
370#define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS) \
371 HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \
372 HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS)
374#define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \
375 HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
376 HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
377 HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
379#define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) \
380 HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
381 HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
382 HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
384#define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS) \
385 HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \
386 HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
388#define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) \
389 HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
390 HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS)
393#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
394 HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
395 HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS)
397#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
398 HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
399 HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
401#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \
402 HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
403 HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS)
406#define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \
407 HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
408 HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
410#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \
411 HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \
412 HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
415#define HWY_RVV_T(BASE, SEW) BASE##SEW##_t
416#define HWY_RVV_D(BASE, SEW, N, SHIFT) Simd<HWY_RVV_T(BASE, SEW), N, SHIFT>
417#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t
418#define HWY_RVV_TUP(BASE, SEW, LMUL, TUP) v##BASE##SEW##LMUL##x##TUP##_t
419#define HWY_RVV_M(MLEN) vbool##MLEN##_t
425#define HWY_SPECIALIZE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
428 struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> { \
429 using Lane = HWY_RVV_T(BASE, SEW); \
430 using type = ScalableTag<Lane, SHIFT>; \
440#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
443#define HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL) \
444 if (__builtin_constant_p(cap >= kMaxLanes) && (cap >= kMaxLanes)) { \
450 if ((__builtin_constant_p((cap & (cap - 1)) == 0) && \
451 ((cap & (cap - 1)) == 0)) || \
452 (__builtin_constant_p(cap <= HWY_MAX(kMinLanesPerFullVec, 4)) && \
453 (cap <= HWY_MAX(kMinLanesPerFullVec, 4)))) { \
481 return detail::IsFull(d) \
482 ? __riscv_vsetvl_e##SEW##LMUL(cap) \
483 : __riscv_vsetvl_e##SEW##LMUL(HWY_MIN(cap, kMaxLanes)); \
486#define HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL)
489#define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
491 template <size_t N> \
492 HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
493 constexpr size_t kFull = HWY_LANES(HWY_RVV_T(BASE, SEW)); \
494 constexpr size_t kCap = MaxLanes(d); \
496 return N == kFull ? __riscv_vsetvlmax_e##SEW##LMUL() \
497 : __riscv_vsetvl_e##SEW##LMUL(kCap); \
499 template <size_t N> \
500 HWY_API size_t Capped##NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, size_t cap) { \
537 constexpr size_t kMinLanesPerFullVec = \
538 detail::ScaleByPower(16 / (SEW / 8), SHIFT); \
540 constexpr size_t kMaxLanes = MaxLanes(d); \
542 HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL) \
544 if (kMaxLanes <= HWY_MAX(kMinLanesPerFullVec, 4)) { \
559 return __riscv_vsetvl_e##SEW##LMUL(HWY_MIN(cap, kMaxLanes)); \
564 const size_t actual = Lanes(d); \
565 return HWY_MIN(actual, cap); \
569#define HWY_RVV_LANES_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
570 SHIFT, MLEN, NAME, OP) \
571 template <size_t N> \
572 HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
573 constexpr size_t kCap = MaxLanes(d); \
576 const size_t actual = \
577 detail::ScaleByPower(__riscv_vlenb() / (SEW / 8), SHIFT); \
578 return HWY_MIN(actual, kCap); \
580 template <size_t N> \
581 HWY_API size_t Capped##NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, size_t cap) { \
584 const size_t actual = \
585 detail::ScaleByPower(__riscv_vlenb() / (SEW / 8), SHIFT); \
587 return detail::IsFull(d) ? HWY_MIN(actual, cap) \
588 : HWY_MIN(HWY_MIN(actual, cap), MaxLanes(d)); \
594#undef HWY_RVV_LANES_VIRT
595#undef HWY_RVV_CAPPED_LANES_SPECIAL_CASES
597template <
class D, HWY_RVV_IF_EMULATED_D(D)>
602template <
class D, HWY_RVV_IF_EMULATED_D(D)>
611#define HWY_RVV_AVL(SEW, SHIFT) \
612 Lanes(ScalableTag<HWY_RVV_T(uint, SEW), SHIFT>())
615#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
616 SHIFT, MLEN, NAME, OP) \
617 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
618 return __riscv_v##OP##_v_##CHAR##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \
622#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
623 SHIFT, MLEN, NAME, OP) \
624 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
625 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
626 return __riscv_v##OP##_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
630#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
631 SHIFT, MLEN, NAME, OP) \
632 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
633 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
634 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(a, b, \
635 HWY_RVV_AVL(SEW, SHIFT)); \
639#define HWY_RVV_RETV_ARGMVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
640 SHIFT, MLEN, NAME, OP) \
641 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
642 NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m, \
643 HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
644 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(m, no, a, b, \
645 HWY_RVV_AVL(SEW, SHIFT)); \
649#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \
650 HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \
651 return __riscv_vm##OP##_m_b##MLEN(m, HWY_RVV_AVL(SEW, SHIFT)); \
658#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
660 template <size_t N> \
661 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
662 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_T(BASE, SEW) arg) { \
663 return __riscv_v##OP##_##CHAR##SEW##LMUL(arg, Lanes(d)); \
672template <
size_t N,
int kPow2>
681template <
size_t N,
int kPow2>
689using VFromD =
decltype(
Set(D(), TFromD<D>()));
706#define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
707 SHIFT, MLEN, NAME, OP) \
708 template <size_t N> \
709 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
710 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) ) { \
711 return __riscv_v##OP##_##CHAR##SEW##LMUL(); \
715#undef HWY_RVV_UNDEFINED
728#define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
730 HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
731 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH( \
738#define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
740 template <size_t N> \
741 HWY_API HWY_RVV_V(BASE, SEW, LMULD) \
742 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) , \
743 HWY_RVV_V(BASE, SEW, LMUL) v) { \
744 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULD( \
752#define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
753 SHIFT, MLEN, NAME, OP) \
754 template <size_t N> \
755 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
756 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) , \
757 HWY_RVV_V(BASE, SEW, LMUL) v) { \
761#undef HWY_RVV_EXT_VIRT
763template <
class D, HWY_RVV_IF_EMULATED_D(D)>
766 const Half<
decltype(du)> duh;
774#define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
775 SHIFT, MLEN, NAME, OP) \
776 template <typename T, size_t N> \
777 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> , \
778 vuint8##LMUL##_t v) { \
781 template <size_t N> \
782 HWY_API vuint8##LMUL##_t BitCastFromByte( \
783 HWY_RVV_D(BASE, SEW, N, SHIFT) , vuint8##LMUL##_t v) { \
788#define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
789 SHIFT, MLEN, NAME, OP) \
790 template <typename T, size_t N> \
791 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> , \
792 vint8##LMUL##_t v) { \
793 return __riscv_vreinterpret_v_i8##LMUL##_u8##LMUL(v); \
795 template <size_t N> \
796 HWY_API vint8##LMUL##_t BitCastFromByte( \
797 HWY_RVV_D(BASE, SEW, N, SHIFT) , vuint8##LMUL##_t v) { \
798 return __riscv_vreinterpret_v_u8##LMUL##_i8##LMUL(v); \
803#define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
805 template <typename T, size_t N> \
806 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> , \
807 HWY_RVV_V(BASE, SEW, LMUL) v) { \
808 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v); \
810 template <size_t N> \
811 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
812 HWY_RVV_D(BASE, SEW, N, SHIFT) , vuint8##LMUL##_t v) { \
813 return __riscv_v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v); \
817#define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
818 SHIFT, MLEN, NAME, OP) \
819 template <typename T, size_t N> \
820 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> , \
821 HWY_RVV_V(BASE, SEW, LMUL) v) { \
822 return __riscv_v##OP##_v_u##SEW##LMUL##_u8##LMUL( \
823 __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v)); \
825 template <size_t N> \
826 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
827 HWY_RVV_D(BASE, SEW, N, SHIFT) , vuint8##LMUL##_t v) { \
828 return __riscv_v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \
829 __riscv_v##OP##_v_u8##LMUL##_u##SEW##LMUL(v)); \
833#define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
834 SHIFT, MLEN, NAME, OP) \
835 template <typename T, size_t N> \
836 HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> , \
837 HWY_RVV_V(BASE, SEW, LMUL) v) { \
838 return detail::Trunc(__riscv_v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v)); \
840 template <size_t N> \
841 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
842 HWY_RVV_D(BASE, SEW, N, SHIFT) , vuint8##LMULH##_t v) { \
843 HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \
844 const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \
845 return __riscv_v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v2); \
849#define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
850 SHIFT, MLEN, NAME, OP) \
851 template <typename T, size_t N> \
852 HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> , \
853 HWY_RVV_V(BASE, SEW, LMUL) v) { \
854 return detail::Trunc(__riscv_v##OP##_v_u##SEW##LMUL##_u8##LMUL( \
855 __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v))); \
857 template <size_t N> \
858 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
859 HWY_RVV_D(BASE, SEW, N, SHIFT) , vuint8##LMULH##_t v) { \
860 HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \
861 const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \
862 return __riscv_v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \
863 __riscv_v##OP##_v_u8##LMUL##_u##SEW##LMUL(v2)); \
875#elif HWY_RVV_HAVE_F16C
879template <
size_t N,
int kPow2>
886#undef HWY_RVV_CAST_U8
887#undef HWY_RVV_CAST_I8
889#undef HWY_RVV_CAST_IF
890#undef HWY_RVV_CAST_VIRT_U
891#undef HWY_RVV_CAST_VIRT_IF
893template <
size_t N,
int kPow2>
902template <
class D,
class FromV>
911#define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
913 template <size_t N> \
914 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
915 return __riscv_v##OP##_##CHAR##SEW##LMUL(Lanes(d)); \
923#define HWY_RVV_MASKED_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
924 SHIFT, MLEN, NAME, OP) \
925 template <size_t N> \
926 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
927 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) mask) { \
928 return __riscv_v##OP##_##CHAR##SEW##LMUL(mask, Lanes(d)); \
932#undef HWY_RVV_MASKED_IOTA
942template <
class V, HWY_IF_FLOAT_V(V)>
958template <
class V, HWY_IF_FLOAT_V(V)>
960 using DF = DFromV<V>;
961 using DU = RebindToUnsigned<DF>;
974template <
class V, HWY_IF_FLOAT_V(V)>
976 using DF = DFromV<V>;
977 using DU = RebindToUnsigned<DF>;
990template <
class V, HWY_IF_FLOAT_V(V)>
992 using DF = DFromV<V>;
993 using DU = RebindToUnsigned<DF>;
1000 return And(
Not(not_a), b);
1006 return Xor(x1,
Xor(x2, x3));
1012 return Or(o1,
Or(o2, o3));
1018 return Or(o,
And(a1, a2));
1034#ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS
1035#undef HWY_NATIVE_OPERATOR_REPLACEMENTS
1037#define HWY_NATIVE_OPERATOR_REPLACEMENTS
1062#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
1063#undef HWY_NATIVE_I32_SATURATED_ADDSUB
1065#define HWY_NATIVE_I32_SATURATED_ADDSUB
1068#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
1069#undef HWY_NATIVE_U32_SATURATED_ADDSUB
1071#define HWY_NATIVE_U32_SATURATED_ADDSUB
1074#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
1075#undef HWY_NATIVE_I64_SATURATED_ADDSUB
1077#define HWY_NATIVE_I64_SATURATED_ADDSUB
1080#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
1081#undef HWY_NATIVE_U64_SATURATED_ADDSUB
1083#define HWY_NATIVE_U64_SATURATED_ADDSUB
1099#ifndef HWY_RVV_CHOOSE_VXRM
1102#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400
1103#define HWY_RVV_AVOID_VXRM
1106#elif HWY_COMPILER_CLANG && \
1107 (HWY_COMPILER_CLANG < 1600 || __riscv_v_intrinsic < 11000)
1108#define HWY_RVV_AVOID_VXRM
1118#ifdef HWY_RVV_AVOID_VXRM
1119#define HWY_RVV_INSERT_VXRM(vxrm, avl) avl
1120#define __RISCV_VXRM_RNU
1121#define __RISCV_VXRM_RDN
1123#define HWY_RVV_INSERT_VXRM(vxrm, avl) vxrm, avl
1127#define HWY_RVV_RETV_AVERAGE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1128 SHIFT, MLEN, NAME, OP) \
1129 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1130 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
1131 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL( \
1132 a, b, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
1138#undef HWY_RVV_RETV_AVERAGE
1143#define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1145 template <int kBits> \
1146 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1147 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v, kBits, \
1148 HWY_RVV_AVL(SEW, SHIFT)); \
1150 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1151 NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \
1152 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast<uint8_t>(bits), \
1153 HWY_RVV_AVL(SEW, SHIFT)); \
1166template <
class VU8, HWY_IF_U8_D(DFromV<VU8>)>
1172 using VU16 =
VFromD<
decltype(du16)>;
1174 const VU16 vFDB97531 = ShiftRight<8>(
BitCast(du16, v));
1175 const VU16 vECA86420 = detail::AndS(
BitCast(du16, v), 0xFF);
1176 const VU16 sFE_DC_BA_98_76_54_32_10 =
Add(vFDB97531, vECA86420);
1178 const VU16 szz_FE_zz_BA_zz_76_zz_32 =
1179 BitCast(du16, ShiftRight<16>(
BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
1180 const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
1181 Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
1182 const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
1183 BitCast(du16, ShiftRight<32>(
BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
1184 const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
1185 Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
1186 return detail::AndS(
BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull);
1189template <
class VI8, HWY_IF_I8_D(DFromV<VI8>)>
1197 using VI16 =
VFromD<
decltype(di16)>;
1199 const VI16 vFDB97531 = ShiftRight<8>(
BitCast(di16, v));
1200 const VI16 vECA86420 = ShiftRight<8>(ShiftLeft<8>(
BitCast(di16, v)));
1201 const VI16 sFE_DC_BA_98_76_54_32_10 =
Add(vFDB97531, vECA86420);
1203 const VI16 sDC_zz_98_zz_54_zz_10_zz =
1204 BitCast(di16, ShiftLeft<16>(
BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
1205 const VI16 sFC_xx_B8_xx_74_xx_30_xx =
1206 Add(sFE_DC_BA_98_76_54_32_10, sDC_zz_98_zz_54_zz_10_zz);
1207 const VI16 sB8_xx_zz_zz_30_xx_zz_zz =
1208 BitCast(di16, ShiftLeft<32>(
BitCast(du64, sFC_xx_B8_xx_74_xx_30_xx)));
1209 const VI16 sF8_xx_xx_xx_70_xx_xx_xx =
1210 Add(sFC_xx_B8_xx_74_xx_30_xx, sB8_xx_zz_zz_30_xx_zz_zz);
1211 return ShiftRight<48>(
BitCast(di64, sF8_xx_xx_xx_70_xx_xx_xx));
1215template <
int kBits,
class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
1220 constexpr size_t kSizeInBits =
sizeof(
TFromV<V>) * 8;
1221 static_assert(0 <= kBits && kBits < kSizeInBits,
"Invalid shift count");
1222 if (kBits == 0)
return v;
1229#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1230 SHIFT, MLEN, NAME, OP) \
1231 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1232 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
1233 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, bits, \
1234 HWY_RVV_AVL(SEW, SHIFT)); \
1239#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1240 SHIFT, MLEN, NAME, OP) \
1241 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1242 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
1243 const HWY_RVV_D(uint, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT) du; \
1244 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, BitCast(du, bits), \
1245 HWY_RVV_AVL(SEW, SHIFT)); \
1255#undef HWY_RVV_SHIFT_II
1256#undef HWY_RVV_SHIFT_VV
1289#ifdef HWY_NATIVE_MUL_8
1290#undef HWY_NATIVE_MUL_8
1292#define HWY_NATIVE_MUL_8
1294#ifdef HWY_NATIVE_MUL_64
1295#undef HWY_NATIVE_MUL_64
1297#define HWY_NATIVE_MUL_64
1311#define HWY_RVV_MUL15(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1313 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1314 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
1315 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL( \
1316 a, b, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
1324#ifdef HWY_NATIVE_INT_DIV
1325#undef HWY_NATIVE_INT_DIV
1327#define HWY_NATIVE_INT_DIV
1339#ifdef HWY_NATIVE_MASKED_ARITH
1340#undef HWY_NATIVE_MASKED_ARITH
1342#define HWY_NATIVE_MASKED_ARITH
1376#ifdef HWY_NATIVE_F64_APPROX_RECIP
1377#undef HWY_NATIVE_F64_APPROX_RECIP
1379#define HWY_NATIVE_F64_APPROX_RECIP
1388#ifdef HWY_NATIVE_F64_APPROX_RSQRT
1389#undef HWY_NATIVE_F64_APPROX_RSQRT
1391#define HWY_NATIVE_F64_APPROX_RSQRT
1399#ifdef HWY_NATIVE_INT_FMA
1400#undef HWY_NATIVE_INT_FMA
1402#define HWY_NATIVE_INT_FMA
1406#define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1408 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1409 NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \
1410 HWY_RVV_V(BASE, SEW, LMUL) add) { \
1411 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x, \
1412 HWY_RVV_AVL(SEW, SHIFT)); \
1437#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1438 SHIFT, MLEN, NAME, OP) \
1439 HWY_API HWY_RVV_M(MLEN) \
1440 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
1441 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN( \
1442 a, b, HWY_RVV_AVL(SEW, SHIFT)); \
1446#define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1447 SHIFT, MLEN, NAME, OP) \
1448 HWY_API HWY_RVV_M(MLEN) \
1449 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
1450 return __riscv_v##OP##_##CHAR##SEW##LMUL##_b##MLEN( \
1451 a, b, HWY_RVV_AVL(SEW, SHIFT)); \
1488#undef HWY_RVV_RETM_ARGVV
1489#undef HWY_RVV_RETM_ARGVS
1494HWY_API auto Ge(
const V a,
const V b) ->
decltype(
Le(a, b)) {
1499HWY_API auto Gt(
const V a,
const V b) ->
decltype(
Lt(a, b)) {
1506 return detail::NeS(
And(a, bit), 0);
1516#define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP) \
1517 HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) a, HWY_RVV_M(MLEN) b) { \
1518 return __riscv_vm##OP##_mm_b##MLEN(b, a, HWY_RVV_AVL(SEW, SHIFT)); \
1535#undef HWY_RVV_RETM_ARGMM
1539#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1540 SHIFT, MLEN, NAME, OP) \
1541 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1542 NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \
1543 HWY_RVV_V(BASE, SEW, LMUL) no) { \
1544 return __riscv_v##OP##_vvm_##CHAR##SEW##LMUL(no, yes, m, \
1545 HWY_RVV_AVL(SEW, SHIFT)); \
1550#undef HWY_RVV_IF_THEN_ELSE
1553template <
class M,
class V>
1560#define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
1561 LMULH, SHIFT, MLEN, NAME, OP) \
1562 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1563 NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) no) { \
1564 return __riscv_v##OP##_##CHAR##SEW##LMUL(no, 0, m, \
1565 HWY_RVV_AVL(SEW, SHIFT)); \
1571#undef HWY_RVV_IF_THEN_ZERO_ELSE
1580 return detail::NeS(v, 0);
1584#ifdef HWY_NATIVE_IS_NEGATIVE
1585#undef HWY_NATIVE_IS_NEGATIVE
1587#define HWY_NATIVE_IS_NEGATIVE
1591template <
class V, HWY_IF_NOT_UNSIGNED_V(V)>
1595 using TI =
TFromD<
decltype(di)>;
1597 return detail::LtS(
BitCast(di, v),
static_cast<TI
>(0));
1605#ifdef HWY_NATIVE_MASK_FALSE
1606#undef HWY_NATIVE_MASK_FALSE
1608#define HWY_NATIVE_MASK_FALSE
1618template <
class D,
typename MFrom>
1629#define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1630 SHIFT, MLEN, NAME, OP) \
1631 template <size_t N> \
1632 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1633 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) m) { \
1635 const DFromV<VFromD<decltype(d)>> d_full; \
1636 const RebindToSigned<decltype(d_full)> di; \
1637 using TI = TFromD<decltype(di)>; \
1638 return BitCast(d_full, __riscv_v##OP##_i##SEW##LMUL(Zero(di), TI{-1}, m, \
1644#undef HWY_RVV_VEC_FROM_MASK
1646template <
class D, HWY_IF_FLOAT_D(D)>
1660 return ShiftRight<
sizeof(TFromV<V>) * 8 - 1>(v);
1666 static_assert(IsSigned<TFromV<V>>(),
"Only works for signed/float");
1672#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1673 template <class D> \
1674 HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) { \
1675 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1676 return __riscv_vfirst_m_b##MLEN(m, Lanes(d)); \
1678 template <class D> \
1679 HWY_API size_t FindKnownFirstTrue(D d, HWY_RVV_M(MLEN) m) { \
1680 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1681 return static_cast<size_t>(__riscv_vfirst_m_b##MLEN(m, Lanes(d))); \
1685#undef HWY_RVV_FIND_FIRST_TRUE
1695#define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1696 template <class D> \
1697 HWY_API bool AllTrue(D d, HWY_RVV_M(MLEN) m) { \
1698 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1699 return AllFalse(d, __riscv_vmnot_m_b##MLEN(m, Lanes(d))); \
1703#undef HWY_RVV_ALL_TRUE
1707#define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1708 template <class D> \
1709 HWY_API size_t CountTrue(D d, HWY_RVV_M(MLEN) m) { \
1710 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1711 return __riscv_vcpop_m_b##MLEN(m, Lanes(d)); \
1715#undef HWY_RVV_COUNT_TRUE
1719#ifdef HWY_NATIVE_PROMOTE_MASK_TO
1720#undef HWY_NATIVE_PROMOTE_MASK_TO
1722#define HWY_NATIVE_PROMOTE_MASK_TO
1725template <
class DTo,
class DFrom,
1735#ifdef HWY_NATIVE_DEMOTE_MASK_TO
1736#undef HWY_NATIVE_DEMOTE_MASK_TO
1738#define HWY_NATIVE_DEMOTE_MASK_TO
1741template <
class DTo,
class DFrom,
1753#define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1755 template <size_t N> \
1756 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1757 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1758 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1759 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
1760 detail::NativeLanePointer(p), Lanes(d)); \
1765template <
class D, HWY_RVV_IF_EMULATED_D(D)>
1780#define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1781 SHIFT, MLEN, NAME, OP) \
1782 template <size_t N> \
1783 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1784 NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1785 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1786 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu( \
1787 m, Zero(d), detail::NativeLanePointer(p), Lanes(d)); \
1789 template <size_t N> \
1790 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1791 NAME##Or(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
1792 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1793 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1794 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu( \
1795 m, v, detail::NativeLanePointer(p), Lanes(d)); \
1799#undef HWY_RVV_MASKED_LOAD
1801template <
class D, HWY_RVV_IF_EMULATED_D(D)>
1809template <
class D, HWY_RVV_IF_EMULATED_D(D)>
1820#ifdef HWY_NATIVE_LOAD_N
1821#undef HWY_NATIVE_LOAD_N
1823#define HWY_NATIVE_LOAD_N
1826#define HWY_RVV_LOADN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1828 template <size_t N> \
1829 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1830 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1831 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t num_lanes) { \
1835 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_tu( \
1836 Zero(d), detail::NativeLanePointer(p), CappedLanes(d, num_lanes)); \
1838 template <size_t N> \
1839 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME##Or( \
1840 HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1841 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t num_lanes) { \
1846 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_tu( \
1847 no, detail::NativeLanePointer(p), CappedLanes(d, num_lanes)); \
1853template <
class D, HWY_RVV_IF_EMULATED_D(D)>
1859template <
class D, HWY_RVV_IF_EMULATED_D(D)>
1869#define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1871 template <size_t N> \
1872 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1873 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1874 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1875 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
1876 detail::NativeLanePointer(p), v, Lanes(d)); \
1881template <
class D, HWY_RVV_IF_EMULATED_D(D)>
1889#define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1890 SHIFT, MLEN, NAME, OP) \
1891 template <size_t N> \
1892 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
1893 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1894 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1895 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_m( \
1896 m, detail::NativeLanePointer(p), v, Lanes(d)); \
1899#undef HWY_RVV_BLENDED_STORE
1901template <
class D, HWY_RVV_IF_EMULATED_D(D)>
1913#define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1915 template <size_t N> \
1916 HWY_API void NAME(size_t count, HWY_RVV_V(BASE, SEW, LMUL) v, \
1917 HWY_RVV_D(BASE, SEW, N, SHIFT) , \
1918 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1919 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
1920 detail::NativeLanePointer(p), v, count); \
1923#undef HWY_RVV_STOREN
1925template <
class D, HWY_RVV_IF_EMULATED_D(D)>
1933#ifdef HWY_NATIVE_STORE_N
1934#undef HWY_NATIVE_STORE_N
1936#define HWY_NATIVE_STORE_N
1941 size_t max_lanes_to_store) {
1957template <
class V,
class D>
1964template <
class V,
class D,
typename T>
1971#ifdef HWY_NATIVE_SCATTER
1972#undef HWY_NATIVE_SCATTER
1974#define HWY_NATIVE_SCATTER
1977#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1978 SHIFT, MLEN, NAME, OP) \
1979 template <size_t N> \
1980 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1981 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1982 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1983 HWY_RVV_V(int, SEW, LMUL) offset) { \
1984 const RebindToUnsigned<decltype(d)> du; \
1985 return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
1986 detail::NativeLanePointer(base), BitCast(du, offset), v, Lanes(d)); \
1989#undef HWY_RVV_SCATTER
2001#define HWY_RVV_MASKED_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
2002 LMULH, SHIFT, MLEN, NAME, OP) \
2003 template <size_t N> \
2004 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
2005 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
2006 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
2007 HWY_RVV_V(int, SEW, LMUL) indices) { \
2008 const RebindToUnsigned<decltype(d)> du; \
2009 constexpr size_t kBits = CeilLog2(sizeof(TFromD<decltype(d)>)); \
2010 return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_m( \
2011 m, detail::NativeLanePointer(base), \
2012 ShiftLeft<kBits>(BitCast(du, indices)), v, Lanes(d)); \
2015#undef HWY_RVV_MASKED_SCATTER
2019#ifdef HWY_NATIVE_GATHER
2020#undef HWY_NATIVE_GATHER
2022#define HWY_NATIVE_GATHER
2025#define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
2027 template <size_t N> \
2028 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2029 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
2030 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
2031 HWY_RVV_V(int, SEW, LMUL) offset) { \
2032 const RebindToUnsigned<decltype(d)> du; \
2033 return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
2034 detail::NativeLanePointer(base), BitCast(du, offset), Lanes(d)); \
2037#undef HWY_RVV_GATHER
2043 const VFromD<RebindToSigned<D>> index) {
2044 constexpr size_t kBits =
CeilLog2(
sizeof(TFromD<D>));
2050#define HWY_RVV_MASKED_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2051 SHIFT, MLEN, NAME, OP) \
2052 template <size_t N> \
2053 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2054 NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m, \
2055 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
2056 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
2057 HWY_RVV_V(int, SEW, LMUL) indices) { \
2058 const RebindToUnsigned<decltype(d)> du; \
2059 const RebindToSigned<decltype(d)> di; \
2061 constexpr size_t kBits = CeilLog2(SEW / 8); \
2062 HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); \
2063 return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_mu( \
2064 m, no, detail::NativeLanePointer(base), \
2065 ShiftLeft<kBits>(BitCast(du, indices)), Lanes(d)); \
2068#undef HWY_RVV_MASKED_GATHER
2081#define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2082 SHIFT, MLEN, NAME, OP) \
2083 template <size_t N> \
2084 HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \
2085 HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
2086 return __riscv_v##OP##CHAR##SEWD##LMULD(v, Lanes(d)); \
2097#if HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C
2103#ifdef HWY_NATIVE_F16C
2104#undef HWY_NATIVE_F16C
2106#define HWY_NATIVE_F16C
2110#undef HWY_RVV_PROMOTE
2114#define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN, \
2116 template <size_t N> \
2117 HWY_API HWY_RVV_V(BASE, BITS, LMUL) \
2118 PromoteTo(HWY_RVV_D(BASE, BITS, N, SHIFT + ADD) d, \
2119 HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) { \
2120 return __riscv_v##OP##CHAR##BITS##LMUL(v, Lanes(d)); \
2123#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
2124 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -2, 1) \
2125 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -1, 1) \
2126 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1, 0, 1) \
2127 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2, 1, 1) \
2128 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4, 2, 1)
2130#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
2131 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4, -2, 2) \
2132 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2, -1, 2) \
2133 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1, 0, 2) \
2134 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2, 1, 2)
2136#define HWY_RVV_PROMOTE_X4_FROM_U8(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
2137 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, mf2, mf8, -3, 2) \
2138 HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
2140#define HWY_RVV_PROMOTE_X8(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
2141 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf8, -3, 3) \
2142 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf4, -2, 3) \
2143 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, mf2, -1, 3) \
2144 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m1, 0, 3)
2166#undef HWY_RVV_PROMOTE_X8
2167#undef HWY_RVV_PROMOTE_X4_FROM_U8
2168#undef HWY_RVV_PROMOTE_X4
2169#undef HWY_RVV_PROMOTE_X2
2170#undef HWY_RVV_PROMOTE
2188template <
size_t N,
int kPow2>
2195template <
size_t N,
int kPow2>
2202template <
size_t N,
int kPow2>
2209template <
size_t N,
int kPow2>
2216template <
size_t N,
int kPow2>
2223template <
size_t N,
int kPow2>
2230template <
size_t N,
int kPow2>
2235 const Rebind<uint16_t,
decltype(
d)> du16;
2242#define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
2244 template <size_t N> \
2245 HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
2246 HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
2247 return __riscv_v##OP##CHAR##SEWH##LMULH( \
2248 v, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); \
2257#define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2258 SHIFT, MLEN, NAME, OP) \
2259 template <size_t N> \
2260 HWY_API HWY_RVV_V(uint, SEWH, LMULH) NAME( \
2261 HWY_RVV_D(uint, SEWH, N, SHIFT - 1) dn, HWY_RVV_V(int, SEW, LMUL) v) { \
2262 const HWY_RVV_D(uint, SEW, N, SHIFT) du; \
2264 return DemoteTo(dn, BitCast(du, detail::MaxS(v, 0))); \
2269#undef HWY_RVV_DEMOTE_I_TO_U
2273 return __riscv_vnclipu_wx_u8mf8(
2279 return __riscv_vnclipu_wx_u8mf4(
2285 return __riscv_vnclipu_wx_u8mf2(
2291 return __riscv_vnclipu_wx_u8m1(
2297 return __riscv_vnclipu_wx_u8m2(
2304 return __riscv_vnclipu_wx_u8mf8(
2310 return __riscv_vnclipu_wx_u8mf4(
2316 return __riscv_vnclipu_wx_u8mf2(
2322 return __riscv_vnclipu_wx_u8m1(
2328 return __riscv_vnclipu_wx_u8m2(
2333template <
size_t N,
int kPow2>
2339template <
size_t N,
int kPow2>
2345template <
size_t N,
int kPow2>
2351template <
size_t N,
int kPow2>
2359 return __riscv_vnclipu_wx_u8mf8(
2360 __riscv_vnclipu_wx_u16mf4(v, 0,
2366 return __riscv_vnclipu_wx_u8mf4(
2367 __riscv_vnclipu_wx_u16mf2(v, 0,
2373 return __riscv_vnclipu_wx_u8mf2(
2374 __riscv_vnclipu_wx_u16m1(v, 0,
2380 return __riscv_vnclipu_wx_u8m1(
2381 __riscv_vnclipu_wx_u16m2(v, 0,
2387 return __riscv_vnclipu_wx_u8m2(
2388 __riscv_vnclipu_wx_u16m4(v, 0,
2398 const size_t avl =
Lanes(
d);
2399 const vuint64m1_t v1 = __riscv_vand(v, 0xFF, avl);
2400 const vuint32mf2_t v2 = __riscv_vnclipu_wx_u32mf2(
2402 const vuint16mf4_t v3 = __riscv_vnclipu_wx_u16mf4(
2404 return __riscv_vnclipu_wx_u8mf8(v3, 0,
2411 const size_t avl =
Lanes(
d);
2412 const vuint64m2_t v1 = __riscv_vand(v, 0xFF, avl);
2413 const vuint32m1_t v2 = __riscv_vnclipu_wx_u32m1(
2415 const vuint16mf2_t v3 = __riscv_vnclipu_wx_u16mf2(
2417 return __riscv_vnclipu_wx_u8mf4(v3, 0,
2424 const size_t avl =
Lanes(
d);
2425 const vuint64m4_t v1 = __riscv_vand(v, 0xFF, avl);
2426 const vuint32m2_t v2 = __riscv_vnclipu_wx_u32m2(
2428 const vuint16m1_t v3 = __riscv_vnclipu_wx_u16m1(
2430 return __riscv_vnclipu_wx_u8mf2(v3, 0,
2437 const size_t avl =
Lanes(
d);
2438 const vuint64m8_t v1 = __riscv_vand(v, 0xFF, avl);
2439 const vuint32m4_t v2 = __riscv_vnclipu_wx_u32m4(
2441 const vuint16m2_t v3 = __riscv_vnclipu_wx_u16m2(
2443 return __riscv_vnclipu_wx_u8m1(v3, 0,
2450 const size_t avl =
Lanes(
d);
2451 const vuint64m1_t v1 = __riscv_vand(v, 0xFFFF, avl);
2452 const vuint32mf2_t v2 = __riscv_vnclipu_wx_u32mf2(
2454 return __riscv_vnclipu_wx_u16mf4(v2, 0,
2461 const size_t avl =
Lanes(
d);
2462 const vuint64m1_t v1 = __riscv_vand(v, 0xFFFF, avl);
2463 const vuint32mf2_t v2 = __riscv_vnclipu_wx_u32mf2(
2465 return __riscv_vnclipu_wx_u16mf4(v2, 0,
2472 const size_t avl =
Lanes(
d);
2473 const vuint64m2_t v1 = __riscv_vand(v, 0xFFFF, avl);
2474 const vuint32m1_t v2 = __riscv_vnclipu_wx_u32m1(
2476 return __riscv_vnclipu_wx_u16mf2(v2, 0,
2483 const size_t avl =
Lanes(
d);
2484 const vuint64m4_t v1 = __riscv_vand(v, 0xFFFF, avl);
2485 const vuint32m2_t v2 = __riscv_vnclipu_wx_u32m2(
2487 return __riscv_vnclipu_wx_u16m1(v2, 0,
2494 const size_t avl =
Lanes(
d);
2495 const vuint64m8_t v1 = __riscv_vand(v, 0xFFFF, avl);
2496 const vuint32m4_t v2 = __riscv_vnclipu_wx_u32m4(
2498 return __riscv_vnclipu_wx_u16m2(v2, 0,
2505 const size_t avl =
Lanes(
d);
2506 const vuint64m1_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl);
2507 return __riscv_vnclipu_wx_u32mf2(v1, 0,
2514 const size_t avl =
Lanes(
d);
2515 const vuint64m1_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl);
2516 return __riscv_vnclipu_wx_u32mf2(v1, 0,
2523 const size_t avl =
Lanes(
d);
2524 const vuint64m2_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl);
2525 return __riscv_vnclipu_wx_u32m1(v1, 0,
2532 const size_t avl =
Lanes(
d);
2533 const vuint64m4_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl);
2534 return __riscv_vnclipu_wx_u32m2(v1, 0,
2541 const size_t avl =
Lanes(
d);
2542 const vuint64m8_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl);
2543 return __riscv_vnclipu_wx_u32m4(v1, 0,
2550 const size_t avl =
Lanes(
d);
2551 const vuint32mf2_t v1 = __riscv_vand(v, 0xFF, avl);
2552 const vuint16mf4_t v2 = __riscv_vnclipu_wx_u16mf4(
2554 return __riscv_vnclipu_wx_u8mf8(v2, 0,
2561 const size_t avl =
Lanes(
d);
2562 const vuint32m1_t v1 = __riscv_vand(v, 0xFF, avl);
2563 const vuint16mf2_t v2 = __riscv_vnclipu_wx_u16mf2(
2565 return __riscv_vnclipu_wx_u8mf4(v2, 0,
2572 const size_t avl =
Lanes(
d);
2573 const vuint32m2_t v1 = __riscv_vand(v, 0xFF, avl);
2574 const vuint16m1_t v2 = __riscv_vnclipu_wx_u16m1(
2576 return __riscv_vnclipu_wx_u8mf2(v2, 0,
2583 const size_t avl =
Lanes(
d);
2584 const vuint32m4_t v1 = __riscv_vand(v, 0xFF, avl);
2585 const vuint16m2_t v2 = __riscv_vnclipu_wx_u16m2(
2587 return __riscv_vnclipu_wx_u8m1(v2, 0,
2594 const size_t avl =
Lanes(
d);
2595 const vuint32m8_t v1 = __riscv_vand(v, 0xFF, avl);
2596 const vuint16m4_t v2 = __riscv_vnclipu_wx_u16m4(
2598 return __riscv_vnclipu_wx_u8m2(v2, 0,
2605 const size_t avl =
Lanes(
d);
2606 const vuint32mf2_t v1 = __riscv_vand(v, 0xFFFF, avl);
2607 return __riscv_vnclipu_wx_u16mf4(v1, 0,
2614 const size_t avl =
Lanes(
d);
2615 const vuint32mf2_t v1 = __riscv_vand(v, 0xFFFF, avl);
2616 return __riscv_vnclipu_wx_u16mf4(v1, 0,
2623 const size_t avl =
Lanes(
d);
2624 const vuint32m1_t v1 = __riscv_vand(v, 0xFFFF, avl);
2625 return __riscv_vnclipu_wx_u16mf2(v1, 0,
2632 const size_t avl =
Lanes(
d);
2633 const vuint32m2_t v1 = __riscv_vand(v, 0xFFFF, avl);
2634 return __riscv_vnclipu_wx_u16m1(v1, 0,
2641 const size_t avl =
Lanes(
d);
2642 const vuint32m4_t v1 = __riscv_vand(v, 0xFFFF, avl);
2643 return __riscv_vnclipu_wx_u16m2(v1, 0,
2650 const size_t avl =
Lanes(
d);
2651 const vuint32m8_t v1 = __riscv_vand(v, 0xFFFF, avl);
2652 return __riscv_vnclipu_wx_u16m4(v1, 0,
2659 const size_t avl =
Lanes(
d);
2660 const vuint16mf4_t v1 = __riscv_vand(v, 0xFF, avl);
2661 return __riscv_vnclipu_wx_u8mf8(v1, 0,
2668 const size_t avl =
Lanes(
d);
2669 const vuint16mf2_t v1 = __riscv_vand(v, 0xFF, avl);
2670 return __riscv_vnclipu_wx_u8mf4(v1, 0,
2677 const size_t avl =
Lanes(
d);
2678 const vuint16m1_t v1 = __riscv_vand(v, 0xFF, avl);
2679 return __riscv_vnclipu_wx_u8mf2(v1, 0,
2686 const size_t avl =
Lanes(
d);
2687 const vuint16m2_t v1 = __riscv_vand(v, 0xFF, avl);
2688 return __riscv_vnclipu_wx_u8m1(v1, 0,
2695 const size_t avl =
Lanes(
d);
2696 const vuint16m4_t v1 = __riscv_vand(v, 0xFF, avl);
2697 return __riscv_vnclipu_wx_u8m2(v1, 0,
2704 const size_t avl =
Lanes(
d);
2705 const vuint16m8_t v1 = __riscv_vand(v, 0xFF, avl);
2706 return __riscv_vnclipu_wx_u8m4(v1, 0,
2737template <
size_t N,
int kPow2>
2743template <
size_t N,
int kPow2>
2749#undef HWY_RVV_DEMOTE
2754#define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2755 SHIFT, MLEN, NAME, OP) \
2756 template <size_t N> \
2757 HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
2758 HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
2759 return __riscv_v##OP##SEWH##LMULH(v, Lanes(d)); \
2762#if HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C
2766#undef HWY_RVV_DEMOTE_F
2771 return __riscv_vfncvt_rtz_x_f_w_i32mf2(v,
Lanes(
d));
2775 return __riscv_vfncvt_rtz_x_f_w_i32mf2(v,
Lanes(
d));
2779 return __riscv_vfncvt_rtz_x_f_w_i32m1(v,
Lanes(
d));
2783 return __riscv_vfncvt_rtz_x_f_w_i32m2(v,
Lanes(
d));
2787 return __riscv_vfncvt_rtz_x_f_w_i32m4(v,
Lanes(
d));
2792 return __riscv_vfncvt_rtz_xu_f_w_u32mf2(v,
Lanes(
d));
2796 return __riscv_vfncvt_rtz_xu_f_w_u32mf2(v,
Lanes(
d));
2800 return __riscv_vfncvt_rtz_xu_f_w_u32m1(v,
Lanes(
d));
2804 return __riscv_vfncvt_rtz_xu_f_w_u32m2(v,
Lanes(
d));
2808 return __riscv_vfncvt_rtz_xu_f_w_u32m4(v,
Lanes(
d));
2813 return __riscv_vfncvt_f_x_w_f32mf2(v,
Lanes(
d));
2817 return __riscv_vfncvt_f_x_w_f32mf2(v,
Lanes(
d));
2821 return __riscv_vfncvt_f_x_w_f32m1(v,
Lanes(
d));
2825 return __riscv_vfncvt_f_x_w_f32m2(v,
Lanes(
d));
2829 return __riscv_vfncvt_f_x_w_f32m4(v,
Lanes(
d));
2834 return __riscv_vfncvt_f_xu_w_f32mf2(v,
Lanes(
d));
2838 return __riscv_vfncvt_f_xu_w_f32mf2(v,
Lanes(
d));
2842 return __riscv_vfncvt_f_xu_w_f32m1(v,
Lanes(
d));
2846 return __riscv_vfncvt_f_xu_w_f32m2(v,
Lanes(
d));
2850 return __riscv_vfncvt_f_xu_w_f32m4(v,
Lanes(
d));
2854#define HWY_RVV_DEMOTE_TO_SHR_16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
2855 LMULH, SHIFT, MLEN, NAME, OP) \
2856 template <size_t N> \
2857 HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
2858 HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
2859 return __riscv_v##OP##CHAR##SEWH##LMULH( \
2860 v, 16, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); \
2866#undef HWY_RVV_DEMOTE_TO_SHR_16
2875template <
class V, HWY_IF_F32(TFromV<V>)>
2877 const RebindToUnsigned<DFromV<V>> du32;
2878 const auto is_non_nan =
Eq(v, v);
2879 const auto bits32 =
BitCast(du32, v);
2881 const auto round_incr =
2882 detail::AddS(detail::AndS(ShiftRight<16>(bits32), 1u), 0x7FFFu);
2883 return MaskedAddOr(detail::OrS(bits32, 0x00400000u), is_non_nan, bits32,
2889#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
2890#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
2892#define HWY_NATIVE_DEMOTE_F32_TO_BF16
2895template <
size_t N,
int kPow2>
2905#define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2906 SHIFT, MLEN, NAME, OP) \
2907 template <size_t N> \
2908 HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \
2909 HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(int, SEW, LMUL) v) { \
2910 return __riscv_vfcvt_f_x_v_f##SEW##LMUL(v, Lanes(d)); \
2912 template <size_t N> \
2913 HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \
2914 HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(uint, SEW, LMUL) v) { \
2915 return __riscv_vfcvt_f_xu_v_f##SEW##LMUL(v, Lanes(d)); \
2918 template <size_t N> \
2919 HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(int, SEW, N, SHIFT) d, \
2920 HWY_RVV_V(BASE, SEW, LMUL) v) { \
2921 return __riscv_vfcvt_rtz_x_f_v_i##SEW##LMUL(v, Lanes(d)); \
2923 template <size_t N> \
2924 HWY_API HWY_RVV_V(uint, SEW, LMUL) ConvertTo( \
2925 HWY_RVV_D(uint, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
2926 return __riscv_vfcvt_rtz_xu_f_v_u##SEW##LMUL(v, Lanes(d)); \
2930#undef HWY_RVV_CONVERT
2933#define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2934 SHIFT, MLEN, NAME, OP) \
2935 HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
2936 return __riscv_vfcvt_x_f_v_i##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \
2939#undef HWY_RVV_NEAREST
2947template <
typename T,
size_t N,
int kPow2>
2950 constexpr size_t kMinVecBytes =
2953 constexpr size_t kMinVecLanes = (kMinVecBytes +
sizeof(T) - 1) /
sizeof(T);
2958 if (kMaxLpb <= kMinVecLanes)
return kMaxLpb;
2961 const size_t lanes_per_vec =
Lanes(
d);
2962 return HWY_MIN(lanes_per_vec, kMaxLpb);
2965template <
class D,
class V>
2971template <
size_t kLanes,
class D>
2975 using TU =
TFromD<
decltype(du)>;
2977 return LtS(
BitCast(di, idx_mod),
static_cast<TFromD<decltype(di)
>>(kLanes));
2980#define HWY_RVV_SLIDE_UP(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2981 SHIFT, MLEN, NAME, OP) \
2982 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2983 NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \
2985 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes, \
2986 HWY_RVV_AVL(SEW, SHIFT)); \
2989#define HWY_RVV_SLIDE_DOWN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2990 SHIFT, MLEN, NAME, OP) \
2991 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2992 NAME(HWY_RVV_V(BASE, SEW, LMUL) src, size_t lanes) { \
2993 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(src, lanes, \
2994 HWY_RVV_AVL(SEW, SHIFT)); \
3000#undef HWY_RVV_SLIDE_UP
3001#undef HWY_RVV_SLIDE_DOWN
3008 return detail::SlideUp(
Zero(
d), v, amt);
3014 v = detail::SlideDown(v, amt);
3023template <
class D,
class V>
3025 const size_t half =
Lanes(
d) / 2;
3026 const V hi_down = detail::SlideDown(hi, half);
3027 return detail::SlideUp(lo, hi_down, half);
3031template <
class D,
class V>
3033 return detail::SlideUp(lo, hi,
Lanes(
d) / 2);
3037template <
class D,
class V>
3039 const size_t half =
Lanes(
d) / 2;
3040 const V hi_down = detail::SlideDown(hi, half);
3041 const V lo_down = detail::SlideDown(lo, half);
3042 return detail::SlideUp(lo_down, hi_down, half);
3046template <
class D,
class V>
3048 const size_t half =
Lanes(
d) / 2;
3049 const V lo_down = detail::SlideDown(lo, half);
3050 return detail::SlideUp(lo_down, hi, half);
3054template <
class D2,
class V>
3061template <
class D2,
class V>
3075 return (
size_t{1} << (
d.Pow2() + 3)) >=
sizeof(
TFromD<D>);
3081template <
class DH, hwy::EnableIf<detail::IsSupportedLMUL(DH())>* =
nullptr>
3083 return detail::Trunc(v);
3091template <
class DH,
class V,
3112#define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
3114 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
3115 return __riscv_v##OP##_##CHAR##SEW##LMUL(v, 0, HWY_RVV_AVL(SEW, SHIFT)); \
3122#undef HWY_RVV_SLIDE1
3126#ifdef HWY_NATIVE_SLIDE1_UP_DOWN
3127#undef HWY_NATIVE_SLIDE1_UP_DOWN
3129#define HWY_NATIVE_SLIDE1_UP_DOWN
3134 return detail::Slide1Up(v);
3139 v = detail::Slide1Down(v);
3149#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
3150 SHIFT, MLEN, NAME, OP) \
3151 HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
3152 return __riscv_v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); \
3157#undef HWY_RVV_GET_LANE
3162 return GetLane(detail::SlideDown(v, i));
3171#define HWY_RVV_SET_AT_OR_AFTER_FIRST(SEW, SHIFT, MLEN, NAME, OP) \
3172 HWY_API HWY_RVV_M(MLEN) SetAtOrAfterFirst(HWY_RVV_M(MLEN) m) { \
3173 return Not(SetBeforeFirst(m)); \
3177#undef HWY_RVV_SET_AT_OR_AFTER_FIRST
3182template <
class V,
typename T, HWY_IF_NOT_T_SIZE_V(V, 1)>
3186 using TU =
TFromD<
decltype(du)>;
3187 const auto is_i = detail::EqS(
detail::Iota0(du),
static_cast<TU
>(i));
3192template <
class V,
typename T, HWY_IF_T_SIZE_V(V, 1)>
3194 const Rebind<T, DFromV<V>>
d;
3195 const auto zero =
Zero(
d);
3196 const auto one =
Set(
d, 1);
3197 const auto ge_i =
Eq(detail::SlideUp(zero, one, i), one);
3207template <
class D, HWY_IF_NOT_T_SIZE_D(D, 8)>
3214template <
class D, HWY_IF_T_SIZE_D(D, 8)>
3221template <
class D, HWY_IF_NOT_T_SIZE_D(D, 8)>
3228template <
class D, HWY_IF_T_SIZE_D(D, 8)>
3244 const V up = detail::Slide1Up(v);
3251 const V down = detail::Slide1Down(v);
3258 return OddEven(detail::Slide1Up(b), a);
3264 return OddEven(b, detail::Slide1Down(a));
3270 const RebindToUnsigned<DFromV<V>> du;
3271 constexpr size_t kShift =
CeilLog2(16 /
sizeof(TFromV<V>));
3272 const auto idx_block = ShiftRight<kShift>(
detail::Iota0(du));
3273 const auto is_even = detail::EqS(detail::AndS(idx_block, 1), 0);
3282 const V down = detail::SlideDown(v, lpb);
3283 const V up = detail::SlideUp(v, v, lpb);
3289template <
class D,
class VI>
3291 static_assert(
sizeof(TFromD<D>) ==
sizeof(TFromV<VI>),
"Index != lane");
3294#if HWY_IS_DEBUG_BUILD
3295 using TU =
TFromD<
decltype(du)>;
3296 const size_t twice_num_of_lanes =
Lanes(
d) * 2;
3299 detail::AndS(
indices,
static_cast<TU
>(twice_num_of_lanes - 1)))));
3304template <
class D,
typename TI>
3306 static_assert(
sizeof(TFromD<D>) ==
sizeof(TI),
"Index size must match lane");
3310#define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
3312 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3313 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \
3314 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, idx, \
3315 HWY_RVV_AVL(SEW, SHIFT)); \
3326#define HWY_RVV_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
3327 SHIFT, MLEN, NAME, OP) \
3328 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3329 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEWD, LMULD) idx) { \
3330 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, idx, \
3331 HWY_RVV_AVL(SEW, SHIFT)); \
3335#undef HWY_RVV_TABLE16
3338#define HWY_RVV_MASKED_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
3339 SHIFT, MLEN, NAME, OP) \
3340 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3341 NAME(HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff, \
3342 HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \
3343 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(mask, maskedoff, v, idx, \
3344 HWY_RVV_AVL(SEW, SHIFT)); \
3348#undef HWY_RVV_MASKED_TABLE
3350#define HWY_RVV_MASKED_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
3351 LMULH, SHIFT, MLEN, NAME, OP) \
3352 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3353 NAME(HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff, \
3354 HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEWD, LMULD) idx) { \
3355 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(mask, maskedoff, v, idx, \
3356 HWY_RVV_AVL(SEW, SHIFT)); \
3361#undef HWY_RVV_MASKED_TABLE16
3366template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_POW2_LE_D(D, 2)>
3368 const Rebind<uint16_t,
decltype(
d)> du16;
3369 const size_t N =
Lanes(
d);
3371 detail::ReverseSubS(
detail::Iota0(du16),
static_cast<uint16_t
>(N - 1));
3372 return detail::TableLookupLanes16(v, idx);
3375template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_POW2_GT_D(D, 2)>
3377 const Half<
decltype(
d)> dh;
3378 const Rebind<uint16_t,
decltype(dh)> du16;
3379 const size_t half_n =
Lanes(dh);
3381 static_cast<uint16_t
>(half_n - 1));
3382 const auto reversed_lo = detail::TableLookupLanes16(
LowerHalf(dh, v), idx);
3383 const auto reversed_hi = detail::TableLookupLanes16(
UpperHalf(dh, v), idx);
3384 return Combine(
d, reversed_lo, reversed_hi);
3387template <
class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
3388HWY_API VFromD<D> Reverse(D , VFromD<D> v) {
3389 const RebindToUn
signed<D> du;
3390 using TU = TFromD<decltype(du)>;
3391 const
size_t N = Lanes(du);
3393 detail::ReverseSubS(detail::Iota0(du), static_cast<TU>(N - 1));
3394 return TableLookupLanes(v,
idx);
3410#define HWY_RVV_IF_SAME_T_DV(D, V) \
3411 hwy::EnableIf<IsSame<NativeLaneType<TFromD<D>>, TFromV<V>>()>* = nullptr
3414template <
class D,
class V,
3418 const Half<
decltype(d_from)> dh_from;
3421 "The LMUL of VFromD<decltype(dh_from)> must be less than the LMUL of V");
3424 "The LMUL of VFromD<D> must be less than or equal to the LMUL of "
3425 "VFromD<decltype(dh_from)>");
3430template <
class D,
class V,
3434 const Twice<
decltype(d_from)> dt_from;
3436 "The LMUL of VFromD<decltype(dt_from)> must be greater than "
3440 "The LMUL of VFromD<D> must be greater than or equal to the LMUL of "
3441 "VFromD<decltype(dt_from)>");
3442 return ChangeLMUL(
d, Ext(dt_from, v));
3445#undef HWY_RVV_IF_SAME_T_DV
3449template <
class DTo,
class VFrom>
3451 const DFromV<
decltype(v)> d_from;
3452 const Repartition<uint8_t,
decltype(d_from)> du8_from;
3454 const Repartition<uint8_t,
decltype(d_to)> du8_to;
3455 return BitCast(d_to, detail::ChangeLMUL(du8_to,
BitCast(du8_from, v)));
3461#ifdef HWY_NATIVE_REVERSE2_8
3462#undef HWY_NATIVE_REVERSE2_8
3464#define HWY_NATIVE_REVERSE2_8
3470template <
class D, HWY_IF_T_SIZE_D(D, 1)>
3472 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint16_t, D>> du16;
3473 return ResizeBitCast(d, RotateRight<8>(ResizeBitCast(du16, v)));
3476template <
class D, HWY_IF_T_SIZE_D(D, 2)>
3478 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint32_t, D>> du32;
3484template <
class D, HWY_IF_T_SIZE_D(D, 4)>
3486 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
3490template <
class D,
class V = VFromD<D>, HWY_IF_T_SIZE_D(D, 8)>
3492 const V up = detail::Slide1Up(v);
3493 const V down = detail::Slide1Down(v);
3499template <
class D, HWY_IF_T_SIZE_D(D, 1)>
3501 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint16_t, D>> du16;
3502 return ResizeBitCast(d, Reverse2(du16, ResizeBitCast(du16, Reverse2(d, v))));
3505template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
3507 const RebindToUnsigned<D> du;
3508 const auto idx = detail::XorS(detail::Iota0(du), 3);
3509 return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
3514template <
class D, HWY_IF_T_SIZE_D(D, 1)>
3516 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint32_t, D>> du32;
3520template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
3522 const RebindToUnsigned<D> du;
3523 const auto idx = detail::XorS(detail::Iota0(du), 7);
3528template <
class D,
class V = VFromD<D>>
3530 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
3531 const size_t N =
Lanes(du64);
3533 detail::ReverseSubS(detail::Iota0(du64),
static_cast<uint64_t
>(N - 1));
3535 const auto idx = detail::XorS(rev, 1);
3542#ifdef HWY_NATIVE_COMPRESS8
3543#undef HWY_NATIVE_COMPRESS8
3545#define HWY_NATIVE_COMPRESS8
3548template <
typename T>
3549struct CompressIsPartition {
3553#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
3554 SHIFT, MLEN, NAME, OP) \
3555 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3556 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \
3557 return __riscv_v##OP##_vm_##CHAR##SEW##LMUL(v, mask, \
3558 HWY_RVV_AVL(SEW, SHIFT)); \
3562#undef HWY_RVV_COMPRESS
3566#ifdef HWY_NATIVE_EXPAND
3567#undef HWY_NATIVE_EXPAND
3569#define HWY_NATIVE_EXPAND
3573template <
class V,
class M, HWY_IF_NOT_T_SIZE_V(V, 1)>
3577 const auto idx = detail::MaskedIota(du,
RebindMask(du, mask));
3578 const V zero =
Zero(
d);
3579 return detail::MaskedTableLookupLanes(mask, zero, v, idx);
3583template <
class V,
class M, HWY_IF_T_SIZE_V(V, 1),
class D = DFromV<V>,
3584 HWY_IF_POW2_LE_D(D, 2)>
3587 const Rebind<uint16_t,
decltype(
d)> du16;
3588 const auto idx = detail::MaskedIota(du16,
RebindMask(du16, mask));
3589 const V zero =
Zero(
d);
3590 return detail::MaskedTableLookupLanes16(mask, zero, v, idx);
3594template <
class V,
class M, HWY_IF_T_SIZE_V(V, 1),
class D = DFromV<V>,
3595 HWY_IF_POW2_GT_D(DFromV<V>, 2)>
3596HWY_API V Expand(V v,
const M mask) {
3599 const auto v0 = LowerHalf(dh, v);
3601 const V vmask = VecFromMask(d, mask);
3602 const auto m0 = MaskFromVec(LowerHalf(dh, vmask));
3605 const size_t count = CountTrue(dh, m0);
3606 const auto v1 = detail::Trunc(detail::SlideDown(v, count));
3607 const auto m1 = MaskFromVec(UpperHalf(dh, vmask));
3608 return Combine(d, Expand(v1, m1), Expand(v0, m0));
3615 return Expand(LoadU(d, unaligned), mask);
3619template <
class V,
class M>
3621 return Compress(v,
Not(mask));
3625template <
class V,
class M>
3627 return CompressNot(v, mask);
3631template <
class V,
class M,
class D>
3634 StoreU(Compress(v, mask),
d, unaligned);
3635 return CountTrue(
d, mask);
3639template <
class V,
class M,
class D>
3642 const size_t count = CountTrue(
d, mask);
3643 StoreN(Compress(v, mask),
d, unaligned, count);
3652HWY_API intptr_t FindLastTrue(D d, MFromD<D> m) {
3653 const RebindToSigned<
decltype(d)> di;
3654 const intptr_t fft_rev_idx =
3655 FindFirstTrue(d, MaskFromVec(Reverse(di, VecFromMask(di, m))));
3656 return (fft_rev_idx >= 0)
3657 ? (
static_cast<intptr_t
>(Lanes(d) - 1) - fft_rev_idx)
3664 const size_t fft_rev_idx =
3666 return Lanes(d) - 1 - fft_rev_idx;
3673#define HWY_RVV_NARROW(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
3675 template <size_t kShift> \
3676 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEWD, LMULD) v) { \
3677 return __riscv_v##OP##_wx_##CHAR##SEW##LMUL(v, kShift, \
3678 HWY_RVV_AVL(SEWD, SHIFT + 1)); \
3684#undef HWY_RVV_NARROW
3689template <
class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
3691 constexpr size_t kBits =
sizeof(
TFromD<D>) * 8;
3692 const Twice<
decltype(
d)> dt;
3695 return BitCast(
d, detail::Narrow<kBits>(hl));
3699template <
class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
3701 const Twice<
decltype(d)> dt;
3702 const VFromD<
decltype(dt)> hl = Combine(dt, hi, lo);
3703 return LowerHalf(d, Compress(hl, detail::IsOdd(dt)));
3707template <
class D, HWY_IF_POW2_GT_D(D, 2)>
3709 const Half<
decltype(d)> dh;
3710 const MFromD<D> is_odd = detail::IsOdd(d);
3711 const VFromD<
decltype(d)> hi_odd = Compress(hi, is_odd);
3712 const VFromD<
decltype(d)> lo_odd = Compress(lo, is_odd);
3713 return Combine(d, LowerHalf(dh, hi_odd), LowerHalf(dh, lo_odd));
3719template <
class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
3721 const Twice<
decltype(
d)> dt;
3724 return BitCast(
d, detail::Narrow<0>(hl));
3728template <
class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
3730 const Twice<
decltype(d)> dt;
3731 const VFromD<
decltype(dt)> hl = Combine(dt, hi, lo);
3732 return LowerHalf(d, Compress(hl, detail::IsEven(dt)));
3736template <
class D, HWY_IF_POW2_GT_D(D, 2)>
3738 const Half<
decltype(d)> dh;
3739 const MFromD<D> is_even = detail::IsEven(d);
3740 const VFromD<
decltype(d)> hi_even = Compress(hi, is_even);
3741 const VFromD<
decltype(d)> lo_even = Compress(lo, is_even);
3742 return Combine(d, LowerHalf(dh, hi_even), LowerHalf(dh, lo_even));
3748template <
size_t kBytes,
class D,
class V = VFromD<D>>
3751 const auto hi8 =
BitCast(d8, hi);
3752 const auto lo8 =
BitCast(d8, lo);
3753 const auto hi_up = detail::SlideUp(hi8, hi8, 16 - kBytes);
3754 const auto lo_down = detail::SlideDown(lo8, kBytes);
3755 const auto is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
3760template <
size_t kLanes,
class D,
class V = VFromD<D>>
3762 constexpr size_t kLanesUp = 16 /
sizeof(
TFromV<V>) - kLanes;
3763 const auto hi_up = detail::SlideUp(hi, hi, kLanesUp);
3764 const auto lo_down = detail::SlideDown(lo, kLanes);
3765 const auto is_lo = detail::FirstNPerBlock<kLanesUp>(
d);
3771HWY_API V Shuffle2301(
const V v) {
3773 static_assert(
sizeof(TFromD<
decltype(d)>) == 4,
"Defined for 32-bit types");
3774 const Repartition<uint64_t,
decltype(d)> du64;
3775 const auto v64 = BitCast(du64, v);
3776 return BitCast(d, Or(ShiftRight<32>(v64), ShiftLeft<32>(v64)));
3781HWY_API V Shuffle2103(
const V v) {
3783 static_assert(
sizeof(TFromD<
decltype(d)>) == 4,
"Defined for 32-bit types");
3784 return CombineShiftRightLanes<3>(d, v, v);
3791 static_assert(
sizeof(
TFromD<
decltype(
d)>) == 4,
"Defined for 32-bit types");
3792 return CombineShiftRightLanes<1>(d, v, v);
3799 static_assert(
sizeof(
TFromD<
decltype(
d)>) == 4,
"Defined for 32-bit types");
3800 return CombineShiftRightLanes<2>(d, v, v);
3807 static_assert(
sizeof(
TFromD<
decltype(
d)>) == 8,
"Defined for 64-bit types");
3808 return CombineShiftRightLanes<1>(d, v, v);
3819template <
class VT,
class VI>
3829 constexpr int kPow2T = dt8.
Pow2();
3830 constexpr int kPow2I = di8.Pow2();
3832 const auto vmt = detail::ChangeLMUL(dm8,
BitCast(dt8, vt));
3833 const auto vmi = detail::ChangeLMUL(dm8,
BitCast(di8, vi));
3834 auto offsets = detail::OffsetsOf128BitBlocks(dm8, detail::Iota0(dm8));
3837 if (kPow2T < kPow2I) {
3838 offsets = detail::AndS(offsets,
static_cast<uint8_t
>(
Lanes(dt8) - 1));
3841 return BitCast(di, detail::ChangeLMUL(di8, out));
3844template <
class VT,
class VI>
3848 const auto idx8 =
BitCast(di8, idx);
3857template <
class D, HWY_IF_POW2_LE_D(D, 2)>
3860 const Twice<
decltype(
d)> dt;
3862 const auto combined_tbl =
Combine(dt, b, a);
3863 const auto combined_idx =
Combine(dt_u, idx, idx);
3867template <
class D, HWY_IF_POW2_GT_D(D, 2)>
3869 VFromD<RebindToUnsigned<D>> idx) {
3870 const RebindToUnsigned<
decltype(d)> du;
3871 using TU = TFromD<
decltype(du)>;
3873 const size_t num_of_lanes = Lanes(d);
3874 const auto idx_mod = detail::AndS(idx,
static_cast<TU
>(num_of_lanes - 1));
3875 const auto sel_a_mask = Ne(idx, idx_mod);
3877 const auto a_lookup_result = TableLookupLanes(a, idx_mod);
3878 return detail::MaskedTableLookupLanes(sel_a_mask, a_lookup_result, b,
3883HWY_API V TwoTablesLookupLanes(V a, V b,
3884 VFromD<RebindToUnsigned<DFromV<V>>> idx) {
3885 const DFromV<
decltype(a)> d;
3886 return TwoTablesLookupLanes(d, a, b, idx);
3892template <
int kLane,
class V,
class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
3893 HWY_IF_POW2_LE_D(D, 2)>
3896 HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(
d));
3898 const Rebind<uint16_t,
decltype(
d)> du16;
3899 VFromD<
decltype(du16)> idx =
3900 detail::OffsetsOf128BitBlocks(
d, detail::Iota0(du16));
3902 idx = detail::AddS(idx, kLane);
3904 return detail::TableLookupLanes16(v, idx);
3908template <
int kLane,
class V,
class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
3909 HWY_IF_POW2_GT_D(D, 2)>
3910HWY_API V Broadcast(
const V v) {
3912 HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
3914 const Half<
decltype(d)> dh;
3915 using VH =
VFromD<
decltype(dh)>;
3916 const Rebind<uint16_t,
decltype(dh)> du16;
3917 VFromD<
decltype(du16)> idx =
3918 detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16));
3920 idx = detail::AddS(idx, kLane);
3922 const VH lo = detail::TableLookupLanes16(LowerHalf(dh, v), idx);
3923 const VH hi = detail::TableLookupLanes16(UpperHalf(dh, v), idx);
3924 return Combine(d, hi, lo);
3927template <
int kLane,
class V,
class D = DFromV<V>,
3928 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
3929HWY_API V Broadcast(const V v) {
3931 HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
3933 const RebindToUn
signed<decltype(d)> du;
3934 auto
idx = detail::OffsetsOf128BitBlocks(d, detail::Iota0(du));
3936 idx = detail::AddS(
idx, kLane);
3938 return TableLookupLanes(v,
idx);
3942#ifdef HWY_NATIVE_BROADCASTLANE
3943#undef HWY_NATIVE_BROADCASTLANE
3945#define HWY_NATIVE_BROADCASTLANE
3950#define HWY_RVV_BROADCAST_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
3951 LMULH, SHIFT, MLEN, NAME, OP) \
3952 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3953 NAME(HWY_RVV_V(BASE, SEW, LMUL) v,
size_t idx) { \
3954 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v,
idx, \
3955 HWY_RVV_AVL(SEW, SHIFT)); \
3958HWY_RVV_FOREACH(HWY_RVV_BROADCAST_LANE, BroadcastLane, rgather, _ALL)
3959#undef HWY_RVV_BROADCAST_LANE
3963template <
int kLane,
class V>
3965 static_assert(0 <= kLane && kLane <
HWY_MAX_LANES_V(V),
"Invalid lane");
3966 return detail::BroadcastLane(v,
static_cast<size_t>(kLane));
3970#ifdef HWY_NATIVE_BLK_INSERT_EXTRACT
3971#undef HWY_NATIVE_BLK_INSERT_EXTRACT
3973#define HWY_NATIVE_BLK_INSERT_EXTRACT
3976template <
int kBlockIdx,
class V>
3981 using TIdx =
If<
sizeof(TU) == 1, uint16_t, TU>;
3984 const Rebind<TIdx,
decltype(du)> d_idx;
3985 static_assert(0 <= kBlockIdx && kBlockIdx <
d.MaxBlocks(),
3986 "Invalid block index");
3987 constexpr size_t kMaxLanesPerBlock = 16 /
sizeof(TU);
3989 constexpr size_t kBlkByteOffset =
3990 static_cast<size_t>(kBlockIdx) * kMaxLanesPerBlock;
3991 const auto vu =
BitCast(du, v);
3993 const auto vblk_shifted = detail::SlideUp(vblk, vblk, kBlkByteOffset);
3995 du, detail::LtS(detail::SubS(detail::Iota0(d_idx),
3996 static_cast<TIdx
>(kBlkByteOffset)),
3997 static_cast<TIdx
>(kMaxLanesPerBlock)));
4003template <
int kBlockIdx,
class V, HWY_IF_POW2_LE_D(DFromV<V>, -3)>
4004HWY_API V BroadcastBlock(V v) {
4005 const DFromV<
decltype(v)> d;
4006 const Repartition<uint8_t,
decltype(d)> du8;
4007 const Rebind<uint16_t,
decltype(d)> du16;
4009 static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(),
4010 "Invalid block index");
4012 const auto idx = detail::AddS(detail::AndS(detail::Iota0(du16), uint16_t{15}),
4013 static_cast<uint16_t
>(kBlockIdx * 16));
4014 return BitCast(d, detail::TableLookupLanes16(BitCast(du8, v), idx));
4017template <
int kBlockIdx,
class V, HWY_IF_POW2_GT_D(DFromV<V>, -3)>
4019 const DFromV<
decltype(v)> d;
4020 using TU =
If<
sizeof(TFromV<V>) == 1, uint16_t,
MakeUnsigned<TFromV<V>>>;
4023 static_assert(0 <= kBlockIdx && kBlockIdx <
d.MaxBlocks(),
4024 "Invalid block index");
4025 constexpr size_t kMaxLanesPerBlock = 16 /
sizeof(TU);
4027 const auto idx = detail::AddS(
4028 detail::AndS(detail::Iota0(du),
static_cast<TU
>(kMaxLanesPerBlock - 1)),
4029 static_cast<TU
>(
static_cast<size_t>(kBlockIdx) * kMaxLanesPerBlock));
4034template <
int kBlockIdx,
class V>
4039 static_assert(0 <= kBlockIdx && kBlockIdx <
d.MaxBlocks(),
4040 "Invalid block index");
4041 constexpr size_t kMaxLanesPerBlock = 16 /
sizeof(
TFromD<
decltype(
d)>);
4042 constexpr size_t kBlkByteOffset =
4043 static_cast<size_t>(kBlockIdx) * kMaxLanesPerBlock;
4045 return ResizeBitCast(d_block, detail::SlideDown(v, kBlkByteOffset));
4050template <
size_t kLanes,
class D,
class V = VFromD<D>>
4051HWY_API V ShiftLeftLanes(
const D d,
const V v) {
4052 const RebindToSigned<
decltype(d)> di;
4053 const RebindToUnsigned<
decltype(d)> du;
4054 using TI = TFromD<
decltype(di)>;
4055 const auto shifted = detail::SlideUp(v, v, kLanes);
4057 const auto idx_mod =
4058 detail::AndS(BitCast(di, detail::Iota0(du)),
4059 static_cast<TI
>(detail::LanesPerBlock(di) - 1));
4060 const auto clear = detail::LtS(idx_mod,
static_cast<TI
>(kLanes));
4061 return IfThenZeroElse(clear, shifted);
4064template <
size_t kLanes,
class V>
4065HWY_API V ShiftLeftLanes(
const V v) {
4066 return ShiftLeftLanes<kLanes>(DFromV<V>(), v);
4071template <
int kBytes,
class D>
4077template <
int kBytes,
class V>
4079 return ShiftLeftBytes<kBytes>(DFromV<V>(), v);
4083template <
size_t kLanes,
typename T,
size_t N,
int kPow2,
4088 using TI =
TFromD<
decltype(di)>;
4090 if (N <= 16 /
sizeof(T)) {
4091 v = detail::SlideUp(v,
Zero(
d), N);
4094 const auto shifted = detail::SlideDown(v, kLanes);
4096 const size_t lpb = detail::LanesPerBlock(di);
4097 const auto idx_mod =
4098 detail::AndS(
BitCast(di, detail::Iota0(du)),
static_cast<TI
>(lpb - 1));
4099 const auto keep = detail::LtS(idx_mod,
static_cast<TI
>(lpb - kLanes));
4104template <
int kBytes,
class D,
class V = VFromD<D>>
4105HWY_API V ShiftRightBytes(
const D d,
const V v) {
4106 const Repartition<uint8_t,
decltype(d)> d8;
4107 return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
4111#ifdef HWY_NATIVE_INTERLEAVE_WHOLE
4112#undef HWY_NATIVE_INTERLEAVE_WHOLE
4114#define HWY_NATIVE_INTERLEAVE_WHOLE
4125 const Half<
decltype(du)> duh;
4132template <
class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
4135 const auto idx = ShiftRight<1>(detail::Iota0(du));
4139template <
class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_GT_D(D, 2)>
4142 const Half<
decltype(dh)> dq;
4143 const VFromD<
decltype(dh)> i0 =
4144 InterleaveWhole(dh, LowerHalf(dq, a), LowerHalf(dq, b));
4145 const VFromD<
decltype(dh)> i1 =
4146 InterleaveWhole(dh, UpperHalf(dq, a), UpperHalf(dq, b));
4147 return Combine(d, i1, i0);
4152template <
class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
4153HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
4154 const RebindToUn
signed<decltype(d)> du;
4155 const detail::AdjustSimdTagToMinVecPow2<RepartitionToW
ide<decltype(du)>> dw;
4156 const RepartitionToNarrow<decltype(dw)> du_src;
4158 const VFromD<D> aw =
4159 ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, a)));
4160 const VFromD<D> bw =
4161 ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, b)));
4162 return Or(aw, detail::Sl
ide1Up(bw));
4165template <
class D, HWY_IF_T_SIZE_D(D, 8)>
4168 const auto idx = ShiftRight<1>(detail::Iota0(du));
4174template <
class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
4175HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4180 const
size_t half_N = Lanes(d) / 2;
4181 return InterleaveWholeLower(d, detail::Sl
ideDown(a, half_N),
4182 detail::Sl
ideDown(b, half_N));
4185template <
class D, HWY_IF_T_SIZE_D(D, 8)>
4191 const size_t half_N =
Lanes(
d) / 2;
4193 const auto idx = detail::AddS(ShiftRight<1>(detail::Iota0(du)),
4194 static_cast<uint64_t
>(half_N));
4204template <
class D,
class V, HWY_IF_POW2_LE_D(D, 2)>
4206 static_assert(IsSame<TFromD<D>,
TFromV<V>>(),
"D/V mismatch");
4209 const VFromD<
decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b);
4213 const VFromD<
decltype(dt_u)> idx_block =
4214 ShiftRight<kShift>(detail::Iota0(dt_u));
4215 const MFromD<
decltype(dt_u)> is_even =
4216 detail::EqS(detail::AndS(idx_block, 1), 0);
4219template <
class D,
class V, HWY_IF_POW2_GT_D(D, 2)>
4220HWY_INLINE V InterleaveLowerBlocks(D d,
const V a,
const V b) {
4222 const VFromD<
decltype(dh)> i0 =
4224 const VFromD<
decltype(dh)> i1 =
4230template <
class D,
class V, HWY_IF_POW2_LE_D(D, 2)>
4232 static_assert(IsSame<TFromD<D>,
TFromV<V>>(),
"D/V mismatch");
4235 const VFromD<
decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b);
4239 const VFromD<
decltype(dt_u)> idx_block =
4240 ShiftRight<kShift>(detail::Iota0(dt_u));
4241 const MFromD<
decltype(dt_u)> is_odd =
4242 detail::EqS(detail::AndS(idx_block, 1), 1);
4245template <
class D,
class V, HWY_IF_POW2_GT_D(D, 2)>
4246HWY_INLINE V InterleaveUpperBlocks(D d,
const V a,
const V b) {
4248 const VFromD<
decltype(dh)> i0 =
4250 const VFromD<
decltype(dh)> i1 =
4257template <
typename T,
size_t N,
int kPow2>
4259 return N *
sizeof(T) >= 16 && kPow2 >= 0;
4264template <
typename T,
size_t N,
int kPow2>
4266 return N *
sizeof(T) < 16;
4271#define HWY_RVV_IF_GE128_D(D) hwy::EnableIf<detail::IsGE128(D())>* = nullptr
4272#define HWY_RVV_IF_LT128_D(D) hwy::EnableIf<detail::IsLT128(D())>* = nullptr
4273#define HWY_RVV_IF_CAN128_D(D) \
4274 hwy::EnableIf<!detail::IsLT128(D()) && !detail::IsGE128(D())>* = nullptr
4276template <
class D,
class V, HWY_RVV_IF_GE128_D(D)>
4278 return detail::InterleaveLowerBlocks(
d, a, b);
4282template <
class D,
class V, HWY_RVV_IF_LT128_D(D)>
4283HWY_API V InterleaveLower(D d,
const V a,
const V b) {
4284 static_assert(IsSame<TFromD<D>, TFromV<V>>(),
"D/V mismatch");
4285 return InterleaveWholeLower(d, a, b);
4289template <
class D,
class V, HWY_RVV_IF_CAN128_D(D)>
4290HWY_API V InterleaveLower(D d,
const V a,
const V b) {
4291 if (Lanes(d) *
sizeof(TFromD<D>) <= 16) {
4292 return InterleaveWholeLower(d, a, b);
4295 const ScalableTag<TFromD<D>,
HWY_MAX(
d.Pow2(), 0)> d1;
4307template <
class D,
class V, HWY_RVV_IF_GE128_D(D)>
4309 return detail::InterleaveUpperBlocks(d, a, b);
4313template <
class D,
class V, HWY_RVV_IF_LT128_D(D)>
4315 static_assert(IsSame<TFromD<D>, TFromV<V>>(),
"D/V mismatch");
4320template <
class D,
class V, HWY_RVV_IF_CAN128_D(D)>
4322 if (
Lanes(d) *
sizeof(TFromD<D>) <= 16) {
4326 const ScalableTag<TFromD<D>,
HWY_MAX(
d.Pow2(), 0)> d1;
4333template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
4335 const RepartitionToNarrow<DW> dn;
4336 static_assert(
IsSame<
TFromD<
decltype(dn)>, TFromV<V>>(),
"D/V mismatch");
4340template <
class V,
class DW = RepartitionToW
ide<DFromV<V>>>
4346template <
class DW,
class V>
4348 const RepartitionToNarrow<DW> dn;
4349 static_assert(
IsSame<
TFromD<
decltype(dn)>, TFromV<V>>(),
"D/V mismatch");
4356#ifdef HWY_NATIVE_REDUCE_SCALAR
4357#undef HWY_NATIVE_REDUCE_SCALAR
4359#define HWY_NATIVE_REDUCE_SCALAR
4363#define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4365 template <size_t N> \
4366 HWY_API HWY_RVV_T(BASE, SEW) \
4367 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) v, \
4368 HWY_RVV_V(BASE, SEW, m1) v0) { \
4369 return GetLane(__riscv_v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \
4370 v, v0, Lanes(d))); \
4376#undef HWY_IF_REDUCE_D
4377#define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
4379#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
4380#undef HWY_NATIVE_REDUCE_SUM_4_UI8
4382#define HWY_NATIVE_REDUCE_SUM_4_UI8
4385#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
4386#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
4388#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
4398template <
class D, HWY_IF_REDUCE_D(D)>
4400 const auto v0 =
Zero(ScalableTag<TFromD<D>>());
4401 return detail::RedSum(d, v, v0);
4411template <
class D,
typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
4413 const ScalableTag<T> d1;
4414 return detail::RedMin(d, v,
Set(d1, HighestValue<T>()));
4424template <
class D,
typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
4426 const ScalableTag<T> d1;
4427 return detail::RedMax(d, v,
Set(d1, LowestValue<T>()));
4430#undef HWY_RVV_REDUCE
4434template <
class D, HWY_IF_LANES_GT_D(D, 1)>
4438template <
class D, HWY_IF_LANES_GT_D(D, 1)>
4442template <
class D, HWY_IF_LANES_GT_D(D, 1)>
4452#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
4453#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
4455#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
4461#define HWY_RVV_GET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4463 template <size_t kIndex> \
4464 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
4465 NAME##2(HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup) { \
4466 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##x2_##CHAR##SEW##LMUL(tup, \
4469 template <size_t kIndex> \
4470 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
4471 NAME##3(HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup) { \
4472 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##x3_##CHAR##SEW##LMUL(tup, \
4475 template <size_t kIndex> \
4476 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
4477 NAME##4(HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup) { \
4478 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##x4_##CHAR##SEW##LMUL(tup, \
4485#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4487 template <size_t kIndex> \
4488 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 2) NAME##2( \
4489 HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup, HWY_RVV_V(BASE, SEW, LMUL) v) { \
4490 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMUL##x2( \
4493 template <size_t kIndex> \
4494 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 3) NAME##3( \
4495 HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup, HWY_RVV_V(BASE, SEW, LMUL) v) { \
4496 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMUL##x3( \
4499 template <size_t kIndex> \
4500 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 4) NAME##4( \
4501 HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup, HWY_RVV_V(BASE, SEW, LMUL) v) { \
4502 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMUL##x4( \
4510#define HWY_RVV_CREATE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4512 template <size_t N> \
4513 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 2) \
4514 NAME##2(HWY_RVV_D(BASE, SEW, N, SHIFT) , \
4515 HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1) { \
4516 HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup{}; \
4517 tup = Set2<0>(tup, v0); \
4518 tup = Set2<1>(tup, v1); \
4521 template <size_t N> \
4522 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 3) NAME##3( \
4523 HWY_RVV_D(BASE, SEW, N, SHIFT) , HWY_RVV_V(BASE, SEW, LMUL) v0, \
4524 HWY_RVV_V(BASE, SEW, LMUL) v1, HWY_RVV_V(BASE, SEW, LMUL) v2) { \
4525 HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup{}; \
4526 tup = Set3<0>(tup, v0); \
4527 tup = Set3<1>(tup, v1); \
4528 tup = Set3<2>(tup, v2); \
4531 template <size_t N> \
4532 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 4) \
4533 NAME##4(HWY_RVV_D(BASE, SEW, N, SHIFT) , \
4534 HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
4535 HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3) { \
4536 HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup{}; \
4537 tup = Set4<0>(tup, v0); \
4538 tup = Set4<1>(tup, v1); \
4539 tup = Set4<2>(tup, v2); \
4540 tup = Set4<3>(tup, v3); \
4545#undef HWY_RVV_CREATE
4548using Vec2 =
decltype(
Create2(D(), Zero(D()), Zero(D())));
4550using Vec3 =
decltype(
Create3(D(), Zero(D()), Zero(D()), Zero(D())));
4552using Vec4 =
decltype(
Create4(D(), Zero(D()), Zero(D()), Zero(D()), Zero(D())));
4554#define HWY_RVV_LOAD2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4556 template <size_t N> \
4557 HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
4558 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \
4559 HWY_RVV_V(BASE, SEW, LMUL) & v0, \
4560 HWY_RVV_V(BASE, SEW, LMUL) & v1) { \
4561 const HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup = \
4562 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x2(unaligned, Lanes(d)); \
4563 v0 = Get2<0>(tup); \
4564 v1 = Get2<1>(tup); \
4572#define HWY_RVV_LOAD3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4574 template <size_t N> \
4575 HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
4576 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \
4577 HWY_RVV_V(BASE, SEW, LMUL) & v0, \
4578 HWY_RVV_V(BASE, SEW, LMUL) & v1, \
4579 HWY_RVV_V(BASE, SEW, LMUL) & v2) { \
4580 const HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup = \
4581 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x3(unaligned, Lanes(d)); \
4582 v0 = Get3<0>(tup); \
4583 v1 = Get3<1>(tup); \
4584 v2 = Get3<2>(tup); \
4592#define HWY_RVV_LOAD4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4594 template <size_t N> \
4595 HWY_API void NAME( \
4596 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
4597 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \
4598 HWY_RVV_V(BASE, SEW, LMUL) & v0, HWY_RVV_V(BASE, SEW, LMUL) & v1, \
4599 HWY_RVV_V(BASE, SEW, LMUL) & v2, HWY_RVV_V(BASE, SEW, LMUL) & v3) { \
4600 const HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup = \
4601 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x4(unaligned, Lanes(d)); \
4602 v0 = Get4<0>(tup); \
4603 v1 = Get4<1>(tup); \
4604 v2 = Get4<2>(tup); \
4605 v3 = Get4<3>(tup); \
4613#define HWY_RVV_STORE2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4615 template <size_t N> \
4616 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v0, \
4617 HWY_RVV_V(BASE, SEW, LMUL) v1, \
4618 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
4619 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \
4620 const HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup = Create2(d, v0, v1); \
4621 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x2(unaligned, tup, Lanes(d)); \
4625#undef HWY_RVV_STORE2
4629#define HWY_RVV_STORE3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4631 template <size_t N> \
4632 HWY_API void NAME( \
4633 HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
4634 HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
4635 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \
4636 const HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup = Create3(d, v0, v1, v2); \
4637 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x3(unaligned, tup, Lanes(d)); \
4641#undef HWY_RVV_STORE3
4645#define HWY_RVV_STORE4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4647 template <size_t N> \
4648 HWY_API void NAME( \
4649 HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
4650 HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3, \
4651 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
4652 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \
4653 const HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup = Create4(d, v0, v1, v2, v3); \
4654 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x4(unaligned, tup, Lanes(d)); \
4658#undef HWY_RVV_STORE4
4662template <
class D,
typename T = TFromD<D>>
4665 const VFromD<D> A = LoadU(d, unaligned);
4666 const VFromD<D> B = LoadU(d, unaligned + Lanes(d));
4667 v0 = ConcatEven(d, B, A);
4668 v1 = ConcatOdd(d, B, A);
4672#define HWY_RVV_LOAD_STRIDED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
4673 SHIFT, MLEN, NAME, OP) \
4674 template <size_t N> \
4675 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
4676 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
4677 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t stride) { \
4678 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
4679 p, static_cast<ptrdiff_t>(stride), Lanes(d)); \
4682#undef HWY_RVV_LOAD_STRIDED
4685template <
class D,
typename T = TFromD<D>>
4689 v0 = detail::LoadStrided(
d, unaligned + 0, 3 *
sizeof(T));
4690 v1 = detail::LoadStrided(
d, unaligned + 1, 3 *
sizeof(T));
4691 v2 = detail::LoadStrided(
d, unaligned + 2, 3 *
sizeof(T));
4694template <
class D,
typename T = TFromD<D>>
4699 v0 = detail::LoadStrided(d, unaligned + 0, 4 *
sizeof(T));
4700 v1 = detail::LoadStrided(d, unaligned + 1, 4 *
sizeof(T));
4701 v2 = detail::LoadStrided(d, unaligned + 2, 4 *
sizeof(T));
4702 v3 = detail::LoadStrided(d, unaligned + 3, 4 *
sizeof(T));
4706template <
class D,
typename T = TFromD<D>, HWY_IF_NOT_T_SIZE_D(D, 8),
4707 HWY_IF_POW2_LE_D(D, 2)>
4712 const Twice<
decltype(
d)> dt;
4717 StoreU(
Or(w0, detail::Slide1Up(w1)), dt, unaligned);
4721template <
class D,
typename T = TFromD<D>, HWY_IF_NOT_T_SIZE_D(D, 8),
4722 HWY_IF_POW2_GT_D(D, 2)>
4725 const Half<
decltype(d)> dh;
4726 StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), d, unaligned);
4727 StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), d,
4728 unaligned + Lanes(d));
4732#define HWY_RVV_STORE_STRIDED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
4733 SHIFT, MLEN, NAME, OP) \
4734 template <size_t N> \
4735 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
4736 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
4737 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t stride) { \
4738 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
4739 p, static_cast<ptrdiff_t>(stride), v, Lanes(d)); \
4742#undef HWY_RVV_STORE_STRIDED
4746template <
class D,
typename T = TFromD<D>, HWY_IF_T_SIZE_D(D, 8)>
4750 detail::StoreStrided(v0,
d, unaligned + 0, 2 *
sizeof(T));
4751 detail::StoreStrided(v1,
d, unaligned + 1, 2 *
sizeof(T));
4754template <
class D,
typename T = TFromD<D>>
4758 detail::StoreStrided(v0,
d, unaligned + 0, 3 *
sizeof(T));
4759 detail::StoreStrided(v1,
d, unaligned + 1, 3 *
sizeof(T));
4760 detail::StoreStrided(v2,
d, unaligned + 2, 3 *
sizeof(T));
4763template <
class D,
typename T = TFromD<D>>
4767 detail::StoreStrided(v0,
d, unaligned + 0, 4 *
sizeof(T));
4768 detail::StoreStrided(v1,
d, unaligned + 1, 4 *
sizeof(T));
4769 detail::StoreStrided(v2,
d, unaligned + 2, 4 *
sizeof(T));
4770 detail::StoreStrided(v3,
d, unaligned + 3, 4 *
sizeof(T));
4777template <
class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
4782template <
class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
4784 const auto even_lanes = Set(d, t0);
4785#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
4786 if (__builtin_constant_p(BitCastScalar<uint64_t>(t0) ==
4787 BitCastScalar<uint64_t>(t1)) &&
4788 (BitCastScalar<uint64_t>(t0) == BitCastScalar<uint64_t>(t1))) {
4793 const auto odd_lanes =
Set(d, t1);
4794 return OddEven(odd_lanes, even_lanes);
4799#pragma pack(push, 1)
4802struct alignas(8) Vec64ValsWrapper {
4803 static_assert(
sizeof(T) >= 1,
"sizeof(T) >= 1 must be true");
4804 static_assert(
sizeof(T) <= 8,
"sizeof(T) <= 8 must be true");
4805 T vals[8 /
sizeof(T)];
4812template <
class D, HWY_IF_T_SIZE_D(D, 1)>
4814 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
4815 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
4816 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
4817 TFromD<D> t11, TFromD<D> t12,
4818 TFromD<D> t13, TFromD<D> t14,
4820 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
4824 BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
4825 {t0, t1, t2, t3, t4, t5, t6, t7}}),
4826 BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
4827 {t8, t9, t10, t11, t12, t13, t14, t15}})));
4830template <
class D, HWY_IF_T_SIZE_D(D, 2)>
4832 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
4833 TFromD<D> t5, TFromD<D> t6,
4835 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
4839 BitCastScalar<uint64_t>(
4840 detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1, t2, t3}}),
4841 BitCastScalar<uint64_t>(
4842 detail::Vec64ValsWrapper<TFromD<D>>{{t4, t5, t6, t7}})));
4845template <
class D, HWY_IF_T_SIZE_D(D, 4)>
4847 TFromD<D> t2, TFromD<D> t3) {
4848 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
4852 BitCastScalar<uint64_t>(
4853 detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}}),
4854 BitCastScalar<uint64_t>(
4855 detail::Vec64ValsWrapper<TFromD<D>>{{t2, t3}})));
4861template <
typename V,
class D = DFromV<V>, HWY_IF_U8_D(D),
4862 hwy::EnableIf<D().Pow2() < 1 || D().MaxLanes() < 16>* =
nullptr>
4863HWY_API V PopulationCount(V v) {
4865 v = Sub(v, detail::AndS(ShiftRight<1>(v), 0x55));
4866 v = Add(detail::AndS(ShiftRight<2>(v), 0x33), detail::AndS(v, 0x33));
4867 return detail::AndS(Add(v, ShiftRight<4>(v)), 0x0F);
4877 constexpr int kLoadPow2 =
d.Pow2();
4878 constexpr size_t kMaxLanesToLoad =
4880 constexpr size_t kLoadN = D::template NewN<kLoadPow2, kMaxLanesToLoad>();
4881 const Simd<TFromD<D>, kLoadN, kLoadPow2> d_load;
4882 static_assert(d_load.MaxBytes() <= 16,
4883 "d_load.MaxBytes() <= 16 must be true");
4884 static_assert((
d.MaxBytes() < 16) || (d_load.MaxBytes() == 16),
4885 "d_load.MaxBytes() == 16 must be true if d.MaxBytes() >= 16 is "
4887 static_assert((
d.MaxBytes() >= 16) || (d_load.MaxBytes() ==
d.MaxBytes()),
4888 "d_load.MaxBytes() == d.MaxBytes() must be true if "
4889 "d.MaxBytes() < 16 is true");
4892 if (
d.MaxBytes() <= 16)
return loaded;
4895 using TU =
TFromD<
decltype(du)>;
4896 const TU mask =
static_cast<TU
>(detail::LanesPerBlock(d) - 1);
4915 64, detail::ScaleByPower(8 *
sizeof(TFromD<D>), -D().Pow2()))>;
4917#define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \
4918 HWY_INLINE HWY_RVV_M(MLEN) \
4919 NAME(hwy::SizeTag<MLEN> , const uint8_t* bits, size_t N) { \
4920 return __riscv_v##OP##_v_b##MLEN(bits, N); \
4923#undef HWY_RVV_LOAD_MASK_BITS
4926template <
class D,
class MT = detail::MaskTag<D>>
4928 ->
decltype(detail::LoadMaskBits(MT(), bits,
Lanes(d))) {
4929 return detail::LoadMaskBits(MT(), bits,
Lanes(d));
4933#define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \
4934 template <class D> \
4935 HWY_API size_t NAME(D d, HWY_RVV_M(MLEN) m, uint8_t* bits) { \
4936 const size_t N = Lanes(d); \
4937 __riscv_v##OP##_v_b##MLEN(bits, m, N); \
4940 constexpr bool kLessThan8 = \
4941 detail::ScaleByPower(16 / sizeof(TFromD<D>), d.Pow2()) < 8; \
4942 if (MaxLanes(d) < 8 || (kLessThan8 && N < 8)) { \
4943 const int mask = (1 << N) - 1; \
4944 bits[0] = static_cast<uint8_t>(bits[0] & mask); \
4946 return (N + 7) / 8; \
4949#undef HWY_RVV_STORE_MASK_BITS
4970template <
class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
4972 const RebindToUnsigned<D> du;
4973 using TU =
TFromD<
decltype(du)>;
4974 return RebindMask(d, detail::LtS(detail::Iota0(du),
static_cast<TU
>(n)));
4977template <
class D, HWY_IF_T_SIZE_D(D, 1)>
4979 const auto zero =
Zero(d);
4980 const auto one =
Set(d, 1);
4981 return Eq(detail::SlideUp(one, zero, n), one);
4986#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
4998HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool1_t m) {
4999 return __riscv_vreinterpret_v_b1_u8m1(m);
5002HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool2_t m) {
5003 return __riscv_vreinterpret_v_b2_u8m1(m);
5006HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool4_t m) {
5007 return __riscv_vreinterpret_v_b4_u8m1(m);
5010HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool8_t m) {
5011 return __riscv_vreinterpret_v_b8_u8m1(m);
5014HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool16_t m) {
5015 return __riscv_vreinterpret_v_b16_u8m1(m);
5018HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool32_t m) {
5019 return __riscv_vreinterpret_v_b32_u8m1(m);
5022HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool64_t m) {
5023 return __riscv_vreinterpret_v_b64_u8m1(m);
5026template <
class D, hwy::EnableIf<IsSame<MFromD<D>, v
bool1_t>()>* =
nullptr>
5027HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D , vuint8m1_t v) {
5028 return __riscv_vreinterpret_v_u8m1_b1(v);
5031template <
class D, hwy::EnableIf<IsSame<MFromD<D>, v
bool2_t>()>* =
nullptr>
5032HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D , vuint8m1_t v) {
5033 return __riscv_vreinterpret_v_u8m1_b2(v);
5036template <
class D, hwy::EnableIf<IsSame<MFromD<D>, v
bool4_t>()>* =
nullptr>
5037HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D , vuint8m1_t v) {
5038 return __riscv_vreinterpret_v_u8m1_b4(v);
5041template <
class D, hwy::EnableIf<IsSame<MFromD<D>, v
bool8_t>()>* =
nullptr>
5042HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D , vuint8m1_t v) {
5043 return __riscv_vreinterpret_v_u8m1_b8(v);
5046template <
class D, hwy::EnableIf<IsSame<MFromD<D>, v
bool16_t>()>* =
nullptr>
5047HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D , vuint8m1_t v) {
5048 return __riscv_vreinterpret_v_u8m1_b16(v);
5051template <
class D, hwy::EnableIf<IsSame<MFromD<D>, v
bool32_t>()>* =
nullptr>
5052HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D , vuint8m1_t v) {
5053 return __riscv_vreinterpret_v_u8m1_b32(v);
5056template <
class D, hwy::EnableIf<IsSame<MFromD<D>, v
bool64_t>()>* =
nullptr>
5057HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D , vuint8m1_t v) {
5058 return __riscv_vreinterpret_v_u8m1_b64(v);
5063#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
5064#undef HWY_NATIVE_LOWER_HALF_OF_MASK
5066#define HWY_NATIVE_LOWER_HALF_OF_MASK
5071 return detail::U8MaskBitsVecToMask(d, detail::MaskToU8MaskBitsVec(m));
5074#ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
5075#undef HWY_NATIVE_UPPER_HALF_OF_MASK
5077#define HWY_NATIVE_UPPER_HALF_OF_MASK
5082 const size_t N =
Lanes(d);
5084 vuint8m1_t mask_bits = detail::MaskToU8MaskBitsVec(m);
5087 mask_bits =
SlideDownLanes(ScalableTag<uint8_t>(), mask_bits, N / 8);
5090 return detail::U8MaskBitsVecToMask(d, mask_bits);
5095#ifdef HWY_NATIVE_COMBINE_MASKS
5096#undef HWY_NATIVE_COMBINE_MASKS
5098#define HWY_NATIVE_COMBINE_MASKS
5103 const Half<
decltype(
d)> dh;
5104 const size_t half_N =
Lanes(dh);
5106 const auto ext_lo_mask =
5107 And(detail::U8MaskBitsVecToMask(d, detail::MaskToU8MaskBitsVec(lo)),
5109 vuint8m1_t hi_mask_bits = detail::MaskToU8MaskBitsVec(hi);
5110 hi_mask_bits =
ShiftLeftSame(hi_mask_bits,
static_cast<int>(half_N & 7));
5113 SlideUpLanes(ScalableTag<uint8_t>(), hi_mask_bits, half_N / 8);
5116 return Or(ext_lo_mask, detail::U8MaskBitsVecToMask(d, hi_mask_bits));
5121#ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
5122#undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
5124#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
5127template <
class DTo,
class DFrom,
5129 class DTo_2 = Repartition<TFromD<DTo>, DFrom>,
5132 MFromD<DFrom> a, MFromD<DFrom> b) {
5143template <
size_t kN, HWY_IF_LANES_LE(kN, 31)>
5144constexpr unsigned MaxMaskBits() {
5145 return (1u << kN) - 1;
5147template <
size_t kN, HWY_IF_LANES_GT(kN, 31)>
5148constexpr unsigned MaxMaskBits() {
5153template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_LE_D(D, 8)>
5156 if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
5158#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5159 return detail::U8MaskBitsVecToMask(
5160 d,
Set(ScalableTag<uint8_t>(),
static_cast<uint8_t
>(mask_bits)));
5163 const detail::AdjustSimdTagToMinVecPow2<
Repartition<uint64_t,
decltype(du8)>>
5169 uint64_t{0x8040201008040201u}));
5170 return detail::NeS(bytes, uint8_t{0});
5174template <
class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 8)>
5176#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5177 const ScalableTag<uint8_t> du8;
5178 const ScalableTag<uint16_t> du16;
5180 return detail::U8MaskBitsVecToMask(
5181 d,
BitCast(du8,
Set(du16,
static_cast<uint16_t
>(mask_bits))));
5186 const detail::AdjustSimdTagToMinVecPow2<
Repartition<uint64_t,
decltype(du8)>>
5191 const auto bytes =
BitCast(du8,
Set(du16,
static_cast<uint16_t
>(mask_bits)));
5193 const auto rep8 =
TableLookupLanes(bytes, ShiftRight<3>(detail::Iota0(du8)));
5197 detail::AndS(
ResizeBitCast(du64, rep8), uint64_t{0x8040201008040201u}));
5198 return detail::NeS(masked_out_rep8, uint8_t{0});
5202template <
class D, HWY_IF_T_SIZE_D(D, 2)>
5205 if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
5207#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5208 const ScalableTag<uint8_t> du8;
5210 return detail::U8MaskBitsVecToMask(d,
5211 Set(du8,
static_cast<uint8_t
>(mask_bits)));
5214 const RebindToUnsigned<D> du;
5215 const VFromD<
decltype(du)> bits =
5216 Shl(
Set(du, uint16_t{1}),
Iota(du, uint16_t{0}));
5217 return TestBit(
Set(du,
static_cast<uint16_t
>(mask_bits)), bits);
5221template <
class D, HWY_IF_T_SIZE_D(D, 4)>
5224 if (kN < 4) mask_bits &= detail::MaxMaskBits<kN>();
5226#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5227 const ScalableTag<uint8_t> du8;
5228 return detail::U8MaskBitsVecToMask(
5229 d,
Set(du8,
static_cast<uint8_t
>(mask_bits * 0x11)));
5232 const RebindToUnsigned<D> du;
5233 const VFromD<
decltype(du)> bits =
5234 Shl(
Set(du, uint32_t{1}),
Iota(du, uint32_t{0}));
5235 return TestBit(
Set(du,
static_cast<uint32_t
>(mask_bits)), bits);
5239template <
class D, HWY_IF_T_SIZE_D(D, 8)>
5242 if (kN < 2) mask_bits &= detail::MaxMaskBits<kN>();
5244#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5245 const ScalableTag<uint8_t> du8;
5246 return detail::U8MaskBitsVecToMask(
5247 d,
Set(du8,
static_cast<uint8_t
>(mask_bits * 0x55)));
5250 const RebindToUnsigned<D> du;
5252 return TestBit(
Set(du,
static_cast<uint64_t
>(mask_bits)), bits);
5258template <
class V, HWY_IF_SIGNED_V(V)>
5260 return detail::ReverseSubS(v, 0);
5264#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
5265 SHIFT, MLEN, NAME, OP) \
5266 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
5267 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, v, \
5268 HWY_RVV_AVL(SEW, SHIFT)); \
5273#if !HWY_HAVE_FLOAT16
5275template <
class V, HWY_IF_U16_D(DFromV<V>)>
5277 const DFromV<
decltype(v)> d;
5279 using TU =
TFromD<
decltype(du)>;
5287template <
class V, HWY_IF_SIGNED_V(V)>
5294#undef HWY_RVV_RETV_ARGV2
5297template <
class V, HWY_IF_FLOAT_V(V)>
5310enum RoundingModes { kNear, kTrunc, kDown, kUp };
5324 const auto int_f =
ConvertTo(df, integer);
5336 const auto int_f =
ConvertTo(df, integer);
5344 asm volatile(
"fsrm %0" ::
"r"(detail::kUp));
5345 const auto ret =
Round(v);
5346 asm volatile(
"fsrm %0" ::
"r"(detail::kNear));
5353 asm volatile(
"fsrm %0" ::
"r"(detail::kDown));
5354 const auto ret =
Round(v);
5355 asm volatile(
"fsrm %0" ::
"r"(detail::kNear));
5371#ifdef HWY_NATIVE_ISINF
5372#undef HWY_NATIVE_ISINF
5374#define HWY_NATIVE_ISINF
5377template <
class V,
class D = DFromV<V>>
5378HWY_API MFromD<D> IsInf(
const V v) {
5380 const RebindToSigned<
decltype(d)> di;
5381 using T = TFromD<D>;
5382 const VFromD<
decltype(di)> vi = BitCast(di, v);
5384 return RebindMask(d, detail::EqS(Add(vi, vi), hwy::MaxExponentTimes2<T>()));
5388template <
class V,
class D = DFromV<V>>
5393 using T = TFromD<D>;
5398 const VFromD<
decltype(di)> exp =
5399 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(
Add(vu, vu)));
5400 return RebindMask(d, detail::LtS(exp, hwy::MaxExponentField<T>()));
5405template <
class D,
typename T2, HWY_IF_UNSIGNED_D(D)>
5407 return detail::AddS(detail::Iota0(d),
static_cast<TFromD<D>
>(first));
5410template <
class D,
typename T2, HWY_IF_SIGNED_D(D)>
5412 const RebindToUnsigned<D> du;
5413 return detail::AddS(
BitCast(d, detail::Iota0(du)),
5414 static_cast<TFromD<D>
>(first));
5417template <
class D,
typename T2, HWY_IF_FLOAT_D(D)>
5419 const RebindToUnsigned<D> du;
5420 const RebindToSigned<D> di;
5428 class D = DFromV<V>,
class DW = RepartitionToWide<D>>
5430 const auto lo = Mul(a, b);
5431 const auto hi =
MulHigh(a, b);
5436 class D = DFromV<V>,
class DW = RepartitionToWide<D>>
5438 const auto lo = Mul(a, b);
5439 const auto hi =
MulHigh(a, b);
5444template <
class V, HWY_IF_T_SIZE_V(V, 8)>
5446 const auto lo = Mul(a, b);
5447 const auto hi =
MulHigh(a, b);
5448 return OddEven(detail::Slide1Up(hi), lo);
5451template <
class V, HWY_IF_T_SIZE_V(V, 8)>
5453 const auto lo = Mul(a, b);
5454 const auto hi =
MulHigh(a, b);
5455 return OddEven(hi, detail::Slide1Down(lo));
5460template <
size_t N,
int kPow2>
5467 const VFromD<
decltype(du32)> b_in_even =
5468 ShiftRight<16>(detail::RoundF32ForDemoteToBF16(b));
5475template <
class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>),
5476 HWY_IF_POW2_LE_D(DN, 2),
class V, HWY_IF_SIGNED_V(V),
5477 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
5478 class V2 = VFromD<Repartition<TFromV<V>, DN>>,
5479 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* =
nullptr>
5491 const Rebind<TFromV<V>, DN> dt;
5492 const VFromD<
decltype(dt)> ab = Combine(dt, b, a);
5493 return DemoteTo(dn, ab);
5497template <
class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>),
5498 HWY_IF_POW2_GT_D(DN, 2),
class V, HWY_IF_SIGNED_V(V),
5499 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
5500 class V2 = VFromD<Repartition<TFromV<V>, DN>>,
5501 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* =
nullptr>
5503 const Half<
decltype(dn)> dnh;
5504 const VFromD<
decltype(dnh)> demoted_a = DemoteTo(dnh, a);
5505 const VFromD<
decltype(dnh)> demoted_b = DemoteTo(dnh, b);
5506 return Combine(dn, demoted_b, demoted_a);
5514 const Half<
decltype(dn)> dnh;
5517 return Combine(dn, demoted_b, demoted_a);
5537 const Half<
decltype(dn)> dnh;
5538 const RebindToUnsigned<
decltype(dn)> dn_u;
5539 const RebindToUnsigned<
decltype(dnh)> dnh_u;
5540 const auto demoted_a = BitCast(dnh_u, DemoteTo(dnh, a));
5541 const auto demoted_b = BitCast(dnh_u, DemoteTo(dnh, b));
5542 return BitCast(dn, Combine(dn_u, demoted_b, demoted_a));
5545template <
class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>),
class V,
5546 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
5547 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
5548 class V2 = VFromD<Repartition<TFromV<V>, DN>>,
5549 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* =
nullptr>
5551 return ReorderDemote2To(dn, a, b);
5560 using VU32 =
VFromD<
decltype(du32)>;
5561 const VU32 odd =
Set(du32, 0xFFFF0000u);
5564 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
5566 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
5572template <
class D, HWY_IF_I32_D(D),
class VI16>
5574 using VI32 =
VFromD<
decltype(d32)>;
5576 const VI32 ae = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32, a)));
5577 const VI32 be = ShiftRight<16>(ShiftLeft<16>(
BitCast(d32, b)));
5578 const VI32 ao = ShiftRight<16>(
BitCast(d32, a));
5579 const VI32 bo = ShiftRight<16>(
BitCast(d32, b));
5583template <
class D, HWY_IF_U32_D(D),
class VI16>
5585 using VU32 =
VFromD<
decltype(du32)>;
5587 const VU32 ae = detail::AndS(
BitCast(du32, a), uint32_t{0x0000FFFFu});
5588 const VU32 be = detail::AndS(
BitCast(du32, b), uint32_t{0x0000FFFFu});
5589 const VU32 ao = ShiftRight<16>(
BitCast(du32, a));
5590 const VU32 bo = ShiftRight<16>(
BitCast(du32, b));
5599template <
size_t N,
int kPow2,
class DF32 = Simd<
float, N, kPow2>,
5600 class VF32 = VFromD<DF32>,
5601 class DBF16 = Repartition<hwy::b
float16_t, Simd<
float, N, kPow2>>>
5604 const VF32 sum0, VF32& sum1) {
5606 using VU32 =
VFromD<
decltype(du32)>;
5607 const VU32 odd =
Set(du32, 0xFFFF0000u);
5610 const VU32 ae = ShiftLeft<16>(
BitCast(du32, a));
5612 const VU32 be = ShiftLeft<16>(
BitCast(du32, b));
5618#define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
5619 SHIFT, MLEN, NAME, OP) \
5620 template <size_t N> \
5621 HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \
5622 HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEWD, LMULD) sum, \
5623 HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
5624 return __riscv_v##OP##CHAR##SEWD##LMULD(sum, a, b, Lanes(d)); \
5629#undef HWY_RVV_WIDEN_MACC
5632template <
class D32, HWY_IF_POW2_LE_D(D32, 2),
class V32 = VFromD<D32>,
5633 class D16 = RepartitionToNarrow<D32>>
5637 const Twice<
decltype(d32)> d32t;
5638 using V32T =
VFromD<
decltype(d32t)>;
5639 V32T sum =
Combine(d32t, sum1, sum0);
5640 sum = detail::WidenMulAcc(d32t, sum, a, b);
5646template <
class D32, HWY_IF_POW2_GT_D(D32, 2),
class V32 = VFromD<D32>,
5647 class D16 = RepartitionToNarrow<D32>>
5652 using V16H =
VFromD<
decltype(d16h)>;
5657 sum1 = detail::WidenMulAcc(d32, sum1, a1, b1);
5658 return detail::WidenMulAcc(d32, sum0, a0, b0);
5662template <
class D32, HWY_IF_POW2_LE_D(D32, 2),
class V32 = VFromD<D32>,
5663 class D16 = RepartitionToNarrow<D32>>
5667 const Twice<
decltype(d32)> d32t;
5668 using V32T =
VFromD<
decltype(d32t)>;
5669 V32T sum =
Combine(d32t, sum1, sum0);
5670 sum = detail::WidenMulAcc(d32t, sum, a, b);
5676template <
class D32, HWY_IF_POW2_GT_D(D32, 2),
class V32 = VFromD<D32>,
5677 class D16 = RepartitionToNarrow<D32>>
5682 using V16H =
VFromD<
decltype(d16h)>;
5687 sum1 = detail::WidenMulAcc(d32, sum1, a1, b1);
5688 return detail::WidenMulAcc(d32, sum0, a0, b0);
5693template <
size_t N,
int kPow2,
class VN,
class VW>
5695 const VW sum0, VW& sum1) {
5696 return detail::ReorderWidenMulAccumulateBF16(d32, a, b, sum0, sum1);
5699template <
size_t N,
int kPow2,
class VN,
class VW>
5701 const VW sum0, VW& sum1) {
5702 return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1);
5705template <
size_t N,
int kPow2,
class VN,
class VW>
5707 const VW sum0, VW& sum1) {
5708 return detail::ReorderWidenMulAccumulateU16(d32, a, b, sum0, sum1);
5713template <
class VW, HWY_IF_SIGNED_V(VW)>
5721 const Twice<
decltype(di32)> di32x2;
5724 const auto combined =
BitCast(di64x2,
Combine(di32x2, sum1, sum0));
5726 const auto even = ShiftRight<32>(ShiftLeft<32>(combined));
5727 const auto odd = ShiftRight<32>(combined);
5734 const Half<
decltype(
d)> dh;
5735 const vint32m4_t lo =
5737 const vint32m4_t hi =
5742template <
class VW, HWY_IF_UNSIGNED_V(VW)>
5743HWY_API VW RearrangeToOddPlusEven(
const VW sum0,
const VW sum1) {
5748 const DFromV<VW> du32;
5749 const Twice<
decltype(du32)> du32x2;
5750 const RepartitionToWide<
decltype(du32x2)> du64x2;
5751 const auto combined = BitCast(du64x2, Combine(du32x2, sum1, sum0));
5753 const auto even = detail::AndS(combined, uint64_t{0xFFFFFFFFu});
5754 const auto odd = ShiftRight<32>(combined);
5755 return TruncateTo(du32, Add(even, odd));
5761 const Half<
decltype(
d)> dh;
5762 const vuint32m4_t lo =
5764 const vuint32m4_t hi =
5769template <
class VW, HWY_IF_FLOAT_V(VW)>
5770HWY_API VW RearrangeToOddPlusEven(
const VW sum0,
const VW sum1) {
5771 return Add(sum0, sum1);
5777 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
5794 const VFromD<D> ltLx = detail::Slide1Up(ltHL);
5803 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
5805 const VFromD<D> down = detail::Slide1Down(ltHL);
5807 asm volatile(
"" : :
"r,m"(
GetLane(down)) :
"memory");
5815 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
5820 asm volatile(
"" : :
"r,m"(
GetLane(eq)) :
"memory");
5827 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
5836 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
5840 asm volatile(
"" : :
"r,m"(
GetLane(neLH)) :
"memory");
5847 static_assert(IsSame<TFromD<D>, uint64_t>(),
"D must be u64");
5849 const VFromD<D> down = detail::Slide1Down(neHL);
5851 asm volatile(
"" : :
"r,m"(
GetLane(down)) :
"memory");
5860 const VFromD<D> aXH = detail::Slide1Down(a);
5861 const VFromD<D> bXH = detail::Slide1Down(b);
5863 const MFromD<D> ltXH = Lt(aXH, bXH);
5864 const MFromD<D> eqXH = Eq(aXH, bXH);
5866 const VFromD<D> lo = IfThenElse(ltXH, a, b);
5869 return OddEven(minHL, IfThenElse(eqXH, minHL, lo));
5874 const VFromD<D> aXH = detail::Slide1Down(a);
5875 const VFromD<D> bXH = detail::Slide1Down(b);
5877 const MFromD<D> ltXH = Lt(aXH, bXH);
5878 const MFromD<D> eqXH = Eq(aXH, bXH);
5880 const VFromD<D> lo = IfThenElse(ltXH, b, a);
5883 return OddEven(maxHL, IfThenElse(eqXH, maxHL, lo));
5899#undef HWY_RVV_FOREACH
5900#undef HWY_RVV_FOREACH_08_ALL
5901#undef HWY_RVV_FOREACH_08_ALL_VIRT
5902#undef HWY_RVV_FOREACH_08_DEMOTE
5903#undef HWY_RVV_FOREACH_08_DEMOTE_VIRT
5904#undef HWY_RVV_FOREACH_08_EXT
5905#undef HWY_RVV_FOREACH_08_EXT_VIRT
5906#undef HWY_RVV_FOREACH_08_TRUNC
5907#undef HWY_RVV_FOREACH_08_VIRT
5908#undef HWY_RVV_FOREACH_16_ALL
5909#undef HWY_RVV_FOREACH_16_ALL_VIRT
5910#undef HWY_RVV_FOREACH_16_DEMOTE
5911#undef HWY_RVV_FOREACH_16_DEMOTE_VIRT
5912#undef HWY_RVV_FOREACH_16_EXT
5913#undef HWY_RVV_FOREACH_16_EXT_VIRT
5914#undef HWY_RVV_FOREACH_16_TRUNC
5915#undef HWY_RVV_FOREACH_16_VIRT
5916#undef HWY_RVV_FOREACH_32_ALL
5917#undef HWY_RVV_FOREACH_32_ALL_VIRT
5918#undef HWY_RVV_FOREACH_32_DEMOTE
5919#undef HWY_RVV_FOREACH_32_DEMOTE_VIRT
5920#undef HWY_RVV_FOREACH_32_EXT
5921#undef HWY_RVV_FOREACH_32_EXT_VIRT
5922#undef HWY_RVV_FOREACH_32_TRUNC
5923#undef HWY_RVV_FOREACH_32_VIRT
5924#undef HWY_RVV_FOREACH_64_ALL
5925#undef HWY_RVV_FOREACH_64_ALL_VIRT
5926#undef HWY_RVV_FOREACH_64_DEMOTE
5927#undef HWY_RVV_FOREACH_64_DEMOTE_VIRT
5928#undef HWY_RVV_FOREACH_64_EXT
5929#undef HWY_RVV_FOREACH_64_EXT_VIRT
5930#undef HWY_RVV_FOREACH_64_TRUNC
5931#undef HWY_RVV_FOREACH_64_VIRT
5932#undef HWY_RVV_FOREACH_B
5933#undef HWY_RVV_FOREACH_F
5934#undef HWY_RVV_FOREACH_F16
5935#undef HWY_RVV_FOREACH_F32
5936#undef HWY_RVV_FOREACH_F3264
5937#undef HWY_RVV_FOREACH_F64
5938#undef HWY_RVV_FOREACH_I
5939#undef HWY_RVV_FOREACH_I08
5940#undef HWY_RVV_FOREACH_I16
5941#undef HWY_RVV_FOREACH_I163264
5942#undef HWY_RVV_FOREACH_I32
5943#undef HWY_RVV_FOREACH_I64
5944#undef HWY_RVV_FOREACH_U
5945#undef HWY_RVV_FOREACH_U08
5946#undef HWY_RVV_FOREACH_U16
5947#undef HWY_RVV_FOREACH_U163264
5948#undef HWY_RVV_FOREACH_U32
5949#undef HWY_RVV_FOREACH_U64
5950#undef HWY_RVV_FOREACH_UI
5951#undef HWY_RVV_FOREACH_UI08
5952#undef HWY_RVV_FOREACH_UI16
5953#undef HWY_RVV_FOREACH_UI163264
5954#undef HWY_RVV_FOREACH_UI32
5955#undef HWY_RVV_FOREACH_UI3264
5956#undef HWY_RVV_FOREACH_UI64
5957#undef HWY_RVV_IF_EMULATED_D
5958#undef HWY_RVV_IF_CAN128_D
5959#undef HWY_RVV_IF_GE128_D
5960#undef HWY_RVV_IF_LT128_D
5961#undef HWY_RVV_INSERT_VXRM
5963#undef HWY_RVV_RETM_ARGM
5964#undef HWY_RVV_RETV_ARGMVV
5965#undef HWY_RVV_RETV_ARGV
5966#undef HWY_RVV_RETV_ARGVS
5967#undef HWY_RVV_RETV_ARGVV
#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:195
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_RESTRICT
Definition base.h:95
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_INLINE
Definition base.h:101
#define HWY_DASSERT(condition)
Definition base.h:290
HWY_API VF32 ReorderWidenMulAccumulateBF16(Simd< float, N, kPow2 > df32, VFromD< DBF16 > a, VFromD< DBF16 > b, const VF32 sum0, VF32 &sum1)
Definition rvv-inl.h:5602
HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b)
Definition rvv-inl.h:4231
HWY_API VFromD< D > InterleaveWhole(D d, VFromD< Half< D > > a, VFromD< Half< D > > b)
Definition rvv-inl.h:4121
HWY_INLINE V SlideUpLanes(V v, size_t amt)
Definition arm_neon-inl.h:6201
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition rvv-inl.h:2966
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:560
constexpr size_t LanesPerBlock(Simd< T, N, kPow2 > d)
Definition arm_sve-inl.h:3442
HWY_INLINE MFromD< D > IsOdd(D d)
Definition rvv-inl.h:3222
VFromD< D > Ext(D d, VFromD< Half< D > > v)
Definition rvv-inl.h:764
HWY_INLINE MFromD< D > LoadMaskBits(D d, uint64_t mask_bits)
Definition arm_neon-inl.h:8051
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:570
HWY_API VFromD< D32 > ReorderWidenMulAccumulateI16(D32 d32, VFromD< D16 > a, VFromD< D16 > b, const V32 sum0, V32 &sum1)
Definition rvv-inl.h:5634
HWY_INLINE VFromD< D > ChangeLMUL(D, VFromD< D > v)
Definition rvv-inl.h:3403
HWY_API void StoreN(size_t count, VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT p)
Definition rvv-inl.h:1926
constexpr bool IsLT128(Simd< T, N, kPow2 >)
Definition rvv-inl.h:4265
HWY_INLINE VFromD< D > Iota0(D d)
Definition arm_neon-inl.h:1239
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5005
HWY_INLINE If< IsConst< T >(), const uint16_t *, uint16_t * > U16LanePointer(T *p)
Definition ops/shared-inl.h:139
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:1402
constexpr bool IsGE128(Simd< T, N, kPow2 >)
Definition rvv-inl.h:4258
HWY_API VFromD< D32 > ReorderWidenMulAccumulateU16(D32 d32, VFromD< D16 > a, VFromD< D16 > b, const V32 sum0, V32 &sum1)
Definition rvv-inl.h:5664
typename AdjustSimdTagToMinVecPow2_t< RemoveConst< D > >::type AdjustSimdTagToMinVecPow2
Definition rvv-inl.h:70
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8851
HWY_INLINE Vec128< T, N > Mul(hwy::FloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:774
HWY_INLINE MFromD< D > IsEven(D d)
Definition rvv-inl.h:3208
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition ops/shared-inl.h:146
HWY_API Vec128< T > InterleaveUpper(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6086
constexpr bool IsSupportedLMUL(D d)
Definition rvv-inl.h:3074
HWY_INLINE VFromD< D > BitCastFromByte(D, VFromD< D > v)
Definition arm_neon-inl.h:1441
HWY_INLINE MFromD< D > FirstNPerBlock(D)
Definition rvv-inl.h:2972
HWY_INLINE svuint32_t RoundF32ForDemoteToBF16(svfloat32_t v)
Definition arm_sve-inl.h:2690
HWY_INLINE V SlideDownLanes(V v, size_t amt)
Definition arm_neon-inl.h:6346
HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b)
Definition rvv-inl.h:4205
HWY_API void LoadInterleaved4(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2, VFromD< D > &v3)
Definition arm_neon-inl.h:9128
HWY_API void ScatterOffset(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2624
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API V MaskedMaxOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1489
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API V MaskedDivOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1512
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_API VFromD< D > LoadNOr(VFromD< D > no, D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1362
HWY_INLINE VFromD< D > Max128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9490
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API void StoreN(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_store)
Definition emu128-inl.h:1398
HWY_API svbool_t MaskFalse(const D)
Definition arm_sve-inl.h:372
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
V Shl(V a, V b)
Definition generic_ops-inl.h:7322
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API VFromD< D > MaxOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3228
HWY_API V MaskedModOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:4666
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API TFromD< D > ReduceMax(D d, VFromD< D > v)
Definition arm_sve-inl.h:3213
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API void ScatterIndex(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2643
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_INLINE MFromD< D > Ne128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9466
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API svbool_t DemoteMaskTo(DTo, DFrom, svbool_t m)
Definition arm_sve-inl.h:1420
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_INLINE MFromD< D > Lt128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9436
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API MFromD< DTo > OrderedDemote2MasksTo(DTo d_to, DFrom, MFromD< DFrom > a, MFromD< DFrom > b)
Definition x86_128-inl.h:1107
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
HWY_API VFromD< D > Slide1Up(D d, VFromD< D > v)
Definition arm_sve-inl.h:3636
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:8924
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API V MaskedMinOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1484
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API void LoadInterleaved3(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2)
Definition arm_neon-inl.h:9087
HWY_API void StoreInterleaved3(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9253
HWY_API VFromD< D > MinOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3224
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API void StoreInterleaved4(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, VFromD< D > v3, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9285
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API VFromD< D > InterleaveWholeLower(D, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2883
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API void MaskedScatterIndex(VFromD< D > v, MFromD< D > m, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2661
HWY_API size_t CappedLanes(D, size_t cap)
Definition rvv-inl.h:603
HWY_API VFromD< D > MaskedGatherIndexOr(VFromD< D > no, MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2753
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
constexpr size_t MLenFromD(Simd< T, N, kPow2 >)
Definition rvv-inl.h:43
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec128< int16_t > MulOdd(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7645
HWY_API TFromD< D > ReduceMin(D d, VFromD< D > v)
Definition arm_sve-inl.h:3208
HWY_INLINE MFromD< D > Eq128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9444
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
HWY_API V MaskedSatSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1525
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > InterleaveWholeUpper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2890
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API V MaskedSatAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1520
HWY_API V MaskedSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1499
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API VFromD< D > GatherIndex(D d, const TFromD< D > *HWY_RESTRICT p, VFromD< RebindToSigned< D > > indices)
Definition arm_sve-inl.h:1963
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:1578
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > SumOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3220
HWY_API VFromD< D > ShiftRightLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5286
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:6122
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
HWY_API V Sub(V a, V b)
Definition generic_ops-inl.h:7304
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition ops/shared-inl.h:367
HWY_API VFromD< D > GatherOffset(D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2694
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API svbool_t LowerHalfOfMask(D, svbool_t m)
Definition arm_sve-inl.h:1456
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_INLINE MFromD< D > Lt128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9409
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_API V Div(V a, V b)
Definition arm_sve-inl.h:4639
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
HWY_API V ExtractBlock(V v)
Definition generic_ops-inl.h:6967
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API VFromD< D > LoadN(D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1352
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7353
HWY_API V MaskedAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1494
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > MaskedGatherIndex(MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2731
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
V Shr(V a, V b)
Definition generic_ops-inl.h:7326
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API svbool_t IsNegative(V v)
Definition arm_sve-inl.h:1623
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_API V BroadcastBlock(V v)
Definition generic_ops-inl.h:6973
HWY_API VFromD< D > Slide1Down(D d, VFromD< D > v)
Definition arm_sve-inl.h:3653
HWY_INLINE MFromD< D > Eq128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9451
HWY_API V MaskedMulOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1504
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_API void StoreInterleaved2(VFromD< D > v0, VFromD< D > v1, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9221
HWY_INLINE MFromD< D > Ne128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9459
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7335
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API TFromD< D > ReduceSum(D, VFromD< D > v)
Definition arm_neon-inl.h:8027
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API MFromD< D > UpperHalfOfMask(D, MFromD< Twice< D > > m)
Definition x86_128-inl.h:1051
HWY_API V Mul(V a, V b)
Definition generic_ops-inl.h:7309
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_API MFromD< D > CombineMasks(D, MFromD< Half< D > > hi, MFromD< Half< D > > lo)
Definition x86_128-inl.h:959
HWY_INLINE VFromD< D > Min128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9485
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API svbool_t PromoteMaskTo(DTo, DFrom, svbool_t m)
Definition arm_sve-inl.h:1394
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API V InsertBlock(V, V blk_to_insert)
Definition generic_ops-inl.h:6961
HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo)
Definition rvv-inl.h:3761
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API V Mod(V a, V b)
Definition arm_sve-inl.h:4660
HWY_API svbool_t Ge(const V a, const V b)
Definition arm_sve-inl.h:1582
HWY_API constexpr TTo ConvertScalarTo(const TFrom in)
Definition base.h:2435
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API constexpr bool IsSame()
Definition base.h:499
constexpr size_t FloorLog2(TI x)
Definition base.h:2662
typename IfT< Condition, Then, Else >::type If
Definition base.h:520
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T MantissaEnd()
Definition base.h:2307
constexpr size_t CeilLog2(TI x)
Definition base.h:2669
typename EnableIfT< Condition >::type EnableIf
Definition base.h:486
typename detail::Relations< T >::Wide MakeWide
Definition base.h:2086
#define HWY_IF_T_SIZE_D(D, bytes)
Definition ops/shared-inl.h:549
#define HWY_IF_UNSIGNED_D(D)
Definition ops/shared-inl.h:531
#define HWY_IF_POW2_GT_D(D, pow2)
Definition ops/shared-inl.h:574
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_POW2_LE_D(D, pow2)
Definition ops/shared-inl.h:573
#define HWY_IF_T_SIZE_V(V, bytes)
Definition ops/shared-inl.h:624
#define HWY_IF_SPECIAL_FLOAT_D(D)
Definition ops/shared-inl.h:540
#define HWY_MAX_LANES_V(V)
Definition ops/shared-inl.h:631
#define HWY_IF_F32_D(D)
Definition ops/shared-inl.h:600
#define HWY_IF_T_SIZE_GT_D(D, bytes)
Definition ops/shared-inl.h:557
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_IF_T_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:555
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:379
#define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2025
#define HWY_RVV_LOAD_STRIDED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:4672
#define HWY_RVV_MASKED_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2001
#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition rvv-inl.h:2123
#define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:774
#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1437
#define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:374
#define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:706
#define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1780
#define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:329
#define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:325
#define HWY_RVV_LOADN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1826
#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1977
#define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:817
#define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:319
#define HWY_RVV_RETV_ARGMVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:639
#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:401
#define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1889
#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition rvv-inl.h:2130
#define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:4363
#define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:833
#define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3112
#define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2933
#define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1913
#define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:738
#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1239
#define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1406
#define HWY_RVV_PROMOTE_X4_FROM_U8(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition rvv-inl.h:2136
#define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2242
#define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:489
#define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2081
#define HWY_RVV_DEMOTE_TO_SHR_16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2854
#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1229
#define HWY_RVV_SLIDE_DOWN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2989
#define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1143
#define HWY_RVV_LANES_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:569
#define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1629
#define HWY_RVV_PROMOTE_X8(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition rvv-inl.h:2140
#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1539
#define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3310
#define HWY_RVV_INSERT_VXRM(vxrm, avl)
Definition rvv-inl.h:1123
#define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP)
Definition rvv-inl.h:85
#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:622
#define HWY_RVV_RETV_AVERAGE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1127
#define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:327
#define HWY_RVV_MASKED_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3338
#define HWY_RVV_SLIDE_UP(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2980
#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:649
#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:615
#define HWY_RVV_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3326
#define HWY_RVV_STORE_STRIDED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:4732
#define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:335
#define HWY_RVV_MASKED_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3350
#define HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:350
#define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1695
#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:313
#define HWY_RVV_MASKED_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:923
#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3553
#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:410
#define HWY_RVV_SET_AT_OR_AFTER_FIRST(SEW, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3171
#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:658
#define HWY_RVV_FOREACH_UI08(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:354
#define HWY_RVV_MUL15(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1311
#define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2754
#define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:728
#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:348
#define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:788
#define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1516
#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:397
#define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1753
#define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2257
#define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:317
#define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:911
#define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:849
#define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1560
#define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1672
#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:752
#define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:803
#define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1869
#define HWY_RVV_NARROW(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3673
#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3149
#define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:5618
#define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1446
#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:393
#define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:406
#define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2905
#define HWY_RVV_MASKED_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2050
#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:630
#define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:315
#define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1707
#define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:323
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition tuple-inl.h:30
Definition tuple-inl.h:36
Definition tuple-inl.h:43
Definition ops/shared-inl.h:198
constexpr int Pow2() const
Definition ops/shared-inl.h:253
int VFromD
Definition tuple-inl.h:25
HWY_API Vec2< D > Create2(D, VFromD< D > v0, VFromD< D > v1)
Definition tuple-inl.h:52
HWY_API Vec4< D > Create4(D, VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, VFromD< D > v3)
Definition tuple-inl.h:62
HWY_API Vec3< D > Create3(D, VFromD< D > v0, VFromD< D > v1, VFromD< D > v2)
Definition tuple-inl.h:57