Grok 12.0.1
rvv-inl.h
Go to the documentation of this file.
1// Copyright 2021 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// RISC-V V vectors (length not known at compile time).
17// External include guard in highway.h - see comment there.
18
19#include <riscv_vector.h>
20
21#include "hwy/ops/shared-inl.h"
22
24namespace hwy {
25namespace HWY_NAMESPACE {
26
27// Support for vfloat16m*_t and PromoteTo/DemoteTo.
28#ifdef __riscv_zvfhmin
29#define HWY_RVV_HAVE_F16C 1
30#else
31#define HWY_RVV_HAVE_F16C 0
32#endif
33
34template <class V>
35struct DFromV_t {}; // specialized in macros
36template <class V>
37using DFromV = typename DFromV_t<RemoveConst<V>>::type;
38
39template <class V>
40using TFromV = TFromD<DFromV<V>>;
41
42template <typename T, size_t N, int kPow2>
43constexpr size_t MLenFromD(Simd<T, N, kPow2> /* tag */) {
44 // Returns divisor = type bits / LMUL. Folding *8 into the ScaleByPower
45 // argument enables fractional LMUL < 1. Limit to 64 because that is the
46 // largest value for which vbool##_t are defined.
47 return HWY_MIN(64, sizeof(T) * 8 * 8 / detail::ScaleByPower(8, kPow2));
48}
49
50namespace detail {
51
52template <class D>
54
55template <typename T, size_t N, int kPow2>
56class AdjustSimdTagToMinVecPow2_t<Simd<T, N, kPow2>> {
57 private:
59 static constexpr int kMinVecPow2 =
60 -3 + static_cast<int>(FloorLog2(sizeof(T)));
61 static constexpr size_t kNumMaxLanes = HWY_MAX_LANES_D(D);
62 static constexpr int kNewPow2 = HWY_MAX(kPow2, kMinVecPow2);
63 static constexpr size_t kNewN = D::template NewN<kNewPow2, kNumMaxLanes>();
64
65 public:
67};
68
69template <class D>
72
73} // namespace detail
74
75// ================================================== MACROS
76
77// Generate specializations and function definitions using X macros. Although
78// harder to read and debug, writing everything manually is too bulky.
79
80namespace detail { // for code folding
81
82// For all mask sizes MLEN: (1/Nth of a register, one bit per lane)
83// The first three arguments are arbitrary SEW, LMUL, SHIFT such that
84// SEW >> SHIFT = MLEN.
85#define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP) \
86 X_MACRO(64, 0, 64, NAME, OP) \
87 X_MACRO(32, 0, 32, NAME, OP) \
88 X_MACRO(16, 0, 16, NAME, OP) \
89 X_MACRO(8, 0, 8, NAME, OP) \
90 X_MACRO(8, 1, 4, NAME, OP) \
91 X_MACRO(8, 2, 2, NAME, OP) \
92 X_MACRO(8, 3, 1, NAME, OP)
93
94// For given SEW, iterate over one of LMULS: _TRUNC, _EXT, _ALL. This allows
95// reusing type lists such as HWY_RVV_FOREACH_U for _ALL (the usual case) or
96// _EXT (for Combine). To achieve this, we HWY_CONCAT with the LMULS suffix.
97//
98// Precompute SEW/LMUL => MLEN to allow token-pasting the result. For the same
99// reason, also pass the double-width and half SEW and LMUL (suffixed D and H,
100// respectively). "__" means there is no corresponding LMUL (e.g. LMULD for m8).
101// Args: BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP
102
103// LMULS = _TRUNC: truncatable (not the smallest LMUL)
104#define HWY_RVV_FOREACH_08_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
105 X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
106 X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
107 X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \
108 X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \
109 X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \
110 X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
111
112#define HWY_RVV_FOREACH_16_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
113 X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
114 X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \
115 X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \
116 X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \
117 X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
118
119#define HWY_RVV_FOREACH_32_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
120 X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
121 X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \
122 X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \
123 X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
124
125#define HWY_RVV_FOREACH_64_TRUNC(X_MACRO, BASE, CHAR, NAME, OP) \
126 X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \
127 X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
128 X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
129
130// LMULS = _DEMOTE: can demote from SEW*LMUL to SEWH*LMULH.
131#define HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
132 X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
133 X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
134 X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \
135 X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP) \
136 X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP) \
137 X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
138
139#define HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
140 X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \
141 X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
142 X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \
143 X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP) \
144 X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP) \
145 X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
146
147#define HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
148 X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \
149 X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
150 X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP) \
151 X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP) \
152 X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
153
154#define HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
155 X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \
156 X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP) \
157 X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP) \
158 X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
159
160// LMULS = _LE2: <= 2
161#define HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
162 X_MACRO(BASE, CHAR, 8, 16, __, mf8, mf4, __, -3, /*MLEN=*/64, NAME, OP) \
163 X_MACRO(BASE, CHAR, 8, 16, __, mf4, mf2, mf8, -2, /*MLEN=*/32, NAME, OP) \
164 X_MACRO(BASE, CHAR, 8, 16, __, mf2, m1, mf4, -1, /*MLEN=*/16, NAME, OP) \
165 X_MACRO(BASE, CHAR, 8, 16, __, m1, m2, mf2, 0, /*MLEN=*/8, NAME, OP) \
166 X_MACRO(BASE, CHAR, 8, 16, __, m2, m4, m1, 1, /*MLEN=*/4, NAME, OP)
167
168#define HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
169 X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -2, /*MLEN=*/64, NAME, OP) \
170 X_MACRO(BASE, CHAR, 16, 32, 8, mf2, m1, mf4, -1, /*MLEN=*/32, NAME, OP) \
171 X_MACRO(BASE, CHAR, 16, 32, 8, m1, m2, mf2, 0, /*MLEN=*/16, NAME, OP) \
172 X_MACRO(BASE, CHAR, 16, 32, 8, m2, m4, m1, 1, /*MLEN=*/8, NAME, OP)
173
174#define HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
175 X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -1, /*MLEN=*/64, NAME, OP) \
176 X_MACRO(BASE, CHAR, 32, 64, 16, m1, m2, mf2, 0, /*MLEN=*/32, NAME, OP) \
177 X_MACRO(BASE, CHAR, 32, 64, 16, m2, m4, m1, 1, /*MLEN=*/16, NAME, OP)
178
179#define HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
180 X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, 0, /*MLEN=*/64, NAME, OP) \
181 X_MACRO(BASE, CHAR, 64, __, 32, m2, m4, m1, 1, /*MLEN=*/32, NAME, OP)
182
183// LMULS = _EXT: not the largest LMUL
184#define HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
185 HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
186 X_MACRO(BASE, CHAR, 8, 16, __, m4, m8, m2, 2, /*MLEN=*/2, NAME, OP)
187
188#define HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
189 HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
190 X_MACRO(BASE, CHAR, 16, 32, 8, m4, m8, m2, 2, /*MLEN=*/4, NAME, OP)
191
192#define HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
193 HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
194 X_MACRO(BASE, CHAR, 32, 64, 16, m4, m8, m2, 2, /*MLEN=*/8, NAME, OP)
195
196#define HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
197 HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
198 X_MACRO(BASE, CHAR, 64, __, 32, m4, m8, m2, 2, /*MLEN=*/16, NAME, OP)
199
200// LMULS = _ALL (2^MinPow2() <= LMUL <= 8)
201#define HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
202 HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
203 X_MACRO(BASE, CHAR, 8, 16, __, m8, __, m4, 3, /*MLEN=*/1, NAME, OP)
204
205#define HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
206 HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
207 X_MACRO(BASE, CHAR, 16, 32, 8, m8, __, m4, 3, /*MLEN=*/2, NAME, OP)
208
209#define HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
210 HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
211 X_MACRO(BASE, CHAR, 32, 64, 16, m8, __, m4, 3, /*MLEN=*/4, NAME, OP)
212
213#define HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
214 HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
215 X_MACRO(BASE, CHAR, 64, __, 32, m8, __, m4, 3, /*MLEN=*/8, NAME, OP)
216
217// 'Virtual' LMUL. This upholds the Highway guarantee that vectors are at least
218// 128 bit and LowerHalf is defined whenever there are at least 2 lanes, even
219// though RISC-V LMUL must be at least SEW/64 (notice that this rules out
220// LMUL=1/2 for SEW=64). To bridge the gap, we add overloads for kPow2 equal to
221// one less than should be supported, with all other parameters (vector type
222// etc.) unchanged. For D with the lowest kPow2 ('virtual LMUL'), Lanes()
223// returns half of what it usually would.
224//
225// Notice that we can only add overloads whenever there is a D argument: those
226// are unique with respect to non-virtual-LMUL overloads because their kPow2
227// template argument differs. Otherwise, there is no actual vuint64mf2_t, and
228// defining another overload with the same LMUL would be an error. Thus we have
229// a separate _VIRT category for HWY_RVV_FOREACH*, and the common case is
230// _ALL_VIRT (meaning the regular LMUL plus the VIRT overloads), used in most
231// functions that take a D.
232
233#define HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
234
235#define HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
236 X_MACRO(BASE, CHAR, 16, 32, 8, mf4, mf2, mf8, -3, /*MLEN=*/64, NAME, OP)
237
238#define HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
239 X_MACRO(BASE, CHAR, 32, 64, 16, mf2, m1, mf4, -2, /*MLEN=*/64, NAME, OP)
240
241#define HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
242 X_MACRO(BASE, CHAR, 64, __, 32, m1, m2, mf2, -1, /*MLEN=*/64, NAME, OP)
243
244// ALL + VIRT
245#define HWY_RVV_FOREACH_08_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
246 HWY_RVV_FOREACH_08_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
247 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
248
249#define HWY_RVV_FOREACH_16_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
250 HWY_RVV_FOREACH_16_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
251 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
252
253#define HWY_RVV_FOREACH_32_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
254 HWY_RVV_FOREACH_32_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
255 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
256
257#define HWY_RVV_FOREACH_64_ALL_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
258 HWY_RVV_FOREACH_64_ALL(X_MACRO, BASE, CHAR, NAME, OP) \
259 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
260
261// LE2 + VIRT
262#define HWY_RVV_FOREACH_08_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
263 HWY_RVV_FOREACH_08_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
264 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
265
266#define HWY_RVV_FOREACH_16_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
267 HWY_RVV_FOREACH_16_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
268 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
269
270#define HWY_RVV_FOREACH_32_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
271 HWY_RVV_FOREACH_32_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
272 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
273
274#define HWY_RVV_FOREACH_64_LE2_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
275 HWY_RVV_FOREACH_64_LE2(X_MACRO, BASE, CHAR, NAME, OP) \
276 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
277
278// EXT + VIRT
279#define HWY_RVV_FOREACH_08_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
280 HWY_RVV_FOREACH_08_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
281 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
282
283#define HWY_RVV_FOREACH_16_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
284 HWY_RVV_FOREACH_16_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
285 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
286
287#define HWY_RVV_FOREACH_32_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
288 HWY_RVV_FOREACH_32_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
289 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
290
291#define HWY_RVV_FOREACH_64_EXT_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
292 HWY_RVV_FOREACH_64_EXT(X_MACRO, BASE, CHAR, NAME, OP) \
293 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
294
295// DEMOTE + VIRT
296#define HWY_RVV_FOREACH_08_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
297 HWY_RVV_FOREACH_08_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
298 HWY_RVV_FOREACH_08_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
299
300#define HWY_RVV_FOREACH_16_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
301 HWY_RVV_FOREACH_16_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
302 HWY_RVV_FOREACH_16_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
303
304#define HWY_RVV_FOREACH_32_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
305 HWY_RVV_FOREACH_32_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
306 HWY_RVV_FOREACH_32_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
307
308#define HWY_RVV_FOREACH_64_DEMOTE_VIRT(X_MACRO, BASE, CHAR, NAME, OP) \
309 HWY_RVV_FOREACH_64_DEMOTE(X_MACRO, BASE, CHAR, NAME, OP) \
310 HWY_RVV_FOREACH_64_VIRT(X_MACRO, BASE, CHAR, NAME, OP)
311
312// SEW for unsigned:
313#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
314 HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, uint, u, NAME, OP)
315#define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
316 HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, uint, u, NAME, OP)
317#define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
318 HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, uint, u, NAME, OP)
319#define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \
320 HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, uint, u, NAME, OP)
321
322// SEW for signed:
323#define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
324 HWY_CONCAT(HWY_RVV_FOREACH_08, LMULS)(X_MACRO, int, i, NAME, OP)
325#define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
326 HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, int, i, NAME, OP)
327#define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
328 HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, int, i, NAME, OP)
329#define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS) \
330 HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, int, i, NAME, OP)
331
332// SEW for float:
333
334// Used for conversion instructions if HWY_RVV_HAVE_F16C.
335#define HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS) \
336 HWY_CONCAT(HWY_RVV_FOREACH_16, LMULS)(X_MACRO, float, f, NAME, OP)
337
338#if HWY_HAVE_FLOAT16
339// Full support for f16 in all ops
340#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
341 HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS)
342// Only BF16 is emulated.
343#define HWY_RVV_IF_EMULATED_D(D) HWY_IF_BF16_D(D)
344#else
345#define HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS)
346#define HWY_RVV_IF_EMULATED_D(D) HWY_IF_SPECIAL_FLOAT_D(D)
347#endif
348#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
349 HWY_CONCAT(HWY_RVV_FOREACH_32, LMULS)(X_MACRO, float, f, NAME, OP)
350#define HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS) \
351 HWY_CONCAT(HWY_RVV_FOREACH_64, LMULS)(X_MACRO, float, f, NAME, OP)
352
353// Commonly used type/SEW groups:
354#define HWY_RVV_FOREACH_UI08(X_MACRO, NAME, OP, LMULS) \
355 HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
356 HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)
357
358#define HWY_RVV_FOREACH_UI16(X_MACRO, NAME, OP, LMULS) \
359 HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
360 HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)
361
362#define HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \
363 HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
364 HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)
365
366#define HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS) \
367 HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS) \
368 HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
369
370#define HWY_RVV_FOREACH_UI3264(X_MACRO, NAME, OP, LMULS) \
371 HWY_RVV_FOREACH_UI32(X_MACRO, NAME, OP, LMULS) \
372 HWY_RVV_FOREACH_UI64(X_MACRO, NAME, OP, LMULS)
373
374#define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \
375 HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS) \
376 HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS) \
377 HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
378
379#define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS) \
380 HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS) \
381 HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS) \
382 HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
383
384#define HWY_RVV_FOREACH_UI163264(X_MACRO, NAME, OP, LMULS) \
385 HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS) \
386 HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
387
388#define HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS) \
389 HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS) \
390 HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS)
391
392// For all combinations of SEW:
393#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
394 HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS) \
395 HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS)
396
397#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS) \
398 HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS) \
399 HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
400
401#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS) \
402 HWY_RVV_FOREACH_F16(X_MACRO, NAME, OP, LMULS) \
403 HWY_RVV_FOREACH_F3264(X_MACRO, NAME, OP, LMULS)
404
405// Commonly used type categories:
406#define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \
407 HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS) \
408 HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
409
410#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS) \
411 HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS) \
412 HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
413
414// Assemble types for use in x-macros
415#define HWY_RVV_T(BASE, SEW) BASE##SEW##_t
416#define HWY_RVV_D(BASE, SEW, N, SHIFT) Simd<HWY_RVV_T(BASE, SEW), N, SHIFT>
417#define HWY_RVV_V(BASE, SEW, LMUL) v##BASE##SEW##LMUL##_t
418#define HWY_RVV_TUP(BASE, SEW, LMUL, TUP) v##BASE##SEW##LMUL##x##TUP##_t
419#define HWY_RVV_M(MLEN) vbool##MLEN##_t
420
421} // namespace detail
422
423// Until we have full intrinsic support for fractional LMUL, mixed-precision
424// code can use LMUL 1..8 (adequate unless they need many registers).
425#define HWY_SPECIALIZE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
426 MLEN, NAME, OP) \
427 template <> \
428 struct DFromV_t<HWY_RVV_V(BASE, SEW, LMUL)> { \
429 using Lane = HWY_RVV_T(BASE, SEW); \
430 using type = ScalableTag<Lane, SHIFT>; \
431 };
432
434#undef HWY_SPECIALIZE
435
436// ------------------------------ Lanes
437
438// WARNING: we want to query VLMAX/sizeof(T), but this may actually change VL!
439
440#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
441// HWY_RVV_CAPPED_LANES_SPECIAL_CASES provides some additional optimizations
442// to CappedLanes in non-debug builds
443#define HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL) \
444 if (__builtin_constant_p(cap >= kMaxLanes) && (cap >= kMaxLanes)) { \
445 /* If cap is known to be greater than or equal to MaxLanes(d), */ \
446 /* HWY_MIN(cap, Lanes(d)) will be equal to Lanes(d) */ \
447 return Lanes(d); \
448 } \
449 \
450 if ((__builtin_constant_p((cap & (cap - 1)) == 0) && \
451 ((cap & (cap - 1)) == 0)) || \
452 (__builtin_constant_p(cap <= HWY_MAX(kMinLanesPerFullVec, 4)) && \
453 (cap <= HWY_MAX(kMinLanesPerFullVec, 4)))) { \
454 /* If cap is known to be a power of 2, then */ \
455 /* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the same */ \
456 /* result as HWY_MIN(cap, Lanes(d)) as kMaxLanes is a power of 2 and */ \
457 /* as (cap > VLMAX && cap < 2 * VLMAX) can only be true if cap is not a */ \
458 /* power of 2 since VLMAX is always a power of 2 */ \
459 \
460 /* If cap is known to be less than or equal to 4, then */ \
461 /* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the same */ \
462 /* result as HWY_MIN(cap, Lanes(d)) as HWY_MIN(cap, kMaxLanes) <= 4 is */ \
463 /* true if cap <= 4 and as vsetvl(HWY_MIN(cap, kMaxLanes)) is */ \
464 /* guaranteed to return the same result as HWY_MIN(cap, Lanes(d)) */ \
465 /* if HWY_MIN(cap, kMaxLanes) <= 4 is true */ \
466 \
467 /* If cap is known to be less than or equal to kMinLanesPerFullVec, */ \
468 /* then vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the */ \
469 /* same result as HWY_MIN(cap, Lanes(d)) as */ \
470 /* HWY_MIN(cap, kMaxLanes) <= kMinLanesPerFullVec is true if */ \
471 /* cap <= kMinLanesPerFullVec is true */ \
472 \
473 /* If cap <= HWY_MAX(kMinLanesPerFullVec, 4) is true, then either */ \
474 /* cap <= 4 or cap <= kMinLanesPerFullVec must be true */ \
475 \
476 /* If cap <= HWY_MAX(kMinLanesPerFullVec, 4) is known to be true, */ \
477 /* then vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return the */ \
478 /* same result as HWY_MIN(cap, Lanes(d)) */ \
479 \
480 /* If no cap, avoid the HWY_MIN. */ \
481 return detail::IsFull(d) \
482 ? __riscv_vsetvl_e##SEW##LMUL(cap) \
483 : __riscv_vsetvl_e##SEW##LMUL(HWY_MIN(cap, kMaxLanes)); \
484 }
485#else
486#define HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL)
487#endif
488
489#define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
490 MLEN, NAME, OP) \
491 template <size_t N> \
492 HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
493 constexpr size_t kFull = HWY_LANES(HWY_RVV_T(BASE, SEW)); \
494 constexpr size_t kCap = MaxLanes(d); \
495 /* If no cap, avoid generating a constant by using VLMAX. */ \
496 return N == kFull ? __riscv_vsetvlmax_e##SEW##LMUL() \
497 : __riscv_vsetvl_e##SEW##LMUL(kCap); \
498 } \
499 template <size_t N> \
500 HWY_API size_t Capped##NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, size_t cap) { \
501 /* NOTE: Section 6.3 of the RVV specification, which can be found at */ \
502 /* https://github.com/riscv/riscv-v-spec/blob/master/v-spec.adoc, */ \
503 /* allows vsetvl to return a result less than Lanes(d) but greater than */ \
504 /* or equal to ((cap + 1) / 2) if */ \
505 /* (Lanes(d) > 2 && cap > HWY_MAX(Lanes(d), 4) && cap < (2 * Lanes(d))) */ \
506 /* is true */ \
507 \
508 /* VLMAX is the number of lanes in a vector of type */ \
509 /* VFromD<decltype(d)>, which is returned by */ \
510 /* Lanes(DFromV<VFromD<decltype(d)>>()) */ \
511 \
512 /* VLMAX is guaranteed to be a power of 2 under Section 2 of the RVV */ \
513 /* specification */ \
514 \
515 /* The VLMAX of a vector of type VFromD<decltype(d)> is at least 2 as */ \
516 /* the HWY_RVV target requires support for the RVV Zvl128b extension, */ \
517 /* which guarantees that vectors with LMUL=1 are at least 16 bytes */ \
518 \
519 /* If VLMAX == 2 is true, then vsetvl(cap) is equal to HWY_MIN(cap, 2) */ \
520 /* as cap == 3 is the only value such that */ \
521 /* (cap > VLMAX && cap < 2 * VLMAX) if VLMAX == 2 and as */ \
522 /* ((3 + 1) / 2) is equal to 2 */ \
523 \
524 /* If cap <= 4 is true, then vsetvl(cap) must be equal to */ \
525 /* HWY_MIN(cap, VLMAX) as cap <= VLMAX is true if VLMAX >= 4 is true */ \
526 /* and as vsetvl(cap) is guaranteed to be equal to HWY_MIN(cap, VLMAX) */ \
527 /* if VLMAX == 2 */ \
528 \
529 /* We want CappedLanes(d, cap) to return Lanes(d) if cap > Lanes(d) as */ \
530 /* LoadN(d, p, cap) expects to load exactly HWY_MIN(cap, Lanes(d)) */ \
531 /* lanes and StoreN(v, d, p, cap) expects to store exactly */ \
532 /* HWY_MIN(cap, Lanes(d)) lanes, even in the case where vsetvl returns */ \
533 /* a result that is less than HWY_MIN(cap, Lanes(d)) */ \
534 \
535 /* kMinLanesPerFullVec is the minimum value of VLMAX for a vector of */ \
536 /* type VFromD<decltype(d)> */ \
537 constexpr size_t kMinLanesPerFullVec = \
538 detail::ScaleByPower(16 / (SEW / 8), SHIFT); \
539 /* kMaxLanes is the maximum number of lanes returned by Lanes(d) */ \
540 constexpr size_t kMaxLanes = MaxLanes(d); \
541 \
542 HWY_RVV_CAPPED_LANES_SPECIAL_CASES(BASE, SEW, LMUL) \
543 \
544 if (kMaxLanes <= HWY_MAX(kMinLanesPerFullVec, 4)) { \
545 /* If kMaxLanes <= kMinLanesPerFullVec is true, then */ \
546 /* vsetvl(HWY_MIN(cap, kMaxLanes)) is guaranteed to return */ \
547 /* HWY_MIN(cap, Lanes(d)) as */ \
548 /* HWY_MIN(cap, kMaxLanes) <= kMaxLanes <= VLMAX is true if */ \
549 /* kMaxLanes <= kMinLanesPerFullVec is true */ \
550 \
551 /* If kMaxLanes <= 4 is true, then vsetvl(HWY_MIN(cap, kMaxLanes)) is */ \
552 /* guaranteed to return the same result as HWY_MIN(cap, Lanes(d)) as */ \
553 /* HWY_MIN(cap, kMaxLanes) <= 4 is true if kMaxLanes <= 4 is true */ \
554 \
555 /* If kMaxLanes <= HWY_MAX(kMinLanesPerFullVec, 4) is true, then */ \
556 /* either kMaxLanes <= 4 or kMaxLanes <= kMinLanesPerFullVec must be */ \
557 /* true */ \
558 \
559 return __riscv_vsetvl_e##SEW##LMUL(HWY_MIN(cap, kMaxLanes)); \
560 } else { \
561 /* If kMaxLanes > HWY_MAX(kMinLanesPerFullVec, 4) is true, need to */ \
562 /* obtain the actual number of lanes using Lanes(d) and clamp cap to */ \
563 /* the result of Lanes(d) */ \
564 const size_t actual = Lanes(d); \
565 return HWY_MIN(actual, cap); \
566 } \
567 }
568
569#define HWY_RVV_LANES_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
570 SHIFT, MLEN, NAME, OP) \
571 template <size_t N> \
572 HWY_API size_t NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
573 constexpr size_t kCap = MaxLanes(d); \
574 /* In case of virtual LMUL (intrinsics do not provide "uint16mf8_t") */ \
575 /* vsetvl may or may not be correct, so do it ourselves. */ \
576 const size_t actual = \
577 detail::ScaleByPower(__riscv_vlenb() / (SEW / 8), SHIFT); \
578 return HWY_MIN(actual, kCap); \
579 } \
580 template <size_t N> \
581 HWY_API size_t Capped##NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, size_t cap) { \
582 /* In case of virtual LMUL (intrinsics do not provide "uint16mf8_t") */ \
583 /* vsetvl may or may not be correct, so do it ourselves. */ \
584 const size_t actual = \
585 detail::ScaleByPower(__riscv_vlenb() / (SEW / 8), SHIFT); \
586 /* If no cap, avoid an extra HWY_MIN. */ \
587 return detail::IsFull(d) ? HWY_MIN(actual, cap) \
588 : HWY_MIN(HWY_MIN(actual, cap), MaxLanes(d)); \
589 }
590
591HWY_RVV_FOREACH(HWY_RVV_LANES, Lanes, setvlmax_e, _ALL)
593#undef HWY_RVV_LANES
594#undef HWY_RVV_LANES_VIRT
595#undef HWY_RVV_CAPPED_LANES_SPECIAL_CASES
596
597template <class D, HWY_RVV_IF_EMULATED_D(D)>
598HWY_API size_t Lanes(D /* tag*/) {
599 return Lanes(RebindToUnsigned<D>());
600}
601
602template <class D, HWY_RVV_IF_EMULATED_D(D)>
603HWY_API size_t CappedLanes(D /* tag*/, size_t cap) {
604 return CappedLanes(RebindToUnsigned<D>(), cap);
605}
606
607// ------------------------------ Common x-macros
608
609// Last argument to most intrinsics. Use when the op has no d arg of its own,
610// which means there is no user-specified cap.
611#define HWY_RVV_AVL(SEW, SHIFT) \
612 Lanes(ScalableTag<HWY_RVV_T(uint, SEW), SHIFT>())
613
614// vector = f(vector), e.g. Not
615#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
616 SHIFT, MLEN, NAME, OP) \
617 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
618 return __riscv_v##OP##_v_##CHAR##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \
619 }
620
621// vector = f(vector, scalar), e.g. detail::AddS
622#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
623 SHIFT, MLEN, NAME, OP) \
624 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
625 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
626 return __riscv_v##OP##_##CHAR##SEW##LMUL(a, b, HWY_RVV_AVL(SEW, SHIFT)); \
627 }
628
629// vector = f(vector, vector), e.g. Add
630#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
631 SHIFT, MLEN, NAME, OP) \
632 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
633 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
634 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(a, b, \
635 HWY_RVV_AVL(SEW, SHIFT)); \
636 }
637
638// vector = f(vector, mask, vector, vector), e.g. MaskedAddOr
639#define HWY_RVV_RETV_ARGMVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
640 SHIFT, MLEN, NAME, OP) \
641 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
642 NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m, \
643 HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
644 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(m, no, a, b, \
645 HWY_RVV_AVL(SEW, SHIFT)); \
646 }
647
648// mask = f(mask)
649#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP) \
650 HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) m) { \
651 return __riscv_vm##OP##_m_b##MLEN(m, HWY_RVV_AVL(SEW, SHIFT)); \
652 }
653
654// ================================================== INIT
655
656// ------------------------------ Set
657
658#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
659 MLEN, NAME, OP) \
660 template <size_t N> \
661 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
662 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_T(BASE, SEW) arg) { \
663 return __riscv_v##OP##_##CHAR##SEW##LMUL(arg, Lanes(d)); \
664 }
665
666HWY_RVV_FOREACH_UI(HWY_RVV_SET, Set, mv_v_x, _ALL_VIRT)
667HWY_RVV_FOREACH_F(HWY_RVV_SET, Set, fmv_v_f, _ALL_VIRT)
668#undef HWY_RVV_SET
669
670// Treat bfloat16_t as int16_t (using the previously defined Set overloads);
671// required for Zero and VFromD.
672template <size_t N, int kPow2>
675 return Set(RebindToSigned<decltype(d)>(), BitCastScalar<int16_t>(arg));
676}
677#if !HWY_HAVE_FLOAT16 // Otherwise already defined above.
678// WARNING: returns a different type than emulated bfloat16_t so that we can
679// implement PromoteTo overloads for both bfloat16_t and float16_t, and also
680// provide a Neg(hwy::float16_t) overload that coexists with Neg(int16_t).
681template <size_t N, int kPow2>
684 return Set(RebindToUnsigned<decltype(d)>(), BitCastScalar<uint16_t>(arg));
685}
686#endif
687
688template <class D>
689using VFromD = decltype(Set(D(), TFromD<D>()));
690
691// ------------------------------ Zero
692
693template <class D>
695 // Cast to support bfloat16_t.
696 const RebindToUnsigned<decltype(d)> du;
697 return BitCast(d, Set(du, 0));
698}
699
700// ------------------------------ Undefined
701
702// RVV vundefined is 'poisoned' such that even XORing a _variable_ initialized
703// by it gives unpredictable results. It should only be used for maskoff, so
704// keep it internal. For the Highway op, just use Zero (single instruction).
705namespace detail {
706#define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
707 SHIFT, MLEN, NAME, OP) \
708 template <size_t N> \
709 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
710 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) /* tag */) { \
711 return __riscv_v##OP##_##CHAR##SEW##LMUL(); /* no AVL */ \
712 }
713
715#undef HWY_RVV_UNDEFINED
716} // namespace detail
717
718template <class D>
720 return Zero(d);
721}
722
723// ------------------------------ BitCast
724
725namespace detail {
726
727// Halves LMUL. (Use LMUL arg for the source so we can use _TRUNC.)
728#define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
729 MLEN, NAME, OP) \
730 HWY_API HWY_RVV_V(BASE, SEW, LMULH) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
731 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULH( \
732 v); /* no AVL */ \
733 }
734HWY_RVV_FOREACH(HWY_RVV_TRUNC, Trunc, lmul_trunc, _TRUNC)
735#undef HWY_RVV_TRUNC
736
737// Doubles LMUL to `d2` (the arg is only necessary for _VIRT).
738#define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
739 MLEN, NAME, OP) \
740 template <size_t N> \
741 HWY_API HWY_RVV_V(BASE, SEW, LMULD) \
742 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \
743 HWY_RVV_V(BASE, SEW, LMUL) v) { \
744 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMULD( \
745 v); /* no AVL */ \
746 }
747HWY_RVV_FOREACH(HWY_RVV_EXT, Ext, lmul_ext, _EXT)
748#undef HWY_RVV_EXT
749
750// For virtual LMUL e.g. 'uint32mf4_t', the return type should be mf2, which is
751// the same as the actual input type.
752#define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
753 SHIFT, MLEN, NAME, OP) \
754 template <size_t N> \
755 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
756 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT + 1) /* d2 */, \
757 HWY_RVV_V(BASE, SEW, LMUL) v) { \
758 return v; \
759 }
760HWY_RVV_FOREACH(HWY_RVV_EXT_VIRT, Ext, lmul_ext, _VIRT)
761#undef HWY_RVV_EXT_VIRT
762
763template <class D, HWY_RVV_IF_EMULATED_D(D)>
765 const RebindToUnsigned<decltype(d)> du;
766 const Half<decltype(du)> duh;
767 return BitCast(d, Ext(du, BitCast(duh, v)));
768}
769
770// For BitCastToByte, the D arg is only to prevent duplicate definitions caused
771// by _ALL_VIRT.
772
773// There is no reinterpret from u8 <-> u8, so just return.
774#define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
775 SHIFT, MLEN, NAME, OP) \
776 template <typename T, size_t N> \
777 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
778 vuint8##LMUL##_t v) { \
779 return v; \
780 } \
781 template <size_t N> \
782 HWY_API vuint8##LMUL##_t BitCastFromByte( \
783 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
784 return v; \
785 }
786
787// For i8, need a single reinterpret (HWY_RVV_CAST_IF does two).
788#define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
789 SHIFT, MLEN, NAME, OP) \
790 template <typename T, size_t N> \
791 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
792 vint8##LMUL##_t v) { \
793 return __riscv_vreinterpret_v_i8##LMUL##_u8##LMUL(v); \
794 } \
795 template <size_t N> \
796 HWY_API vint8##LMUL##_t BitCastFromByte( \
797 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
798 return __riscv_vreinterpret_v_u8##LMUL##_i8##LMUL(v); \
799 }
800
801// Separate u/i because clang only provides signed <-> unsigned reinterpret for
802// the same SEW.
803#define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
804 MLEN, NAME, OP) \
805 template <typename T, size_t N> \
806 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
807 HWY_RVV_V(BASE, SEW, LMUL) v) { \
808 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v); \
809 } \
810 template <size_t N> \
811 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
812 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
813 return __riscv_v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v); \
814 }
815
816// Signed/Float: first cast to/from unsigned
817#define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
818 SHIFT, MLEN, NAME, OP) \
819 template <typename T, size_t N> \
820 HWY_API vuint8##LMUL##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
821 HWY_RVV_V(BASE, SEW, LMUL) v) { \
822 return __riscv_v##OP##_v_u##SEW##LMUL##_u8##LMUL( \
823 __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v)); \
824 } \
825 template <size_t N> \
826 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
827 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMUL##_t v) { \
828 return __riscv_v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \
829 __riscv_v##OP##_v_u8##LMUL##_u##SEW##LMUL(v)); \
830 }
831
832// Additional versions for virtual LMUL using LMULH for byte vectors.
833#define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
834 SHIFT, MLEN, NAME, OP) \
835 template <typename T, size_t N> \
836 HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
837 HWY_RVV_V(BASE, SEW, LMUL) v) { \
838 return detail::Trunc(__riscv_v##OP##_v_##CHAR##SEW##LMUL##_u8##LMUL(v)); \
839 } \
840 template <size_t N> \
841 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
842 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \
843 HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \
844 const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \
845 return __riscv_v##OP##_v_u8##LMUL##_##CHAR##SEW##LMUL(v2); \
846 }
847
848// Signed/Float: first cast to/from unsigned
849#define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
850 SHIFT, MLEN, NAME, OP) \
851 template <typename T, size_t N> \
852 HWY_API vuint8##LMULH##_t BitCastToByte(Simd<T, N, SHIFT> /* d */, \
853 HWY_RVV_V(BASE, SEW, LMUL) v) { \
854 return detail::Trunc(__riscv_v##OP##_v_u##SEW##LMUL##_u8##LMUL( \
855 __riscv_v##OP##_v_##CHAR##SEW##LMUL##_u##SEW##LMUL(v))); \
856 } \
857 template <size_t N> \
858 HWY_API HWY_RVV_V(BASE, SEW, LMUL) BitCastFromByte( \
859 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, vuint8##LMULH##_t v) { \
860 HWY_RVV_D(uint, 8, N, SHIFT + 1) d2; \
861 const vuint8##LMUL##_t v2 = detail::Ext(d2, v); \
862 return __riscv_v##OP##_v_u##SEW##LMUL##_##CHAR##SEW##LMUL( \
863 __riscv_v##OP##_v_u8##LMUL##_u##SEW##LMUL(v2)); \
864 }
865
866HWY_RVV_FOREACH_U08(HWY_RVV_CAST_U8, _, reinterpret, _ALL)
867HWY_RVV_FOREACH_I08(HWY_RVV_CAST_I8, _, reinterpret, _ALL)
868HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_U, _, reinterpret, _ALL)
869HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
870HWY_RVV_FOREACH_U163264(HWY_RVV_CAST_VIRT_U, _, reinterpret, _VIRT)
871HWY_RVV_FOREACH_I163264(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
872HWY_RVV_FOREACH_F(HWY_RVV_CAST_IF, _, reinterpret, _ALL)
873HWY_RVV_FOREACH_F(HWY_RVV_CAST_VIRT_IF, _, reinterpret, _VIRT)
874#if HWY_HAVE_FLOAT16 // HWY_RVV_FOREACH_F already covered float16_
875#elif HWY_RVV_HAVE_F16C // zvfhmin provides reinterpret* intrinsics:
878#else
879template <size_t N, int kPow2>
884#endif
885
886#undef HWY_RVV_CAST_U8
887#undef HWY_RVV_CAST_I8
888#undef HWY_RVV_CAST_U
889#undef HWY_RVV_CAST_IF
890#undef HWY_RVV_CAST_VIRT_U
891#undef HWY_RVV_CAST_VIRT_IF
892
893template <size_t N, int kPow2>
899
900} // namespace detail
901
902template <class D, class FromV>
903HWY_API VFromD<D> BitCast(D d, FromV v) {
905}
906
907// ------------------------------ Iota
908
909namespace detail {
910
911#define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
912 MLEN, NAME, OP) \
913 template <size_t N> \
914 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d) { \
915 return __riscv_v##OP##_##CHAR##SEW##LMUL(Lanes(d)); \
916 }
917
918// For i8 lanes, this may well wrap around. Unsigned only is less error-prone.
919HWY_RVV_FOREACH_U(HWY_RVV_IOTA, Iota0, id_v, _ALL_VIRT)
920#undef HWY_RVV_IOTA
921
922// Used by Expand.
923#define HWY_RVV_MASKED_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
924 SHIFT, MLEN, NAME, OP) \
925 template <size_t N> \
926 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
927 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) mask) { \
928 return __riscv_v##OP##_##CHAR##SEW##LMUL(mask, Lanes(d)); \
929 }
930
931HWY_RVV_FOREACH_U(HWY_RVV_MASKED_IOTA, MaskedIota, iota_m, _ALL_VIRT)
932#undef HWY_RVV_MASKED_IOTA
933
934} // namespace detail
935
936// ================================================== LOGICAL
937
938// ------------------------------ Not
939
941
942template <class V, HWY_IF_FLOAT_V(V)>
943HWY_API V Not(const V v) {
944 using DF = DFromV<V>;
945 using DU = RebindToUnsigned<DF>;
946 return BitCast(DF(), Not(BitCast(DU(), v)));
947}
948
949// ------------------------------ And
950
951// Non-vector version (ideally immediate) for use with Iota0
952namespace detail {
953HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AndS, and_vx, _ALL)
954} // namespace detail
955
957
958template <class V, HWY_IF_FLOAT_V(V)>
959HWY_API V And(const V a, const V b) {
960 using DF = DFromV<V>;
961 using DU = RebindToUnsigned<DF>;
962 return BitCast(DF(), And(BitCast(DU(), a), BitCast(DU(), b)));
963}
964
965// ------------------------------ Or
966
967// Non-vector version (ideally immediate) for use with RoundF32ForDemoteToBF16
968namespace detail {
969HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, OrS, or_vx, _ALL)
970} // namespace detail
971
973
974template <class V, HWY_IF_FLOAT_V(V)>
975HWY_API V Or(const V a, const V b) {
976 using DF = DFromV<V>;
977 using DU = RebindToUnsigned<DF>;
978 return BitCast(DF(), Or(BitCast(DU(), a), BitCast(DU(), b)));
979}
980
981// ------------------------------ Xor
982
983// Non-vector version (ideally immediate) for use with Iota0
984namespace detail {
985HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, XorS, xor_vx, _ALL)
986} // namespace detail
987
989
990template <class V, HWY_IF_FLOAT_V(V)>
991HWY_API V Xor(const V a, const V b) {
992 using DF = DFromV<V>;
993 using DU = RebindToUnsigned<DF>;
994 return BitCast(DF(), Xor(BitCast(DU(), a), BitCast(DU(), b)));
995}
996
997// ------------------------------ AndNot
998template <class V>
999HWY_API V AndNot(const V not_a, const V b) {
1000 return And(Not(not_a), b);
1001}
1002
1003// ------------------------------ Xor3
1004template <class V>
1005HWY_API V Xor3(V x1, V x2, V x3) {
1006 return Xor(x1, Xor(x2, x3));
1007}
1008
1009// ------------------------------ Or3
1010template <class V>
1011HWY_API V Or3(V o1, V o2, V o3) {
1012 return Or(o1, Or(o2, o3));
1013}
1014
1015// ------------------------------ OrAnd
1016template <class V>
1017HWY_API V OrAnd(const V o, const V a1, const V a2) {
1018 return Or(o, And(a1, a2));
1019}
1020
1021// ------------------------------ CopySign
1022
1024
1025template <class V>
1026HWY_API V CopySignToAbs(const V abs, const V sign) {
1027 // RVV can also handle abs < 0, so no extra action needed.
1028 return CopySign(abs, sign);
1029}
1030
1031// ================================================== ARITHMETIC
1032
1033// Per-target flags to prevent generic_ops-inl.h defining Add etc.
1034#ifdef HWY_NATIVE_OPERATOR_REPLACEMENTS
1035#undef HWY_NATIVE_OPERATOR_REPLACEMENTS
1036#else
1037#define HWY_NATIVE_OPERATOR_REPLACEMENTS
1038#endif
1039
1040// ------------------------------ Add
1041
1042namespace detail {
1043HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, AddS, add_vx, _ALL)
1044HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, AddS, fadd_vf, _ALL)
1045HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, ReverseSubS, rsub_vx, _ALL)
1046HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, ReverseSubS, frsub_vf, _ALL)
1047} // namespace detail
1048
1051
1052// ------------------------------ Sub
1053namespace detail {
1054HWY_RVV_FOREACH_UI(HWY_RVV_RETV_ARGVS, SubS, sub_vx, _ALL)
1055} // namespace detail
1056
1059
1060// ------------------------------ SaturatedAdd
1061
1062#ifdef HWY_NATIVE_I32_SATURATED_ADDSUB
1063#undef HWY_NATIVE_I32_SATURATED_ADDSUB
1064#else
1065#define HWY_NATIVE_I32_SATURATED_ADDSUB
1066#endif
1067
1068#ifdef HWY_NATIVE_U32_SATURATED_ADDSUB
1069#undef HWY_NATIVE_U32_SATURATED_ADDSUB
1070#else
1071#define HWY_NATIVE_U32_SATURATED_ADDSUB
1072#endif
1073
1074#ifdef HWY_NATIVE_I64_SATURATED_ADDSUB
1075#undef HWY_NATIVE_I64_SATURATED_ADDSUB
1076#else
1077#define HWY_NATIVE_I64_SATURATED_ADDSUB
1078#endif
1079
1080#ifdef HWY_NATIVE_U64_SATURATED_ADDSUB
1081#undef HWY_NATIVE_U64_SATURATED_ADDSUB
1082#else
1083#define HWY_NATIVE_U64_SATURATED_ADDSUB
1084#endif
1085
1088
1089// ------------------------------ SaturatedSub
1090
1093
1094// ------------------------------ AverageRound
1095
1096// Define this to opt-out of the default behavior, which is AVOID on certain
1097// compiler versions. You can define only this to use VXRM, or define both this
1098// and HWY_RVV_AVOID_VXRM to always avoid VXRM.
1099#ifndef HWY_RVV_CHOOSE_VXRM
1100
1101// Assume that GCC-13 defaults to 'avoid VXRM'. Tested with GCC 13.1.0.
1102#if HWY_COMPILER_GCC_ACTUAL && HWY_COMPILER_GCC_ACTUAL < 1400
1103#define HWY_RVV_AVOID_VXRM
1104// Clang 16 with __riscv_v_intrinsic == 11000 may either require VXRM or avoid.
1105// Assume earlier versions avoid.
1106#elif HWY_COMPILER_CLANG && \
1107 (HWY_COMPILER_CLANG < 1600 || __riscv_v_intrinsic < 11000)
1108#define HWY_RVV_AVOID_VXRM
1109#endif
1110
1111#endif // HWY_RVV_CHOOSE_VXRM
1112
1113// Adding __RISCV_VXRM_* was a backwards-incompatible change and it is not clear
1114// how to detect whether it is supported or required. #ifdef __RISCV_VXRM_RDN
1115// does not work because it seems to be a compiler built-in, but neither does
1116// __has_builtin(__RISCV_VXRM_RDN). The intrinsics version was also not updated,
1117// so we require a macro to opt out of the new intrinsics.
1118#ifdef HWY_RVV_AVOID_VXRM
1119#define HWY_RVV_INSERT_VXRM(vxrm, avl) avl
1120#define __RISCV_VXRM_RNU
1121#define __RISCV_VXRM_RDN
1122#else // default: use new vxrm arguments
1123#define HWY_RVV_INSERT_VXRM(vxrm, avl) vxrm, avl
1124#endif
1125
1126// Extra rounding mode = up argument.
1127#define HWY_RVV_RETV_AVERAGE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1128 SHIFT, MLEN, NAME, OP) \
1129 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1130 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
1131 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL( \
1132 a, b, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
1133 }
1134
1137
1138#undef HWY_RVV_RETV_AVERAGE
1139
1140// ------------------------------ ShiftLeft[Same]
1141
1142// Intrinsics do not define .vi forms, so use .vx instead.
1143#define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1144 MLEN, NAME, OP) \
1145 template <int kBits> \
1146 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
1147 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v, kBits, \
1148 HWY_RVV_AVL(SEW, SHIFT)); \
1149 } \
1150 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1151 NAME##Same(HWY_RVV_V(BASE, SEW, LMUL) v, int bits) { \
1152 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v, static_cast<uint8_t>(bits), \
1153 HWY_RVV_AVL(SEW, SHIFT)); \
1154 }
1155
1157
1158// ------------------------------ ShiftRight[Same]
1159
1162
1163#undef HWY_RVV_SHIFT
1164
1165// ------------------------------ SumsOf8 (ShiftRight, Add)
1166template <class VU8, HWY_IF_U8_D(DFromV<VU8>)>
1168 const DFromV<VU8> du8;
1169 const RepartitionToWide<decltype(du8)> du16;
1170 const RepartitionToWide<decltype(du16)> du32;
1171 const RepartitionToWide<decltype(du32)> du64;
1172 using VU16 = VFromD<decltype(du16)>;
1173
1174 const VU16 vFDB97531 = ShiftRight<8>(BitCast(du16, v));
1175 const VU16 vECA86420 = detail::AndS(BitCast(du16, v), 0xFF);
1176 const VU16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
1177
1178 const VU16 szz_FE_zz_BA_zz_76_zz_32 =
1179 BitCast(du16, ShiftRight<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
1180 const VU16 sxx_FC_xx_B8_xx_74_xx_30 =
1181 Add(sFE_DC_BA_98_76_54_32_10, szz_FE_zz_BA_zz_76_zz_32);
1182 const VU16 szz_zz_xx_FC_zz_zz_xx_74 =
1183 BitCast(du16, ShiftRight<32>(BitCast(du64, sxx_FC_xx_B8_xx_74_xx_30)));
1184 const VU16 sxx_xx_xx_F8_xx_xx_xx_70 =
1185 Add(sxx_FC_xx_B8_xx_74_xx_30, szz_zz_xx_FC_zz_zz_xx_74);
1186 return detail::AndS(BitCast(du64, sxx_xx_xx_F8_xx_xx_xx_70), 0xFFFFull);
1187}
1188
1189template <class VI8, HWY_IF_I8_D(DFromV<VI8>)>
1191 const DFromV<VI8> di8;
1192 const RepartitionToWide<decltype(di8)> di16;
1193 const RepartitionToWide<decltype(di16)> di32;
1194 const RepartitionToWide<decltype(di32)> di64;
1195 const RebindToUnsigned<decltype(di32)> du32;
1196 const RebindToUnsigned<decltype(di64)> du64;
1197 using VI16 = VFromD<decltype(di16)>;
1198
1199 const VI16 vFDB97531 = ShiftRight<8>(BitCast(di16, v));
1200 const VI16 vECA86420 = ShiftRight<8>(ShiftLeft<8>(BitCast(di16, v)));
1201 const VI16 sFE_DC_BA_98_76_54_32_10 = Add(vFDB97531, vECA86420);
1202
1203 const VI16 sDC_zz_98_zz_54_zz_10_zz =
1204 BitCast(di16, ShiftLeft<16>(BitCast(du32, sFE_DC_BA_98_76_54_32_10)));
1205 const VI16 sFC_xx_B8_xx_74_xx_30_xx =
1206 Add(sFE_DC_BA_98_76_54_32_10, sDC_zz_98_zz_54_zz_10_zz);
1207 const VI16 sB8_xx_zz_zz_30_xx_zz_zz =
1208 BitCast(di16, ShiftLeft<32>(BitCast(du64, sFC_xx_B8_xx_74_xx_30_xx)));
1209 const VI16 sF8_xx_xx_xx_70_xx_xx_xx =
1210 Add(sFC_xx_B8_xx_74_xx_30_xx, sB8_xx_zz_zz_30_xx_zz_zz);
1211 return ShiftRight<48>(BitCast(di64, sF8_xx_xx_xx_70_xx_xx_xx));
1212}
1213
1214// ------------------------------ RotateRight
1215template <int kBits, class V, HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V)>
1216HWY_API V RotateRight(const V v) {
1217 const DFromV<decltype(v)> d;
1218 const RebindToUnsigned<decltype(d)> du;
1219
1220 constexpr size_t kSizeInBits = sizeof(TFromV<V>) * 8;
1221 static_assert(0 <= kBits && kBits < kSizeInBits, "Invalid shift count");
1222 if (kBits == 0) return v;
1223
1224 return Or(BitCast(d, ShiftRight<kBits>(BitCast(du, v))),
1225 ShiftLeft<HWY_MIN(kSizeInBits - 1, kSizeInBits - kBits)>(v));
1226}
1227
1228// ------------------------------ Shl
1229#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1230 SHIFT, MLEN, NAME, OP) \
1231 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1232 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
1233 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, bits, \
1234 HWY_RVV_AVL(SEW, SHIFT)); \
1235 }
1236
1238
1239#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1240 SHIFT, MLEN, NAME, OP) \
1241 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1242 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(BASE, SEW, LMUL) bits) { \
1243 const HWY_RVV_D(uint, SEW, HWY_LANES(HWY_RVV_T(BASE, SEW)), SHIFT) du; \
1244 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, BitCast(du, bits), \
1245 HWY_RVV_AVL(SEW, SHIFT)); \
1246 }
1247
1249
1250// ------------------------------ Shr
1251
1254
1255#undef HWY_RVV_SHIFT_II
1256#undef HWY_RVV_SHIFT_VV
1257
1258// ------------------------------ Min
1259
1260namespace detail {
1261
1262HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MinS, minu_vx, _ALL)
1263HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MinS, min_vx, _ALL)
1264HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MinS, fmin_vf, _ALL)
1265
1266} // namespace detail
1267
1271
1272// ------------------------------ Max
1273
1274namespace detail {
1275
1276HWY_RVV_FOREACH_U(HWY_RVV_RETV_ARGVS, MaxS, maxu_vx, _ALL)
1277HWY_RVV_FOREACH_I(HWY_RVV_RETV_ARGVS, MaxS, max_vx, _ALL)
1278HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGVS, MaxS, fmax_vf, _ALL)
1279
1280} // namespace detail
1281
1285
1286// ------------------------------ Mul
1287
1288// Per-target flags to prevent generic_ops-inl.h defining 8/64-bit operator*.
1289#ifdef HWY_NATIVE_MUL_8
1290#undef HWY_NATIVE_MUL_8
1291#else
1292#define HWY_NATIVE_MUL_8
1293#endif
1294#ifdef HWY_NATIVE_MUL_64
1295#undef HWY_NATIVE_MUL_64
1296#else
1297#define HWY_NATIVE_MUL_64
1298#endif
1299
1302
1303// ------------------------------ MulHigh
1304
1307
1308// ------------------------------ MulFixedPoint15
1309
1310// Extra rounding mode = up argument.
1311#define HWY_RVV_MUL15(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1312 MLEN, NAME, OP) \
1313 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1314 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
1315 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL( \
1316 a, b, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RNU, HWY_RVV_AVL(SEW, SHIFT))); \
1317 }
1318
1320
1321#undef HWY_RVV_MUL15
1322
1323// ------------------------------ Div
1324#ifdef HWY_NATIVE_INT_DIV
1325#undef HWY_NATIVE_INT_DIV
1326#else
1327#define HWY_NATIVE_INT_DIV
1328#endif
1329
1333
1336
1337// ------------------------------ MaskedAddOr etc.
1338
1339#ifdef HWY_NATIVE_MASKED_ARITH
1340#undef HWY_NATIVE_MASKED_ARITH
1341#else
1342#define HWY_NATIVE_MASKED_ARITH
1343#endif
1344
1348
1352
1355
1358
1361
1365
1368
1371
1374
1375// ------------------------------ ApproximateReciprocal
1376#ifdef HWY_NATIVE_F64_APPROX_RECIP
1377#undef HWY_NATIVE_F64_APPROX_RECIP
1378#else
1379#define HWY_NATIVE_F64_APPROX_RECIP
1380#endif
1381
1383
1384// ------------------------------ Sqrt
1386
1387// ------------------------------ ApproximateReciprocalSqrt
1388#ifdef HWY_NATIVE_F64_APPROX_RSQRT
1389#undef HWY_NATIVE_F64_APPROX_RSQRT
1390#else
1391#define HWY_NATIVE_F64_APPROX_RSQRT
1392#endif
1393
1395
1396// ------------------------------ MulAdd
1397
1398// Per-target flag to prevent generic_ops-inl.h from defining int MulAdd.
1399#ifdef HWY_NATIVE_INT_FMA
1400#undef HWY_NATIVE_INT_FMA
1401#else
1402#define HWY_NATIVE_INT_FMA
1403#endif
1404
1405// Note: op is still named vv, not vvv.
1406#define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1407 MLEN, NAME, OP) \
1408 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1409 NAME(HWY_RVV_V(BASE, SEW, LMUL) mul, HWY_RVV_V(BASE, SEW, LMUL) x, \
1410 HWY_RVV_V(BASE, SEW, LMUL) add) { \
1411 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(add, mul, x, \
1412 HWY_RVV_AVL(SEW, SHIFT)); \
1413 }
1414
1417
1418// ------------------------------ NegMulAdd
1421
1422// ------------------------------ MulSub
1424
1425// ------------------------------ NegMulSub
1427
1428#undef HWY_RVV_FMA
1429
1430// ================================================== COMPARE
1431
1432// Comparisons set a mask bit to 1 if the condition is true, else 0. The XX in
1433// vboolXX_t is a power of two divisor for vector bits. SEW=8 / LMUL=1 = 1/8th
1434// of all bits; SEW=8 / LMUL=4 = half of all bits.
1435
1436// mask = f(vector, vector)
1437#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1438 SHIFT, MLEN, NAME, OP) \
1439 HWY_API HWY_RVV_M(MLEN) \
1440 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
1441 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_b##MLEN( \
1442 a, b, HWY_RVV_AVL(SEW, SHIFT)); \
1443 }
1444
1445// mask = f(vector, scalar)
1446#define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1447 SHIFT, MLEN, NAME, OP) \
1448 HWY_API HWY_RVV_M(MLEN) \
1449 NAME(HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_T(BASE, SEW) b) { \
1450 return __riscv_v##OP##_##CHAR##SEW##LMUL##_b##MLEN( \
1451 a, b, HWY_RVV_AVL(SEW, SHIFT)); \
1452 }
1453
1454// ------------------------------ Eq
1457
1458namespace detail {
1459HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, EqS, mseq_vx, _ALL)
1460HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, EqS, mfeq_vf, _ALL)
1461} // namespace detail
1462
1463// ------------------------------ Ne
1466
1467namespace detail {
1468HWY_RVV_FOREACH_UI(HWY_RVV_RETM_ARGVS, NeS, msne_vx, _ALL)
1469HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, NeS, mfne_vf, _ALL)
1470} // namespace detail
1471
1472// ------------------------------ Lt
1476
1477namespace detail {
1478HWY_RVV_FOREACH_I(HWY_RVV_RETM_ARGVS, LtS, mslt_vx, _ALL)
1479HWY_RVV_FOREACH_U(HWY_RVV_RETM_ARGVS, LtS, msltu_vx, _ALL)
1480HWY_RVV_FOREACH_F(HWY_RVV_RETM_ARGVS, LtS, mflt_vf, _ALL)
1481} // namespace detail
1482
1483// ------------------------------ Le
1487
1488#undef HWY_RVV_RETM_ARGVV
1489#undef HWY_RVV_RETM_ARGVS
1490
1491// ------------------------------ Gt/Ge
1492
1493template <class V>
1494HWY_API auto Ge(const V a, const V b) -> decltype(Le(a, b)) {
1495 return Le(b, a);
1496}
1497
1498template <class V>
1499HWY_API auto Gt(const V a, const V b) -> decltype(Lt(a, b)) {
1500 return Lt(b, a);
1501}
1502
1503// ------------------------------ TestBit
1504template <class V>
1505HWY_API auto TestBit(const V a, const V bit) -> decltype(Eq(a, bit)) {
1506 return detail::NeS(And(a, bit), 0);
1507}
1508
1509// ------------------------------ Not
1510// NOLINTNEXTLINE
1512
1513// ------------------------------ And
1514
1515// mask = f(mask_a, mask_b) (note arg2,arg1 order!)
1516#define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP) \
1517 HWY_API HWY_RVV_M(MLEN) NAME(HWY_RVV_M(MLEN) a, HWY_RVV_M(MLEN) b) { \
1518 return __riscv_vm##OP##_mm_b##MLEN(b, a, HWY_RVV_AVL(SEW, SHIFT)); \
1519 }
1520
1522
1523// ------------------------------ AndNot
1525
1526// ------------------------------ Or
1528
1529// ------------------------------ Xor
1531
1532// ------------------------------ ExclusiveNeither
1534
1535#undef HWY_RVV_RETM_ARGMM
1536
1537// ------------------------------ IfThenElse
1538
1539#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1540 SHIFT, MLEN, NAME, OP) \
1541 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1542 NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) yes, \
1543 HWY_RVV_V(BASE, SEW, LMUL) no) { \
1544 return __riscv_v##OP##_vvm_##CHAR##SEW##LMUL(no, yes, m, \
1545 HWY_RVV_AVL(SEW, SHIFT)); \
1546 }
1547
1549
1550#undef HWY_RVV_IF_THEN_ELSE
1551
1552// ------------------------------ IfThenElseZero
1553template <class M, class V>
1554HWY_API V IfThenElseZero(const M mask, const V yes) {
1555 return IfThenElse(mask, yes, Zero(DFromV<V>()));
1556}
1557
1558// ------------------------------ IfThenZeroElse
1559
1560#define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
1561 LMULH, SHIFT, MLEN, NAME, OP) \
1562 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1563 NAME(HWY_RVV_M(MLEN) m, HWY_RVV_V(BASE, SEW, LMUL) no) { \
1564 return __riscv_v##OP##_##CHAR##SEW##LMUL(no, 0, m, \
1565 HWY_RVV_AVL(SEW, SHIFT)); \
1566 }
1567
1570
1571#undef HWY_RVV_IF_THEN_ZERO_ELSE
1572
1573// ------------------------------ MaskFromVec
1574
1575template <class D>
1576using MFromD = decltype(Eq(Zero(D()), Zero(D())));
1577
1578template <class V>
1580 return detail::NeS(v, 0);
1581}
1582
1583// ------------------------------ IsNegative (MFromD)
1584#ifdef HWY_NATIVE_IS_NEGATIVE
1585#undef HWY_NATIVE_IS_NEGATIVE
1586#else
1587#define HWY_NATIVE_IS_NEGATIVE
1588#endif
1589
1590// Generic for all vector lengths
1591template <class V, HWY_IF_NOT_UNSIGNED_V(V)>
1593 const DFromV<decltype(v)> d;
1594 const RebindToSigned<decltype(d)> di;
1595 using TI = TFromD<decltype(di)>;
1596
1597 return detail::LtS(BitCast(di, v), static_cast<TI>(0));
1598}
1599
1600// ------------------------------ MaskFalse
1601
1602// For mask ops including vmclr, elements past VL are tail-agnostic and cannot
1603// be relied upon, so define a variant of the generic_ops-inl implementation of
1604// MaskFalse that ensures all bits are zero as required by mask_test.
1605#ifdef HWY_NATIVE_MASK_FALSE
1606#undef HWY_NATIVE_MASK_FALSE
1607#else
1608#define HWY_NATIVE_MASK_FALSE
1609#endif
1610
1611template <class D>
1613 const DFromV<VFromD<decltype(d)>> d_full;
1614 return MaskFromVec(Zero(d_full));
1615}
1616
1617// ------------------------------ RebindMask
1618template <class D, typename MFrom>
1619HWY_API MFromD<D> RebindMask(const D /*d*/, const MFrom mask) {
1620 // No need to check lane size/LMUL are the same: if not, casting MFrom to
1621 // MFromD<D> would fail.
1622 return mask;
1623}
1624
1625// ------------------------------ VecFromMask
1626
1627// Returns mask ? ~0 : 0. No longer use sub.vx(Zero(), 1, mask) because per the
1628// default mask-agnostic policy, the result of inactive lanes may also be ~0.
1629#define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1630 SHIFT, MLEN, NAME, OP) \
1631 template <size_t N> \
1632 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1633 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_M(MLEN) m) { \
1634 /* MaskFalse requires we set all lanes for capped d and virtual LMUL. */ \
1635 const DFromV<VFromD<decltype(d)>> d_full; \
1636 const RebindToSigned<decltype(d_full)> di; \
1637 using TI = TFromD<decltype(di)>; \
1638 return BitCast(d_full, __riscv_v##OP##_i##SEW##LMUL(Zero(di), TI{-1}, m, \
1639 Lanes(d_full))); \
1640 }
1641
1643
1644#undef HWY_RVV_VEC_FROM_MASK
1645
1646template <class D, HWY_IF_FLOAT_D(D)>
1648 return BitCast(d, VecFromMask(RebindToUnsigned<D>(), mask));
1649}
1650
1651// ------------------------------ IfVecThenElse (MaskFromVec)
1652template <class V>
1653HWY_API V IfVecThenElse(const V mask, const V yes, const V no) {
1654 return IfThenElse(MaskFromVec(mask), yes, no);
1655}
1656
1657// ------------------------------ BroadcastSignBit
1658template <class V>
1659HWY_API V BroadcastSignBit(const V v) {
1660 return ShiftRight<sizeof(TFromV<V>) * 8 - 1>(v);
1661}
1662
1663// ------------------------------ IfNegativeThenElse (BroadcastSignBit)
1664template <class V>
1665HWY_API V IfNegativeThenElse(V v, V yes, V no) {
1666 static_assert(IsSigned<TFromV<V>>(), "Only works for signed/float");
1667 return IfThenElse(IsNegative(v), yes, no);
1668}
1669
1670// ------------------------------ FindFirstTrue
1671
1672#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1673 template <class D> \
1674 HWY_API intptr_t FindFirstTrue(D d, HWY_RVV_M(MLEN) m) { \
1675 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1676 return __riscv_vfirst_m_b##MLEN(m, Lanes(d)); \
1677 } \
1678 template <class D> \
1679 HWY_API size_t FindKnownFirstTrue(D d, HWY_RVV_M(MLEN) m) { \
1680 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1681 return static_cast<size_t>(__riscv_vfirst_m_b##MLEN(m, Lanes(d))); \
1682 }
1683
1685#undef HWY_RVV_FIND_FIRST_TRUE
1686
1687// ------------------------------ AllFalse
1688template <class D>
1689HWY_API bool AllFalse(D d, MFromD<D> m) {
1690 return FindFirstTrue(d, m) < 0;
1691}
1692
1693// ------------------------------ AllTrue
1694
1695#define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1696 template <class D> \
1697 HWY_API bool AllTrue(D d, HWY_RVV_M(MLEN) m) { \
1698 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1699 return AllFalse(d, __riscv_vmnot_m_b##MLEN(m, Lanes(d))); \
1700 }
1701
1703#undef HWY_RVV_ALL_TRUE
1704
1705// ------------------------------ CountTrue
1706
1707#define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP) \
1708 template <class D> \
1709 HWY_API size_t CountTrue(D d, HWY_RVV_M(MLEN) m) { \
1710 static_assert(MLenFromD(d) == MLEN, "Type mismatch"); \
1711 return __riscv_vcpop_m_b##MLEN(m, Lanes(d)); \
1712 }
1713
1715#undef HWY_RVV_COUNT_TRUE
1716
1717// ------------------------------ PromoteMaskTo
1718
1719#ifdef HWY_NATIVE_PROMOTE_MASK_TO
1720#undef HWY_NATIVE_PROMOTE_MASK_TO
1721#else
1722#define HWY_NATIVE_PROMOTE_MASK_TO
1723#endif
1724
1725template <class DTo, class DFrom,
1726 HWY_IF_T_SIZE_GT_D(DTo, sizeof(TFromD<DFrom>)),
1727 hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DFrom>>()>* = nullptr>
1728HWY_API MFromD<DTo> PromoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
1729 MFromD<DFrom> m) {
1730 return m;
1731}
1732
1733// ------------------------------ DemoteMaskTo
1734
1735#ifdef HWY_NATIVE_DEMOTE_MASK_TO
1736#undef HWY_NATIVE_DEMOTE_MASK_TO
1737#else
1738#define HWY_NATIVE_DEMOTE_MASK_TO
1739#endif
1740
1741template <class DTo, class DFrom,
1742 HWY_IF_T_SIZE_LE_D(DTo, sizeof(TFromD<DFrom>) - 1),
1743 hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DFrom>>()>* = nullptr>
1744HWY_API MFromD<DTo> DemoteMaskTo(DTo /*d_to*/, DFrom /*d_from*/,
1745 MFromD<DFrom> m) {
1746 return m;
1747}
1748
1749// ================================================== MEMORY
1750
1751// ------------------------------ Load
1752
1753#define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1754 MLEN, NAME, OP) \
1755 template <size_t N> \
1756 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1757 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1758 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1759 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
1760 detail::NativeLanePointer(p), Lanes(d)); \
1761 }
1762HWY_RVV_FOREACH(HWY_RVV_LOAD, Load, le, _ALL_VIRT)
1763#undef HWY_RVV_LOAD
1764
1765template <class D, HWY_RVV_IF_EMULATED_D(D)>
1767 const RebindToUnsigned<decltype(d)> du;
1768 return BitCast(d, Load(du, detail::U16LanePointer(p)));
1769}
1770
1771// ------------------------------ LoadU
1772template <class D>
1773HWY_API VFromD<D> LoadU(D d, const TFromD<D>* HWY_RESTRICT p) {
1774 // RVV only requires element alignment, not vector alignment.
1775 return Load(d, p);
1776}
1777
1778// ------------------------------ MaskedLoad
1779
1780#define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1781 SHIFT, MLEN, NAME, OP) \
1782 template <size_t N> \
1783 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1784 NAME(HWY_RVV_M(MLEN) m, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1785 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1786 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu( \
1787 m, Zero(d), detail::NativeLanePointer(p), Lanes(d)); \
1788 } \
1789 template <size_t N> \
1790 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1791 NAME##Or(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
1792 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1793 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1794 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_mu( \
1795 m, v, detail::NativeLanePointer(p), Lanes(d)); \
1796 }
1797
1799#undef HWY_RVV_MASKED_LOAD
1800
1801template <class D, HWY_RVV_IF_EMULATED_D(D)>
1802HWY_API VFromD<D> MaskedLoad(MFromD<D> m, D d,
1803 const TFromD<D>* HWY_RESTRICT p) {
1804 const RebindToUnsigned<decltype(d)> du;
1805 return BitCast(d,
1807}
1808
1809template <class D, HWY_RVV_IF_EMULATED_D(D)>
1811 const TFromD<D>* HWY_RESTRICT p) {
1812 const RebindToUnsigned<decltype(d)> du;
1813 return BitCast(d, MaskedLoadOr(BitCast(du, no), RebindMask(du, m), du,
1815}
1816
1817// ------------------------------ LoadN
1818
1819// Native with avl is faster than the generic_ops using FirstN.
1820#ifdef HWY_NATIVE_LOAD_N
1821#undef HWY_NATIVE_LOAD_N
1822#else
1823#define HWY_NATIVE_LOAD_N
1824#endif
1825
1826#define HWY_RVV_LOADN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1827 MLEN, NAME, OP) \
1828 template <size_t N> \
1829 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
1830 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1831 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t num_lanes) { \
1832 /* Use a tail-undisturbed load in LoadN as the tail-undisturbed load */ \
1833 /* operation below will leave any lanes past the first */ \
1834 /* (lowest-indexed) HWY_MIN(num_lanes, Lanes(d)) lanes unchanged */ \
1835 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_tu( \
1836 Zero(d), detail::NativeLanePointer(p), CappedLanes(d, num_lanes)); \
1837 } \
1838 template <size_t N> \
1839 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME##Or( \
1840 HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1841 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t num_lanes) { \
1842 /* Use a tail-undisturbed load in LoadNOr as the tail-undisturbed load */ \
1843 /* operation below will set any lanes past the first */ \
1844 /* (lowest-indexed) HWY_MIN(num_lanes, Lanes(d)) lanes to the */ \
1845 /* corresponding lanes in no */ \
1846 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_tu( \
1847 no, detail::NativeLanePointer(p), CappedLanes(d, num_lanes)); \
1848 }
1849
1850HWY_RVV_FOREACH(HWY_RVV_LOADN, LoadN, le, _ALL_VIRT)
1851#undef HWY_RVV_LOADN
1852
1853template <class D, HWY_RVV_IF_EMULATED_D(D)>
1855 size_t num_lanes) {
1856 const RebindToUnsigned<D> du;
1857 return BitCast(d, LoadN(du, detail::U16LanePointer(p), num_lanes));
1858}
1859template <class D, HWY_RVV_IF_EMULATED_D(D)>
1861 size_t num_lanes) {
1862 const RebindToUnsigned<D> du;
1863 return BitCast(
1864 d, LoadNOr(BitCast(du, v), du, detail::U16LanePointer(p), num_lanes));
1865}
1866
1867// ------------------------------ Store
1868
1869#define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1870 MLEN, NAME, OP) \
1871 template <size_t N> \
1872 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1873 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1874 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1875 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
1876 detail::NativeLanePointer(p), v, Lanes(d)); \
1877 }
1878HWY_RVV_FOREACH(HWY_RVV_STORE, Store, se, _ALL_VIRT)
1879#undef HWY_RVV_STORE
1880
1881template <class D, HWY_RVV_IF_EMULATED_D(D)>
1883 const RebindToUnsigned<decltype(d)> du;
1884 Store(BitCast(du, v), du, detail::U16LanePointer(p));
1885}
1886
1887// ------------------------------ BlendedStore
1888
1889#define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1890 SHIFT, MLEN, NAME, OP) \
1891 template <size_t N> \
1892 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
1893 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1894 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1895 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL##_m( \
1896 m, detail::NativeLanePointer(p), v, Lanes(d)); \
1897 }
1899#undef HWY_RVV_BLENDED_STORE
1900
1901template <class D, HWY_RVV_IF_EMULATED_D(D)>
1902HWY_API void BlendedStore(VFromD<D> v, MFromD<D> m, D d,
1903 TFromD<D>* HWY_RESTRICT p) {
1904 const RebindToUnsigned<decltype(d)> du;
1905 BlendedStore(BitCast(du, v), RebindMask(du, m), du,
1907}
1908
1909// ------------------------------ StoreN
1910
1911namespace detail {
1912
1913#define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
1914 MLEN, NAME, OP) \
1915 template <size_t N> \
1916 HWY_API void NAME(size_t count, HWY_RVV_V(BASE, SEW, LMUL) v, \
1917 HWY_RVV_D(BASE, SEW, N, SHIFT) /* d */, \
1918 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p) { \
1919 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
1920 detail::NativeLanePointer(p), v, count); \
1921 }
1922HWY_RVV_FOREACH(HWY_RVV_STOREN, StoreN, se, _ALL_VIRT)
1923#undef HWY_RVV_STOREN
1924
1925template <class D, HWY_RVV_IF_EMULATED_D(D)>
1926HWY_API void StoreN(size_t count, VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p) {
1927 const RebindToUnsigned<decltype(d)> du;
1928 StoreN(count, BitCast(du, v), du, detail::U16LanePointer(p));
1929}
1930
1931} // namespace detail
1932
1933#ifdef HWY_NATIVE_STORE_N
1934#undef HWY_NATIVE_STORE_N
1935#else
1936#define HWY_NATIVE_STORE_N
1937#endif
1938
1939template <class D>
1940HWY_API void StoreN(VFromD<D> v, D d, TFromD<D>* HWY_RESTRICT p,
1941 size_t max_lanes_to_store) {
1942 // NOTE: Need to clamp max_lanes_to_store to Lanes(d), even if
1943 // MaxLanes(d) >= MaxLanes(DFromV<VFromD<D>>()) is true, as it is possible for
1944 // detail::StoreN(max_lanes_to_store, v, d, p) to store fewer than
1945 // Lanes(DFromV<VFromD<D>>()) lanes to p if
1946 // max_lanes_to_store > Lanes(DFromV<VFromD<D>>()) and
1947 // max_lanes_to_store < 2 * Lanes(DFromV<VFromD<D>>()) are both true.
1948
1949 // Also need to make sure that no more than Lanes(d) lanes are stored to p
1950 // if Lanes(d) < Lanes(DFromV<VFromD<D>>()) is true, which is possible if
1951 // MaxLanes(d) < MaxLanes(DFromV<VFromD<D>>()) or
1952 // d.Pow2() < DFromV<VFromD<D>>().Pow2() is true.
1953 detail::StoreN(CappedLanes(d, max_lanes_to_store), v, d, p);
1954}
1955
1956// ------------------------------ StoreU
1957template <class V, class D>
1958HWY_API void StoreU(const V v, D d, TFromD<D>* HWY_RESTRICT p) {
1959 // RVV only requires element alignment, not vector alignment.
1960 Store(v, d, p);
1961}
1962
1963// ------------------------------ Stream
1964template <class V, class D, typename T>
1965HWY_API void Stream(const V v, D d, T* HWY_RESTRICT aligned) {
1966 Store(v, d, aligned);
1967}
1968
1969// ------------------------------ ScatterOffset
1970
1971#ifdef HWY_NATIVE_SCATTER
1972#undef HWY_NATIVE_SCATTER
1973#else
1974#define HWY_NATIVE_SCATTER
1975#endif
1976
1977#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
1978 SHIFT, MLEN, NAME, OP) \
1979 template <size_t N> \
1980 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
1981 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
1982 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
1983 HWY_RVV_V(int, SEW, LMUL) offset) { \
1984 const RebindToUnsigned<decltype(d)> du; \
1985 return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
1986 detail::NativeLanePointer(base), BitCast(du, offset), v, Lanes(d)); \
1987 }
1989#undef HWY_RVV_SCATTER
1990
1991// ------------------------------ ScatterIndex
1992template <class D>
1995 constexpr size_t kBits = CeilLog2(sizeof(TFromD<D>));
1996 return ScatterOffset(v, d, base, ShiftLeft<kBits>(indices));
1997}
1998
1999// ------------------------------ MaskedScatterIndex
2000
2001#define HWY_RVV_MASKED_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
2002 LMULH, SHIFT, MLEN, NAME, OP) \
2003 template <size_t N> \
2004 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) m, \
2005 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
2006 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
2007 HWY_RVV_V(int, SEW, LMUL) indices) { \
2008 const RebindToUnsigned<decltype(d)> du; \
2009 constexpr size_t kBits = CeilLog2(sizeof(TFromD<decltype(d)>)); \
2010 return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_m( \
2011 m, detail::NativeLanePointer(base), \
2012 ShiftLeft<kBits>(BitCast(du, indices)), v, Lanes(d)); \
2013 }
2015#undef HWY_RVV_MASKED_SCATTER
2016
2017// ------------------------------ GatherOffset
2018
2019#ifdef HWY_NATIVE_GATHER
2020#undef HWY_NATIVE_GATHER
2021#else
2022#define HWY_NATIVE_GATHER
2023#endif
2024
2025#define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
2026 MLEN, NAME, OP) \
2027 template <size_t N> \
2028 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2029 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
2030 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
2031 HWY_RVV_V(int, SEW, LMUL) offset) { \
2032 const RebindToUnsigned<decltype(d)> du; \
2033 return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL( \
2034 detail::NativeLanePointer(base), BitCast(du, offset), Lanes(d)); \
2035 }
2037#undef HWY_RVV_GATHER
2038
2039// ------------------------------ GatherIndex
2040
2041template <class D>
2042HWY_API VFromD<D> GatherIndex(D d, const TFromD<D>* HWY_RESTRICT base,
2043 const VFromD<RebindToSigned<D>> index) {
2044 constexpr size_t kBits = CeilLog2(sizeof(TFromD<D>));
2045 return GatherOffset(d, base, ShiftLeft<kBits>(index));
2046}
2047
2048// ------------------------------ MaskedGatherIndexOr
2049
2050#define HWY_RVV_MASKED_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2051 SHIFT, MLEN, NAME, OP) \
2052 template <size_t N> \
2053 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2054 NAME(HWY_RVV_V(BASE, SEW, LMUL) no, HWY_RVV_M(MLEN) m, \
2055 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
2056 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT base, \
2057 HWY_RVV_V(int, SEW, LMUL) indices) { \
2058 const RebindToUnsigned<decltype(d)> du; \
2059 const RebindToSigned<decltype(d)> di; \
2060 (void)di; /* for HWY_DASSERT */ \
2061 constexpr size_t kBits = CeilLog2(SEW / 8); \
2062 HWY_DASSERT(AllFalse(di, Lt(indices, Zero(di)))); \
2063 return __riscv_v##OP##ei##SEW##_v_##CHAR##SEW##LMUL##_mu( \
2064 m, no, detail::NativeLanePointer(base), \
2065 ShiftLeft<kBits>(BitCast(du, indices)), Lanes(d)); \
2066 }
2068#undef HWY_RVV_MASKED_GATHER
2069
2070template <class D>
2075
2076// ================================================== CONVERT
2077
2078// ------------------------------ PromoteTo
2079
2080// SEW is for the input.
2081#define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2082 SHIFT, MLEN, NAME, OP) \
2083 template <size_t N> \
2084 HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \
2085 HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
2086 return __riscv_v##OP##CHAR##SEWD##LMULD(v, Lanes(d)); \
2087 }
2088
2089HWY_RVV_FOREACH_U08(HWY_RVV_PROMOTE, PromoteTo, zext_vf2_, _EXT_VIRT)
2090HWY_RVV_FOREACH_U16(HWY_RVV_PROMOTE, PromoteTo, zext_vf2_, _EXT_VIRT)
2091HWY_RVV_FOREACH_U32(HWY_RVV_PROMOTE, PromoteTo, zext_vf2_, _EXT_VIRT)
2092HWY_RVV_FOREACH_I08(HWY_RVV_PROMOTE, PromoteTo, sext_vf2_, _EXT_VIRT)
2093HWY_RVV_FOREACH_I16(HWY_RVV_PROMOTE, PromoteTo, sext_vf2_, _EXT_VIRT)
2094HWY_RVV_FOREACH_I32(HWY_RVV_PROMOTE, PromoteTo, sext_vf2_, _EXT_VIRT)
2095HWY_RVV_FOREACH_F32(HWY_RVV_PROMOTE, PromoteTo, fwcvt_f_f_v_, _EXT_VIRT)
2096
2097#if HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C
2098
2100 _EXT_VIRT)
2101
2102// Per-target flag to prevent generic_ops-inl.h from defining f16 conversions.
2103#ifdef HWY_NATIVE_F16C
2104#undef HWY_NATIVE_F16C
2105#else
2106#define HWY_NATIVE_F16C
2107#endif
2108#endif // HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C
2109
2110#undef HWY_RVV_PROMOTE
2111
2112// The above X-macro cannot handle 4x promotion nor type switching.
2113// TODO(janwas): use BASE2 arg to allow the latter.
2114#define HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, LMUL, LMUL_IN, \
2115 SHIFT, ADD) \
2116 template <size_t N> \
2117 HWY_API HWY_RVV_V(BASE, BITS, LMUL) \
2118 PromoteTo(HWY_RVV_D(BASE, BITS, N, SHIFT + ADD) d, \
2119 HWY_RVV_V(BASE_IN, BITS_IN, LMUL_IN) v) { \
2120 return __riscv_v##OP##CHAR##BITS##LMUL(v, Lanes(d)); \
2121 }
2122
2123#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
2124 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -2, 1) \
2125 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf2, -1, 1) \
2126 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, m1, 0, 1) \
2127 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m2, 1, 1) \
2128 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m4, 2, 1)
2129
2130#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
2131 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf4, -2, 2) \
2132 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf2, -1, 2) \
2133 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, m1, 0, 2) \
2134 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m2, 1, 2)
2135
2136#define HWY_RVV_PROMOTE_X4_FROM_U8(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
2137 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, mf2, mf8, -3, 2) \
2138 HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
2139
2140#define HWY_RVV_PROMOTE_X8(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN) \
2141 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m1, mf8, -3, 3) \
2142 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m2, mf4, -2, 3) \
2143 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m4, mf2, -1, 3) \
2144 HWY_RVV_PROMOTE(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN, m8, m1, 0, 3)
2145
2146HWY_RVV_PROMOTE_X8(zext_vf8_, uint, u, 64, uint, 8)
2147HWY_RVV_PROMOTE_X8(sext_vf8_, int, i, 64, int, 8)
2148
2149HWY_RVV_PROMOTE_X4_FROM_U8(zext_vf4_, uint, u, 32, uint, 8)
2150HWY_RVV_PROMOTE_X4_FROM_U8(sext_vf4_, int, i, 32, int, 8)
2151HWY_RVV_PROMOTE_X4(zext_vf4_, uint, u, 64, uint, 16)
2152HWY_RVV_PROMOTE_X4(sext_vf4_, int, i, 64, int, 16)
2153
2154// i32 to f64
2155HWY_RVV_PROMOTE_X2(fwcvt_f_x_v_, float, f, 64, int, 32)
2156
2157// u32 to f64
2158HWY_RVV_PROMOTE_X2(fwcvt_f_xu_v_, float, f, 64, uint, 32)
2159
2160// f32 to i64
2161HWY_RVV_PROMOTE_X2(fwcvt_rtz_x_f_v_, int, i, 64, float, 32)
2162
2163// f32 to u64
2164HWY_RVV_PROMOTE_X2(fwcvt_rtz_xu_f_v_, uint, u, 64, float, 32)
2165
2166#undef HWY_RVV_PROMOTE_X8
2167#undef HWY_RVV_PROMOTE_X4_FROM_U8
2168#undef HWY_RVV_PROMOTE_X4
2169#undef HWY_RVV_PROMOTE_X2
2170#undef HWY_RVV_PROMOTE
2171
2172// I16->I64 or U16->U64 PromoteTo with virtual LMUL
2173template <size_t N>
2175 VFromD<Rebind<int16_t, decltype(d)>> v)
2176 -> VFromD<decltype(d)> {
2177 return PromoteTo(ScalableTag<int64_t>(), v);
2178}
2179
2180template <size_t N>
2182 VFromD<Rebind<uint16_t, decltype(d)>> v)
2183 -> VFromD<decltype(d)> {
2184 return PromoteTo(ScalableTag<uint64_t>(), v);
2185}
2186
2187// Unsigned to signed: cast for unsigned promote.
2188template <size_t N, int kPow2>
2190 VFromD<Rebind<uint8_t, decltype(d)>> v)
2191 -> VFromD<decltype(d)> {
2192 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
2193}
2194
2195template <size_t N, int kPow2>
2197 VFromD<Rebind<uint8_t, decltype(d)>> v)
2198 -> VFromD<decltype(d)> {
2199 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
2200}
2201
2202template <size_t N, int kPow2>
2204 VFromD<Rebind<uint16_t, decltype(d)>> v)
2205 -> VFromD<decltype(d)> {
2206 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
2207}
2208
2209template <size_t N, int kPow2>
2211 VFromD<Rebind<uint32_t, decltype(d)>> v)
2212 -> VFromD<decltype(d)> {
2213 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
2214}
2215
2216template <size_t N, int kPow2>
2218 VFromD<Rebind<uint16_t, decltype(d)>> v)
2219 -> VFromD<decltype(d)> {
2220 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
2221}
2222
2223template <size_t N, int kPow2>
2225 VFromD<Rebind<uint8_t, decltype(d)>> v)
2226 -> VFromD<decltype(d)> {
2227 return BitCast(d, PromoteTo(RebindToUnsigned<decltype(d)>(), v));
2228}
2229
2230template <size_t N, int kPow2>
2232 VFromD<Rebind<hwy::bfloat16_t, decltype(d)>> v)
2233 -> VFromD<decltype(d)> {
2234 const RebindToSigned<decltype(d)> di32;
2235 const Rebind<uint16_t, decltype(d)> du16;
2236 return BitCast(d, ShiftLeft<16>(PromoteTo(di32, BitCast(du16, v))));
2237}
2238
2239// ------------------------------ DemoteTo U
2240
2241// SEW is for the source so we can use _DEMOTE_VIRT.
2242#define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
2243 MLEN, NAME, OP) \
2244 template <size_t N> \
2245 HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
2246 HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
2247 return __riscv_v##OP##CHAR##SEWH##LMULH( \
2248 v, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); \
2249 }
2250
2251// Unsigned -> unsigned
2252HWY_RVV_FOREACH_U16(HWY_RVV_DEMOTE, DemoteTo, nclipu_wx_, _DEMOTE_VIRT)
2253HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE, DemoteTo, nclipu_wx_, _DEMOTE_VIRT)
2254HWY_RVV_FOREACH_U64(HWY_RVV_DEMOTE, DemoteTo, nclipu_wx_, _DEMOTE_VIRT)
2255
2256// SEW is for the source so we can use _DEMOTE_VIRT.
2257#define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2258 SHIFT, MLEN, NAME, OP) \
2259 template <size_t N> \
2260 HWY_API HWY_RVV_V(uint, SEWH, LMULH) NAME( \
2261 HWY_RVV_D(uint, SEWH, N, SHIFT - 1) dn, HWY_RVV_V(int, SEW, LMUL) v) { \
2262 const HWY_RVV_D(uint, SEW, N, SHIFT) du; \
2263 /* First clamp negative numbers to zero to match x86 packus. */ \
2264 return DemoteTo(dn, BitCast(du, detail::MaxS(v, 0))); \
2265 }
2269#undef HWY_RVV_DEMOTE_I_TO_U
2270
2271template <size_t N>
2272HWY_API vuint8mf8_t DemoteTo(Simd<uint8_t, N, -3> d, const vint32mf2_t v) {
2273 return __riscv_vnclipu_wx_u8mf8(
2275 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
2276}
2277template <size_t N>
2278HWY_API vuint8mf4_t DemoteTo(Simd<uint8_t, N, -2> d, const vint32m1_t v) {
2279 return __riscv_vnclipu_wx_u8mf4(
2281 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
2282}
2283template <size_t N>
2284HWY_API vuint8mf2_t DemoteTo(Simd<uint8_t, N, -1> d, const vint32m2_t v) {
2285 return __riscv_vnclipu_wx_u8mf2(
2287 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
2288}
2289template <size_t N>
2290HWY_API vuint8m1_t DemoteTo(Simd<uint8_t, N, 0> d, const vint32m4_t v) {
2291 return __riscv_vnclipu_wx_u8m1(
2293 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
2294}
2295template <size_t N>
2296HWY_API vuint8m2_t DemoteTo(Simd<uint8_t, N, 1> d, const vint32m8_t v) {
2297 return __riscv_vnclipu_wx_u8m2(
2299 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
2300}
2301
2302template <size_t N>
2303HWY_API vuint8mf8_t DemoteTo(Simd<uint8_t, N, -3> d, const vuint32mf2_t v) {
2304 return __riscv_vnclipu_wx_u8mf8(
2306 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
2307}
2308template <size_t N>
2309HWY_API vuint8mf4_t DemoteTo(Simd<uint8_t, N, -2> d, const vuint32m1_t v) {
2310 return __riscv_vnclipu_wx_u8mf4(
2312 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
2313}
2314template <size_t N>
2315HWY_API vuint8mf2_t DemoteTo(Simd<uint8_t, N, -1> d, const vuint32m2_t v) {
2316 return __riscv_vnclipu_wx_u8mf2(
2318 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
2319}
2320template <size_t N>
2321HWY_API vuint8m1_t DemoteTo(Simd<uint8_t, N, 0> d, const vuint32m4_t v) {
2322 return __riscv_vnclipu_wx_u8m1(
2324 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
2325}
2326template <size_t N>
2327HWY_API vuint8m2_t DemoteTo(Simd<uint8_t, N, 1> d, const vuint32m8_t v) {
2328 return __riscv_vnclipu_wx_u8m2(
2330 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d)));
2331}
2332
2333template <size_t N, int kPow2>
2338
2339template <size_t N, int kPow2>
2344
2345template <size_t N, int kPow2>
2350
2351template <size_t N, int kPow2>
2356
2357HWY_API vuint8mf8_t U8FromU32(const vuint32mf2_t v) {
2358 const size_t avl = Lanes(ScalableTag<uint8_t, -3>());
2359 return __riscv_vnclipu_wx_u8mf8(
2360 __riscv_vnclipu_wx_u16mf4(v, 0,
2361 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)),
2362 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2363}
2364HWY_API vuint8mf4_t U8FromU32(const vuint32m1_t v) {
2365 const size_t avl = Lanes(ScalableTag<uint8_t, -2>());
2366 return __riscv_vnclipu_wx_u8mf4(
2367 __riscv_vnclipu_wx_u16mf2(v, 0,
2368 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)),
2369 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2370}
2371HWY_API vuint8mf2_t U8FromU32(const vuint32m2_t v) {
2372 const size_t avl = Lanes(ScalableTag<uint8_t, -1>());
2373 return __riscv_vnclipu_wx_u8mf2(
2374 __riscv_vnclipu_wx_u16m1(v, 0,
2375 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)),
2376 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2377}
2378HWY_API vuint8m1_t U8FromU32(const vuint32m4_t v) {
2379 const size_t avl = Lanes(ScalableTag<uint8_t, 0>());
2380 return __riscv_vnclipu_wx_u8m1(
2381 __riscv_vnclipu_wx_u16m2(v, 0,
2382 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)),
2383 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2384}
2385HWY_API vuint8m2_t U8FromU32(const vuint32m8_t v) {
2386 const size_t avl = Lanes(ScalableTag<uint8_t, 1>());
2387 return __riscv_vnclipu_wx_u8m2(
2388 __riscv_vnclipu_wx_u16m4(v, 0,
2389 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl)),
2390 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2391}
2392
2393// ------------------------------ Truncations
2394
2395template <size_t N>
2397 const VFromD<Simd<uint64_t, N, 0>> v) {
2398 const size_t avl = Lanes(d);
2399 const vuint64m1_t v1 = __riscv_vand(v, 0xFF, avl);
2400 const vuint32mf2_t v2 = __riscv_vnclipu_wx_u32mf2(
2401 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2402 const vuint16mf4_t v3 = __riscv_vnclipu_wx_u16mf4(
2403 v2, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2404 return __riscv_vnclipu_wx_u8mf8(v3, 0,
2405 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2406}
2407
2408template <size_t N>
2410 const VFromD<Simd<uint64_t, N, 1>> v) {
2411 const size_t avl = Lanes(d);
2412 const vuint64m2_t v1 = __riscv_vand(v, 0xFF, avl);
2413 const vuint32m1_t v2 = __riscv_vnclipu_wx_u32m1(
2414 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2415 const vuint16mf2_t v3 = __riscv_vnclipu_wx_u16mf2(
2416 v2, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2417 return __riscv_vnclipu_wx_u8mf4(v3, 0,
2418 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2419}
2420
2421template <size_t N>
2423 const VFromD<Simd<uint64_t, N, 2>> v) {
2424 const size_t avl = Lanes(d);
2425 const vuint64m4_t v1 = __riscv_vand(v, 0xFF, avl);
2426 const vuint32m2_t v2 = __riscv_vnclipu_wx_u32m2(
2427 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2428 const vuint16m1_t v3 = __riscv_vnclipu_wx_u16m1(
2429 v2, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2430 return __riscv_vnclipu_wx_u8mf2(v3, 0,
2431 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2432}
2433
2434template <size_t N>
2436 const VFromD<Simd<uint64_t, N, 3>> v) {
2437 const size_t avl = Lanes(d);
2438 const vuint64m8_t v1 = __riscv_vand(v, 0xFF, avl);
2439 const vuint32m4_t v2 = __riscv_vnclipu_wx_u32m4(
2440 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2441 const vuint16m2_t v3 = __riscv_vnclipu_wx_u16m2(
2442 v2, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2443 return __riscv_vnclipu_wx_u8m1(v3, 0,
2444 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2445}
2446
2447template <size_t N>
2449 const VFromD<Simd<uint64_t, N, -1>> v) {
2450 const size_t avl = Lanes(d);
2451 const vuint64m1_t v1 = __riscv_vand(v, 0xFFFF, avl);
2452 const vuint32mf2_t v2 = __riscv_vnclipu_wx_u32mf2(
2453 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2454 return __riscv_vnclipu_wx_u16mf4(v2, 0,
2455 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2456}
2457
2458template <size_t N>
2460 const VFromD<Simd<uint64_t, N, 0>> v) {
2461 const size_t avl = Lanes(d);
2462 const vuint64m1_t v1 = __riscv_vand(v, 0xFFFF, avl);
2463 const vuint32mf2_t v2 = __riscv_vnclipu_wx_u32mf2(
2464 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2465 return __riscv_vnclipu_wx_u16mf4(v2, 0,
2466 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2467}
2468
2469template <size_t N>
2471 const VFromD<Simd<uint64_t, N, 1>> v) {
2472 const size_t avl = Lanes(d);
2473 const vuint64m2_t v1 = __riscv_vand(v, 0xFFFF, avl);
2474 const vuint32m1_t v2 = __riscv_vnclipu_wx_u32m1(
2475 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2476 return __riscv_vnclipu_wx_u16mf2(v2, 0,
2477 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2478}
2479
2480template <size_t N>
2482 const VFromD<Simd<uint64_t, N, 2>> v) {
2483 const size_t avl = Lanes(d);
2484 const vuint64m4_t v1 = __riscv_vand(v, 0xFFFF, avl);
2485 const vuint32m2_t v2 = __riscv_vnclipu_wx_u32m2(
2486 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2487 return __riscv_vnclipu_wx_u16m1(v2, 0,
2488 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2489}
2490
2491template <size_t N>
2493 const VFromD<Simd<uint64_t, N, 3>> v) {
2494 const size_t avl = Lanes(d);
2495 const vuint64m8_t v1 = __riscv_vand(v, 0xFFFF, avl);
2496 const vuint32m4_t v2 = __riscv_vnclipu_wx_u32m4(
2497 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2498 return __riscv_vnclipu_wx_u16m2(v2, 0,
2499 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2500}
2501
2502template <size_t N>
2504 const VFromD<Simd<uint64_t, N, -1>> v) {
2505 const size_t avl = Lanes(d);
2506 const vuint64m1_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl);
2507 return __riscv_vnclipu_wx_u32mf2(v1, 0,
2508 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2509}
2510
2511template <size_t N>
2513 const VFromD<Simd<uint64_t, N, 0>> v) {
2514 const size_t avl = Lanes(d);
2515 const vuint64m1_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl);
2516 return __riscv_vnclipu_wx_u32mf2(v1, 0,
2517 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2518}
2519
2520template <size_t N>
2522 const VFromD<Simd<uint64_t, N, 1>> v) {
2523 const size_t avl = Lanes(d);
2524 const vuint64m2_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl);
2525 return __riscv_vnclipu_wx_u32m1(v1, 0,
2526 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2527}
2528
2529template <size_t N>
2531 const VFromD<Simd<uint64_t, N, 2>> v) {
2532 const size_t avl = Lanes(d);
2533 const vuint64m4_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl);
2534 return __riscv_vnclipu_wx_u32m2(v1, 0,
2535 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2536}
2537
2538template <size_t N>
2540 const VFromD<Simd<uint64_t, N, 3>> v) {
2541 const size_t avl = Lanes(d);
2542 const vuint64m8_t v1 = __riscv_vand(v, 0xFFFFFFFFu, avl);
2543 return __riscv_vnclipu_wx_u32m4(v1, 0,
2544 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2545}
2546
2547template <size_t N>
2549 const VFromD<Simd<uint32_t, N, -1>> v) {
2550 const size_t avl = Lanes(d);
2551 const vuint32mf2_t v1 = __riscv_vand(v, 0xFF, avl);
2552 const vuint16mf4_t v2 = __riscv_vnclipu_wx_u16mf4(
2553 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2554 return __riscv_vnclipu_wx_u8mf8(v2, 0,
2555 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2556}
2557
2558template <size_t N>
2560 const VFromD<Simd<uint32_t, N, 0>> v) {
2561 const size_t avl = Lanes(d);
2562 const vuint32m1_t v1 = __riscv_vand(v, 0xFF, avl);
2563 const vuint16mf2_t v2 = __riscv_vnclipu_wx_u16mf2(
2564 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2565 return __riscv_vnclipu_wx_u8mf4(v2, 0,
2566 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2567}
2568
2569template <size_t N>
2571 const VFromD<Simd<uint32_t, N, 1>> v) {
2572 const size_t avl = Lanes(d);
2573 const vuint32m2_t v1 = __riscv_vand(v, 0xFF, avl);
2574 const vuint16m1_t v2 = __riscv_vnclipu_wx_u16m1(
2575 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2576 return __riscv_vnclipu_wx_u8mf2(v2, 0,
2577 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2578}
2579
2580template <size_t N>
2582 const VFromD<Simd<uint32_t, N, 2>> v) {
2583 const size_t avl = Lanes(d);
2584 const vuint32m4_t v1 = __riscv_vand(v, 0xFF, avl);
2585 const vuint16m2_t v2 = __riscv_vnclipu_wx_u16m2(
2586 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2587 return __riscv_vnclipu_wx_u8m1(v2, 0,
2588 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2589}
2590
2591template <size_t N>
2593 const VFromD<Simd<uint32_t, N, 3>> v) {
2594 const size_t avl = Lanes(d);
2595 const vuint32m8_t v1 = __riscv_vand(v, 0xFF, avl);
2596 const vuint16m4_t v2 = __riscv_vnclipu_wx_u16m4(
2597 v1, 0, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2598 return __riscv_vnclipu_wx_u8m2(v2, 0,
2599 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2600}
2601
2602template <size_t N>
2604 const VFromD<Simd<uint32_t, N, -2>> v) {
2605 const size_t avl = Lanes(d);
2606 const vuint32mf2_t v1 = __riscv_vand(v, 0xFFFF, avl);
2607 return __riscv_vnclipu_wx_u16mf4(v1, 0,
2608 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2609}
2610
2611template <size_t N>
2613 const VFromD<Simd<uint32_t, N, -1>> v) {
2614 const size_t avl = Lanes(d);
2615 const vuint32mf2_t v1 = __riscv_vand(v, 0xFFFF, avl);
2616 return __riscv_vnclipu_wx_u16mf4(v1, 0,
2617 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2618}
2619
2620template <size_t N>
2622 const VFromD<Simd<uint32_t, N, 0>> v) {
2623 const size_t avl = Lanes(d);
2624 const vuint32m1_t v1 = __riscv_vand(v, 0xFFFF, avl);
2625 return __riscv_vnclipu_wx_u16mf2(v1, 0,
2626 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2627}
2628
2629template <size_t N>
2631 const VFromD<Simd<uint32_t, N, 1>> v) {
2632 const size_t avl = Lanes(d);
2633 const vuint32m2_t v1 = __riscv_vand(v, 0xFFFF, avl);
2634 return __riscv_vnclipu_wx_u16m1(v1, 0,
2635 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2636}
2637
2638template <size_t N>
2640 const VFromD<Simd<uint32_t, N, 2>> v) {
2641 const size_t avl = Lanes(d);
2642 const vuint32m4_t v1 = __riscv_vand(v, 0xFFFF, avl);
2643 return __riscv_vnclipu_wx_u16m2(v1, 0,
2644 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2645}
2646
2647template <size_t N>
2649 const VFromD<Simd<uint32_t, N, 3>> v) {
2650 const size_t avl = Lanes(d);
2651 const vuint32m8_t v1 = __riscv_vand(v, 0xFFFF, avl);
2652 return __riscv_vnclipu_wx_u16m4(v1, 0,
2653 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2654}
2655
2656template <size_t N>
2658 const VFromD<Simd<uint16_t, N, -2>> v) {
2659 const size_t avl = Lanes(d);
2660 const vuint16mf4_t v1 = __riscv_vand(v, 0xFF, avl);
2661 return __riscv_vnclipu_wx_u8mf8(v1, 0,
2662 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2663}
2664
2665template <size_t N>
2667 const VFromD<Simd<uint16_t, N, -1>> v) {
2668 const size_t avl = Lanes(d);
2669 const vuint16mf2_t v1 = __riscv_vand(v, 0xFF, avl);
2670 return __riscv_vnclipu_wx_u8mf4(v1, 0,
2671 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2672}
2673
2674template <size_t N>
2676 const VFromD<Simd<uint16_t, N, 0>> v) {
2677 const size_t avl = Lanes(d);
2678 const vuint16m1_t v1 = __riscv_vand(v, 0xFF, avl);
2679 return __riscv_vnclipu_wx_u8mf2(v1, 0,
2680 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2681}
2682
2683template <size_t N>
2685 const VFromD<Simd<uint16_t, N, 1>> v) {
2686 const size_t avl = Lanes(d);
2687 const vuint16m2_t v1 = __riscv_vand(v, 0xFF, avl);
2688 return __riscv_vnclipu_wx_u8m1(v1, 0,
2689 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2690}
2691
2692template <size_t N>
2694 const VFromD<Simd<uint16_t, N, 2>> v) {
2695 const size_t avl = Lanes(d);
2696 const vuint16m4_t v1 = __riscv_vand(v, 0xFF, avl);
2697 return __riscv_vnclipu_wx_u8m2(v1, 0,
2698 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2699}
2700
2701template <size_t N>
2703 const VFromD<Simd<uint16_t, N, 3>> v) {
2704 const size_t avl = Lanes(d);
2705 const vuint16m8_t v1 = __riscv_vand(v, 0xFF, avl);
2706 return __riscv_vnclipu_wx_u8m4(v1, 0,
2707 HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, avl));
2708}
2709
2710// ------------------------------ DemoteTo I
2711
2712HWY_RVV_FOREACH_I16(HWY_RVV_DEMOTE, DemoteTo, nclip_wx_, _DEMOTE_VIRT)
2713HWY_RVV_FOREACH_I32(HWY_RVV_DEMOTE, DemoteTo, nclip_wx_, _DEMOTE_VIRT)
2714HWY_RVV_FOREACH_I64(HWY_RVV_DEMOTE, DemoteTo, nclip_wx_, _DEMOTE_VIRT)
2715
2716template <size_t N>
2717HWY_API vint8mf8_t DemoteTo(Simd<int8_t, N, -3> d, const vint32mf2_t v) {
2718 return DemoteTo(d, DemoteTo(Simd<int16_t, N, -2>(), v));
2719}
2720template <size_t N>
2721HWY_API vint8mf4_t DemoteTo(Simd<int8_t, N, -2> d, const vint32m1_t v) {
2722 return DemoteTo(d, DemoteTo(Simd<int16_t, N, -1>(), v));
2723}
2724template <size_t N>
2725HWY_API vint8mf2_t DemoteTo(Simd<int8_t, N, -1> d, const vint32m2_t v) {
2726 return DemoteTo(d, DemoteTo(Simd<int16_t, N, 0>(), v));
2727}
2728template <size_t N>
2729HWY_API vint8m1_t DemoteTo(Simd<int8_t, N, 0> d, const vint32m4_t v) {
2730 return DemoteTo(d, DemoteTo(Simd<int16_t, N, 1>(), v));
2731}
2732template <size_t N>
2733HWY_API vint8m2_t DemoteTo(Simd<int8_t, N, 1> d, const vint32m8_t v) {
2734 return DemoteTo(d, DemoteTo(Simd<int16_t, N, 2>(), v));
2735}
2736
2737template <size_t N, int kPow2>
2742
2743template <size_t N, int kPow2>
2748
2749#undef HWY_RVV_DEMOTE
2750
2751// ------------------------------ DemoteTo F
2752
2753// SEW is for the source so we can use _DEMOTE_VIRT.
2754#define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2755 SHIFT, MLEN, NAME, OP) \
2756 template <size_t N> \
2757 HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
2758 HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
2759 return __riscv_v##OP##SEWH##LMULH(v, Lanes(d)); \
2760 }
2761
2762#if HWY_HAVE_FLOAT16 || HWY_RVV_HAVE_F16C
2763HWY_RVV_FOREACH_F32(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_rod_f_f_w_f, _DEMOTE_VIRT)
2764#endif
2765HWY_RVV_FOREACH_F64(HWY_RVV_DEMOTE_F, DemoteTo, fncvt_rod_f_f_w_f, _DEMOTE_VIRT)
2766#undef HWY_RVV_DEMOTE_F
2767
2768// TODO(janwas): add BASE2 arg to allow generating this via DEMOTE_F.
2769template <size_t N>
2770HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -2> d, const vfloat64m1_t v) {
2771 return __riscv_vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d));
2772}
2773template <size_t N>
2774HWY_API vint32mf2_t DemoteTo(Simd<int32_t, N, -1> d, const vfloat64m1_t v) {
2775 return __riscv_vfncvt_rtz_x_f_w_i32mf2(v, Lanes(d));
2776}
2777template <size_t N>
2778HWY_API vint32m1_t DemoteTo(Simd<int32_t, N, 0> d, const vfloat64m2_t v) {
2779 return __riscv_vfncvt_rtz_x_f_w_i32m1(v, Lanes(d));
2780}
2781template <size_t N>
2782HWY_API vint32m2_t DemoteTo(Simd<int32_t, N, 1> d, const vfloat64m4_t v) {
2783 return __riscv_vfncvt_rtz_x_f_w_i32m2(v, Lanes(d));
2784}
2785template <size_t N>
2786HWY_API vint32m4_t DemoteTo(Simd<int32_t, N, 2> d, const vfloat64m8_t v) {
2787 return __riscv_vfncvt_rtz_x_f_w_i32m4(v, Lanes(d));
2788}
2789
2790template <size_t N>
2791HWY_API vuint32mf2_t DemoteTo(Simd<uint32_t, N, -2> d, const vfloat64m1_t v) {
2792 return __riscv_vfncvt_rtz_xu_f_w_u32mf2(v, Lanes(d));
2793}
2794template <size_t N>
2795HWY_API vuint32mf2_t DemoteTo(Simd<uint32_t, N, -1> d, const vfloat64m1_t v) {
2796 return __riscv_vfncvt_rtz_xu_f_w_u32mf2(v, Lanes(d));
2797}
2798template <size_t N>
2799HWY_API vuint32m1_t DemoteTo(Simd<uint32_t, N, 0> d, const vfloat64m2_t v) {
2800 return __riscv_vfncvt_rtz_xu_f_w_u32m1(v, Lanes(d));
2801}
2802template <size_t N>
2803HWY_API vuint32m2_t DemoteTo(Simd<uint32_t, N, 1> d, const vfloat64m4_t v) {
2804 return __riscv_vfncvt_rtz_xu_f_w_u32m2(v, Lanes(d));
2805}
2806template <size_t N>
2807HWY_API vuint32m4_t DemoteTo(Simd<uint32_t, N, 2> d, const vfloat64m8_t v) {
2808 return __riscv_vfncvt_rtz_xu_f_w_u32m4(v, Lanes(d));
2809}
2810
2811template <size_t N>
2812HWY_API vfloat32mf2_t DemoteTo(Simd<float, N, -2> d, const vint64m1_t v) {
2813 return __riscv_vfncvt_f_x_w_f32mf2(v, Lanes(d));
2814}
2815template <size_t N>
2816HWY_API vfloat32mf2_t DemoteTo(Simd<float, N, -1> d, const vint64m1_t v) {
2817 return __riscv_vfncvt_f_x_w_f32mf2(v, Lanes(d));
2818}
2819template <size_t N>
2820HWY_API vfloat32m1_t DemoteTo(Simd<float, N, 0> d, const vint64m2_t v) {
2821 return __riscv_vfncvt_f_x_w_f32m1(v, Lanes(d));
2822}
2823template <size_t N>
2824HWY_API vfloat32m2_t DemoteTo(Simd<float, N, 1> d, const vint64m4_t v) {
2825 return __riscv_vfncvt_f_x_w_f32m2(v, Lanes(d));
2826}
2827template <size_t N>
2828HWY_API vfloat32m4_t DemoteTo(Simd<float, N, 2> d, const vint64m8_t v) {
2829 return __riscv_vfncvt_f_x_w_f32m4(v, Lanes(d));
2830}
2831
2832template <size_t N>
2833HWY_API vfloat32mf2_t DemoteTo(Simd<float, N, -2> d, const vuint64m1_t v) {
2834 return __riscv_vfncvt_f_xu_w_f32mf2(v, Lanes(d));
2835}
2836template <size_t N>
2837HWY_API vfloat32mf2_t DemoteTo(Simd<float, N, -1> d, const vuint64m1_t v) {
2838 return __riscv_vfncvt_f_xu_w_f32mf2(v, Lanes(d));
2839}
2840template <size_t N>
2841HWY_API vfloat32m1_t DemoteTo(Simd<float, N, 0> d, const vuint64m2_t v) {
2842 return __riscv_vfncvt_f_xu_w_f32m1(v, Lanes(d));
2843}
2844template <size_t N>
2845HWY_API vfloat32m2_t DemoteTo(Simd<float, N, 1> d, const vuint64m4_t v) {
2846 return __riscv_vfncvt_f_xu_w_f32m2(v, Lanes(d));
2847}
2848template <size_t N>
2849HWY_API vfloat32m4_t DemoteTo(Simd<float, N, 2> d, const vuint64m8_t v) {
2850 return __riscv_vfncvt_f_xu_w_f32m4(v, Lanes(d));
2851}
2852
2853// SEW is for the source so we can use _DEMOTE_VIRT.
2854#define HWY_RVV_DEMOTE_TO_SHR_16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
2855 LMULH, SHIFT, MLEN, NAME, OP) \
2856 template <size_t N> \
2857 HWY_API HWY_RVV_V(BASE, SEWH, LMULH) NAME( \
2858 HWY_RVV_D(BASE, SEWH, N, SHIFT - 1) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
2859 return __riscv_v##OP##CHAR##SEWH##LMULH( \
2860 v, 16, HWY_RVV_INSERT_VXRM(__RISCV_VXRM_RDN, Lanes(d))); \
2861 }
2862namespace detail {
2863HWY_RVV_FOREACH_U32(HWY_RVV_DEMOTE_TO_SHR_16, DemoteToShr16, nclipu_wx_,
2864 _DEMOTE_VIRT)
2865}
2866#undef HWY_RVV_DEMOTE_TO_SHR_16
2867
2868namespace detail {
2869
2870// Round a F32 value to the nearest BF16 value, with the result returned as the
2871// rounded F32 value bitcasted to an U32
2872
2873// RoundF32ForDemoteToBF16 also converts NaN values to QNaN values to prevent
2874// NaN F32 values from being converted to an infinity
2875template <class V, HWY_IF_F32(TFromV<V>)>
2877 const RebindToUnsigned<DFromV<V>> du32;
2878 const auto is_non_nan = Eq(v, v);
2879 const auto bits32 = BitCast(du32, v);
2880
2881 const auto round_incr =
2882 detail::AddS(detail::AndS(ShiftRight<16>(bits32), 1u), 0x7FFFu);
2883 return MaskedAddOr(detail::OrS(bits32, 0x00400000u), is_non_nan, bits32,
2884 round_incr);
2885}
2886
2887} // namespace detail
2888
2889#ifdef HWY_NATIVE_DEMOTE_F32_TO_BF16
2890#undef HWY_NATIVE_DEMOTE_F32_TO_BF16
2891#else
2892#define HWY_NATIVE_DEMOTE_F32_TO_BF16
2893#endif
2894
2895template <size_t N, int kPow2>
2902
2903// ------------------------------ ConvertTo F
2904
2905#define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2906 SHIFT, MLEN, NAME, OP) \
2907 template <size_t N> \
2908 HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \
2909 HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(int, SEW, LMUL) v) { \
2910 return __riscv_vfcvt_f_x_v_f##SEW##LMUL(v, Lanes(d)); \
2911 } \
2912 template <size_t N> \
2913 HWY_API HWY_RVV_V(BASE, SEW, LMUL) ConvertTo( \
2914 HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(uint, SEW, LMUL) v) { \
2915 return __riscv_vfcvt_f_xu_v_f##SEW##LMUL(v, Lanes(d)); \
2916 } \
2917 /* Truncates (rounds toward zero). */ \
2918 template <size_t N> \
2919 HWY_API HWY_RVV_V(int, SEW, LMUL) ConvertTo(HWY_RVV_D(int, SEW, N, SHIFT) d, \
2920 HWY_RVV_V(BASE, SEW, LMUL) v) { \
2921 return __riscv_vfcvt_rtz_x_f_v_i##SEW##LMUL(v, Lanes(d)); \
2922 } \
2923 template <size_t N> \
2924 HWY_API HWY_RVV_V(uint, SEW, LMUL) ConvertTo( \
2925 HWY_RVV_D(uint, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) v) { \
2926 return __riscv_vfcvt_rtz_xu_f_v_u##SEW##LMUL(v, Lanes(d)); \
2927 }
2928
2929HWY_RVV_FOREACH_F(HWY_RVV_CONVERT, _, _, _ALL_VIRT)
2930#undef HWY_RVV_CONVERT
2931
2932// Uses default rounding mode. Must be separate because there is no D arg.
2933#define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2934 SHIFT, MLEN, NAME, OP) \
2935 HWY_API HWY_RVV_V(int, SEW, LMUL) NearestInt(HWY_RVV_V(BASE, SEW, LMUL) v) { \
2936 return __riscv_vfcvt_x_f_v_i##SEW##LMUL(v, HWY_RVV_AVL(SEW, SHIFT)); \
2937 }
2939#undef HWY_RVV_NEAREST
2940
2941// ================================================== COMBINE
2942
2943namespace detail {
2944
2945// For x86-compatible behaviour mandated by Highway API: TableLookupBytes
2946// offsets are implicitly relative to the start of their 128-bit block.
2947template <typename T, size_t N, int kPow2>
2949 // kMinVecBytes is the minimum size of VFromD<decltype(d)> in bytes
2950 constexpr size_t kMinVecBytes =
2951 ScaleByPower(16, HWY_MAX(HWY_MIN(kPow2, 3), -3));
2952 // kMinVecLanes is the minimum number of lanes in VFromD<decltype(d)>
2953 constexpr size_t kMinVecLanes = (kMinVecBytes + sizeof(T) - 1) / sizeof(T);
2954 // kMaxLpb is the maximum number of lanes per block
2955 constexpr size_t kMaxLpb = HWY_MIN(16 / sizeof(T), MaxLanes(d));
2956
2957 // If kMaxLpb <= kMinVecLanes is true, then kMaxLpb <= Lanes(d) is true
2958 if (kMaxLpb <= kMinVecLanes) return kMaxLpb;
2959
2960 // Fractional LMUL: Lanes(d) may be smaller than kMaxLpb, so honor that.
2961 const size_t lanes_per_vec = Lanes(d);
2962 return HWY_MIN(lanes_per_vec, kMaxLpb);
2963}
2964
2965template <class D, class V>
2966HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0) {
2967 using T = MakeUnsigned<TFromV<V>>;
2968 return AndS(iota0, static_cast<T>(~(LanesPerBlock(d) - 1)));
2969}
2970
2971template <size_t kLanes, class D>
2973 const RebindToUnsigned<D> du;
2974 const RebindToSigned<D> di;
2975 using TU = TFromD<decltype(du)>;
2976 const auto idx_mod = AndS(Iota0(du), static_cast<TU>(LanesPerBlock(du) - 1));
2977 return LtS(BitCast(di, idx_mod), static_cast<TFromD<decltype(di)>>(kLanes));
2978}
2979
2980#define HWY_RVV_SLIDE_UP(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2981 SHIFT, MLEN, NAME, OP) \
2982 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2983 NAME(HWY_RVV_V(BASE, SEW, LMUL) dst, HWY_RVV_V(BASE, SEW, LMUL) src, \
2984 size_t lanes) { \
2985 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(dst, src, lanes, \
2986 HWY_RVV_AVL(SEW, SHIFT)); \
2987 }
2988
2989#define HWY_RVV_SLIDE_DOWN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
2990 SHIFT, MLEN, NAME, OP) \
2991 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
2992 NAME(HWY_RVV_V(BASE, SEW, LMUL) src, size_t lanes) { \
2993 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(src, lanes, \
2994 HWY_RVV_AVL(SEW, SHIFT)); \
2995 }
2996
2997HWY_RVV_FOREACH(HWY_RVV_SLIDE_UP, SlideUp, slideup, _ALL)
2998HWY_RVV_FOREACH(HWY_RVV_SLIDE_DOWN, SlideDown, slidedown, _ALL)
2999
3000#undef HWY_RVV_SLIDE_UP
3001#undef HWY_RVV_SLIDE_DOWN
3002
3003} // namespace detail
3004
3005// ------------------------------ SlideUpLanes
3006template <class D>
3007HWY_API VFromD<D> SlideUpLanes(D d, VFromD<D> v, size_t amt) {
3008 return detail::SlideUp(Zero(d), v, amt);
3009}
3010
3011// ------------------------------ SlideDownLanes
3012template <class D>
3013HWY_API VFromD<D> SlideDownLanes(D d, VFromD<D> v, size_t amt) {
3014 v = detail::SlideDown(v, amt);
3015 // Zero out upper lanes if v is a partial vector
3016 if (MaxLanes(d) < MaxLanes(DFromV<decltype(v)>())) {
3017 v = detail::SlideUp(v, Zero(d), Lanes(d) - amt);
3018 }
3019 return v;
3020}
3021
3022// ------------------------------ ConcatUpperLower
3023template <class D, class V>
3024HWY_API V ConcatUpperLower(D d, const V hi, const V lo) {
3025 const size_t half = Lanes(d) / 2;
3026 const V hi_down = detail::SlideDown(hi, half);
3027 return detail::SlideUp(lo, hi_down, half);
3028}
3029
3030// ------------------------------ ConcatLowerLower
3031template <class D, class V>
3032HWY_API V ConcatLowerLower(D d, const V hi, const V lo) {
3033 return detail::SlideUp(lo, hi, Lanes(d) / 2);
3034}
3035
3036// ------------------------------ ConcatUpperUpper
3037template <class D, class V>
3038HWY_API V ConcatUpperUpper(D d, const V hi, const V lo) {
3039 const size_t half = Lanes(d) / 2;
3040 const V hi_down = detail::SlideDown(hi, half);
3041 const V lo_down = detail::SlideDown(lo, half);
3042 return detail::SlideUp(lo_down, hi_down, half);
3043}
3044
3045// ------------------------------ ConcatLowerUpper
3046template <class D, class V>
3047HWY_API V ConcatLowerUpper(D d, const V hi, const V lo) {
3048 const size_t half = Lanes(d) / 2;
3049 const V lo_down = detail::SlideDown(lo, half);
3050 return detail::SlideUp(lo_down, hi, half);
3051}
3052
3053// ------------------------------ Combine
3054template <class D2, class V>
3055HWY_API VFromD<D2> Combine(D2 d2, const V hi, const V lo) {
3056 return detail::SlideUp(detail::Ext(d2, lo), detail::Ext(d2, hi),
3057 Lanes(d2) / 2);
3058}
3059
3060// ------------------------------ ZeroExtendVector
3061template <class D2, class V>
3063 return Combine(d2, Xor(lo, lo), lo);
3064}
3065
3066// ------------------------------ Lower/UpperHalf
3067
3068namespace detail {
3069
3070// RVV may only support LMUL >= SEW/64; returns whether that holds for D. Note
3071// that SEW = sizeof(T)*8 and LMUL = 1 << d.Pow2(). Add 3 to Pow2 to avoid
3072// negative shift counts.
3073template <class D>
3074constexpr bool IsSupportedLMUL(D d) {
3075 return (size_t{1} << (d.Pow2() + 3)) >= sizeof(TFromD<D>);
3076}
3077
3078} // namespace detail
3079
3080// If IsSupportedLMUL, just 'truncate' i.e. halve LMUL.
3081template <class DH, hwy::EnableIf<detail::IsSupportedLMUL(DH())>* = nullptr>
3082HWY_API VFromD<DH> LowerHalf(const DH /* tag */, const VFromD<Twice<DH>> v) {
3083 return detail::Trunc(v);
3084}
3085
3086// Otherwise, there is no corresponding intrinsic type (e.g. vuint64mf2_t), and
3087// the hardware may set "vill" if we attempt such an LMUL. However, the V
3088// extension on application processors requires Zvl128b, i.e. VLEN >= 128, so it
3089// still makes sense to have half of an SEW=64 vector. We instead just return
3090// the vector, and rely on the kPow2 in DH to halve the return value of Lanes().
3091template <class DH, class V,
3092 hwy::EnableIf<!detail::IsSupportedLMUL(DH())>* = nullptr>
3093HWY_API V LowerHalf(const DH /* tag */, const V v) {
3094 return v;
3095}
3096
3097// Same, but without D arg
3098template <class V>
3100 return LowerHalf(Half<DFromV<V>>(), v);
3101}
3102
3103template <class DH>
3105 return LowerHalf(d2, detail::SlideDown(v, Lanes(d2)));
3106}
3107
3108// ================================================== SWIZZLE
3109
3110namespace detail {
3111// Special instruction for 1 lane is presumably faster?
3112#define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
3113 MLEN, NAME, OP) \
3114 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
3115 return __riscv_v##OP##_##CHAR##SEW##LMUL(v, 0, HWY_RVV_AVL(SEW, SHIFT)); \
3116 }
3117
3118HWY_RVV_FOREACH_UI(HWY_RVV_SLIDE1, Slide1Up, slide1up_vx, _ALL)
3119HWY_RVV_FOREACH_F(HWY_RVV_SLIDE1, Slide1Up, fslide1up_vf, _ALL)
3120HWY_RVV_FOREACH_UI(HWY_RVV_SLIDE1, Slide1Down, slide1down_vx, _ALL)
3121HWY_RVV_FOREACH_F(HWY_RVV_SLIDE1, Slide1Down, fslide1down_vf, _ALL)
3122#undef HWY_RVV_SLIDE1
3123} // namespace detail
3124
3125// ------------------------------ Slide1Up and Slide1Down
3126#ifdef HWY_NATIVE_SLIDE1_UP_DOWN
3127#undef HWY_NATIVE_SLIDE1_UP_DOWN
3128#else
3129#define HWY_NATIVE_SLIDE1_UP_DOWN
3130#endif
3131
3132template <class D>
3133HWY_API VFromD<D> Slide1Up(D /*d*/, VFromD<D> v) {
3134 return detail::Slide1Up(v);
3135}
3136
3137template <class D>
3139 v = detail::Slide1Down(v);
3140 // Zero out upper lanes if v is a partial vector
3141 if (MaxLanes(d) < MaxLanes(DFromV<decltype(v)>())) {
3142 v = detail::SlideUp(v, Zero(d), Lanes(d) - 1);
3143 }
3144 return v;
3145}
3146
3147// ------------------------------ GetLane
3148
3149#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
3150 SHIFT, MLEN, NAME, OP) \
3151 HWY_API HWY_RVV_T(BASE, SEW) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
3152 return __riscv_v##OP##_s_##CHAR##SEW##LMUL##_##CHAR##SEW(v); /* no AVL */ \
3153 }
3154
3157#undef HWY_RVV_GET_LANE
3158
3159// ------------------------------ ExtractLane
3160template <class V>
3161HWY_API TFromV<V> ExtractLane(const V v, size_t i) {
3162 return GetLane(detail::SlideDown(v, i));
3163}
3164
3165// ------------------------------ Additional mask logical operations
3166
3170
3171#define HWY_RVV_SET_AT_OR_AFTER_FIRST(SEW, SHIFT, MLEN, NAME, OP) \
3172 HWY_API HWY_RVV_M(MLEN) SetAtOrAfterFirst(HWY_RVV_M(MLEN) m) { \
3173 return Not(SetBeforeFirst(m)); \
3174 }
3175
3177#undef HWY_RVV_SET_AT_OR_AFTER_FIRST
3178
3179// ------------------------------ InsertLane
3180
3181// T template arg because TFromV<V> might not match the hwy::float16_t argument.
3182template <class V, typename T, HWY_IF_NOT_T_SIZE_V(V, 1)>
3183HWY_API V InsertLane(const V v, size_t i, T t) {
3184 const Rebind<T, DFromV<V>> d;
3185 const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
3186 using TU = TFromD<decltype(du)>;
3187 const auto is_i = detail::EqS(detail::Iota0(du), static_cast<TU>(i));
3188 return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
3189}
3190
3191// For 8-bit lanes, Iota0 might overflow.
3192template <class V, typename T, HWY_IF_T_SIZE_V(V, 1)>
3193HWY_API V InsertLane(const V v, size_t i, T t) {
3194 const Rebind<T, DFromV<V>> d;
3195 const auto zero = Zero(d);
3196 const auto one = Set(d, 1);
3197 const auto ge_i = Eq(detail::SlideUp(zero, one, i), one);
3198 const auto is_i = SetOnlyFirst(ge_i);
3199 return IfThenElse(RebindMask(d, is_i), Set(d, t), v);
3200}
3201
3202// ------------------------------ OddEven
3203
3204namespace detail {
3205
3206// Faster version using a wide constant instead of Iota0 + AndS.
3207template <class D, HWY_IF_NOT_T_SIZE_D(D, 8)>
3209 const RebindToUnsigned<decltype(d)> du;
3210 const RepartitionToWide<decltype(du)> duw;
3211 return RebindMask(d, detail::NeS(BitCast(du, Set(duw, 1)), 0u));
3212}
3213
3214template <class D, HWY_IF_T_SIZE_D(D, 8)>
3216 const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
3217 return detail::EqS(detail::AndS(detail::Iota0(du), 1), 0);
3218}
3219
3220// Also provide the negated form because there is no native CompressNot.
3221template <class D, HWY_IF_NOT_T_SIZE_D(D, 8)>
3223 const RebindToUnsigned<decltype(d)> du;
3224 const RepartitionToWide<decltype(du)> duw;
3225 return RebindMask(d, detail::EqS(BitCast(du, Set(duw, 1)), 0u));
3226}
3227
3228template <class D, HWY_IF_T_SIZE_D(D, 8)>
3230 const RebindToUnsigned<decltype(d)> du; // Iota0 is unsigned only
3231 return detail::NeS(detail::AndS(detail::Iota0(du), 1), 0);
3232}
3233
3234} // namespace detail
3235
3236template <class V>
3237HWY_API V OddEven(const V a, const V b) {
3238 return IfThenElse(detail::IsEven(DFromV<V>()), b, a);
3239}
3240
3241// ------------------------------ DupEven (OddEven)
3242template <class V>
3243HWY_API V DupEven(const V v) {
3244 const V up = detail::Slide1Up(v);
3245 return OddEven(up, v);
3246}
3247
3248// ------------------------------ DupOdd (OddEven)
3249template <class V>
3250HWY_API V DupOdd(const V v) {
3251 const V down = detail::Slide1Down(v);
3252 return OddEven(v, down);
3253}
3254
3255// ------------------------------ InterleaveEven (OddEven)
3256template <class D>
3258 return OddEven(detail::Slide1Up(b), a);
3259}
3260
3261// ------------------------------ InterleaveOdd (OddEven)
3262template <class D>
3264 return OddEven(b, detail::Slide1Down(a));
3265}
3266
3267// ------------------------------ OddEvenBlocks
3268template <class V>
3269HWY_API V OddEvenBlocks(const V a, const V b) {
3270 const RebindToUnsigned<DFromV<V>> du; // Iota0 is unsigned only
3271 constexpr size_t kShift = CeilLog2(16 / sizeof(TFromV<V>));
3272 const auto idx_block = ShiftRight<kShift>(detail::Iota0(du));
3273 const auto is_even = detail::EqS(detail::AndS(idx_block, 1), 0);
3274 return IfThenElse(is_even, b, a);
3275}
3276
3277// ------------------------------ SwapAdjacentBlocks
3278template <class V>
3279HWY_API V SwapAdjacentBlocks(const V v) {
3280 const DFromV<V> d;
3281 const size_t lpb = detail::LanesPerBlock(d);
3282 const V down = detail::SlideDown(v, lpb);
3283 const V up = detail::SlideUp(v, v, lpb);
3284 return OddEvenBlocks(up, down);
3285}
3286
3287// ------------------------------ TableLookupLanes
3288
3289template <class D, class VI>
3291 static_assert(sizeof(TFromD<D>) == sizeof(TFromV<VI>), "Index != lane");
3292 const RebindToUnsigned<decltype(d)> du; // instead of <D>: avoids unused d.
3293 const auto indices = BitCast(du, vec);
3294#if HWY_IS_DEBUG_BUILD
3295 using TU = TFromD<decltype(du)>;
3296 const size_t twice_num_of_lanes = Lanes(d) * 2;
3298 du, Eq(indices,
3299 detail::AndS(indices, static_cast<TU>(twice_num_of_lanes - 1)))));
3300#endif
3301 return indices;
3302}
3303
3304template <class D, typename TI>
3306 static_assert(sizeof(TFromD<D>) == sizeof(TI), "Index size must match lane");
3307 return IndicesFromVec(d, LoadU(Rebind<TI, D>(), idx));
3308}
3309
3310#define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
3311 MLEN, NAME, OP) \
3312 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3313 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \
3314 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, idx, \
3315 HWY_RVV_AVL(SEW, SHIFT)); \
3316 }
3317
3318// TableLookupLanes is supported for all types, but beware that indices are
3319// likely to wrap around for 8-bit lanes. When using TableLookupLanes inside
3320// this file, ensure that it is safe or use TableLookupLanes16 instead.
3322#undef HWY_RVV_TABLE
3323
3324namespace detail {
3325
3326#define HWY_RVV_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
3327 SHIFT, MLEN, NAME, OP) \
3328 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3329 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEWD, LMULD) idx) { \
3330 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, idx, \
3331 HWY_RVV_AVL(SEW, SHIFT)); \
3332 }
3333
3334HWY_RVV_FOREACH_UI08(HWY_RVV_TABLE16, TableLookupLanes16, rgatherei16, _EXT)
3335#undef HWY_RVV_TABLE16
3336
3337// Used by Expand.
3338#define HWY_RVV_MASKED_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
3339 SHIFT, MLEN, NAME, OP) \
3340 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3341 NAME(HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff, \
3342 HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEW, LMUL) idx) { \
3343 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(mask, maskedoff, v, idx, \
3344 HWY_RVV_AVL(SEW, SHIFT)); \
3345 }
3346
3347HWY_RVV_FOREACH(HWY_RVV_MASKED_TABLE, MaskedTableLookupLanes, rgather, _ALL)
3348#undef HWY_RVV_MASKED_TABLE
3349
3350#define HWY_RVV_MASKED_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
3351 LMULH, SHIFT, MLEN, NAME, OP) \
3352 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3353 NAME(HWY_RVV_M(MLEN) mask, HWY_RVV_V(BASE, SEW, LMUL) maskedoff, \
3354 HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_V(uint, SEWD, LMULD) idx) { \
3355 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL##_mu(mask, maskedoff, v, idx, \
3356 HWY_RVV_AVL(SEW, SHIFT)); \
3357 }
3358
3359HWY_RVV_FOREACH_UI08(HWY_RVV_MASKED_TABLE16, MaskedTableLookupLanes16,
3360 rgatherei16, _EXT)
3361#undef HWY_RVV_MASKED_TABLE16
3362
3363} // namespace detail
3364
3365// ------------------------------ Reverse (TableLookupLanes)
3366template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_POW2_LE_D(D, 2)>
3368 const Rebind<uint16_t, decltype(d)> du16;
3369 const size_t N = Lanes(d);
3370 const auto idx =
3371 detail::ReverseSubS(detail::Iota0(du16), static_cast<uint16_t>(N - 1));
3372 return detail::TableLookupLanes16(v, idx);
3373}
3374
3375template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_POW2_GT_D(D, 2)>
3377 const Half<decltype(d)> dh;
3378 const Rebind<uint16_t, decltype(dh)> du16;
3379 const size_t half_n = Lanes(dh);
3380 const auto idx = detail::ReverseSubS(detail::Iota0(du16),
3381 static_cast<uint16_t>(half_n - 1));
3382 const auto reversed_lo = detail::TableLookupLanes16(LowerHalf(dh, v), idx);
3383 const auto reversed_hi = detail::TableLookupLanes16(UpperHalf(dh, v), idx);
3384 return Combine(d, reversed_lo, reversed_hi);
3385}
3386
3387template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
3388HWY_API VFromD<D> Reverse(D /* tag */, VFromD<D> v) {
3389 const RebindToUnsigned<D> du;
3390 using TU = TFromD<decltype(du)>;
3391 const size_t N = Lanes(du);
3392 const auto idx =
3393 detail::ReverseSubS(detail::Iota0(du), static_cast<TU>(N - 1));
3394 return TableLookupLanes(v, idx);
3395}
3396
3397// ------------------------------ ResizeBitCast
3398
3399// Extends or truncates a vector to match the given d.
3400namespace detail {
3401
3402template <class D>
3404 return v;
3405}
3406
3407// Sanity check: when calling ChangeLMUL, the caller (ResizeBitCast) already
3408// BitCast to the same lane type. Note that V may use the native lane type for
3409// f16, so convert D to that before checking.
3410#define HWY_RVV_IF_SAME_T_DV(D, V) \
3411 hwy::EnableIf<IsSame<NativeLaneType<TFromD<D>>, TFromV<V>>()>* = nullptr
3412
3413// LMUL of VFromD<D> < LMUL of V: need to truncate v
3414template <class D, class V, // HWY_RVV_IF_SAME_T_DV(D, V),
3415 HWY_IF_POW2_LE_D(DFromV<VFromD<D>>, DFromV<V>().Pow2() - 1)>
3417 const DFromV<V> d_from;
3418 const Half<decltype(d_from)> dh_from;
3419 static_assert(
3420 DFromV<VFromD<decltype(dh_from)>>().Pow2() < DFromV<V>().Pow2(),
3421 "The LMUL of VFromD<decltype(dh_from)> must be less than the LMUL of V");
3422 static_assert(
3423 DFromV<VFromD<D>>().Pow2() <= DFromV<VFromD<decltype(dh_from)>>().Pow2(),
3424 "The LMUL of VFromD<D> must be less than or equal to the LMUL of "
3425 "VFromD<decltype(dh_from)>");
3426 return ChangeLMUL(d, Trunc(v));
3427}
3428
3429// LMUL of VFromD<D> > LMUL of V: need to extend v
3430template <class D, class V, // HWY_RVV_IF_SAME_T_DV(D, V),
3432HWY_INLINE VFromD<D> ChangeLMUL(D d, V v) {
3433 const DFromV<V> d_from;
3434 const Twice<decltype(d_from)> dt_from;
3435 static_assert(DFromV<VFromD<decltype(dt_from)>>().Pow2() > DFromV<V>().Pow2(),
3436 "The LMUL of VFromD<decltype(dt_from)> must be greater than "
3437 "the LMUL of V");
3438 static_assert(
3439 DFromV<VFromD<D>>().Pow2() >= DFromV<VFromD<decltype(dt_from)>>().Pow2(),
3440 "The LMUL of VFromD<D> must be greater than or equal to the LMUL of "
3441 "VFromD<decltype(dt_from)>");
3442 return ChangeLMUL(d, Ext(dt_from, v));
3443}
3444
3445#undef HWY_RVV_IF_SAME_T_DV
3446
3447} // namespace detail
3448
3449template <class DTo, class VFrom>
3450HWY_API VFromD<DTo> ResizeBitCast(DTo /*dto*/, VFrom v) {
3451 const DFromV<decltype(v)> d_from;
3452 const Repartition<uint8_t, decltype(d_from)> du8_from;
3453 const DFromV<VFromD<DTo>> d_to;
3454 const Repartition<uint8_t, decltype(d_to)> du8_to;
3455 return BitCast(d_to, detail::ChangeLMUL(du8_to, BitCast(du8_from, v)));
3456}
3457
3458// ------------------------------ Reverse2 (RotateRight, OddEven)
3459
3460// Per-target flags to prevent generic_ops-inl.h defining 8-bit Reverse2/4/8.
3461#ifdef HWY_NATIVE_REVERSE2_8
3462#undef HWY_NATIVE_REVERSE2_8
3463#else
3464#define HWY_NATIVE_REVERSE2_8
3465#endif
3466
3467// Shifting and adding requires fewer instructions than blending, but casting to
3468// u32 only works for LMUL in [1/2, 8].
3469
3470template <class D, HWY_IF_T_SIZE_D(D, 1)>
3471HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
3472 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint16_t, D>> du16;
3473 return ResizeBitCast(d, RotateRight<8>(ResizeBitCast(du16, v)));
3474}
3475
3476template <class D, HWY_IF_T_SIZE_D(D, 2)>
3477HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
3478 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint32_t, D>> du32;
3479 return ResizeBitCast(d, RotateRight<16>(ResizeBitCast(du32, v)));
3480}
3481
3482// Shifting and adding requires fewer instructions than blending, but casting to
3483// u64 does not work for LMUL < 1.
3484template <class D, HWY_IF_T_SIZE_D(D, 4)>
3485HWY_API VFromD<D> Reverse2(D d, const VFromD<D> v) {
3486 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
3487 return ResizeBitCast(d, RotateRight<32>(ResizeBitCast(du64, v)));
3488}
3489
3490template <class D, class V = VFromD<D>, HWY_IF_T_SIZE_D(D, 8)>
3491HWY_API V Reverse2(D /* tag */, const V v) {
3492 const V up = detail::Slide1Up(v);
3493 const V down = detail::Slide1Down(v);
3494 return OddEven(up, down);
3495}
3496
3497// ------------------------------ Reverse4 (TableLookupLanes)
3498
3499template <class D, HWY_IF_T_SIZE_D(D, 1)>
3500HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
3501 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint16_t, D>> du16;
3502 return ResizeBitCast(d, Reverse2(du16, ResizeBitCast(du16, Reverse2(d, v))));
3503}
3504
3505template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
3506HWY_API VFromD<D> Reverse4(D d, const VFromD<D> v) {
3507 const RebindToUnsigned<D> du;
3508 const auto idx = detail::XorS(detail::Iota0(du), 3);
3509 return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
3510}
3511
3512// ------------------------------ Reverse8 (TableLookupLanes)
3513
3514template <class D, HWY_IF_T_SIZE_D(D, 1)>
3515HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
3516 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint32_t, D>> du32;
3517 return ResizeBitCast(d, Reverse2(du32, ResizeBitCast(du32, Reverse4(d, v))));
3518}
3519
3520template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
3521HWY_API VFromD<D> Reverse8(D d, const VFromD<D> v) {
3522 const RebindToUnsigned<D> du;
3523 const auto idx = detail::XorS(detail::Iota0(du), 7);
3524 return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
3525}
3526
3527// ------------------------------ ReverseBlocks (Reverse, Shuffle01)
3528template <class D, class V = VFromD<D>>
3529HWY_API V ReverseBlocks(D d, V v) {
3530 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
3531 const size_t N = Lanes(du64);
3532 const auto rev =
3533 detail::ReverseSubS(detail::Iota0(du64), static_cast<uint64_t>(N - 1));
3534 // Swap lo/hi u64 within each block
3535 const auto idx = detail::XorS(rev, 1);
3536 return ResizeBitCast(d, TableLookupLanes(ResizeBitCast(du64, v), idx));
3537}
3538
3539// ------------------------------ Compress
3540
3541// RVV supports all lane types natively.
3542#ifdef HWY_NATIVE_COMPRESS8
3543#undef HWY_NATIVE_COMPRESS8
3544#else
3545#define HWY_NATIVE_COMPRESS8
3546#endif
3547
3548template <typename T>
3549struct CompressIsPartition {
3550 enum { value = 0 };
3551};
3552
3553#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
3554 SHIFT, MLEN, NAME, OP) \
3555 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3556 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, HWY_RVV_M(MLEN) mask) { \
3557 return __riscv_v##OP##_vm_##CHAR##SEW##LMUL(v, mask, \
3558 HWY_RVV_AVL(SEW, SHIFT)); \
3559 }
3560
3561HWY_RVV_FOREACH(HWY_RVV_COMPRESS, Compress, compress, _ALL)
3562#undef HWY_RVV_COMPRESS
3563
3564// ------------------------------ Expand
3565
3566#ifdef HWY_NATIVE_EXPAND
3567#undef HWY_NATIVE_EXPAND
3568#else
3569#define HWY_NATIVE_EXPAND
3570#endif
3571
3572// >= 2-byte lanes: idx lanes will not overflow.
3573template <class V, class M, HWY_IF_NOT_T_SIZE_V(V, 1)>
3574HWY_API V Expand(V v, const M mask) {
3575 const DFromV<V> d;
3576 const RebindToUnsigned<decltype(d)> du;
3577 const auto idx = detail::MaskedIota(du, RebindMask(du, mask));
3578 const V zero = Zero(d);
3579 return detail::MaskedTableLookupLanes(mask, zero, v, idx);
3580}
3581
3582// 1-byte lanes, LMUL < 8: promote idx to u16.
3583template <class V, class M, HWY_IF_T_SIZE_V(V, 1), class D = DFromV<V>,
3584 HWY_IF_POW2_LE_D(D, 2)>
3585HWY_API V Expand(V v, const M mask) {
3586 const D d;
3587 const Rebind<uint16_t, decltype(d)> du16;
3588 const auto idx = detail::MaskedIota(du16, RebindMask(du16, mask));
3589 const V zero = Zero(d);
3590 return detail::MaskedTableLookupLanes16(mask, zero, v, idx);
3591}
3592
3593// 1-byte lanes, max LMUL: unroll 2x.
3594template <class V, class M, HWY_IF_T_SIZE_V(V, 1), class D = DFromV<V>,
3595 HWY_IF_POW2_GT_D(DFromV<V>, 2)>
3596HWY_API V Expand(V v, const M mask) {
3597 const D d;
3598 const Half<D> dh;
3599 const auto v0 = LowerHalf(dh, v);
3600 // TODO(janwas): skip vec<->mask if we can cast masks.
3601 const V vmask = VecFromMask(d, mask);
3602 const auto m0 = MaskFromVec(LowerHalf(dh, vmask));
3603
3604 // Cannot just use UpperHalf, must shift by the number of inputs consumed.
3605 const size_t count = CountTrue(dh, m0);
3606 const auto v1 = detail::Trunc(detail::SlideDown(v, count));
3607 const auto m1 = MaskFromVec(UpperHalf(dh, vmask));
3608 return Combine(d, Expand(v1, m1), Expand(v0, m0));
3609}
3610
3611// ------------------------------ LoadExpand
3612template <class D>
3613HWY_API VFromD<D> LoadExpand(MFromD<D> mask, D d,
3614 const TFromD<D>* HWY_RESTRICT unaligned) {
3615 return Expand(LoadU(d, unaligned), mask);
3616}
3617
3618// ------------------------------ CompressNot
3619template <class V, class M>
3620HWY_API V CompressNot(V v, const M mask) {
3621 return Compress(v, Not(mask));
3622}
3623
3624// ------------------------------ CompressBlocksNot
3625template <class V, class M>
3626HWY_API V CompressBlocksNot(V v, const M mask) {
3627 return CompressNot(v, mask);
3628}
3629
3630// ------------------------------ CompressStore
3631template <class V, class M, class D>
3632HWY_API size_t CompressStore(const V v, const M mask, const D d,
3633 TFromD<D>* HWY_RESTRICT unaligned) {
3634 StoreU(Compress(v, mask), d, unaligned);
3635 return CountTrue(d, mask);
3636}
3637
3638// ------------------------------ CompressBlendedStore
3639template <class V, class M, class D>
3640HWY_API size_t CompressBlendedStore(const V v, const M mask, const D d,
3641 TFromD<D>* HWY_RESTRICT unaligned) {
3642 const size_t count = CountTrue(d, mask);
3643 StoreN(Compress(v, mask), d, unaligned, count);
3644 return count;
3645}
3646
3647// ================================================== COMPARE (2)
3648
3649// ------------------------------ FindLastTrue
3650
3651template <class D>
3652HWY_API intptr_t FindLastTrue(D d, MFromD<D> m) {
3653 const RebindToSigned<decltype(d)> di;
3654 const intptr_t fft_rev_idx =
3655 FindFirstTrue(d, MaskFromVec(Reverse(di, VecFromMask(di, m))));
3656 return (fft_rev_idx >= 0)
3657 ? (static_cast<intptr_t>(Lanes(d) - 1) - fft_rev_idx)
3658 : intptr_t{-1};
3659}
3660
3661template <class D>
3662HWY_API size_t FindKnownLastTrue(D d, MFromD<D> m) {
3663 const RebindToSigned<decltype(d)> di;
3664 const size_t fft_rev_idx =
3666 return Lanes(d) - 1 - fft_rev_idx;
3667}
3668
3669// ------------------------------ ConcatOdd (Compress)
3670
3671namespace detail {
3672
3673#define HWY_RVV_NARROW(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
3674 MLEN, NAME, OP) \
3675 template <size_t kShift> \
3676 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEWD, LMULD) v) { \
3677 return __riscv_v##OP##_wx_##CHAR##SEW##LMUL(v, kShift, \
3678 HWY_RVV_AVL(SEWD, SHIFT + 1)); \
3679 }
3680
3681HWY_RVV_FOREACH_U08(HWY_RVV_NARROW, Narrow, nsrl, _EXT)
3682HWY_RVV_FOREACH_U16(HWY_RVV_NARROW, Narrow, nsrl, _EXT)
3683HWY_RVV_FOREACH_U32(HWY_RVV_NARROW, Narrow, nsrl, _EXT)
3684#undef HWY_RVV_NARROW
3685
3686} // namespace detail
3687
3688// Casting to wider and narrowing is the fastest for < 64-bit lanes.
3689template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
3690HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
3691 constexpr size_t kBits = sizeof(TFromD<D>) * 8;
3692 const Twice<decltype(d)> dt;
3693 const RepartitionToWide<RebindToUnsigned<decltype(dt)>> dtuw;
3694 const VFromD<decltype(dtuw)> hl = BitCast(dtuw, Combine(dt, hi, lo));
3695 return BitCast(d, detail::Narrow<kBits>(hl));
3696}
3697
3698// 64-bit: Combine+Compress.
3699template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
3700HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
3701 const Twice<decltype(d)> dt;
3702 const VFromD<decltype(dt)> hl = Combine(dt, hi, lo);
3703 return LowerHalf(d, Compress(hl, detail::IsOdd(dt)));
3704}
3705
3706// Any type, max LMUL: Compress both, then Combine.
3707template <class D, HWY_IF_POW2_GT_D(D, 2)>
3708HWY_API VFromD<D> ConcatOdd(D d, VFromD<D> hi, VFromD<D> lo) {
3709 const Half<decltype(d)> dh;
3710 const MFromD<D> is_odd = detail::IsOdd(d);
3711 const VFromD<decltype(d)> hi_odd = Compress(hi, is_odd);
3712 const VFromD<decltype(d)> lo_odd = Compress(lo, is_odd);
3713 return Combine(d, LowerHalf(dh, hi_odd), LowerHalf(dh, lo_odd));
3714}
3715
3716// ------------------------------ ConcatEven (Compress)
3717
3718// Casting to wider and narrowing is the fastest for < 64-bit lanes.
3719template <class D, HWY_IF_NOT_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
3720HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
3721 const Twice<decltype(d)> dt;
3722 const RepartitionToWide<RebindToUnsigned<decltype(dt)>> dtuw;
3723 const VFromD<decltype(dtuw)> hl = BitCast(dtuw, Combine(dt, hi, lo));
3724 return BitCast(d, detail::Narrow<0>(hl));
3725}
3726
3727// 64-bit: Combine+Compress.
3728template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
3729HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
3730 const Twice<decltype(d)> dt;
3731 const VFromD<decltype(dt)> hl = Combine(dt, hi, lo);
3732 return LowerHalf(d, Compress(hl, detail::IsEven(dt)));
3733}
3734
3735// Any type, max LMUL: Compress both, then Combine.
3736template <class D, HWY_IF_POW2_GT_D(D, 2)>
3737HWY_API VFromD<D> ConcatEven(D d, VFromD<D> hi, VFromD<D> lo) {
3738 const Half<decltype(d)> dh;
3739 const MFromD<D> is_even = detail::IsEven(d);
3740 const VFromD<decltype(d)> hi_even = Compress(hi, is_even);
3741 const VFromD<decltype(d)> lo_even = Compress(lo, is_even);
3742 return Combine(d, LowerHalf(dh, hi_even), LowerHalf(dh, lo_even));
3743}
3744
3745// ================================================== BLOCKWISE
3746
3747// ------------------------------ CombineShiftRightBytes
3748template <size_t kBytes, class D, class V = VFromD<D>>
3749HWY_API V CombineShiftRightBytes(const D d, const V hi, V lo) {
3750 const Repartition<uint8_t, decltype(d)> d8;
3751 const auto hi8 = BitCast(d8, hi);
3752 const auto lo8 = BitCast(d8, lo);
3753 const auto hi_up = detail::SlideUp(hi8, hi8, 16 - kBytes);
3754 const auto lo_down = detail::SlideDown(lo8, kBytes);
3755 const auto is_lo = detail::FirstNPerBlock<16 - kBytes>(d8);
3756 return BitCast(d, IfThenElse(is_lo, lo_down, hi_up));
3757}
3758
3759// ------------------------------ CombineShiftRightLanes
3760template <size_t kLanes, class D, class V = VFromD<D>>
3761HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo) {
3762 constexpr size_t kLanesUp = 16 / sizeof(TFromV<V>) - kLanes;
3763 const auto hi_up = detail::SlideUp(hi, hi, kLanesUp);
3764 const auto lo_down = detail::SlideDown(lo, kLanes);
3765 const auto is_lo = detail::FirstNPerBlock<kLanesUp>(d);
3766 return IfThenElse(is_lo, lo_down, hi_up);
3767}
3768
3769// ------------------------------ Shuffle2301 (ShiftLeft)
3770template <class V>
3771HWY_API V Shuffle2301(const V v) {
3772 const DFromV<V> d;
3773 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
3774 const Repartition<uint64_t, decltype(d)> du64;
3775 const auto v64 = BitCast(du64, v);
3776 return BitCast(d, Or(ShiftRight<32>(v64), ShiftLeft<32>(v64)));
3777}
3778
3779// ------------------------------ Shuffle2103
3780template <class V>
3781HWY_API V Shuffle2103(const V v) {
3782 const DFromV<V> d;
3783 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
3784 return CombineShiftRightLanes<3>(d, v, v);
3785}
3786
3787// ------------------------------ Shuffle0321
3788template <class V>
3789HWY_API V Shuffle0321(const V v) {
3790 const DFromV<V> d;
3791 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
3792 return CombineShiftRightLanes<1>(d, v, v);
3793}
3794
3795// ------------------------------ Shuffle1032
3796template <class V>
3797HWY_API V Shuffle1032(const V v) {
3798 const DFromV<V> d;
3799 static_assert(sizeof(TFromD<decltype(d)>) == 4, "Defined for 32-bit types");
3800 return CombineShiftRightLanes<2>(d, v, v);
3801}
3802
3803// ------------------------------ Shuffle01
3804template <class V>
3805HWY_API V Shuffle01(const V v) {
3806 const DFromV<V> d;
3807 static_assert(sizeof(TFromD<decltype(d)>) == 8, "Defined for 64-bit types");
3808 return CombineShiftRightLanes<1>(d, v, v);
3809}
3810
3811// ------------------------------ Shuffle0123
3812template <class V>
3813HWY_API V Shuffle0123(const V v) {
3814 return Shuffle2301(Shuffle1032(v));
3815}
3816
3817// ------------------------------ TableLookupBytes
3818
3819template <class VT, class VI>
3820HWY_API VI TableLookupBytes(const VT vt, const VI vi) {
3821 const DFromV<VT> dt; // T=table, I=index.
3822 const DFromV<VI> di;
3823 const Repartition<uint8_t, decltype(dt)> dt8;
3824 const Repartition<uint8_t, decltype(di)> di8;
3825 // Required for producing half-vectors with table lookups from a full vector.
3826 // If we instead run at the LMUL of the index vector, lookups into the table
3827 // would be truncated. Thus we run at the larger of the two LMULs and truncate
3828 // the result vector to the original index LMUL.
3829 constexpr int kPow2T = dt8.Pow2();
3830 constexpr int kPow2I = di8.Pow2();
3831 const Simd<uint8_t, MaxLanes(di8), HWY_MAX(kPow2T, kPow2I)> dm8; // m=max
3832 const auto vmt = detail::ChangeLMUL(dm8, BitCast(dt8, vt));
3833 const auto vmi = detail::ChangeLMUL(dm8, BitCast(di8, vi));
3834 auto offsets = detail::OffsetsOf128BitBlocks(dm8, detail::Iota0(dm8));
3835 // If the table is shorter, wrap around offsets so they do not reference
3836 // undefined lanes in the newly extended vmt.
3837 if (kPow2T < kPow2I) {
3838 offsets = detail::AndS(offsets, static_cast<uint8_t>(Lanes(dt8) - 1));
3839 }
3840 const auto out = TableLookupLanes(vmt, Add(vmi, offsets));
3841 return BitCast(di, detail::ChangeLMUL(di8, out));
3842}
3843
3844template <class VT, class VI>
3845HWY_API VI TableLookupBytesOr0(const VT vt, const VI idx) {
3846 const DFromV<VI> di;
3847 const Repartition<int8_t, decltype(di)> di8;
3848 const auto idx8 = BitCast(di8, idx);
3849 const auto lookup = TableLookupBytes(vt, idx8);
3850 return BitCast(di, IfThenZeroElse(detail::LtS(idx8, 0), lookup));
3851}
3852
3853// ------------------------------ TwoTablesLookupLanes
3854
3855// WARNING: 8-bit lanes may lead to unexpected results because idx is the same
3856// size and may overflow.
3857template <class D, HWY_IF_POW2_LE_D(D, 2)>
3860 const Twice<decltype(d)> dt;
3861 const RebindToUnsigned<decltype(dt)> dt_u;
3862 const auto combined_tbl = Combine(dt, b, a);
3863 const auto combined_idx = Combine(dt_u, idx, idx);
3864 return LowerHalf(d, TableLookupLanes(combined_tbl, combined_idx));
3865}
3866
3867template <class D, HWY_IF_POW2_GT_D(D, 2)>
3868HWY_API VFromD<D> TwoTablesLookupLanes(D d, VFromD<D> a, VFromD<D> b,
3869 VFromD<RebindToUnsigned<D>> idx) {
3870 const RebindToUnsigned<decltype(d)> du;
3871 using TU = TFromD<decltype(du)>;
3872
3873 const size_t num_of_lanes = Lanes(d);
3874 const auto idx_mod = detail::AndS(idx, static_cast<TU>(num_of_lanes - 1));
3875 const auto sel_a_mask = Ne(idx, idx_mod); // FALSE if a
3876
3877 const auto a_lookup_result = TableLookupLanes(a, idx_mod);
3878 return detail::MaskedTableLookupLanes(sel_a_mask, a_lookup_result, b,
3879 idx_mod);
3880}
3881
3882template <class V>
3883HWY_API V TwoTablesLookupLanes(V a, V b,
3884 VFromD<RebindToUnsigned<DFromV<V>>> idx) {
3885 const DFromV<decltype(a)> d;
3886 return TwoTablesLookupLanes(d, a, b, idx);
3887}
3888
3889// ------------------------------ Broadcast
3890
3891// 8-bit requires 16-bit tables.
3892template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
3893 HWY_IF_POW2_LE_D(D, 2)>
3894HWY_API V Broadcast(const V v) {
3895 const D d;
3896 HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
3897
3898 const Rebind<uint16_t, decltype(d)> du16;
3899 VFromD<decltype(du16)> idx =
3900 detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16));
3901 if (kLane != 0) {
3902 idx = detail::AddS(idx, kLane);
3903 }
3904 return detail::TableLookupLanes16(v, idx);
3905}
3906
3907// 8-bit and max LMUL: split into halves.
3908template <int kLane, class V, class D = DFromV<V>, HWY_IF_T_SIZE_D(D, 1),
3909 HWY_IF_POW2_GT_D(D, 2)>
3910HWY_API V Broadcast(const V v) {
3911 const D d;
3912 HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
3913
3914 const Half<decltype(d)> dh;
3915 using VH = VFromD<decltype(dh)>;
3916 const Rebind<uint16_t, decltype(dh)> du16;
3917 VFromD<decltype(du16)> idx =
3918 detail::OffsetsOf128BitBlocks(d, detail::Iota0(du16));
3919 if (kLane != 0) {
3920 idx = detail::AddS(idx, kLane);
3921 }
3922 const VH lo = detail::TableLookupLanes16(LowerHalf(dh, v), idx);
3923 const VH hi = detail::TableLookupLanes16(UpperHalf(dh, v), idx);
3924 return Combine(d, hi, lo);
3925}
3926
3927template <int kLane, class V, class D = DFromV<V>,
3928 HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 2) | (1 << 4) | (1 << 8))>
3929HWY_API V Broadcast(const V v) {
3930 const D d;
3931 HWY_DASSERT(0 <= kLane && kLane < detail::LanesPerBlock(d));
3932
3933 const RebindToUnsigned<decltype(d)> du;
3934 auto idx = detail::OffsetsOf128BitBlocks(d, detail::Iota0(du));
3935 if (kLane != 0) {
3936 idx = detail::AddS(idx, kLane);
3937 }
3938 return TableLookupLanes(v, idx);
3939}
3940
3941// ------------------------------ BroadcastLane
3942#ifdef HWY_NATIVE_BROADCASTLANE
3943#undef HWY_NATIVE_BROADCASTLANE
3944#else
3945#define HWY_NATIVE_BROADCASTLANE
3946#endif
3947
3948namespace detail {
3949
3950#define HWY_RVV_BROADCAST_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, \
3951 LMULH, SHIFT, MLEN, NAME, OP) \
3952 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
3953 NAME(HWY_RVV_V(BASE, SEW, LMUL) v, size_t idx) { \
3954 return __riscv_v##OP##_vx_##CHAR##SEW##LMUL(v, idx, \
3955 HWY_RVV_AVL(SEW, SHIFT)); \
3956 }
3957
3958HWY_RVV_FOREACH(HWY_RVV_BROADCAST_LANE, BroadcastLane, rgather, _ALL)
3959#undef HWY_RVV_BROADCAST_LANE
3960
3961} // namespace detail
3962
3963template <int kLane, class V>
3964HWY_API V BroadcastLane(V v) {
3965 static_assert(0 <= kLane && kLane < HWY_MAX_LANES_V(V), "Invalid lane");
3966 return detail::BroadcastLane(v, static_cast<size_t>(kLane));
3967}
3968
3969// ------------------------------ InsertBlock
3970#ifdef HWY_NATIVE_BLK_INSERT_EXTRACT
3971#undef HWY_NATIVE_BLK_INSERT_EXTRACT
3972#else
3973#define HWY_NATIVE_BLK_INSERT_EXTRACT
3974#endif
3975
3976template <int kBlockIdx, class V>
3978 const DFromV<decltype(v)> d;
3979 using TU = If<(sizeof(TFromV<V>) == 1 && DFromV<V>().Pow2() >= -2), uint16_t,
3981 using TIdx = If<sizeof(TU) == 1, uint16_t, TU>;
3982
3983 const Repartition<TU, decltype(d)> du;
3984 const Rebind<TIdx, decltype(du)> d_idx;
3985 static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(),
3986 "Invalid block index");
3987 constexpr size_t kMaxLanesPerBlock = 16 / sizeof(TU);
3988
3989 constexpr size_t kBlkByteOffset =
3990 static_cast<size_t>(kBlockIdx) * kMaxLanesPerBlock;
3991 const auto vu = BitCast(du, v);
3992 const auto vblk = ResizeBitCast(du, blk_to_insert);
3993 const auto vblk_shifted = detail::SlideUp(vblk, vblk, kBlkByteOffset);
3994 const auto insert_mask = RebindMask(
3995 du, detail::LtS(detail::SubS(detail::Iota0(d_idx),
3996 static_cast<TIdx>(kBlkByteOffset)),
3997 static_cast<TIdx>(kMaxLanesPerBlock)));
3998
3999 return BitCast(d, IfThenElse(insert_mask, vblk_shifted, vu));
4000}
4001
4002// ------------------------------ BroadcastBlock
4003template <int kBlockIdx, class V, HWY_IF_POW2_LE_D(DFromV<V>, -3)>
4004HWY_API V BroadcastBlock(V v) {
4005 const DFromV<decltype(v)> d;
4006 const Repartition<uint8_t, decltype(d)> du8;
4007 const Rebind<uint16_t, decltype(d)> du16;
4008
4009 static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(),
4010 "Invalid block index");
4011
4012 const auto idx = detail::AddS(detail::AndS(detail::Iota0(du16), uint16_t{15}),
4013 static_cast<uint16_t>(kBlockIdx * 16));
4014 return BitCast(d, detail::TableLookupLanes16(BitCast(du8, v), idx));
4015}
4016
4017template <int kBlockIdx, class V, HWY_IF_POW2_GT_D(DFromV<V>, -3)>
4018HWY_API V BroadcastBlock(V v) {
4019 const DFromV<decltype(v)> d;
4020 using TU = If<sizeof(TFromV<V>) == 1, uint16_t, MakeUnsigned<TFromV<V>>>;
4021 const Repartition<TU, decltype(d)> du;
4022
4023 static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(),
4024 "Invalid block index");
4025 constexpr size_t kMaxLanesPerBlock = 16 / sizeof(TU);
4026
4027 const auto idx = detail::AddS(
4028 detail::AndS(detail::Iota0(du), static_cast<TU>(kMaxLanesPerBlock - 1)),
4029 static_cast<TU>(static_cast<size_t>(kBlockIdx) * kMaxLanesPerBlock));
4030 return BitCast(d, TableLookupLanes(BitCast(du, v), idx));
4031}
4032
4033// ------------------------------ ExtractBlock
4034template <int kBlockIdx, class V>
4036 const DFromV<decltype(v)> d;
4037 const BlockDFromD<decltype(d)> d_block;
4038
4039 static_assert(0 <= kBlockIdx && kBlockIdx < d.MaxBlocks(),
4040 "Invalid block index");
4041 constexpr size_t kMaxLanesPerBlock = 16 / sizeof(TFromD<decltype(d)>);
4042 constexpr size_t kBlkByteOffset =
4043 static_cast<size_t>(kBlockIdx) * kMaxLanesPerBlock;
4044
4045 return ResizeBitCast(d_block, detail::SlideDown(v, kBlkByteOffset));
4046}
4047
4048// ------------------------------ ShiftLeftLanes
4049
4050template <size_t kLanes, class D, class V = VFromD<D>>
4051HWY_API V ShiftLeftLanes(const D d, const V v) {
4052 const RebindToSigned<decltype(d)> di;
4053 const RebindToUnsigned<decltype(d)> du;
4054 using TI = TFromD<decltype(di)>;
4055 const auto shifted = detail::SlideUp(v, v, kLanes);
4056 // Match x86 semantics by zeroing lower lanes in 128-bit blocks
4057 const auto idx_mod =
4058 detail::AndS(BitCast(di, detail::Iota0(du)),
4059 static_cast<TI>(detail::LanesPerBlock(di) - 1));
4060 const auto clear = detail::LtS(idx_mod, static_cast<TI>(kLanes));
4061 return IfThenZeroElse(clear, shifted);
4062}
4063
4064template <size_t kLanes, class V>
4065HWY_API V ShiftLeftLanes(const V v) {
4066 return ShiftLeftLanes<kLanes>(DFromV<V>(), v);
4067}
4068
4069// ------------------------------ ShiftLeftBytes
4070
4071template <int kBytes, class D>
4073 const Repartition<uint8_t, decltype(d)> d8;
4074 return BitCast(d, ShiftLeftLanes<kBytes>(BitCast(d8, v)));
4075}
4076
4077template <int kBytes, class V>
4078HWY_API V ShiftLeftBytes(const V v) {
4079 return ShiftLeftBytes<kBytes>(DFromV<V>(), v);
4080}
4081
4082// ------------------------------ ShiftRightLanes
4083template <size_t kLanes, typename T, size_t N, int kPow2,
4084 class V = VFromD<Simd<T, N, kPow2>>>
4086 const RebindToSigned<decltype(d)> di;
4087 const RebindToUnsigned<decltype(d)> du;
4088 using TI = TFromD<decltype(di)>;
4089 // For partial vectors, clear upper lanes so we shift in zeros.
4090 if (N <= 16 / sizeof(T)) {
4091 v = detail::SlideUp(v, Zero(d), N);
4092 }
4093
4094 const auto shifted = detail::SlideDown(v, kLanes);
4095 // Match x86 semantics by zeroing upper lanes in 128-bit blocks
4096 const size_t lpb = detail::LanesPerBlock(di);
4097 const auto idx_mod =
4098 detail::AndS(BitCast(di, detail::Iota0(du)), static_cast<TI>(lpb - 1));
4099 const auto keep = detail::LtS(idx_mod, static_cast<TI>(lpb - kLanes));
4100 return IfThenElseZero(keep, shifted);
4101}
4102
4103// ------------------------------ ShiftRightBytes
4104template <int kBytes, class D, class V = VFromD<D>>
4105HWY_API V ShiftRightBytes(const D d, const V v) {
4106 const Repartition<uint8_t, decltype(d)> d8;
4107 return BitCast(d, ShiftRightLanes<kBytes>(d8, BitCast(d8, v)));
4108}
4109
4110// ------------------------------ InterleaveWholeLower
4111#ifdef HWY_NATIVE_INTERLEAVE_WHOLE
4112#undef HWY_NATIVE_INTERLEAVE_WHOLE
4113#else
4114#define HWY_NATIVE_INTERLEAVE_WHOLE
4115#endif
4116
4117namespace detail {
4118// Returns double-length vector with interleaved lanes.
4119template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4)),
4120 HWY_IF_POW2_GT_D(D, -3)>
4122 const RebindToUnsigned<decltype(d)> du;
4123 using TW = MakeWide<TFromD<decltype(du)>>;
4124 const Rebind<TW, Half<decltype(du)>> dw;
4125 const Half<decltype(du)> duh; // cast inputs to unsigned so we zero-extend
4126
4127 const VFromD<decltype(dw)> aw = PromoteTo(dw, BitCast(duh, a));
4128 const VFromD<decltype(dw)> bw = PromoteTo(dw, BitCast(duh, b));
4129 return BitCast(d, Or(aw, BitCast(dw, detail::Slide1Up(BitCast(du, bw)))));
4130}
4131// 64-bit: cannot PromoteTo, but can Ext.
4132template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_LE_D(D, 2)>
4133HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
4134 const RebindToUnsigned<decltype(d)> du;
4135 const auto idx = ShiftRight<1>(detail::Iota0(du));
4136 return OddEven(TableLookupLanes(detail::Ext(d, b), idx),
4137 TableLookupLanes(detail::Ext(d, a), idx));
4138}
4139template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_POW2_GT_D(D, 2)>
4140HWY_API VFromD<D> InterleaveWhole(D d, VFromD<Half<D>> a, VFromD<Half<D>> b) {
4141 const Half<D> dh;
4142 const Half<decltype(dh)> dq;
4143 const VFromD<decltype(dh)> i0 =
4144 InterleaveWhole(dh, LowerHalf(dq, a), LowerHalf(dq, b));
4145 const VFromD<decltype(dh)> i1 =
4146 InterleaveWhole(dh, UpperHalf(dq, a), UpperHalf(dq, b));
4147 return Combine(d, i1, i0);
4148}
4149
4150} // namespace detail
4151
4152template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
4153HWY_API VFromD<D> InterleaveWholeLower(D d, VFromD<D> a, VFromD<D> b) {
4154 const RebindToUnsigned<decltype(d)> du;
4155 const detail::AdjustSimdTagToMinVecPow2<RepartitionToWide<decltype(du)>> dw;
4156 const RepartitionToNarrow<decltype(dw)> du_src;
4157
4158 const VFromD<D> aw =
4159 ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, a)));
4160 const VFromD<D> bw =
4161 ResizeBitCast(d, PromoteLowerTo(dw, ResizeBitCast(du_src, b)));
4162 return Or(aw, detail::Slide1Up(bw));
4163}
4164
4165template <class D, HWY_IF_T_SIZE_D(D, 8)>
4167 const RebindToUnsigned<decltype(d)> du;
4168 const auto idx = ShiftRight<1>(detail::Iota0(du));
4169 return OddEven(TableLookupLanes(b, idx), TableLookupLanes(a, idx));
4170}
4171
4172// ------------------------------ InterleaveWholeUpper
4173
4174template <class D, HWY_IF_T_SIZE_ONE_OF_D(D, (1 << 1) | (1 << 2) | (1 << 4))>
4175HWY_API VFromD<D> InterleaveWholeUpper(D d, VFromD<D> a, VFromD<D> b) {
4176 // Use Lanes(d) / 2 instead of Lanes(Half<D>()) as Lanes(Half<D>()) can only
4177 // be called if (d.Pow2() >= -2 && d.Pow2() == DFromV<VFromD<D>>().Pow2()) is
4178 // true and and as the results of InterleaveWholeUpper are
4179 // implementation-defined if Lanes(d) is less than 2.
4180 const size_t half_N = Lanes(d) / 2;
4181 return InterleaveWholeLower(d, detail::SlideDown(a, half_N),
4182 detail::SlideDown(b, half_N));
4183}
4184
4185template <class D, HWY_IF_T_SIZE_D(D, 8)>
4187 // Use Lanes(d) / 2 instead of Lanes(Half<D>()) as Lanes(Half<D>()) can only
4188 // be called if (d.Pow2() >= -2 && d.Pow2() == DFromV<VFromD<D>>().Pow2()) is
4189 // true and as the results of InterleaveWholeUpper are implementation-defined
4190 // if Lanes(d) is less than 2.
4191 const size_t half_N = Lanes(d) / 2;
4192 const RebindToUnsigned<decltype(d)> du;
4193 const auto idx = detail::AddS(ShiftRight<1>(detail::Iota0(du)),
4194 static_cast<uint64_t>(half_N));
4195 return OddEven(TableLookupLanes(b, idx), TableLookupLanes(a, idx));
4196}
4197
4198// ------------------------------ InterleaveLower (InterleaveWholeLower)
4199
4200namespace detail {
4201
4202// Definitely at least 128 bit: match x86 semantics (independent blocks). Using
4203// InterleaveWhole and 64-bit Compress avoids 8-bit overflow.
4204template <class D, class V, HWY_IF_POW2_LE_D(D, 2)>
4205HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b) {
4206 static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
4207 const Twice<D> dt;
4208 const RebindToUnsigned<decltype(dt)> dt_u;
4209 const VFromD<decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b);
4210 // Keep only even 128-bit blocks. This is faster than u64 ConcatEven
4211 // because we only have a single vector.
4212 constexpr size_t kShift = CeilLog2(16 / sizeof(TFromD<D>));
4213 const VFromD<decltype(dt_u)> idx_block =
4214 ShiftRight<kShift>(detail::Iota0(dt_u));
4215 const MFromD<decltype(dt_u)> is_even =
4216 detail::EqS(detail::AndS(idx_block, 1), 0);
4217 return BitCast(d, LowerHalf(Compress(BitCast(dt_u, interleaved), is_even)));
4218}
4219template <class D, class V, HWY_IF_POW2_GT_D(D, 2)>
4220HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b) {
4221 const Half<D> dh;
4222 const VFromD<decltype(dh)> i0 =
4223 InterleaveLowerBlocks(dh, LowerHalf(dh, a), LowerHalf(dh, b));
4224 const VFromD<decltype(dh)> i1 =
4225 InterleaveLowerBlocks(dh, UpperHalf(dh, a), UpperHalf(dh, b));
4226 return Combine(d, i1, i0);
4227}
4228
4229// As above, for the upper half of blocks.
4230template <class D, class V, HWY_IF_POW2_LE_D(D, 2)>
4231HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b) {
4232 static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
4233 const Twice<D> dt;
4234 const RebindToUnsigned<decltype(dt)> dt_u;
4235 const VFromD<decltype(dt)> interleaved = detail::InterleaveWhole(dt, a, b);
4236 // Keep only odd 128-bit blocks. This is faster than u64 ConcatEven
4237 // because we only have a single vector.
4238 constexpr size_t kShift = CeilLog2(16 / sizeof(TFromD<D>));
4239 const VFromD<decltype(dt_u)> idx_block =
4240 ShiftRight<kShift>(detail::Iota0(dt_u));
4241 const MFromD<decltype(dt_u)> is_odd =
4242 detail::EqS(detail::AndS(idx_block, 1), 1);
4243 return BitCast(d, LowerHalf(Compress(BitCast(dt_u, interleaved), is_odd)));
4244}
4245template <class D, class V, HWY_IF_POW2_GT_D(D, 2)>
4246HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b) {
4247 const Half<D> dh;
4248 const VFromD<decltype(dh)> i0 =
4249 InterleaveUpperBlocks(dh, LowerHalf(dh, a), LowerHalf(dh, b));
4250 const VFromD<decltype(dh)> i1 =
4251 InterleaveUpperBlocks(dh, UpperHalf(dh, a), UpperHalf(dh, b));
4252 return Combine(d, i1, i0);
4253}
4254
4255// RVV vectors are at least 128 bit when there is no fractional LMUL nor cap.
4256// Used by functions with per-block behavior such as InterleaveLower.
4257template <typename T, size_t N, int kPow2>
4258constexpr bool IsGE128(Simd<T, N, kPow2> /* d */) {
4259 return N * sizeof(T) >= 16 && kPow2 >= 0;
4260}
4261
4262// Definitely less than 128-bit only if there is a small cap; fractional LMUL
4263// might not be enough if vectors are large.
4264template <typename T, size_t N, int kPow2>
4265constexpr bool IsLT128(Simd<T, N, kPow2> /* d */) {
4266 return N * sizeof(T) < 16;
4267}
4268
4269} // namespace detail
4270
4271#define HWY_RVV_IF_GE128_D(D) hwy::EnableIf<detail::IsGE128(D())>* = nullptr
4272#define HWY_RVV_IF_LT128_D(D) hwy::EnableIf<detail::IsLT128(D())>* = nullptr
4273#define HWY_RVV_IF_CAN128_D(D) \
4274 hwy::EnableIf<!detail::IsLT128(D()) && !detail::IsGE128(D())>* = nullptr
4275
4276template <class D, class V, HWY_RVV_IF_GE128_D(D)>
4277HWY_API V InterleaveLower(D d, const V a, const V b) {
4278 return detail::InterleaveLowerBlocks(d, a, b);
4279}
4280
4281// Single block: interleave without extra Compress.
4282template <class D, class V, HWY_RVV_IF_LT128_D(D)>
4283HWY_API V InterleaveLower(D d, const V a, const V b) {
4284 static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
4285 return InterleaveWholeLower(d, a, b);
4286}
4287
4288// Could be either; branch at runtime.
4289template <class D, class V, HWY_RVV_IF_CAN128_D(D)>
4290HWY_API V InterleaveLower(D d, const V a, const V b) {
4291 if (Lanes(d) * sizeof(TFromD<D>) <= 16) {
4292 return InterleaveWholeLower(d, a, b);
4293 }
4294 // Fractional LMUL: use LMUL=1 to ensure we can cast to u64.
4295 const ScalableTag<TFromD<D>, HWY_MAX(d.Pow2(), 0)> d1;
4296 return ResizeBitCast(d, detail::InterleaveLowerBlocks(
4297 d1, ResizeBitCast(d1, a), ResizeBitCast(d1, b)));
4298}
4299
4300template <class V>
4301HWY_API V InterleaveLower(const V a, const V b) {
4302 return InterleaveLower(DFromV<V>(), a, b);
4303}
4304
4305// ------------------------------ InterleaveUpper (Compress)
4306
4307template <class D, class V, HWY_RVV_IF_GE128_D(D)>
4308HWY_API V InterleaveUpper(D d, const V a, const V b) {
4309 return detail::InterleaveUpperBlocks(d, a, b);
4310}
4311
4312// Single block: interleave without extra Compress.
4313template <class D, class V, HWY_RVV_IF_LT128_D(D)>
4314HWY_API V InterleaveUpper(D d, const V a, const V b) {
4315 static_assert(IsSame<TFromD<D>, TFromV<V>>(), "D/V mismatch");
4316 return InterleaveWholeUpper(d, a, b);
4317}
4318
4319// Could be either; branch at runtime.
4320template <class D, class V, HWY_RVV_IF_CAN128_D(D)>
4321HWY_API V InterleaveUpper(D d, const V a, const V b) {
4322 if (Lanes(d) * sizeof(TFromD<D>) <= 16) {
4323 return InterleaveWholeUpper(d, a, b);
4324 }
4325 // Fractional LMUL: use LMUL=1 to ensure we can cast to u64.
4326 const ScalableTag<TFromD<D>, HWY_MAX(d.Pow2(), 0)> d1;
4327 return ResizeBitCast(d, detail::InterleaveUpperBlocks(
4328 d1, ResizeBitCast(d1, a), ResizeBitCast(d1, b)));
4329}
4330
4331// ------------------------------ ZipLower
4332
4333template <class V, class DW = RepartitionToWide<DFromV<V>>>
4334HWY_API VFromD<DW> ZipLower(DW dw, V a, V b) {
4335 const RepartitionToNarrow<DW> dn;
4336 static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
4337 return BitCast(dw, InterleaveLower(dn, a, b));
4338}
4339
4340template <class V, class DW = RepartitionToWide<DFromV<V>>>
4341HWY_API VFromD<DW> ZipLower(V a, V b) {
4342 return BitCast(DW(), InterleaveLower(a, b));
4343}
4344
4345// ------------------------------ ZipUpper
4346template <class DW, class V>
4347HWY_API VFromD<DW> ZipUpper(DW dw, V a, V b) {
4348 const RepartitionToNarrow<DW> dn;
4349 static_assert(IsSame<TFromD<decltype(dn)>, TFromV<V>>(), "D/V mismatch");
4350 return BitCast(dw, InterleaveUpper(dn, a, b));
4351}
4352
4353// ================================================== REDUCE
4354
4355// We have ReduceSum, generic_ops-inl.h defines SumOfLanes via Set.
4356#ifdef HWY_NATIVE_REDUCE_SCALAR
4357#undef HWY_NATIVE_REDUCE_SCALAR
4358#else
4359#define HWY_NATIVE_REDUCE_SCALAR
4360#endif
4361
4362// scalar = f(vector, zero_m1)
4363#define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4364 MLEN, NAME, OP) \
4365 template <size_t N> \
4366 HWY_API HWY_RVV_T(BASE, SEW) \
4367 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, HWY_RVV_V(BASE, SEW, LMUL) v, \
4368 HWY_RVV_V(BASE, SEW, m1) v0) { \
4369 return GetLane(__riscv_v##OP##_vs_##CHAR##SEW##LMUL##_##CHAR##SEW##m1( \
4370 v, v0, Lanes(d))); \
4371 }
4372
4373// detail::RedSum, detail::RedMin, and detail::RedMax is more efficient
4374// for N=4 I8/U8 reductions on RVV than the default implementations of the
4375// the N=4 I8/U8 ReduceSum/ReduceMin/ReduceMax operations in generic_ops-inl.h
4376#undef HWY_IF_REDUCE_D
4377#define HWY_IF_REDUCE_D(D) hwy::EnableIf<HWY_MAX_LANES_D(D) != 1>* = nullptr
4378
4379#ifdef HWY_NATIVE_REDUCE_SUM_4_UI8
4380#undef HWY_NATIVE_REDUCE_SUM_4_UI8
4381#else
4382#define HWY_NATIVE_REDUCE_SUM_4_UI8
4383#endif
4384
4385#ifdef HWY_NATIVE_REDUCE_MINMAX_4_UI8
4386#undef HWY_NATIVE_REDUCE_MINMAX_4_UI8
4387#else
4388#define HWY_NATIVE_REDUCE_MINMAX_4_UI8
4389#endif
4390
4391// ------------------------------ ReduceSum
4392
4393namespace detail {
4394HWY_RVV_FOREACH_UI(HWY_RVV_REDUCE, RedSum, redsum, _ALL_VIRT)
4395HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedSum, fredusum, _ALL_VIRT)
4396} // namespace detail
4397
4398template <class D, HWY_IF_REDUCE_D(D)>
4399HWY_API TFromD<D> ReduceSum(D d, const VFromD<D> v) {
4400 const auto v0 = Zero(ScalableTag<TFromD<D>>()); // always m1
4401 return detail::RedSum(d, v, v0);
4402}
4403
4404// ------------------------------ ReduceMin
4405namespace detail {
4406HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMin, redminu, _ALL_VIRT)
4407HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMin, redmin, _ALL_VIRT)
4408HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMin, fredmin, _ALL_VIRT)
4409} // namespace detail
4410
4411template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
4412HWY_API T ReduceMin(D d, const VFromD<D> v) {
4413 const ScalableTag<T> d1; // always m1
4414 return detail::RedMin(d, v, Set(d1, HighestValue<T>()));
4415}
4416
4417// ------------------------------ ReduceMax
4418namespace detail {
4419HWY_RVV_FOREACH_U(HWY_RVV_REDUCE, RedMax, redmaxu, _ALL_VIRT)
4420HWY_RVV_FOREACH_I(HWY_RVV_REDUCE, RedMax, redmax, _ALL_VIRT)
4421HWY_RVV_FOREACH_F(HWY_RVV_REDUCE, RedMax, fredmax, _ALL_VIRT)
4422} // namespace detail
4423
4424template <class D, typename T = TFromD<D>, HWY_IF_REDUCE_D(D)>
4425HWY_API T ReduceMax(D d, const VFromD<D> v) {
4426 const ScalableTag<T> d1; // always m1
4427 return detail::RedMax(d, v, Set(d1, LowestValue<T>()));
4428}
4429
4430#undef HWY_RVV_REDUCE
4431
4432// ------------------------------ SumOfLanes
4433
4434template <class D, HWY_IF_LANES_GT_D(D, 1)>
4436 return Set(d, ReduceSum(d, v));
4437}
4438template <class D, HWY_IF_LANES_GT_D(D, 1)>
4440 return Set(d, ReduceMin(d, v));
4441}
4442template <class D, HWY_IF_LANES_GT_D(D, 1)>
4444 return Set(d, ReduceMax(d, v));
4445}
4446
4447// ================================================== Ops with dependencies
4448
4449// ------------------------------ LoadInterleaved2
4450
4451// Per-target flag to prevent generic_ops-inl.h from defining LoadInterleaved2.
4452#ifdef HWY_NATIVE_LOAD_STORE_INTERLEAVED
4453#undef HWY_NATIVE_LOAD_STORE_INTERLEAVED
4454#else
4455#define HWY_NATIVE_LOAD_STORE_INTERLEAVED
4456#endif
4457
4458// Requires Clang 16+, GCC 14+; otherwise emulated in generic_ops-inl.h.
4459#if HWY_HAVE_TUPLE
4460
4461#define HWY_RVV_GET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4462 MLEN, NAME, OP) \
4463 template <size_t kIndex> \
4464 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
4465 NAME##2(HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup) { \
4466 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##x2_##CHAR##SEW##LMUL(tup, \
4467 kIndex); \
4468 } \
4469 template <size_t kIndex> \
4470 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
4471 NAME##3(HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup) { \
4472 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##x3_##CHAR##SEW##LMUL(tup, \
4473 kIndex); \
4474 } \
4475 template <size_t kIndex> \
4476 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
4477 NAME##4(HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup) { \
4478 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##x4_##CHAR##SEW##LMUL(tup, \
4479 kIndex); \
4480 }
4481
4482HWY_RVV_FOREACH(HWY_RVV_GET, Get, get, _LE2)
4483#undef HWY_RVV_GET
4484
4485#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4486 MLEN, NAME, OP) \
4487 template <size_t kIndex> \
4488 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 2) NAME##2( \
4489 HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup, HWY_RVV_V(BASE, SEW, LMUL) v) { \
4490 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMUL##x2( \
4491 tup, kIndex, v); \
4492 } \
4493 template <size_t kIndex> \
4494 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 3) NAME##3( \
4495 HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup, HWY_RVV_V(BASE, SEW, LMUL) v) { \
4496 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMUL##x3( \
4497 tup, kIndex, v); \
4498 } \
4499 template <size_t kIndex> \
4500 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 4) NAME##4( \
4501 HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup, HWY_RVV_V(BASE, SEW, LMUL) v) { \
4502 return __riscv_v##OP##_v_##CHAR##SEW##LMUL##_##CHAR##SEW##LMUL##x4( \
4503 tup, kIndex, v); \
4504 }
4505
4506HWY_RVV_FOREACH(HWY_RVV_SET, Set, set, _LE2)
4507#undef HWY_RVV_SET
4508
4509// RVV does not provide vcreate, so implement using Set.
4510#define HWY_RVV_CREATE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4511 MLEN, NAME, OP) \
4512 template <size_t N> \
4513 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 2) \
4514 NAME##2(HWY_RVV_D(BASE, SEW, N, SHIFT) /*d*/, \
4515 HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1) { \
4516 HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup{}; \
4517 tup = Set2<0>(tup, v0); \
4518 tup = Set2<1>(tup, v1); \
4519 return tup; \
4520 } \
4521 template <size_t N> \
4522 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 3) NAME##3( \
4523 HWY_RVV_D(BASE, SEW, N, SHIFT) /*d*/, HWY_RVV_V(BASE, SEW, LMUL) v0, \
4524 HWY_RVV_V(BASE, SEW, LMUL) v1, HWY_RVV_V(BASE, SEW, LMUL) v2) { \
4525 HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup{}; \
4526 tup = Set3<0>(tup, v0); \
4527 tup = Set3<1>(tup, v1); \
4528 tup = Set3<2>(tup, v2); \
4529 return tup; \
4530 } \
4531 template <size_t N> \
4532 HWY_API HWY_RVV_TUP(BASE, SEW, LMUL, 4) \
4533 NAME##4(HWY_RVV_D(BASE, SEW, N, SHIFT) /*d*/, \
4534 HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
4535 HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3) { \
4536 HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup{}; \
4537 tup = Set4<0>(tup, v0); \
4538 tup = Set4<1>(tup, v1); \
4539 tup = Set4<2>(tup, v2); \
4540 tup = Set4<3>(tup, v3); \
4541 return tup; \
4542 }
4543
4544HWY_RVV_FOREACH(HWY_RVV_CREATE, Create, xx, _LE2_VIRT)
4545#undef HWY_RVV_CREATE
4546
4547template <class D>
4548using Vec2 = decltype(Create2(D(), Zero(D()), Zero(D())));
4549template <class D>
4550using Vec3 = decltype(Create3(D(), Zero(D()), Zero(D()), Zero(D())));
4551template <class D>
4552using Vec4 = decltype(Create4(D(), Zero(D()), Zero(D()), Zero(D()), Zero(D())));
4553
4554#define HWY_RVV_LOAD2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4555 MLEN, NAME, OP) \
4556 template <size_t N> \
4557 HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
4558 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \
4559 HWY_RVV_V(BASE, SEW, LMUL) & v0, \
4560 HWY_RVV_V(BASE, SEW, LMUL) & v1) { \
4561 const HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup = \
4562 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x2(unaligned, Lanes(d)); \
4563 v0 = Get2<0>(tup); \
4564 v1 = Get2<1>(tup); \
4565 }
4566// Segments are limited to 8 registers, so we can only go up to LMUL=2.
4567HWY_RVV_FOREACH(HWY_RVV_LOAD2, LoadInterleaved2, lseg2, _LE2_VIRT)
4568#undef HWY_RVV_LOAD2
4569
4570// ------------------------------ LoadInterleaved3
4571
4572#define HWY_RVV_LOAD3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4573 MLEN, NAME, OP) \
4574 template <size_t N> \
4575 HWY_API void NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
4576 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \
4577 HWY_RVV_V(BASE, SEW, LMUL) & v0, \
4578 HWY_RVV_V(BASE, SEW, LMUL) & v1, \
4579 HWY_RVV_V(BASE, SEW, LMUL) & v2) { \
4580 const HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup = \
4581 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x3(unaligned, Lanes(d)); \
4582 v0 = Get3<0>(tup); \
4583 v1 = Get3<1>(tup); \
4584 v2 = Get3<2>(tup); \
4585 }
4586// Segments are limited to 8 registers, so we can only go up to LMUL=2.
4587HWY_RVV_FOREACH(HWY_RVV_LOAD3, LoadInterleaved3, lseg3, _LE2_VIRT)
4588#undef HWY_RVV_LOAD3
4589
4590// ------------------------------ LoadInterleaved4
4591
4592#define HWY_RVV_LOAD4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4593 MLEN, NAME, OP) \
4594 template <size_t N> \
4595 HWY_API void NAME( \
4596 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
4597 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned, \
4598 HWY_RVV_V(BASE, SEW, LMUL) & v0, HWY_RVV_V(BASE, SEW, LMUL) & v1, \
4599 HWY_RVV_V(BASE, SEW, LMUL) & v2, HWY_RVV_V(BASE, SEW, LMUL) & v3) { \
4600 const HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup = \
4601 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x4(unaligned, Lanes(d)); \
4602 v0 = Get4<0>(tup); \
4603 v1 = Get4<1>(tup); \
4604 v2 = Get4<2>(tup); \
4605 v3 = Get4<3>(tup); \
4606 }
4607// Segments are limited to 8 registers, so we can only go up to LMUL=2.
4608HWY_RVV_FOREACH(HWY_RVV_LOAD4, LoadInterleaved4, lseg4, _LE2_VIRT)
4609#undef HWY_RVV_LOAD4
4610
4611// ------------------------------ StoreInterleaved2
4612
4613#define HWY_RVV_STORE2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4614 MLEN, NAME, OP) \
4615 template <size_t N> \
4616 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v0, \
4617 HWY_RVV_V(BASE, SEW, LMUL) v1, \
4618 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
4619 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \
4620 const HWY_RVV_TUP(BASE, SEW, LMUL, 2) tup = Create2(d, v0, v1); \
4621 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x2(unaligned, tup, Lanes(d)); \
4622 }
4623// Segments are limited to 8 registers, so we can only go up to LMUL=2.
4624HWY_RVV_FOREACH(HWY_RVV_STORE2, StoreInterleaved2, sseg2, _LE2_VIRT)
4625#undef HWY_RVV_STORE2
4626
4627// ------------------------------ StoreInterleaved3
4628
4629#define HWY_RVV_STORE3(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4630 MLEN, NAME, OP) \
4631 template <size_t N> \
4632 HWY_API void NAME( \
4633 HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
4634 HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
4635 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \
4636 const HWY_RVV_TUP(BASE, SEW, LMUL, 3) tup = Create3(d, v0, v1, v2); \
4637 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x3(unaligned, tup, Lanes(d)); \
4638 }
4639// Segments are limited to 8 registers, so we can only go up to LMUL=2.
4640HWY_RVV_FOREACH(HWY_RVV_STORE3, StoreInterleaved3, sseg3, _LE2_VIRT)
4641#undef HWY_RVV_STORE3
4642
4643// ------------------------------ StoreInterleaved4
4644
4645#define HWY_RVV_STORE4(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, \
4646 MLEN, NAME, OP) \
4647 template <size_t N> \
4648 HWY_API void NAME( \
4649 HWY_RVV_V(BASE, SEW, LMUL) v0, HWY_RVV_V(BASE, SEW, LMUL) v1, \
4650 HWY_RVV_V(BASE, SEW, LMUL) v2, HWY_RVV_V(BASE, SEW, LMUL) v3, \
4651 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
4652 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT unaligned) { \
4653 const HWY_RVV_TUP(BASE, SEW, LMUL, 4) tup = Create4(d, v0, v1, v2, v3); \
4654 __riscv_v##OP##e##SEW##_v_##CHAR##SEW##LMUL##x4(unaligned, tup, Lanes(d)); \
4655 }
4656// Segments are limited to 8 registers, so we can only go up to LMUL=2.
4657HWY_RVV_FOREACH(HWY_RVV_STORE4, StoreInterleaved4, sseg4, _LE2_VIRT)
4658#undef HWY_RVV_STORE4
4659
4660#else // !HWY_HAVE_TUPLE
4661
4662template <class D, typename T = TFromD<D>>
4663HWY_API void LoadInterleaved2(D d, const T* HWY_RESTRICT unaligned,
4664 VFromD<D>& v0, VFromD<D>& v1) {
4665 const VFromD<D> A = LoadU(d, unaligned); // v1[1] v0[1] v1[0] v0[0]
4666 const VFromD<D> B = LoadU(d, unaligned + Lanes(d));
4667 v0 = ConcatEven(d, B, A);
4668 v1 = ConcatOdd(d, B, A);
4669}
4670
4671namespace detail {
4672#define HWY_RVV_LOAD_STRIDED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
4673 SHIFT, MLEN, NAME, OP) \
4674 template <size_t N> \
4675 HWY_API HWY_RVV_V(BASE, SEW, LMUL) \
4676 NAME(HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
4677 const HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t stride) { \
4678 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
4679 p, static_cast<ptrdiff_t>(stride), Lanes(d)); \
4680 }
4681HWY_RVV_FOREACH(HWY_RVV_LOAD_STRIDED, LoadStrided, lse, _ALL_VIRT)
4682#undef HWY_RVV_LOAD_STRIDED
4683} // namespace detail
4684
4685template <class D, typename T = TFromD<D>>
4686HWY_API void LoadInterleaved3(D d, const TFromD<D>* HWY_RESTRICT unaligned,
4687 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2) {
4688 // Offsets are bytes, and this is not documented.
4689 v0 = detail::LoadStrided(d, unaligned + 0, 3 * sizeof(T));
4690 v1 = detail::LoadStrided(d, unaligned + 1, 3 * sizeof(T));
4691 v2 = detail::LoadStrided(d, unaligned + 2, 3 * sizeof(T));
4692}
4693
4694template <class D, typename T = TFromD<D>>
4695HWY_API void LoadInterleaved4(D d, const TFromD<D>* HWY_RESTRICT unaligned,
4696 VFromD<D>& v0, VFromD<D>& v1, VFromD<D>& v2,
4697 VFromD<D>& v3) {
4698 // Offsets are bytes, and this is not documented.
4699 v0 = detail::LoadStrided(d, unaligned + 0, 4 * sizeof(T));
4700 v1 = detail::LoadStrided(d, unaligned + 1, 4 * sizeof(T));
4701 v2 = detail::LoadStrided(d, unaligned + 2, 4 * sizeof(T));
4702 v3 = detail::LoadStrided(d, unaligned + 3, 4 * sizeof(T));
4703}
4704
4705// Not 64-bit / max LMUL: interleave via promote, slide, OddEven.
4706template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE_D(D, 8),
4707 HWY_IF_POW2_LE_D(D, 2)>
4709 T* HWY_RESTRICT unaligned) {
4710 const RebindToUnsigned<D> du;
4711 const Twice<RepartitionToWide<decltype(du)>> duw;
4712 const Twice<decltype(d)> dt;
4713 // Interleave with zero by promoting to wider (unsigned) type.
4714 const VFromD<decltype(dt)> w0 = BitCast(dt, PromoteTo(duw, BitCast(du, v0)));
4715 const VFromD<decltype(dt)> w1 = BitCast(dt, PromoteTo(duw, BitCast(du, v1)));
4716 // OR second vector into the zero-valued lanes (faster than OddEven).
4717 StoreU(Or(w0, detail::Slide1Up(w1)), dt, unaligned);
4718}
4719
4720// Can promote, max LMUL: two half-length
4721template <class D, typename T = TFromD<D>, HWY_IF_NOT_T_SIZE_D(D, 8),
4722 HWY_IF_POW2_GT_D(D, 2)>
4723HWY_API void StoreInterleaved2(VFromD<D> v0, VFromD<D> v1, D d,
4724 T* HWY_RESTRICT unaligned) {
4725 const Half<decltype(d)> dh;
4726 StoreInterleaved2(LowerHalf(dh, v0), LowerHalf(dh, v1), d, unaligned);
4727 StoreInterleaved2(UpperHalf(dh, v0), UpperHalf(dh, v1), d,
4728 unaligned + Lanes(d));
4729}
4730
4731namespace detail {
4732#define HWY_RVV_STORE_STRIDED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
4733 SHIFT, MLEN, NAME, OP) \
4734 template <size_t N> \
4735 HWY_API void NAME(HWY_RVV_V(BASE, SEW, LMUL) v, \
4736 HWY_RVV_D(BASE, SEW, N, SHIFT) d, \
4737 HWY_RVV_T(BASE, SEW) * HWY_RESTRICT p, size_t stride) { \
4738 return __riscv_v##OP##SEW##_v_##CHAR##SEW##LMUL( \
4739 p, static_cast<ptrdiff_t>(stride), v, Lanes(d)); \
4740 }
4741HWY_RVV_FOREACH(HWY_RVV_STORE_STRIDED, StoreStrided, sse, _ALL_VIRT)
4742#undef HWY_RVV_STORE_STRIDED
4743} // namespace detail
4744
4745// 64-bit: strided
4746template <class D, typename T = TFromD<D>, HWY_IF_T_SIZE_D(D, 8)>
4748 T* HWY_RESTRICT unaligned) {
4749 // Offsets are bytes, and this is not documented.
4750 detail::StoreStrided(v0, d, unaligned + 0, 2 * sizeof(T));
4751 detail::StoreStrided(v1, d, unaligned + 1, 2 * sizeof(T));
4752}
4753
4754template <class D, typename T = TFromD<D>>
4756 T* HWY_RESTRICT unaligned) {
4757 // Offsets are bytes, and this is not documented.
4758 detail::StoreStrided(v0, d, unaligned + 0, 3 * sizeof(T));
4759 detail::StoreStrided(v1, d, unaligned + 1, 3 * sizeof(T));
4760 detail::StoreStrided(v2, d, unaligned + 2, 3 * sizeof(T));
4761}
4762
4763template <class D, typename T = TFromD<D>>
4765 VFromD<D> v3, D d, T* HWY_RESTRICT unaligned) {
4766 // Offsets are bytes, and this is not documented.
4767 detail::StoreStrided(v0, d, unaligned + 0, 4 * sizeof(T));
4768 detail::StoreStrided(v1, d, unaligned + 1, 4 * sizeof(T));
4769 detail::StoreStrided(v2, d, unaligned + 2, 4 * sizeof(T));
4770 detail::StoreStrided(v3, d, unaligned + 3, 4 * sizeof(T));
4771}
4772
4773#endif // HWY_HAVE_TUPLE
4774
4775// ------------------------------ Dup128VecFromValues (ResizeBitCast)
4776
4777template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_D(D, 1)>
4778HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> /*t1*/) {
4779 return Set(d, t0);
4780}
4781
4782template <class D, HWY_IF_T_SIZE_D(D, 8), HWY_IF_LANES_GT_D(D, 1)>
4783HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1) {
4784 const auto even_lanes = Set(d, t0);
4785#if HWY_COMPILER_GCC && !HWY_IS_DEBUG_BUILD
4786 if (__builtin_constant_p(BitCastScalar<uint64_t>(t0) ==
4787 BitCastScalar<uint64_t>(t1)) &&
4788 (BitCastScalar<uint64_t>(t0) == BitCastScalar<uint64_t>(t1))) {
4789 return even_lanes;
4790 }
4791#endif
4792
4793 const auto odd_lanes = Set(d, t1);
4794 return OddEven(odd_lanes, even_lanes);
4795}
4796
4797namespace detail {
4798
4799#pragma pack(push, 1)
4800
4801template <class T>
4802struct alignas(8) Vec64ValsWrapper {
4803 static_assert(sizeof(T) >= 1, "sizeof(T) >= 1 must be true");
4804 static_assert(sizeof(T) <= 8, "sizeof(T) <= 8 must be true");
4805 T vals[8 / sizeof(T)];
4806};
4807
4808#pragma pack(pop)
4809
4810} // namespace detail
4811
4812template <class D, HWY_IF_T_SIZE_D(D, 1)>
4813HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
4814 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
4815 TFromD<D> t5, TFromD<D> t6, TFromD<D> t7,
4816 TFromD<D> t8, TFromD<D> t9, TFromD<D> t10,
4817 TFromD<D> t11, TFromD<D> t12,
4818 TFromD<D> t13, TFromD<D> t14,
4819 TFromD<D> t15) {
4820 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
4821 return ResizeBitCast(
4823 du64,
4824 BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
4825 {t0, t1, t2, t3, t4, t5, t6, t7}}),
4826 BitCastScalar<uint64_t>(detail::Vec64ValsWrapper<TFromD<D>>{
4827 {t8, t9, t10, t11, t12, t13, t14, t15}})));
4828}
4829
4830template <class D, HWY_IF_T_SIZE_D(D, 2)>
4831HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
4832 TFromD<D> t2, TFromD<D> t3, TFromD<D> t4,
4833 TFromD<D> t5, TFromD<D> t6,
4834 TFromD<D> t7) {
4835 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
4836 return ResizeBitCast(
4838 du64,
4839 BitCastScalar<uint64_t>(
4840 detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1, t2, t3}}),
4841 BitCastScalar<uint64_t>(
4842 detail::Vec64ValsWrapper<TFromD<D>>{{t4, t5, t6, t7}})));
4843}
4844
4845template <class D, HWY_IF_T_SIZE_D(D, 4)>
4846HWY_API VFromD<D> Dup128VecFromValues(D d, TFromD<D> t0, TFromD<D> t1,
4847 TFromD<D> t2, TFromD<D> t3) {
4848 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, D>> du64;
4849 return ResizeBitCast(
4850 d,
4852 BitCastScalar<uint64_t>(
4853 detail::Vec64ValsWrapper<TFromD<D>>{{t0, t1}}),
4854 BitCastScalar<uint64_t>(
4855 detail::Vec64ValsWrapper<TFromD<D>>{{t2, t3}})));
4856}
4857
4858// ------------------------------ PopulationCount (ShiftRight)
4859
4860// Handles LMUL < 2 or capped vectors, which generic_ops-inl cannot.
4861template <typename V, class D = DFromV<V>, HWY_IF_U8_D(D),
4862 hwy::EnableIf<D().Pow2() < 1 || D().MaxLanes() < 16>* = nullptr>
4863HWY_API V PopulationCount(V v) {
4864 // See https://arxiv.org/pdf/1611.07612.pdf, Figure 3
4865 v = Sub(v, detail::AndS(ShiftRight<1>(v), 0x55));
4866 v = Add(detail::AndS(ShiftRight<2>(v), 0x33), detail::AndS(v, 0x33));
4867 return detail::AndS(Add(v, ShiftRight<4>(v)), 0x0F);
4868}
4869
4870// ------------------------------ LoadDup128
4871
4872template <class D>
4873HWY_API VFromD<D> LoadDup128(D d, const TFromD<D>* const HWY_RESTRICT p) {
4874 const RebindToUnsigned<decltype(d)> du;
4875
4876 // Make sure that no more than 16 bytes are loaded from p
4877 constexpr int kLoadPow2 = d.Pow2();
4878 constexpr size_t kMaxLanesToLoad =
4879 HWY_MIN(HWY_MAX_LANES_D(D), 16 / sizeof(TFromD<D>));
4880 constexpr size_t kLoadN = D::template NewN<kLoadPow2, kMaxLanesToLoad>();
4881 const Simd<TFromD<D>, kLoadN, kLoadPow2> d_load;
4882 static_assert(d_load.MaxBytes() <= 16,
4883 "d_load.MaxBytes() <= 16 must be true");
4884 static_assert((d.MaxBytes() < 16) || (d_load.MaxBytes() == 16),
4885 "d_load.MaxBytes() == 16 must be true if d.MaxBytes() >= 16 is "
4886 "true");
4887 static_assert((d.MaxBytes() >= 16) || (d_load.MaxBytes() == d.MaxBytes()),
4888 "d_load.MaxBytes() == d.MaxBytes() must be true if "
4889 "d.MaxBytes() < 16 is true");
4890
4891 const VFromD<D> loaded = Load(d_load, p);
4892 if (d.MaxBytes() <= 16) return loaded;
4893
4894 // idx must be unsigned for TableLookupLanes.
4895 using TU = TFromD<decltype(du)>;
4896 const TU mask = static_cast<TU>(detail::LanesPerBlock(d) - 1);
4897 // Broadcast the first block.
4898 const VFromD<RebindToUnsigned<D>> idx = detail::AndS(detail::Iota0(du), mask);
4899 // Safe even for 8-bit lanes because indices never exceed 15.
4900 return TableLookupLanes(loaded, idx);
4901}
4902
4903// ------------------------------ LoadMaskBits
4904
4905// Support all combinations of T and SHIFT(LMUL) without explicit overloads for
4906// each. First overload for MLEN=1..64.
4907namespace detail {
4908
4909// Maps D to MLEN (wrapped in SizeTag), such that #mask_bits = VLEN/MLEN. MLEN
4910// increases with lane size and decreases for increasing LMUL. Cap at 64, the
4911// largest supported by HWY_RVV_FOREACH_B (and intrinsics), for virtual LMUL
4912// e.g. vuint16mf8_t: (8*2 << 3) == 128.
4913template <class D>
4914using MaskTag = hwy::SizeTag<HWY_MIN(
4915 64, detail::ScaleByPower(8 * sizeof(TFromD<D>), -D().Pow2()))>;
4916
4917#define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \
4918 HWY_INLINE HWY_RVV_M(MLEN) \
4919 NAME(hwy::SizeTag<MLEN> /* tag */, const uint8_t* bits, size_t N) { \
4920 return __riscv_v##OP##_v_b##MLEN(bits, N); \
4921 }
4923#undef HWY_RVV_LOAD_MASK_BITS
4924} // namespace detail
4925
4926template <class D, class MT = detail::MaskTag<D>>
4927HWY_API auto LoadMaskBits(D d, const uint8_t* bits)
4928 -> decltype(detail::LoadMaskBits(MT(), bits, Lanes(d))) {
4929 return detail::LoadMaskBits(MT(), bits, Lanes(d));
4930}
4931
4932// ------------------------------ StoreMaskBits
4933#define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP) \
4934 template <class D> \
4935 HWY_API size_t NAME(D d, HWY_RVV_M(MLEN) m, uint8_t* bits) { \
4936 const size_t N = Lanes(d); \
4937 __riscv_v##OP##_v_b##MLEN(bits, m, N); \
4938 /* Non-full byte, need to clear the undefined upper bits. */ \
4939 /* Use MaxLanes and sizeof(T) to move some checks to compile-time. */ \
4940 constexpr bool kLessThan8 = \
4941 detail::ScaleByPower(16 / sizeof(TFromD<D>), d.Pow2()) < 8; \
4942 if (MaxLanes(d) < 8 || (kLessThan8 && N < 8)) { \
4943 const int mask = (1 << N) - 1; \
4944 bits[0] = static_cast<uint8_t>(bits[0] & mask); \
4945 } \
4946 return (N + 7) / 8; \
4947 }
4949#undef HWY_RVV_STORE_MASK_BITS
4950
4951// ------------------------------ CompressBits, CompressBitsStore (LoadMaskBits)
4952
4953template <class V>
4954HWY_INLINE V CompressBits(V v, const uint8_t* HWY_RESTRICT bits) {
4955 return Compress(v, LoadMaskBits(DFromV<V>(), bits));
4956}
4957
4958template <class D>
4959HWY_API size_t CompressBitsStore(VFromD<D> v, const uint8_t* HWY_RESTRICT bits,
4960 D d, TFromD<D>* HWY_RESTRICT unaligned) {
4961 return CompressStore(v, LoadMaskBits(d, bits), d, unaligned);
4962}
4963
4964// ------------------------------ FirstN (Iota0, Lt, RebindMask, SlideUp)
4965
4966// NOTE: do not use this as a building block within rvv-inl - it is likely more
4967// efficient to use avl or detail::SlideUp.
4968
4969// Disallow for 8-bit because Iota is likely to overflow.
4970template <class D, HWY_IF_NOT_T_SIZE_D(D, 1)>
4971HWY_API MFromD<D> FirstN(const D d, const size_t n) {
4972 const RebindToUnsigned<D> du;
4973 using TU = TFromD<decltype(du)>;
4974 return RebindMask(d, detail::LtS(detail::Iota0(du), static_cast<TU>(n)));
4975}
4976
4977template <class D, HWY_IF_T_SIZE_D(D, 1)>
4978HWY_API MFromD<D> FirstN(const D d, const size_t n) {
4979 const auto zero = Zero(d);
4980 const auto one = Set(d, 1);
4981 return Eq(detail::SlideUp(one, zero, n), one);
4982}
4983
4984// ------------------------------ LowerHalfOfMask/UpperHalfOfMask
4985
4986#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
4987
4988// Target-specific implementations of LowerHalfOfMask, UpperHalfOfMask,
4989// CombineMasks, OrderedDemote2MasksTo, and Dup128MaskFromMaskBits are possible
4990// on RVV if the __riscv_vreinterpret_v_b*_u8m1 and
4991// __riscv_vreinterpret_v_u8m1_b* intrinsics are available.
4992
4993// The __riscv_vreinterpret_v_b*_u8m1 and __riscv_vreinterpret_v_u8m1_b*
4994// intrinsics available with Clang 17 and later and GCC 14 and later.
4995
4996namespace detail {
4997
4998HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool1_t m) {
4999 return __riscv_vreinterpret_v_b1_u8m1(m);
5000}
5001
5002HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool2_t m) {
5003 return __riscv_vreinterpret_v_b2_u8m1(m);
5004}
5005
5006HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool4_t m) {
5007 return __riscv_vreinterpret_v_b4_u8m1(m);
5008}
5009
5010HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool8_t m) {
5011 return __riscv_vreinterpret_v_b8_u8m1(m);
5012}
5013
5014HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool16_t m) {
5015 return __riscv_vreinterpret_v_b16_u8m1(m);
5016}
5017
5018HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool32_t m) {
5019 return __riscv_vreinterpret_v_b32_u8m1(m);
5020}
5021
5022HWY_INLINE vuint8m1_t MaskToU8MaskBitsVec(vbool64_t m) {
5023 return __riscv_vreinterpret_v_b64_u8m1(m);
5024}
5025
5026template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool1_t>()>* = nullptr>
5027HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
5028 return __riscv_vreinterpret_v_u8m1_b1(v);
5029}
5030
5031template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool2_t>()>* = nullptr>
5032HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
5033 return __riscv_vreinterpret_v_u8m1_b2(v);
5034}
5035
5036template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool4_t>()>* = nullptr>
5037HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
5038 return __riscv_vreinterpret_v_u8m1_b4(v);
5039}
5040
5041template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool8_t>()>* = nullptr>
5042HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
5043 return __riscv_vreinterpret_v_u8m1_b8(v);
5044}
5045
5046template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool16_t>()>* = nullptr>
5047HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
5048 return __riscv_vreinterpret_v_u8m1_b16(v);
5049}
5050
5051template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool32_t>()>* = nullptr>
5052HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
5053 return __riscv_vreinterpret_v_u8m1_b32(v);
5054}
5055
5056template <class D, hwy::EnableIf<IsSame<MFromD<D>, vbool64_t>()>* = nullptr>
5057HWY_INLINE MFromD<D> U8MaskBitsVecToMask(D /*d*/, vuint8m1_t v) {
5058 return __riscv_vreinterpret_v_u8m1_b64(v);
5059}
5060
5061} // namespace detail
5062
5063#ifdef HWY_NATIVE_LOWER_HALF_OF_MASK
5064#undef HWY_NATIVE_LOWER_HALF_OF_MASK
5065#else
5066#define HWY_NATIVE_LOWER_HALF_OF_MASK
5067#endif
5068
5069template <class D>
5070HWY_API MFromD<D> LowerHalfOfMask(D d, MFromD<Twice<D>> m) {
5071 return detail::U8MaskBitsVecToMask(d, detail::MaskToU8MaskBitsVec(m));
5072}
5073
5074#ifdef HWY_NATIVE_UPPER_HALF_OF_MASK
5075#undef HWY_NATIVE_UPPER_HALF_OF_MASK
5076#else
5077#define HWY_NATIVE_UPPER_HALF_OF_MASK
5078#endif
5079
5080template <class D>
5081HWY_API MFromD<D> UpperHalfOfMask(D d, MFromD<Twice<D>> m) {
5082 const size_t N = Lanes(d);
5083
5084 vuint8m1_t mask_bits = detail::MaskToU8MaskBitsVec(m);
5085 mask_bits = ShiftRightSame(mask_bits, static_cast<int>(N & 7));
5086 if (HWY_MAX_LANES_D(D) >= 8) {
5087 mask_bits = SlideDownLanes(ScalableTag<uint8_t>(), mask_bits, N / 8);
5088 }
5089
5090 return detail::U8MaskBitsVecToMask(d, mask_bits);
5091}
5092
5093// ------------------------------ CombineMasks
5094
5095#ifdef HWY_NATIVE_COMBINE_MASKS
5096#undef HWY_NATIVE_COMBINE_MASKS
5097#else
5098#define HWY_NATIVE_COMBINE_MASKS
5099#endif
5100
5101template <class D>
5102HWY_API MFromD<D> CombineMasks(D d, MFromD<Half<D>> hi, MFromD<Half<D>> lo) {
5103 const Half<decltype(d)> dh;
5104 const size_t half_N = Lanes(dh);
5105
5106 const auto ext_lo_mask =
5107 And(detail::U8MaskBitsVecToMask(d, detail::MaskToU8MaskBitsVec(lo)),
5108 FirstN(d, half_N));
5109 vuint8m1_t hi_mask_bits = detail::MaskToU8MaskBitsVec(hi);
5110 hi_mask_bits = ShiftLeftSame(hi_mask_bits, static_cast<int>(half_N & 7));
5111 if (HWY_MAX_LANES_D(D) >= 8) {
5112 hi_mask_bits =
5113 SlideUpLanes(ScalableTag<uint8_t>(), hi_mask_bits, half_N / 8);
5114 }
5115
5116 return Or(ext_lo_mask, detail::U8MaskBitsVecToMask(d, hi_mask_bits));
5117}
5118
5119// ------------------------------ OrderedDemote2MasksTo
5120
5121#ifdef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
5122#undef HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
5123#else
5124#define HWY_NATIVE_ORDERED_DEMOTE_2_MASKS_TO
5125#endif
5126
5127template <class DTo, class DFrom,
5128 HWY_IF_T_SIZE_D(DTo, sizeof(TFromD<DFrom>) / 2),
5129 class DTo_2 = Repartition<TFromD<DTo>, DFrom>,
5130 hwy::EnableIf<IsSame<MFromD<DTo>, MFromD<DTo_2>>()>* = nullptr>
5131HWY_API MFromD<DTo> OrderedDemote2MasksTo(DTo d_to, DFrom /*d_from*/,
5132 MFromD<DFrom> a, MFromD<DFrom> b) {
5133 return CombineMasks(d_to, b, a);
5134}
5135
5136#endif // HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5137
5138// ------------------------------ Dup128MaskFromMaskBits
5139
5140namespace detail {
5141// Even though this is only used after checking if (kN < X), this helper
5142// function prevents "shift count exceeded" errors.
5143template <size_t kN, HWY_IF_LANES_LE(kN, 31)>
5144constexpr unsigned MaxMaskBits() {
5145 return (1u << kN) - 1;
5146}
5147template <size_t kN, HWY_IF_LANES_GT(kN, 31)>
5148constexpr unsigned MaxMaskBits() {
5149 return ~0u;
5150}
5151} // namespace detail
5152
5153template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_LE_D(D, 8)>
5154HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5155 constexpr size_t kN = MaxLanes(d);
5156 if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
5157
5158#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5159 return detail::U8MaskBitsVecToMask(
5160 d, Set(ScalableTag<uint8_t>(), static_cast<uint8_t>(mask_bits)));
5161#else
5162 const RebindToUnsigned<decltype(d)> du8;
5163 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>>
5164 du64;
5165
5166 const auto bytes = ResizeBitCast(
5167 du8, detail::AndS(
5168 ResizeBitCast(du64, Set(du8, static_cast<uint8_t>(mask_bits))),
5169 uint64_t{0x8040201008040201u}));
5170 return detail::NeS(bytes, uint8_t{0});
5171#endif
5172}
5173
5174template <class D, HWY_IF_T_SIZE_D(D, 1), HWY_IF_LANES_GT_D(D, 8)>
5175HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5176#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5177 const ScalableTag<uint8_t> du8;
5178 const ScalableTag<uint16_t> du16;
5179 // There are exactly 16 mask bits for 128 vector bits of 8-bit lanes.
5180 return detail::U8MaskBitsVecToMask(
5181 d, BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits))));
5182#else
5183 // Slow fallback for completeness; the above bits to mask cast is preferred.
5184 const RebindToUnsigned<decltype(d)> du8;
5185 const Repartition<uint16_t, decltype(du8)> du16;
5186 const detail::AdjustSimdTagToMinVecPow2<Repartition<uint64_t, decltype(du8)>>
5187 du64;
5188
5189 // Replicate the lower 16 bits of mask_bits to each u16 lane of a u16 vector,
5190 // and then bitcast the replicated mask_bits to a u8 vector
5191 const auto bytes = BitCast(du8, Set(du16, static_cast<uint16_t>(mask_bits)));
5192 // Replicate bytes 8x such that each byte contains the bit that governs it.
5193 const auto rep8 = TableLookupLanes(bytes, ShiftRight<3>(detail::Iota0(du8)));
5194
5195 const auto masked_out_rep8 = ResizeBitCast(
5196 du8,
5197 detail::AndS(ResizeBitCast(du64, rep8), uint64_t{0x8040201008040201u}));
5198 return detail::NeS(masked_out_rep8, uint8_t{0});
5199#endif
5200}
5201
5202template <class D, HWY_IF_T_SIZE_D(D, 2)>
5203HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5204 constexpr size_t kN = MaxLanes(d);
5205 if (kN < 8) mask_bits &= detail::MaxMaskBits<kN>();
5206
5207#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5208 const ScalableTag<uint8_t> du8;
5209 // There are exactly 8 mask bits for 128 vector bits of 16-bit lanes.
5210 return detail::U8MaskBitsVecToMask(d,
5211 Set(du8, static_cast<uint8_t>(mask_bits)));
5212#else
5213 // Slow fallback for completeness; the above bits to mask cast is preferred.
5214 const RebindToUnsigned<D> du;
5215 const VFromD<decltype(du)> bits =
5216 Shl(Set(du, uint16_t{1}), Iota(du, uint16_t{0}));
5217 return TestBit(Set(du, static_cast<uint16_t>(mask_bits)), bits);
5218#endif
5219}
5220
5221template <class D, HWY_IF_T_SIZE_D(D, 4)>
5222HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5223 constexpr size_t kN = MaxLanes(d);
5224 if (kN < 4) mask_bits &= detail::MaxMaskBits<kN>();
5225
5226#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5227 const ScalableTag<uint8_t> du8;
5228 return detail::U8MaskBitsVecToMask(
5229 d, Set(du8, static_cast<uint8_t>(mask_bits * 0x11)));
5230#else
5231 // Slow fallback for completeness; the above bits to mask cast is preferred.
5232 const RebindToUnsigned<D> du;
5233 const VFromD<decltype(du)> bits =
5234 Shl(Set(du, uint32_t{1}), Iota(du, uint32_t{0}));
5235 return TestBit(Set(du, static_cast<uint32_t>(mask_bits)), bits);
5236#endif
5237}
5238
5239template <class D, HWY_IF_T_SIZE_D(D, 8)>
5240HWY_API MFromD<D> Dup128MaskFromMaskBits(D d, unsigned mask_bits) {
5241 constexpr size_t kN = MaxLanes(d);
5242 if (kN < 2) mask_bits &= detail::MaxMaskBits<kN>();
5243
5244#if HWY_COMPILER_CLANG >= 1700 || HWY_COMPILER_GCC_ACTUAL >= 1400
5245 const ScalableTag<uint8_t> du8;
5246 return detail::U8MaskBitsVecToMask(
5247 d, Set(du8, static_cast<uint8_t>(mask_bits * 0x55)));
5248#else
5249 // Slow fallback for completeness; the above bits to mask cast is preferred.
5250 const RebindToUnsigned<D> du;
5251 const VFromD<decltype(du)> bits = Dup128VecFromValues(du, 0, 1);
5252 return TestBit(Set(du, static_cast<uint64_t>(mask_bits)), bits);
5253#endif
5254}
5255
5256// ------------------------------ Neg (Sub)
5257
5258template <class V, HWY_IF_SIGNED_V(V)>
5259HWY_API V Neg(const V v) {
5260 return detail::ReverseSubS(v, 0);
5261}
5262
5263// vector = f(vector), but argument is repeated
5264#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
5265 SHIFT, MLEN, NAME, OP) \
5266 HWY_API HWY_RVV_V(BASE, SEW, LMUL) NAME(HWY_RVV_V(BASE, SEW, LMUL) v) { \
5267 return __riscv_v##OP##_vv_##CHAR##SEW##LMUL(v, v, \
5268 HWY_RVV_AVL(SEW, SHIFT)); \
5269 }
5270
5271HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Neg, fsgnjn, _ALL)
5272
5273#if !HWY_HAVE_FLOAT16
5274
5275template <class V, HWY_IF_U16_D(DFromV<V>)> // hwy::float16_t
5276HWY_API V Neg(V v) {
5277 const DFromV<decltype(v)> d;
5278 const RebindToUnsigned<decltype(d)> du;
5279 using TU = TFromD<decltype(du)>;
5280 return BitCast(d, Xor(BitCast(du, v), Set(du, SignMask<TU>())));
5281}
5282
5283#endif // !HWY_HAVE_FLOAT16
5284
5285// ------------------------------ Abs (Max, Neg)
5286
5287template <class V, HWY_IF_SIGNED_V(V)>
5288HWY_API V Abs(const V v) {
5289 return Max(v, Neg(v));
5290}
5291
5292HWY_RVV_FOREACH_F(HWY_RVV_RETV_ARGV2, Abs, fsgnjx, _ALL)
5293
5294#undef HWY_RVV_RETV_ARGV2
5295
5296// ------------------------------ AbsDiff (Abs, Sub)
5297template <class V, HWY_IF_FLOAT_V(V)>
5298HWY_API V AbsDiff(const V a, const V b) {
5299 return Abs(Sub(a, b));
5300}
5301
5302// ------------------------------ Round (NearestInt, ConvertTo, CopySign)
5303
5304// IEEE-754 roundToIntegralTiesToEven returns floating-point, but we do not have
5305// a dedicated instruction for that. Rounding to integer and converting back to
5306// float is correct except when the input magnitude is large, in which case the
5307// input was already an integer (because mantissa >> exponent is zero).
5308
5309namespace detail {
5310enum RoundingModes { kNear, kTrunc, kDown, kUp };
5311
5312template <class V>
5313HWY_INLINE auto UseInt(const V v) -> decltype(MaskFromVec(v)) {
5314 return detail::LtS(Abs(v), MantissaEnd<TFromV<V>>());
5315}
5316
5317} // namespace detail
5318
5319template <class V>
5320HWY_API V Round(const V v) {
5321 const DFromV<V> df;
5322
5323 const auto integer = NearestInt(v); // round using current mode
5324 const auto int_f = ConvertTo(df, integer);
5325
5326 return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
5327}
5328
5329// ------------------------------ Trunc (ConvertTo)
5330template <class V>
5331HWY_API V Trunc(const V v) {
5332 const DFromV<V> df;
5333 const RebindToSigned<decltype(df)> di;
5334
5335 const auto integer = ConvertTo(di, v); // round toward 0
5336 const auto int_f = ConvertTo(df, integer);
5337
5338 return IfThenElse(detail::UseInt(v), CopySign(int_f, v), v);
5339}
5340
5341// ------------------------------ Ceil
5342template <class V>
5343HWY_API V Ceil(const V v) {
5344 asm volatile("fsrm %0" ::"r"(detail::kUp));
5345 const auto ret = Round(v);
5346 asm volatile("fsrm %0" ::"r"(detail::kNear));
5347 return ret;
5348}
5349
5350// ------------------------------ Floor
5351template <class V>
5352HWY_API V Floor(const V v) {
5353 asm volatile("fsrm %0" ::"r"(detail::kDown));
5354 const auto ret = Round(v);
5355 asm volatile("fsrm %0" ::"r"(detail::kNear));
5356 return ret;
5357}
5358
5359// ------------------------------ Floating-point classification (Ne)
5360
5361// vfclass does not help because it would require 3 instructions (to AND and
5362// then compare the bits), whereas these are just 1-3 integer instructions.
5363
5364template <class V>
5366 return Ne(v, v);
5367}
5368
5369// Per-target flag to prevent generic_ops-inl.h from defining IsInf / IsFinite.
5370// We use a fused Set/comparison for IsFinite.
5371#ifdef HWY_NATIVE_ISINF
5372#undef HWY_NATIVE_ISINF
5373#else
5374#define HWY_NATIVE_ISINF
5375#endif
5376
5377template <class V, class D = DFromV<V>>
5378HWY_API MFromD<D> IsInf(const V v) {
5379 const D d;
5380 const RebindToSigned<decltype(d)> di;
5381 using T = TFromD<D>;
5382 const VFromD<decltype(di)> vi = BitCast(di, v);
5383 // 'Shift left' to clear the sign bit, check for exponent=max and mantissa=0.
5384 return RebindMask(d, detail::EqS(Add(vi, vi), hwy::MaxExponentTimes2<T>()));
5385}
5386
5387// Returns whether normal/subnormal/zero.
5388template <class V, class D = DFromV<V>>
5389HWY_API MFromD<D> IsFinite(const V v) {
5390 const D d;
5391 const RebindToUnsigned<decltype(d)> du;
5392 const RebindToSigned<decltype(d)> di; // cheaper than unsigned comparison
5393 using T = TFromD<D>;
5394 const VFromD<decltype(du)> vu = BitCast(du, v);
5395 // 'Shift left' to clear the sign bit, then right so we can compare with the
5396 // max exponent (cannot compare with MaxExponentTimes2 directly because it is
5397 // negative and non-negative floats would be greater).
5398 const VFromD<decltype(di)> exp =
5399 BitCast(di, ShiftRight<hwy::MantissaBits<T>() + 1>(Add(vu, vu)));
5400 return RebindMask(d, detail::LtS(exp, hwy::MaxExponentField<T>()));
5401}
5402
5403// ------------------------------ Iota (ConvertTo)
5404
5405template <class D, typename T2, HWY_IF_UNSIGNED_D(D)>
5406HWY_API VFromD<D> Iota(const D d, T2 first) {
5407 return detail::AddS(detail::Iota0(d), static_cast<TFromD<D>>(first));
5408}
5409
5410template <class D, typename T2, HWY_IF_SIGNED_D(D)>
5411HWY_API VFromD<D> Iota(const D d, T2 first) {
5412 const RebindToUnsigned<D> du;
5413 return detail::AddS(BitCast(d, detail::Iota0(du)),
5414 static_cast<TFromD<D>>(first));
5415}
5416
5417template <class D, typename T2, HWY_IF_FLOAT_D(D)>
5418HWY_API VFromD<D> Iota(const D d, T2 first) {
5419 const RebindToUnsigned<D> du;
5420 const RebindToSigned<D> di;
5421 return detail::AddS(ConvertTo(d, BitCast(di, detail::Iota0(du))),
5422 ConvertScalarTo<TFromD<D>>(first));
5423}
5424
5425// ------------------------------ MulEven/Odd (Mul, OddEven)
5426
5427template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)),
5428 class D = DFromV<V>, class DW = RepartitionToWide<D>>
5429HWY_API VFromD<DW> MulEven(const V a, const V b) {
5430 const auto lo = Mul(a, b);
5431 const auto hi = MulHigh(a, b);
5432 return BitCast(DW(), OddEven(detail::Slide1Up(hi), lo));
5433}
5434
5435template <class V, HWY_IF_T_SIZE_ONE_OF_V(V, (1 << 1) | (1 << 2) | (1 << 4)),
5436 class D = DFromV<V>, class DW = RepartitionToWide<D>>
5437HWY_API VFromD<DW> MulOdd(const V a, const V b) {
5438 const auto lo = Mul(a, b);
5439 const auto hi = MulHigh(a, b);
5440 return BitCast(DW(), OddEven(hi, detail::Slide1Down(lo)));
5441}
5442
5443// There is no 64x64 vwmul.
5444template <class V, HWY_IF_T_SIZE_V(V, 8)>
5445HWY_INLINE V MulEven(const V a, const V b) {
5446 const auto lo = Mul(a, b);
5447 const auto hi = MulHigh(a, b);
5448 return OddEven(detail::Slide1Up(hi), lo);
5449}
5450
5451template <class V, HWY_IF_T_SIZE_V(V, 8)>
5452HWY_INLINE V MulOdd(const V a, const V b) {
5453 const auto lo = Mul(a, b);
5454 const auto hi = MulHigh(a, b);
5455 return OddEven(hi, detail::Slide1Down(lo));
5456}
5457
5458// ------------------------------ ReorderDemote2To (OddEven, Combine)
5459
5460template <size_t N, int kPow2>
5463 VFromD<RepartitionToWide<decltype(dbf16)>> a,
5464 VFromD<RepartitionToWide<decltype(dbf16)>> b) {
5465 const RebindToUnsigned<decltype(dbf16)> du16;
5466 const RebindToUnsigned<DFromV<decltype(a)>> du32;
5467 const VFromD<decltype(du32)> b_in_even =
5468 ShiftRight<16>(detail::RoundF32ForDemoteToBF16(b));
5469 return BitCast(dbf16,
5470 OddEven(BitCast(du16, detail::RoundF32ForDemoteToBF16(a)),
5471 BitCast(du16, b_in_even)));
5472}
5473
5474// If LMUL is not the max, Combine first to avoid another DemoteTo.
5475template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>),
5476 HWY_IF_POW2_LE_D(DN, 2), class V, HWY_IF_SIGNED_V(V),
5477 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
5478 class V2 = VFromD<Repartition<TFromV<V>, DN>>,
5479 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
5481 const Rebind<TFromV<V>, DN> dt;
5482 const VFromD<decltype(dt)> ab = Combine(dt, b, a);
5483 return DemoteTo(dn, ab);
5484}
5485
5486template <class DN, HWY_IF_UNSIGNED_D(DN), HWY_IF_POW2_LE_D(DN, 2), class V,
5487 HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
5488 class V2 = VFromD<Repartition<TFromV<V>, DN>>,
5489 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
5490HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
5491 const Rebind<TFromV<V>, DN> dt;
5492 const VFromD<decltype(dt)> ab = Combine(dt, b, a);
5493 return DemoteTo(dn, ab);
5494}
5495
5496// Max LMUL: must DemoteTo first, then Combine.
5497template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>),
5498 HWY_IF_POW2_GT_D(DN, 2), class V, HWY_IF_SIGNED_V(V),
5499 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
5500 class V2 = VFromD<Repartition<TFromV<V>, DN>>,
5501 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
5502HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
5503 const Half<decltype(dn)> dnh;
5504 const VFromD<decltype(dnh)> demoted_a = DemoteTo(dnh, a);
5505 const VFromD<decltype(dnh)> demoted_b = DemoteTo(dnh, b);
5506 return Combine(dn, demoted_b, demoted_a);
5507}
5508
5509template <class DN, HWY_IF_UNSIGNED_D(DN), HWY_IF_POW2_GT_D(DN, 2), class V,
5510 HWY_IF_UNSIGNED_V(V), HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
5511 class V2 = VFromD<Repartition<TFromV<V>, DN>>,
5512 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
5513HWY_API VFromD<DN> ReorderDemote2To(DN dn, V a, V b) {
5514 const Half<decltype(dn)> dnh;
5515 const VFromD<decltype(dnh)> demoted_a = DemoteTo(dnh, a);
5516 const VFromD<decltype(dnh)> demoted_b = DemoteTo(dnh, b);
5517 return Combine(dn, demoted_b, demoted_a);
5518}
5519
5520// If LMUL is not the max, Combine first to avoid another DemoteTo.
5521template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), HWY_IF_POW2_LE_D(DN, 2),
5522 class V, HWY_IF_F32_D(DFromV<V>),
5523 class V2 = VFromD<Repartition<TFromV<V>, DN>>,
5524 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
5526 const Rebind<TFromV<V>, DN> dt;
5527 const VFromD<decltype(dt)> ab = Combine(dt, b, a);
5528 return DemoteTo(dn, ab);
5529}
5530
5531// Max LMUL: must DemoteTo first, then Combine.
5532template <class DN, HWY_IF_SPECIAL_FLOAT_D(DN), HWY_IF_POW2_GT_D(DN, 2),
5533 class V, HWY_IF_F32_D(DFromV<V>),
5534 class V2 = VFromD<Repartition<TFromV<V>, DN>>,
5535 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
5536HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
5537 const Half<decltype(dn)> dnh;
5538 const RebindToUnsigned<decltype(dn)> dn_u;
5539 const RebindToUnsigned<decltype(dnh)> dnh_u;
5540 const auto demoted_a = BitCast(dnh_u, DemoteTo(dnh, a));
5541 const auto demoted_b = BitCast(dnh_u, DemoteTo(dnh, b));
5542 return BitCast(dn, Combine(dn_u, demoted_b, demoted_a));
5543}
5544
5545template <class DN, HWY_IF_NOT_FLOAT_NOR_SPECIAL(TFromD<DN>), class V,
5546 HWY_IF_NOT_FLOAT_NOR_SPECIAL_V(V),
5547 HWY_IF_T_SIZE_V(V, sizeof(TFromD<DN>) * 2),
5548 class V2 = VFromD<Repartition<TFromV<V>, DN>>,
5549 hwy::EnableIf<DFromV<V>().Pow2() == DFromV<V2>().Pow2()>* = nullptr>
5550HWY_API VFromD<DN> OrderedDemote2To(DN dn, V a, V b) {
5551 return ReorderDemote2To(dn, a, b);
5552}
5553
5554// ------------------------------ WidenMulPairwiseAdd
5555
5556template <class D32, HWY_IF_F32_D(D32),
5558HWY_API VFromD<D32> WidenMulPairwiseAdd(D32 df32, V16 a, V16 b) {
5559 const RebindToUnsigned<decltype(df32)> du32;
5560 using VU32 = VFromD<decltype(du32)>;
5561 const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
5562 // Using shift/and instead of Zip leads to the odd/even order that
5563 // RearrangeToOddPlusEven prefers.
5564 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
5565 const VU32 ao = And(BitCast(du32, a), odd);
5566 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
5567 const VU32 bo = And(BitCast(du32, b), odd);
5568 return MulAdd(BitCast(df32, ae), BitCast(df32, be),
5569 Mul(BitCast(df32, ao), BitCast(df32, bo)));
5570}
5571
5572template <class D, HWY_IF_I32_D(D), class VI16>
5573HWY_API VFromD<D> WidenMulPairwiseAdd(D d32, VI16 a, VI16 b) {
5574 using VI32 = VFromD<decltype(d32)>;
5575 // Manual sign extension requires two shifts for even lanes.
5576 const VI32 ae = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, a)));
5577 const VI32 be = ShiftRight<16>(ShiftLeft<16>(BitCast(d32, b)));
5578 const VI32 ao = ShiftRight<16>(BitCast(d32, a));
5579 const VI32 bo = ShiftRight<16>(BitCast(d32, b));
5580 return Add(Mul(ae, be), Mul(ao, bo));
5581}
5582
5583template <class D, HWY_IF_U32_D(D), class VI16>
5584HWY_API VFromD<D> WidenMulPairwiseAdd(D du32, VI16 a, VI16 b) {
5585 using VU32 = VFromD<decltype(du32)>;
5586 // Manual sign extension requires two shifts for even lanes.
5587 const VU32 ae = detail::AndS(BitCast(du32, a), uint32_t{0x0000FFFFu});
5588 const VU32 be = detail::AndS(BitCast(du32, b), uint32_t{0x0000FFFFu});
5589 const VU32 ao = ShiftRight<16>(BitCast(du32, a));
5590 const VU32 bo = ShiftRight<16>(BitCast(du32, b));
5591 return Add(Mul(ae, be), Mul(ao, bo));
5592}
5593
5594// ------------------------------ ReorderWidenMulAccumulate (MulAdd, ZipLower)
5595
5596namespace detail {
5597
5598// Non-overloaded wrapper function so we can define DF32 in template args.
5599template <size_t N, int kPow2, class DF32 = Simd<float, N, kPow2>,
5600 class VF32 = VFromD<DF32>,
5601 class DBF16 = Repartition<hwy::bfloat16_t, Simd<float, N, kPow2>>>
5604 const VF32 sum0, VF32& sum1) {
5605 const RebindToUnsigned<DF32> du32;
5606 using VU32 = VFromD<decltype(du32)>;
5607 const VU32 odd = Set(du32, 0xFFFF0000u); // bfloat16 is the upper half of f32
5608 // Using shift/and instead of Zip leads to the odd/even order that
5609 // RearrangeToOddPlusEven prefers.
5610 const VU32 ae = ShiftLeft<16>(BitCast(du32, a));
5611 const VU32 ao = And(BitCast(du32, a), odd);
5612 const VU32 be = ShiftLeft<16>(BitCast(du32, b));
5613 const VU32 bo = And(BitCast(du32, b), odd);
5614 sum1 = MulAdd(BitCast(df32, ao), BitCast(df32, bo), sum1);
5615 return MulAdd(BitCast(df32, ae), BitCast(df32, be), sum0);
5616}
5617
5618#define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, \
5619 SHIFT, MLEN, NAME, OP) \
5620 template <size_t N> \
5621 HWY_API HWY_RVV_V(BASE, SEWD, LMULD) NAME( \
5622 HWY_RVV_D(BASE, SEWD, N, SHIFT + 1) d, HWY_RVV_V(BASE, SEWD, LMULD) sum, \
5623 HWY_RVV_V(BASE, SEW, LMUL) a, HWY_RVV_V(BASE, SEW, LMUL) b) { \
5624 return __riscv_v##OP##CHAR##SEWD##LMULD(sum, a, b, Lanes(d)); \
5625 }
5626
5627HWY_RVV_FOREACH_I16(HWY_RVV_WIDEN_MACC, WidenMulAcc, wmacc_vv_, _EXT_VIRT)
5628HWY_RVV_FOREACH_U16(HWY_RVV_WIDEN_MACC, WidenMulAcc, wmaccu_vv_, _EXT_VIRT)
5629#undef HWY_RVV_WIDEN_MACC
5630
5631// If LMUL is not the max, we can WidenMul first (3 instructions).
5632template <class D32, HWY_IF_POW2_LE_D(D32, 2), class V32 = VFromD<D32>,
5633 class D16 = RepartitionToNarrow<D32>>
5635 VFromD<D16> b, const V32 sum0,
5636 V32& sum1) {
5637 const Twice<decltype(d32)> d32t;
5638 using V32T = VFromD<decltype(d32t)>;
5639 V32T sum = Combine(d32t, sum1, sum0);
5640 sum = detail::WidenMulAcc(d32t, sum, a, b);
5641 sum1 = UpperHalf(d32, sum);
5642 return LowerHalf(d32, sum);
5643}
5644
5645// Max LMUL: must LowerHalf first (4 instructions).
5646template <class D32, HWY_IF_POW2_GT_D(D32, 2), class V32 = VFromD<D32>,
5647 class D16 = RepartitionToNarrow<D32>>
5648HWY_API VFromD<D32> ReorderWidenMulAccumulateI16(D32 d32, VFromD<D16> a,
5649 VFromD<D16> b, const V32 sum0,
5650 V32& sum1) {
5651 const Half<D16> d16h;
5652 using V16H = VFromD<decltype(d16h)>;
5653 const V16H a0 = LowerHalf(d16h, a);
5654 const V16H a1 = UpperHalf(d16h, a);
5655 const V16H b0 = LowerHalf(d16h, b);
5656 const V16H b1 = UpperHalf(d16h, b);
5657 sum1 = detail::WidenMulAcc(d32, sum1, a1, b1);
5658 return detail::WidenMulAcc(d32, sum0, a0, b0);
5659}
5660
5661// If LMUL is not the max, we can WidenMul first (3 instructions).
5662template <class D32, HWY_IF_POW2_LE_D(D32, 2), class V32 = VFromD<D32>,
5663 class D16 = RepartitionToNarrow<D32>>
5665 VFromD<D16> b, const V32 sum0,
5666 V32& sum1) {
5667 const Twice<decltype(d32)> d32t;
5668 using V32T = VFromD<decltype(d32t)>;
5669 V32T sum = Combine(d32t, sum1, sum0);
5670 sum = detail::WidenMulAcc(d32t, sum, a, b);
5671 sum1 = UpperHalf(d32, sum);
5672 return LowerHalf(d32, sum);
5673}
5674
5675// Max LMUL: must LowerHalf first (4 instructions).
5676template <class D32, HWY_IF_POW2_GT_D(D32, 2), class V32 = VFromD<D32>,
5677 class D16 = RepartitionToNarrow<D32>>
5678HWY_API VFromD<D32> ReorderWidenMulAccumulateU16(D32 d32, VFromD<D16> a,
5679 VFromD<D16> b, const V32 sum0,
5680 V32& sum1) {
5681 const Half<D16> d16h;
5682 using V16H = VFromD<decltype(d16h)>;
5683 const V16H a0 = LowerHalf(d16h, a);
5684 const V16H a1 = UpperHalf(d16h, a);
5685 const V16H b0 = LowerHalf(d16h, b);
5686 const V16H b1 = UpperHalf(d16h, b);
5687 sum1 = detail::WidenMulAcc(d32, sum1, a1, b1);
5688 return detail::WidenMulAcc(d32, sum0, a0, b0);
5689}
5690
5691} // namespace detail
5692
5693template <size_t N, int kPow2, class VN, class VW>
5695 const VW sum0, VW& sum1) {
5696 return detail::ReorderWidenMulAccumulateBF16(d32, a, b, sum0, sum1);
5697}
5698
5699template <size_t N, int kPow2, class VN, class VW>
5701 const VW sum0, VW& sum1) {
5702 return detail::ReorderWidenMulAccumulateI16(d32, a, b, sum0, sum1);
5703}
5704
5705template <size_t N, int kPow2, class VN, class VW>
5707 const VW sum0, VW& sum1) {
5708 return detail::ReorderWidenMulAccumulateU16(d32, a, b, sum0, sum1);
5709}
5710
5711// ------------------------------ RearrangeToOddPlusEven
5712
5713template <class VW, HWY_IF_SIGNED_V(VW)> // vint32_t*
5714HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
5715 // vwmacc doubles LMUL, so we require a pairwise sum here. This op is
5716 // expected to be less frequent than ReorderWidenMulAccumulate, hence it's
5717 // preferable to do the extra work here rather than do manual odd/even
5718 // extraction there.
5719 const DFromV<VW> di32;
5720 const RebindToUnsigned<decltype(di32)> du32;
5721 const Twice<decltype(di32)> di32x2;
5722 const RepartitionToWide<decltype(di32x2)> di64x2;
5723 const RebindToUnsigned<decltype(di64x2)> du64x2;
5724 const auto combined = BitCast(di64x2, Combine(di32x2, sum1, sum0));
5725 // Isolate odd/even int32 in int64 lanes.
5726 const auto even = ShiftRight<32>(ShiftLeft<32>(combined)); // sign extend
5727 const auto odd = ShiftRight<32>(combined);
5728 return BitCast(di32, TruncateTo(du32, BitCast(du64x2, Add(even, odd))));
5729}
5730
5731// For max LMUL, we cannot Combine again and instead manually unroll.
5732HWY_API vint32m8_t RearrangeToOddPlusEven(vint32m8_t sum0, vint32m8_t sum1) {
5733 const DFromV<vint32m8_t> d;
5734 const Half<decltype(d)> dh;
5735 const vint32m4_t lo =
5736 RearrangeToOddPlusEven(LowerHalf(sum0), UpperHalf(dh, sum0));
5737 const vint32m4_t hi =
5738 RearrangeToOddPlusEven(LowerHalf(sum1), UpperHalf(dh, sum1));
5739 return Combine(d, hi, lo);
5740}
5741
5742template <class VW, HWY_IF_UNSIGNED_V(VW)> // vuint32_t*
5743HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
5744 // vwmacc doubles LMUL, so we require a pairwise sum here. This op is
5745 // expected to be less frequent than ReorderWidenMulAccumulate, hence it's
5746 // preferable to do the extra work here rather than do manual odd/even
5747 // extraction there.
5748 const DFromV<VW> du32;
5749 const Twice<decltype(du32)> du32x2;
5750 const RepartitionToWide<decltype(du32x2)> du64x2;
5751 const auto combined = BitCast(du64x2, Combine(du32x2, sum1, sum0));
5752 // Isolate odd/even int32 in int64 lanes.
5753 const auto even = detail::AndS(combined, uint64_t{0xFFFFFFFFu});
5754 const auto odd = ShiftRight<32>(combined);
5755 return TruncateTo(du32, Add(even, odd));
5756}
5757
5758// For max LMUL, we cannot Combine again and instead manually unroll.
5759HWY_API vuint32m8_t RearrangeToOddPlusEven(vuint32m8_t sum0, vuint32m8_t sum1) {
5760 const DFromV<vuint32m8_t> d;
5761 const Half<decltype(d)> dh;
5762 const vuint32m4_t lo =
5763 RearrangeToOddPlusEven(LowerHalf(sum0), UpperHalf(dh, sum0));
5764 const vuint32m4_t hi =
5765 RearrangeToOddPlusEven(LowerHalf(sum1), UpperHalf(dh, sum1));
5766 return Combine(d, hi, lo);
5767}
5768
5769template <class VW, HWY_IF_FLOAT_V(VW)> // vfloat*
5770HWY_API VW RearrangeToOddPlusEven(const VW sum0, const VW sum1) {
5771 return Add(sum0, sum1); // invariant already holds
5772}
5773
5774// ------------------------------ Lt128
5775template <class D>
5777 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
5778 // Truth table of Eq and Compare for Hi and Lo u64.
5779 // (removed lines with (=H && cH) or (=L && cL) - cannot both be true)
5780 // =H =L cH cL | out = cH | (=H & cL)
5781 // 0 0 0 0 | 0
5782 // 0 0 0 1 | 0
5783 // 0 0 1 0 | 1
5784 // 0 0 1 1 | 1
5785 // 0 1 0 0 | 0
5786 // 0 1 0 1 | 0
5787 // 0 1 1 0 | 1
5788 // 1 0 0 0 | 0
5789 // 1 0 0 1 | 1
5790 // 1 1 0 0 | 0
5791 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
5792 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
5793 // Shift leftward so L can influence H.
5794 const VFromD<D> ltLx = detail::Slide1Up(ltHL);
5795 const VFromD<D> vecHx = OrAnd(ltHL, eqHL, ltLx);
5796 // Replicate H to its neighbor.
5797 return MaskFromVec(OddEven(vecHx, detail::Slide1Down(vecHx)));
5798}
5799
5800// ------------------------------ Lt128Upper
5801template <class D>
5803 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
5804 const VFromD<D> ltHL = VecFromMask(d, Lt(a, b));
5805 const VFromD<D> down = detail::Slide1Down(ltHL);
5806 // b(267743505): Clang compiler bug, workaround is DoNotOptimize
5807 asm volatile("" : : "r,m"(GetLane(down)) : "memory");
5808 // Replicate H to its neighbor.
5809 return MaskFromVec(OddEven(ltHL, down));
5810}
5811
5812// ------------------------------ Eq128
5813template <class D>
5815 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
5816 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
5817 const VFromD<D> eqLH = Reverse2(d, eqHL);
5818 const VFromD<D> eq = And(eqHL, eqLH);
5819 // b(267743505): Clang compiler bug, workaround is DoNotOptimize
5820 asm volatile("" : : "r,m"(GetLane(eq)) : "memory");
5821 return MaskFromVec(eq);
5822}
5823
5824// ------------------------------ Eq128Upper
5825template <class D>
5827 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
5828 const VFromD<D> eqHL = VecFromMask(d, Eq(a, b));
5829 // Replicate H to its neighbor.
5830 return MaskFromVec(OddEven(eqHL, detail::Slide1Down(eqHL)));
5831}
5832
5833// ------------------------------ Ne128
5834template <class D>
5836 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
5837 const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
5838 const VFromD<D> neLH = Reverse2(d, neHL);
5839 // b(267743505): Clang compiler bug, workaround is DoNotOptimize
5840 asm volatile("" : : "r,m"(GetLane(neLH)) : "memory");
5841 return MaskFromVec(Or(neHL, neLH));
5842}
5843
5844// ------------------------------ Ne128Upper
5845template <class D>
5847 static_assert(IsSame<TFromD<D>, uint64_t>(), "D must be u64");
5848 const VFromD<D> neHL = VecFromMask(d, Ne(a, b));
5849 const VFromD<D> down = detail::Slide1Down(neHL);
5850 // b(267743505): Clang compiler bug, workaround is DoNotOptimize
5851 asm volatile("" : : "r,m"(GetLane(down)) : "memory");
5852 // Replicate H to its neighbor.
5853 return MaskFromVec(OddEven(neHL, down));
5854}
5855
5856// ------------------------------ Min128, Max128 (Lt128)
5857
5858template <class D>
5859HWY_INLINE VFromD<D> Min128(D /* tag */, const VFromD<D> a, const VFromD<D> b) {
5860 const VFromD<D> aXH = detail::Slide1Down(a);
5861 const VFromD<D> bXH = detail::Slide1Down(b);
5862 const VFromD<D> minHL = Min(a, b);
5863 const MFromD<D> ltXH = Lt(aXH, bXH);
5864 const MFromD<D> eqXH = Eq(aXH, bXH);
5865 // If the upper lane is the decider, take lo from the same reg.
5866 const VFromD<D> lo = IfThenElse(ltXH, a, b);
5867 // The upper lane is just minHL; if they are equal, we also need to use the
5868 // actual min of the lower lanes.
5869 return OddEven(minHL, IfThenElse(eqXH, minHL, lo));
5870}
5871
5872template <class D>
5873HWY_INLINE VFromD<D> Max128(D /* tag */, const VFromD<D> a, const VFromD<D> b) {
5874 const VFromD<D> aXH = detail::Slide1Down(a);
5875 const VFromD<D> bXH = detail::Slide1Down(b);
5876 const VFromD<D> maxHL = Max(a, b);
5877 const MFromD<D> ltXH = Lt(aXH, bXH);
5878 const MFromD<D> eqXH = Eq(aXH, bXH);
5879 // If the upper lane is the decider, take lo from the same reg.
5880 const VFromD<D> lo = IfThenElse(ltXH, b, a);
5881 // The upper lane is just maxHL; if they are equal, we also need to use the
5882 // actual min of the lower lanes.
5883 return OddEven(maxHL, IfThenElse(eqXH, maxHL, lo));
5884}
5885
5886template <class D>
5888 return IfThenElse(Lt128Upper(d, a, b), a, b);
5889}
5890
5891template <class D>
5893 return IfThenElse(Lt128Upper(d, b, a), a, b);
5894}
5895
5896// ================================================== END MACROS
5897#undef HWY_RVV_AVL
5898#undef HWY_RVV_D
5899#undef HWY_RVV_FOREACH
5900#undef HWY_RVV_FOREACH_08_ALL
5901#undef HWY_RVV_FOREACH_08_ALL_VIRT
5902#undef HWY_RVV_FOREACH_08_DEMOTE
5903#undef HWY_RVV_FOREACH_08_DEMOTE_VIRT
5904#undef HWY_RVV_FOREACH_08_EXT
5905#undef HWY_RVV_FOREACH_08_EXT_VIRT
5906#undef HWY_RVV_FOREACH_08_TRUNC
5907#undef HWY_RVV_FOREACH_08_VIRT
5908#undef HWY_RVV_FOREACH_16_ALL
5909#undef HWY_RVV_FOREACH_16_ALL_VIRT
5910#undef HWY_RVV_FOREACH_16_DEMOTE
5911#undef HWY_RVV_FOREACH_16_DEMOTE_VIRT
5912#undef HWY_RVV_FOREACH_16_EXT
5913#undef HWY_RVV_FOREACH_16_EXT_VIRT
5914#undef HWY_RVV_FOREACH_16_TRUNC
5915#undef HWY_RVV_FOREACH_16_VIRT
5916#undef HWY_RVV_FOREACH_32_ALL
5917#undef HWY_RVV_FOREACH_32_ALL_VIRT
5918#undef HWY_RVV_FOREACH_32_DEMOTE
5919#undef HWY_RVV_FOREACH_32_DEMOTE_VIRT
5920#undef HWY_RVV_FOREACH_32_EXT
5921#undef HWY_RVV_FOREACH_32_EXT_VIRT
5922#undef HWY_RVV_FOREACH_32_TRUNC
5923#undef HWY_RVV_FOREACH_32_VIRT
5924#undef HWY_RVV_FOREACH_64_ALL
5925#undef HWY_RVV_FOREACH_64_ALL_VIRT
5926#undef HWY_RVV_FOREACH_64_DEMOTE
5927#undef HWY_RVV_FOREACH_64_DEMOTE_VIRT
5928#undef HWY_RVV_FOREACH_64_EXT
5929#undef HWY_RVV_FOREACH_64_EXT_VIRT
5930#undef HWY_RVV_FOREACH_64_TRUNC
5931#undef HWY_RVV_FOREACH_64_VIRT
5932#undef HWY_RVV_FOREACH_B
5933#undef HWY_RVV_FOREACH_F
5934#undef HWY_RVV_FOREACH_F16
5935#undef HWY_RVV_FOREACH_F32
5936#undef HWY_RVV_FOREACH_F3264
5937#undef HWY_RVV_FOREACH_F64
5938#undef HWY_RVV_FOREACH_I
5939#undef HWY_RVV_FOREACH_I08
5940#undef HWY_RVV_FOREACH_I16
5941#undef HWY_RVV_FOREACH_I163264
5942#undef HWY_RVV_FOREACH_I32
5943#undef HWY_RVV_FOREACH_I64
5944#undef HWY_RVV_FOREACH_U
5945#undef HWY_RVV_FOREACH_U08
5946#undef HWY_RVV_FOREACH_U16
5947#undef HWY_RVV_FOREACH_U163264
5948#undef HWY_RVV_FOREACH_U32
5949#undef HWY_RVV_FOREACH_U64
5950#undef HWY_RVV_FOREACH_UI
5951#undef HWY_RVV_FOREACH_UI08
5952#undef HWY_RVV_FOREACH_UI16
5953#undef HWY_RVV_FOREACH_UI163264
5954#undef HWY_RVV_FOREACH_UI32
5955#undef HWY_RVV_FOREACH_UI3264
5956#undef HWY_RVV_FOREACH_UI64
5957#undef HWY_RVV_IF_EMULATED_D
5958#undef HWY_RVV_IF_CAN128_D
5959#undef HWY_RVV_IF_GE128_D
5960#undef HWY_RVV_IF_LT128_D
5961#undef HWY_RVV_INSERT_VXRM
5962#undef HWY_RVV_M
5963#undef HWY_RVV_RETM_ARGM
5964#undef HWY_RVV_RETV_ARGMVV
5965#undef HWY_RVV_RETV_ARGV
5966#undef HWY_RVV_RETV_ARGVS
5967#undef HWY_RVV_RETV_ARGVV
5968#undef HWY_RVV_T
5969#undef HWY_RVV_V
5970// NOLINTNEXTLINE(google-readability-namespace-comments)
5971} // namespace HWY_NAMESPACE
5972} // namespace hwy
#define HWY_SPECIALIZE(BASE, CHAR, BITS, HALF, NAME, OP)
Definition arm_sve-inl.h:195
#define HWY_MAX(a, b)
Definition base.h:177
#define HWY_RESTRICT
Definition base.h:95
#define HWY_API
Definition base.h:171
#define HWY_MIN(a, b)
Definition base.h:176
#define HWY_INLINE
Definition base.h:101
#define HWY_DASSERT(condition)
Definition base.h:290
HWY_API VF32 ReorderWidenMulAccumulateBF16(Simd< float, N, kPow2 > df32, VFromD< DBF16 > a, VFromD< DBF16 > b, const VF32 sum0, VF32 &sum1)
Definition rvv-inl.h:5602
HWY_INLINE V InterleaveUpperBlocks(D d, const V a, const V b)
Definition rvv-inl.h:4231
HWY_API VFromD< D > InterleaveWhole(D d, VFromD< Half< D > > a, VFromD< Half< D > > b)
Definition rvv-inl.h:4121
HWY_INLINE V SlideUpLanes(V v, size_t amt)
Definition arm_neon-inl.h:6201
HWY_INLINE V OffsetsOf128BitBlocks(const D d, const V iota0)
Definition rvv-inl.h:2966
HWY_INLINE Vec128< T, N > Add(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:560
constexpr size_t LanesPerBlock(Simd< T, N, kPow2 > d)
Definition arm_sve-inl.h:3442
HWY_INLINE MFromD< D > IsOdd(D d)
Definition rvv-inl.h:3222
VFromD< D > Ext(D d, VFromD< Half< D > > v)
Definition rvv-inl.h:764
HWY_INLINE MFromD< D > LoadMaskBits(D d, uint64_t mask_bits)
Definition arm_neon-inl.h:8051
HWY_INLINE Vec128< T, N > Sub(hwy::NonFloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:570
HWY_API VFromD< D32 > ReorderWidenMulAccumulateI16(D32 d32, VFromD< D16 > a, VFromD< D16 > b, const V32 sum0, V32 &sum1)
Definition rvv-inl.h:5634
HWY_INLINE VFromD< D > ChangeLMUL(D, VFromD< D > v)
Definition rvv-inl.h:3403
HWY_API void StoreN(size_t count, VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT p)
Definition rvv-inl.h:1926
constexpr bool IsLT128(Simd< T, N, kPow2 >)
Definition rvv-inl.h:4265
HWY_INLINE VFromD< D > Iota0(D d)
Definition arm_neon-inl.h:1239
HWY_INLINE Mask128< float, N > UseInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5005
HWY_INLINE If< IsConst< T >(), const uint16_t *, uint16_t * > U16LanePointer(T *p)
Definition ops/shared-inl.h:139
HWY_INLINE Vec128< uint8_t, N > BitCastToByte(Vec128< uint8_t, N > v)
Definition arm_neon-inl.h:1402
constexpr bool IsGE128(Simd< T, N, kPow2 >)
Definition rvv-inl.h:4258
HWY_API VFromD< D32 > ReorderWidenMulAccumulateU16(D32 d32, VFromD< D16 > a, VFromD< D16 > b, const V32 sum0, V32 &sum1)
Definition rvv-inl.h:5664
typename AdjustSimdTagToMinVecPow2_t< RemoveConst< D > >::type AdjustSimdTagToMinVecPow2
Definition rvv-inl.h:70
HWY_INLINE Vec128< T, N > Compress(Vec128< T, N > v, uint64_t mask_bits)
Definition arm_neon-inl.h:8851
HWY_INLINE Vec128< T, N > Mul(hwy::FloatTag, Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:774
HWY_INLINE MFromD< D > IsEven(D d)
Definition rvv-inl.h:3208
constexpr size_t ScaleByPower(size_t N, int pow2)
Definition ops/shared-inl.h:146
HWY_API Vec128< T > InterleaveUpper(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6086
constexpr bool IsSupportedLMUL(D d)
Definition rvv-inl.h:3074
HWY_INLINE VFromD< D > BitCastFromByte(D, VFromD< D > v)
Definition arm_neon-inl.h:1441
HWY_INLINE MFromD< D > FirstNPerBlock(D)
Definition rvv-inl.h:2972
HWY_INLINE svuint32_t RoundF32ForDemoteToBF16(svfloat32_t v)
Definition arm_sve-inl.h:2690
HWY_INLINE V SlideDownLanes(V v, size_t amt)
Definition arm_neon-inl.h:6346
HWY_INLINE V InterleaveLowerBlocks(D d, const V a, const V b)
Definition rvv-inl.h:4205
HWY_API void LoadInterleaved4(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2, VFromD< D > &v3)
Definition arm_neon-inl.h:9128
HWY_API void ScatterOffset(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2624
HWY_API Vec128< T, N > NegMulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2618
HWY_API Vec128< T, N > ShiftLeftSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2332
typename D::template Rebind< T > Rebind
Definition ops/shared-inl.h:460
HWY_API Vec128< T, N > OddEvenBlocks(Vec128< T, N >, Vec128< T, N > even)
Definition arm_neon-inl.h:7156
HWY_API VFromD< D > Undefined(D)
Definition arm_neon-inl.h:959
HWY_API VFromD< D > VecFromMask(D d, const MFromD< D > m)
Definition arm_neon-inl.h:2960
HWY_API V MaskedMaxOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1489
HWY_API Vec128< T, N > DupOdd(Vec128< T, N > v)
Definition arm_neon-inl.h:7091
HWY_API VFromD< DW > ZipLower(V a, V b)
Definition arm_neon-inl.h:6113
HWY_API Vec128< T > CombineShiftRightBytes(D d, Vec128< T > hi, Vec128< T > lo)
Definition arm_neon-inl.h:5166
HWY_API V MaskedDivOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1512
HWY_API auto Lt(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7339
HWY_API Vec128< float, N > Round(const Vec128< float, N > v)
Definition arm_neon-inl.h:5023
HWY_API Vec128< int64_t, N > AbsDiff(const Vec128< int64_t, N > a, const Vec128< int64_t, N > b)
Definition arm_neon-inl.h:2823
HWY_API auto Eq(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7331
HWY_API VFromD< D > ZeroExtendVector(D d, VFromD< Half< D > > lo)
Definition arm_neon-inl.h:6867
HWY_API Mask128< T, N > IsNaN(const Vec128< T, N > v)
Definition arm_neon-inl.h:5093
D d
Definition arm_sve-inl.h:1915
HWY_API Vec128< T, N > SaturatedAdd(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:632
HWY_API VFromD< D > LoadNOr(VFromD< D > no, D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1362
HWY_INLINE VFromD< D > Max128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9490
HWY_API Vec128< int8_t > MulHigh(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:2357
HWY_API void StoreN(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_store)
Definition emu128-inl.h:1398
HWY_API svbool_t MaskFalse(const D)
Definition arm_sve-inl.h:372
HWY_API V IfThenElse(MFromD< DFromV< V > > mask, V yes, V no)
Definition arm_neon-inl.h:2992
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > Sqrt(const Vec128< T, N > v)
Definition arm_neon-inl.h:2654
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< T, N > BroadcastSignBit(const Vec128< T, N > v)
Definition arm_neon-inl.h:2941
V Shl(V a, V b)
Definition generic_ops-inl.h:7322
HWY_INLINE HWY_MAYBE_UNUSED constexpr size_t MaxLanes(D)
Definition ops/shared-inl.h:442
HWY_API VFromD< D > MaxOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3228
HWY_API V MaskedModOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:4666
HWY_API VFromD< D32 > ReorderWidenMulAccumulate(D32 df32, V16 a, V16 b, const VFromD< D32 > sum0, VFromD< D32 > &sum1)
Definition arm_neon-inl.h:6571
HWY_API Vec128< T > Shuffle0321(Vec128< T > v)
Definition arm_neon-inl.h:6018
HWY_API Vec128< T, N > MulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2550
HWY_API Mask128< T, N > MaskFromVec(const Vec128< T, N > v)
Definition arm_neon-inl.h:2951
HWY_API Vec128< T, N > IfThenZeroElse(Mask128< T, N > mask, Vec128< T, N > no)
Definition arm_neon-inl.h:3019
HWY_API void Store(VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3911
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API TFromD< D > ReduceMax(D d, VFromD< D > v)
Definition arm_sve-inl.h:3213
HWY_API Vec32< uint8_t > U8FromU32(Vec128< uint32_t > v)
Definition arm_neon-inl.h:4965
HWY_API void ScatterIndex(VFromD< D > v, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2643
HWY_API Vec128< T, N > CopySignToAbs(Vec128< T, N > abs, Vec128< T, N > sign)
Definition arm_neon-inl.h:2932
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_INLINE MFromD< D > Ne128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9466
svbool_t m
Definition arm_sve-inl.h:1956
HWY_API svbool_t DemoteMaskTo(DTo, DFrom, svbool_t m)
Definition arm_sve-inl.h:1420
HWY_API VFromD< D > MaskedLoadOr(VFromD< D > v, MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3675
HWY_API VFromD< D > ConcatLowerUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6965
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_INLINE MFromD< D > Lt128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9436
HWY_API Vec128< uint64_t, N > Max(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3377
HWY_API Vec128< T > Shuffle1032(Vec128< T > v)
Definition arm_neon-inl.h:6008
HWY_API Vec128< T, N > MulSub(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > sub)
Definition arm_neon-inl.h:2612
HWY_API MFromD< DTo > OrderedDemote2MasksTo(DTo d_to, DFrom, MFromD< DFrom > a, MFromD< DFrom > b)
Definition x86_128-inl.h:1107
HWY_API VFromD< D > Zero(D d)
Definition arm_neon-inl.h:947
HWY_API Vec128< float, N > ApproximateReciprocal(Vec128< float, N > v)
Definition emu128-inl.h:900
HWY_API Vec128< float > ConvertTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:3971
HWY_API Vec128< T, N > SaturatedSub(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:645
HWY_API VFromD< D > OrderedDemote2To(D d, V a, V b)
Definition arm_neon-inl.h:7394
HWY_API Vec64< uint8_t > UpperHalf(D, Vec128< uint8_t > v)
Definition arm_neon-inl.h:5313
HWY_API Vec128< TTo, 1 > TruncateTo(DTo, Vec128< TFrom, 1 > v)
Definition arm_neon-inl.h:7477
HWY_API Vec128< T, 1 > Reverse(D, Vec128< T, 1 > v)
Definition arm_neon-inl.h:5959
HWY_API VFromD< D > Slide1Up(D d, VFromD< D > v)
Definition arm_sve-inl.h:3636
D TFromD< D > *HWY_RESTRICT p
Definition arm_sve-inl.h:1915
HWY_API Vec128< uint64_t, N > Min(Vec128< uint64_t, N > a, Vec128< uint64_t, N > b)
Definition arm_neon-inl.h:3311
HWY_API Vec128< uint64_t > CompressBlocksNot(Vec128< uint64_t > v, Mask128< uint64_t >)
Definition arm_neon-inl.h:8924
HWY_API Vec128< int64_t > Abs(const Vec128< int64_t > v)
Definition arm_neon-inl.h:3271
HWY_API V MaskedMinOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1484
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API Vec128< float, N > ApproximateReciprocalSqrt(Vec128< float, N > v)
Definition emu128-inl.h:945
typename D::T TFromD
Definition ops/shared-inl.h:426
HWY_API Vec128< T, N > IfVecThenElse(Vec128< T, N > mask, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:2785
HWY_API size_t FindKnownLastTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8385
HWY_API VFromD< D > ConcatLowerLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6875
HWY_API VFromD< D > Load(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3664
HWY_API void LoadInterleaved3(D d, const T *HWY_RESTRICT unaligned, VFromD< D > &v0, VFromD< D > &v1, VFromD< D > &v2)
Definition arm_neon-inl.h:9087
HWY_API void StoreInterleaved3(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9253
HWY_API VFromD< D > MinOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3224
HWY_API Vec128< T, N > DupEven(Vec128< T, N > v)
Definition arm_neon-inl.h:7074
HWY_API void StoreInterleaved4(VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, VFromD< D > v3, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9285
HWY_API Vec128< T, N > TwoTablesLookupLanes(Vec128< T, N > a, Vec128< T, N > b, Indices128< T, N > idx)
Definition arm_neon-inl.h:5783
HWY_API VFromD< D > SlideDownLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6367
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< uint8_t > Combine(D, Vec64< uint8_t > hi, Vec64< uint8_t > lo)
Definition arm_neon-inl.h:1314
HWY_API Mask128< uint64_t, N > TestBit(Vec128< uint64_t, N > v, Vec128< uint64_t, N > bit)
Definition arm_neon-inl.h:3254
HWY_API VFromD< D > InterleaveWholeLower(D, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2883
HWY_API Vec128< T, N > NegMulAdd(Vec128< T, N > mul, Vec128< T, N > x, Vec128< T, N > add)
Definition arm_neon-inl.h:2556
HWY_API Vec128< T, N > RotateRight(const Vec128< T, N > v)
Definition arm_neon-inl.h:2158
HWY_API Mask128< T, 1 > SetOnlyFirst(Mask128< T, 1 > mask)
Definition arm_neon-inl.h:9356
HWY_API Vec128< T, N > TableLookupLanes(Vec128< T, N > v, Indices128< T, N > idx)
Definition arm_neon-inl.h:5775
HWY_API Vec128< T, N > Xor(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2739
HWY_API Vec128< float, N > Floor(const Vec128< float, N > v)
Definition arm_neon-inl.h:5054
HWY_API intptr_t FindFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8377
HWY_API void MaskedScatterIndex(VFromD< D > v, MFromD< D > m, D d, T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2661
HWY_API size_t CappedLanes(D, size_t cap)
Definition rvv-inl.h:603
HWY_API VFromD< D > MaskedGatherIndexOr(VFromD< D > no, MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2753
HWY_API size_t CompressStore(VFromD< D > v, MFromD< D > mask, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8946
HWY_API Vec128< T, N > AverageRound(Vec128< T, N > a, Vec128< T, N > b)
Definition emu128-inl.h:657
constexpr size_t MLenFromD(Simd< T, N, kPow2 >)
Definition rvv-inl.h:43
HWY_API Vec64< uint16_t > DemoteTo(D, Vec128< int32_t > v)
Definition arm_neon-inl.h:4629
HWY_API Vec128< float, N > Ceil(const Vec128< float, N > v)
Definition arm_neon-inl.h:5040
HWY_API Vec128< T, N > SwapAdjacentBlocks(Vec128< T, N > v)
Definition arm_neon-inl.h:7162
HWY_API VFromD< D > ConcatUpperLower(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6989
HWY_API Vec64< uint32_t > Shuffle2301(const Vec64< uint32_t > v)
Definition arm_neon-inl.h:3084
HWY_API Vec128< int16_t > MulOdd(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7645
HWY_API TFromD< D > ReduceMin(D d, VFromD< D > v)
Definition arm_sve-inl.h:3208
HWY_INLINE MFromD< D > Eq128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9444
HWY_API Mask128< T, N > ExclusiveNeither(const Mask128< T, N > a, Mask128< T, N > b)
Definition arm_neon-inl.h:3072
HWY_API V MaskedSatSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1525
Rebind< MakeUnsigned< TFromD< D > >, D > RebindToUnsigned
Definition ops/shared-inl.h:465
HWY_INLINE Vec128< T, N > CompressBits(Vec128< T, N > v, const uint8_t *HWY_RESTRICT bits)
Definition arm_neon-inl.h:8932
HWY_API VFromD< D > InterleaveWholeUpper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_sve-inl.h:2890
Simd< typename V::PrivateT, V::kPrivateN, 0 > DFromV
Definition arm_neon-inl.h:885
HWY_API VFromD< D > ReverseBlocks(D, VFromD< D > v)
Definition arm_neon-inl.h:7169
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_API Vec128< T, N > IfNegativeThenElse(Vec128< T, N > v, Vec128< T, N > yes, Vec128< T, N > no)
Definition arm_neon-inl.h:3030
HWY_API VFromD< D > LoadDup128(D d, const TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3682
HWY_API bool AllTrue(D d, Mask128< T > m)
Definition arm_neon-inl.h:8416
HWY_API T ExtractLane(const Vec128< T, 1 > v, size_t i)
Definition arm_neon-inl.h:1657
HWY_API Vec128< T > InterleaveLower(Vec128< T > a, Vec128< T > b)
Definition arm_neon-inl.h:6046
HWY_API V MaskedSatAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1520
HWY_API V MaskedSubOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1499
HWY_API VFromD< D > Iota(D d, const T2 first)
Definition arm_neon-inl.h:1297
HWY_API Vec128< int16_t > MulEven(Vec128< int8_t > a, Vec128< int8_t > b)
Definition arm_neon-inl.h:7538
HWY_API void BlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT p)
Definition arm_neon-inl.h:3918
HWY_API VFromD< D > GatherIndex(D d, const TFromD< D > *HWY_RESTRICT p, VFromD< RebindToSigned< D > > indices)
Definition arm_sve-inl.h:1963
HWY_API Vec128< T > Shuffle01(Vec128< T > v)
Definition arm_neon-inl.h:6012
HWY_API svbool_t Gt(const V a, const V b)
Definition arm_sve-inl.h:1578
HWY_API Indices128< TFromD< D >, MaxLanes(D())> IndicesFromVec(D d, Vec128< TI, MaxLanes(D())> vec)
Definition arm_neon-inl.h:5727
HWY_API VFromD< D > SumOfLanes(D d, VFromD< D > v)
Definition arm_sve-inl.h:3220
HWY_API VFromD< D > ShiftRightLanes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5286
HWY_API Vec128< uint16_t > PromoteTo(D, Vec64< uint8_t > v)
Definition arm_neon-inl.h:4252
HWY_API VFromD< DW > ZipUpper(DW dw, V a, V b)
Definition arm_neon-inl.h:6122
HWY_API Vec128< T, N > ShiftRight(Vec128< T, N > v)
Definition emu128-inl.h:488
HWY_API Vec128< T > Shuffle0123(Vec128< T > v)
Definition arm_neon-inl.h:6030
D TFromD< D > *HWY_RESTRICT VFromD< RebindToSigned< D > > indices
Definition arm_sve-inl.h:1916
HWY_API V Sub(V a, V b)
Definition generic_ops-inl.h:7304
Rebind< MakeSigned< TFromD< D > >, D > RebindToSigned
Definition ops/shared-inl.h:463
HWY_API MFromD< DTo > RebindMask(DTo, Mask128< TFrom, NFrom > m)
Definition arm_neon-inl.h:2969
HWY_API Indices128< TFromD< D >, MaxLanes(D())> SetTableIndices(D d, const TI *idx)
Definition arm_neon-inl.h:5768
typename detail::ScalableTagChecker< T, kPow2 >::type ScalableTag
Definition ops/shared-inl.h:367
HWY_API VFromD< D > GatherOffset(D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > offset)
Definition generic_ops-inl.h:2694
HWY_API size_t CompressBitsStore(VFromD< D > v, const uint8_t *HWY_RESTRICT bits, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8970
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API svbool_t LowerHalfOfMask(D, svbool_t m)
Definition arm_sve-inl.h:1456
HWY_API VFromD< D32 > WidenMulPairwiseAdd(D32 df32, VFromD< Repartition< bfloat16_t, D32 > > a, VFromD< Repartition< bfloat16_t, D32 > > b)
Definition arm_neon-inl.h:6776
HWY_API Vec128< T, N > ShiftRightSame(const Vec128< T, N > v, int bits)
Definition arm_neon-inl.h:2336
HWY_INLINE MFromD< D > Lt128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9409
HWY_API VI TableLookupBytesOr0(V bytes, VI from)
Definition arm_neon-inl.h:7806
HWY_API VFromD< D > Reverse8(D d, VFromD< D > v)
Definition arm_neon-inl.h:5935
HWY_API V Div(V a, V b)
Definition arm_sve-inl.h:4639
HWY_API Vec128< T, N > IfThenElseZero(Mask128< T, N > mask, Vec128< T, N > yes)
Definition arm_neon-inl.h:3007
HWY_API V ExtractBlock(V v)
Definition generic_ops-inl.h:6967
typename D::Half Half
Definition ops/shared-inl.h:487
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
HWY_API void Stream(const VFromD< D > v, D d, TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3932
typename V::PrivateT TFromV
Definition arm_neon-inl.h:891
HWY_API VFromD< D > LoadN(D d, const TFromD< D > *HWY_RESTRICT p, size_t max_lanes_to_load)
Definition emu128-inl.h:1352
HWY_API auto Le(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7353
HWY_API V MaskedAddOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1494
HWY_API Vec128< int32_t, N > NearestInt(const Vec128< float, N > v)
Definition arm_neon-inl.h:5084
HWY_API TFromV< V > GetLane(const V v)
Definition arm_neon-inl.h:1648
HWY_API bool AllFalse(D d, MFromD< D > m)
Definition arm_neon-inl.h:8410
HWY_API VFromD< D > MaskedGatherIndex(MFromD< D > m, D d, const T *HWY_RESTRICT base, VFromD< RebindToSigned< D > > index)
Definition generic_ops-inl.h:2731
HWY_API VFromD< D > Reverse4(D d, VFromD< D > v)
Definition arm_neon-inl.h:5900
HWY_API Vec128< T, N > Or3(Vec128< T, N > o1, Vec128< T, N > o2, Vec128< T, N > o3)
Definition arm_neon-inl.h:2773
V Shr(V a, V b)
Definition generic_ops-inl.h:7326
HWY_API Vec128< T, N > OddEven(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:7107
HWY_API svbool_t IsNegative(V v)
Definition arm_sve-inl.h:1623
HWY_API Vec128< int16_t > MulFixedPoint15(Vec128< int16_t > a, Vec128< int16_t > b)
Definition arm_neon-inl.h:2473
HWY_API VFromD< D > Dup128VecFromValues(D d, TFromD< D > t0, TFromD< D > t1, TFromD< D > t2, TFromD< D > t3, TFromD< D > t4, TFromD< D > t5, TFromD< D > t6, TFromD< D > t7, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >, TFromD< D >)
Definition arm_neon-inl.h:984
HWY_API Vec128< float, N > Trunc(const Vec128< float, N > v)
Definition arm_neon-inl.h:5012
decltype(Set(D(), TFromD< D >())) VFromD
Definition arm_neon-inl.h:944
HWY_API Vec128< TI > TableLookupBytes(Vec128< T > bytes, Vec128< TI > from)
Definition arm_neon-inl.h:7754
HWY_API V BroadcastBlock(V v)
Definition generic_ops-inl.h:6973
HWY_API VFromD< D > Slide1Down(D d, VFromD< D > v)
Definition arm_sve-inl.h:3653
HWY_INLINE MFromD< D > Eq128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9451
HWY_API V MaskedMulOr(V no, M m, V a, V b)
Definition arm_sve-inl.h:1504
HWY_API VFromD< D > InterleaveEven(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7126
HWY_API Vec128< float, N > RearrangeToOddPlusEven(Vec128< float, N > sum0, Vec128< float, N > sum1)
Definition arm_neon-inl.h:6687
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Mask128< T, N > SetBeforeFirst(Mask128< T, N > mask)
Definition arm_neon-inl.h:9351
HWY_API VFromD< D > ConcatUpperUpper(D d, VFromD< D > hi, VFromD< D > lo)
Definition arm_neon-inl.h:6940
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_API void StoreInterleaved2(VFromD< D > v0, VFromD< D > v1, D d, T *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:9221
HWY_INLINE MFromD< D > Ne128(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9459
HWY_API auto Ne(V a, V b) -> decltype(a==b)
Definition generic_ops-inl.h:7335
HWY_API Mask128< T, 1 > SetAtOrBeforeFirst(Mask128< T, 1 >)
Definition arm_neon-inl.h:9393
HWY_API TFromD< D > ReduceSum(D, VFromD< D > v)
Definition arm_neon-inl.h:8027
HWY_API VFromD< D > SlideUpLanes(D, VFromD< D > v, size_t)
Definition arm_neon-inl.h:6221
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
HWY_API Vec128< uint64_t > SumsOf8(const Vec128< uint8_t > v)
Definition arm_neon-inl.h:1942
HWY_API MFromD< D > Dup128MaskFromMaskBits(D d, unsigned mask_bits)
Definition arm_neon-inl.h:8103
HWY_API Vec128< int32_t > ReorderDemote2To(D d32, Vec128< int64_t > a, Vec128< int64_t > b)
Definition arm_neon-inl.h:7185
HWY_API VFromD< D > InterleaveOdd(D, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:7141
typename D::Twice Twice
Definition ops/shared-inl.h:491
HWY_API svbool_t IsFinite(const V v)
Definition arm_sve-inl.h:1725
HWY_API MFromD< D > UpperHalfOfMask(D, MFromD< Twice< D > > m)
Definition x86_128-inl.h:1051
HWY_API V Mul(V a, V b)
Definition generic_ops-inl.h:7309
HWY_API Vec128< T, 1 > Broadcast(Vec128< T, 1 > v)
Definition arm_neon-inl.h:5387
HWY_API V Expand(V v, svbool_t mask)
Definition arm_sve-inl.h:5240
HWY_API MFromD< D > CombineMasks(D, MFromD< Half< D > > hi, MFromD< Half< D > > lo)
Definition x86_128-inl.h:959
HWY_INLINE VFromD< D > Min128Upper(D d, VFromD< D > a, VFromD< D > b)
Definition arm_neon-inl.h:9485
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API Vec128< T, 1 > InsertLane(const Vec128< T, 1 > v, size_t i, T t)
Definition arm_neon-inl.h:1806
HWY_API Vec128< float16_t, N > Neg(const Vec128< float16_t, N > v)
Definition arm_neon-inl.h:2079
typename D::template Repartition< T > Repartition
Definition ops/shared-inl.h:471
decltype(MaskFromVec(VFromD< D >())) MFromD
Definition arm_neon-inl.h:2957
HWY_API Vec128< T, N > CopySign(Vec128< T, N > magn, Vec128< T, N > sign)
Definition arm_neon-inl.h:2924
HWY_API Vec128< T, N > ShiftLeft(Vec128< T, N > v)
Definition emu128-inl.h:476
HWY_API svbool_t PromoteMaskTo(DTo, DFrom, svbool_t m)
Definition arm_sve-inl.h:1394
HWY_API VFromD< D > ShiftLeftBytes(D d, VFromD< D > v)
Definition arm_neon-inl.h:5258
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
HWY_API VFromD< D > Reverse2(D d, VFromD< D > v)
Definition arm_neon-inl.h:5860
HWY_API V InsertBlock(V, V blk_to_insert)
Definition generic_ops-inl.h:6961
HWY_API V CombineShiftRightLanes(const D d, const V hi, V lo)
Definition rvv-inl.h:3761
HWY_API Vec128< T, N/2 > LowerHalf(Vec128< T, N > v)
Definition arm_neon-inl.h:5103
HWY_API VFromD< D > ResizeBitCast(D d, FromV v)
Definition arm_neon-inl.h:1591
HWY_API size_t FindKnownFirstTrue(D d, MFromD< D > mask)
Definition arm_neon-inl.h:8370
HWY_API V Mod(V a, V b)
Definition arm_sve-inl.h:4660
HWY_API svbool_t Ge(const V a, const V b)
Definition arm_sve-inl.h:1582
Definition abort.h:8
HWY_API constexpr TTo ConvertScalarTo(const TFrom in)
Definition base.h:2435
typename detail::Relations< T >::Unsigned MakeUnsigned
Definition base.h:2078
HWY_API constexpr bool IsSame()
Definition base.h:499
constexpr size_t FloorLog2(TI x)
Definition base.h:2662
typename IfT< Condition, Then, Else >::type If
Definition base.h:520
HWY_INLINE HWY_BITCASTSCALAR_CONSTEXPR T MantissaEnd()
Definition base.h:2307
constexpr size_t CeilLog2(TI x)
Definition base.h:2669
typename EnableIfT< Condition >::type EnableIf
Definition base.h:486
typename detail::Relations< T >::Wide MakeWide
Definition base.h:2086
#define HWY_IF_T_SIZE_D(D, bytes)
Definition ops/shared-inl.h:549
#define HWY_IF_UNSIGNED_D(D)
Definition ops/shared-inl.h:531
#define HWY_IF_POW2_GT_D(D, pow2)
Definition ops/shared-inl.h:574
#define HWY_IF_T_SIZE_ONE_OF_V(V, bit_array)
Definition ops/shared-inl.h:628
#define HWY_IF_POW2_LE_D(D, pow2)
Definition ops/shared-inl.h:573
#define HWY_IF_T_SIZE_V(V, bytes)
Definition ops/shared-inl.h:624
#define HWY_IF_SPECIAL_FLOAT_D(D)
Definition ops/shared-inl.h:540
#define HWY_MAX_LANES_V(V)
Definition ops/shared-inl.h:631
#define HWY_IF_F32_D(D)
Definition ops/shared-inl.h:600
#define HWY_IF_T_SIZE_GT_D(D, bytes)
Definition ops/shared-inl.h:557
#define HWY_IF_UNSIGNED_V(V)
Definition ops/shared-inl.h:613
#define HWY_MAX_LANES_D(D)
Definition ops/shared-inl.h:432
#define HWY_IF_T_SIZE_LE_D(D, bytes)
Definition ops/shared-inl.h:555
#define HWY_IF_T_SIZE_ONE_OF_D(D, bit_array)
Definition ops/shared-inl.h:553
#define HWY_RVV_FOREACH_I163264(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:379
#define HWY_RVV_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2025
HWY_AFTER_NAMESPACE()
#define HWY_RVV_LOAD_STRIDED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:4672
#define HWY_RVV_MASKED_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2001
#define HWY_RVV_PROMOTE_X2(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition rvv-inl.h:2123
#define HWY_RVV_CAST_U8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:774
#define HWY_RVV_RETM_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1437
#define HWY_RVV_FOREACH_U163264(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:374
#define HWY_RVV_UNDEFINED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:706
#define HWY_RVV_MASKED_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1780
#define HWY_RVV_FOREACH_I64(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:329
#define HWY_RVV_FOREACH_I16(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:325
#define HWY_RVV_LOADN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1826
#define HWY_RVV_SCATTER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1977
#define HWY_RVV_CAST_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:817
#define HWY_RVV_FOREACH_U64(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:319
#define HWY_RVV_RETV_ARGMVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:639
#define HWY_RVV_FOREACH_F(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:401
#define HWY_RVV_BLENDED_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1889
#define HWY_RVV_PROMOTE_X4(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition rvv-inl.h:2130
#define HWY_RVV_REDUCE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:4363
#define HWY_RVV_CAST_VIRT_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:833
#define HWY_RVV_SLIDE1(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3112
#define HWY_RVV_NEAREST(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2933
#define HWY_RVV_STOREN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1913
#define HWY_RVV_EXT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:738
#define HWY_RVV_SHIFT_II(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1239
#define HWY_RVV_FMA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1406
#define HWY_RVV_PROMOTE_X4_FROM_U8(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition rvv-inl.h:2136
#define HWY_RVV_DEMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2242
#define HWY_RVV_LANES(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:489
#define HWY_RVV_PROMOTE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2081
#define HWY_RVV_DEMOTE_TO_SHR_16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2854
#define HWY_RVV_SHIFT_VV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1229
#define HWY_RVV_SLIDE_DOWN(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2989
#define HWY_RVV_SHIFT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1143
#define HWY_RVV_LANES_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:569
#define HWY_RVV_VEC_FROM_MASK(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1629
#define HWY_RVV_PROMOTE_X8(OP, BASE, CHAR, BITS, BASE_IN, BITS_IN)
Definition rvv-inl.h:2140
#define HWY_RVV_IF_THEN_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1539
#define HWY_RVV_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3310
#define HWY_RVV_INSERT_VXRM(vxrm, avl)
Definition rvv-inl.h:1123
#define HWY_RVV_FOREACH_B(X_MACRO, NAME, OP)
Definition rvv-inl.h:85
#define HWY_RVV_RETV_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:622
#define HWY_RVV_RETV_AVERAGE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1127
#define HWY_RVV_FOREACH_I32(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:327
#define HWY_RVV_MASKED_TABLE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3338
#define HWY_RVV_SLIDE_UP(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2980
#define HWY_RVV_RETM_ARGM(SEW, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:649
#define HWY_RVV_RETV_ARGV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:615
#define HWY_RVV_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3326
#define HWY_RVV_STORE_STRIDED(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:4732
#define HWY_RVV_LOAD_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_FOREACH_F16_UNCONDITIONAL(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:335
#define HWY_RVV_MASKED_TABLE16(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3350
#define HWY_RVV_FOREACH_F64(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:350
#define HWY_RVV_ALL_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1695
#define HWY_RVV_FOREACH_U08(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:313
#define HWY_RVV_MASKED_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:923
#define HWY_RVV_COMPRESS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3553
HWY_BEFORE_NAMESPACE()
#define HWY_RVV_FOREACH(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:410
#define HWY_RVV_SET_AT_OR_AFTER_FIRST(SEW, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3171
#define HWY_RVV_SET(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:658
#define HWY_RVV_FOREACH_UI08(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:354
#define HWY_RVV_MUL15(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1311
#define HWY_RVV_DEMOTE_F(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2754
#define HWY_RVV_TRUNC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:728
#define HWY_RVV_FOREACH_F32(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:348
#define HWY_RVV_CAST_I8(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:788
#define HWY_RVV_RETM_ARGMM(SEW, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1516
#define HWY_RVV_FOREACH_I(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:397
#define HWY_RVV_LOAD(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1753
#define HWY_RVV_DEMOTE_I_TO_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2257
#define HWY_RVV_FOREACH_U32(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:317
#define HWY_RVV_IOTA(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:911
#define HWY_RVV_CAST_VIRT_IF(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:849
#define HWY_RVV_IF_THEN_ZERO_ELSE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1560
#define HWY_RVV_STORE_MASK_BITS(SEW, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_FIND_FIRST_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1672
#define HWY_RVV_RETV_ARGV2(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
#define HWY_RVV_EXT_VIRT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:752
#define HWY_RVV_CAST_U(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:803
#define HWY_RVV_STORE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1869
#define HWY_RVV_NARROW(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3673
#define HWY_RVV_GET_LANE(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:3149
#define HWY_RVV_WIDEN_MACC(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:5618
#define HWY_RVV_RETM_ARGVS(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1446
#define HWY_RVV_FOREACH_U(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:393
#define HWY_RVV_FOREACH_UI(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:406
#define HWY_RVV_CONVERT(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2905
#define HWY_RVV_MASKED_GATHER(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:2050
#define HWY_RVV_RETV_ARGVV(BASE, CHAR, SEW, SEWD, SEWH, LMUL, LMULD, LMULH, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:630
#define HWY_RVV_FOREACH_U16(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:315
#define HWY_RVV_COUNT_TRUE(SEW, SHIFT, MLEN, NAME, OP)
Definition rvv-inl.h:1707
#define HWY_RVV_FOREACH_I08(X_MACRO, NAME, OP, LMULS)
Definition rvv-inl.h:323
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
Definition tuple-inl.h:30
Definition tuple-inl.h:36
Definition tuple-inl.h:43
Definition ops/shared-inl.h:198
constexpr int Pow2() const
Definition ops/shared-inl.h:253
Definition base.h:694
Definition base.h:1594
Definition base.h:1117
int VFromD
Definition tuple-inl.h:25
HWY_API Vec2< D > Create2(D, VFromD< D > v0, VFromD< D > v1)
Definition tuple-inl.h:52
HWY_API Vec4< D > Create4(D, VFromD< D > v0, VFromD< D > v1, VFromD< D > v2, VFromD< D > v3)
Definition tuple-inl.h:62
HWY_API Vec3< D > Create3(D, VFromD< D > v0, VFromD< D > v1, VFromD< D > v2)
Definition tuple-inl.h:57