Grok 12.0.1
bit_pack-inl.h
Go to the documentation of this file.
1// Copyright 2022 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Per-target include guard
17#if defined(HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_) == defined(HWY_TARGET_TOGGLE)
18#ifdef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
19#undef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
20#else
21#define HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
22#endif
23
24#include "hwy/highway.h"
25
27namespace hwy {
28namespace HWY_NAMESPACE {
29
30// The entry points are class templates specialized below for each number of
31// bits. Each provides Pack and Unpack member functions which load (Pack) or
32// store (Unpack) B raw vectors, and store (Pack) or load (Unpack) a number of
33// packed vectors equal to kBits. B denotes the bits per lane: 8 for Pack8, 16
34// for Pack16, which is also the upper bound for kBits.
35template <size_t kBits> // <= 8
36struct Pack8 {};
37template <size_t kBits> // <= 16
38struct Pack16 {};
39
40template <>
41struct Pack8<1> {
42 template <class D8>
43 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
44 uint8_t* HWY_RESTRICT packed_out) const {
45 const RepartitionToWide<decltype(d8)> d16;
46 using VU16 = Vec<decltype(d16)>;
47 const size_t N8 = Lanes(d8);
48 // 16-bit shifts avoid masking (bits will not cross 8-bit lanes).
49 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
50 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
51 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
52 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
53 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
54 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
55 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
56 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
57
58 const VU16 packed =
59 Xor3(Or(ShiftLeft<7>(raw7), ShiftLeft<6>(raw6)),
60 Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3)),
61 Xor3(ShiftLeft<2>(raw2), ShiftLeft<1>(raw1), raw0));
62 StoreU(BitCast(d8, packed), d8, packed_out);
63 }
64
65 template <class D8>
66 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
67 uint8_t* HWY_RESTRICT raw) const {
68 const RepartitionToWide<decltype(d8)> d16;
69 using VU16 = Vec<decltype(d16)>;
70 const size_t N8 = Lanes(d8);
71 const VU16 mask = Set(d16, 0x0101u); // LSB in each byte
72
73 const VU16 packed = BitCast(d16, LoadU(d8, packed_in));
74
75 const VU16 raw0 = And(packed, mask);
76 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
77
78 const VU16 raw1 = And(ShiftRight<1>(packed), mask);
79 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
80
81 const VU16 raw2 = And(ShiftRight<2>(packed), mask);
82 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
83
84 const VU16 raw3 = And(ShiftRight<3>(packed), mask);
85 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
86
87 const VU16 raw4 = And(ShiftRight<4>(packed), mask);
88 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
89
90 const VU16 raw5 = And(ShiftRight<5>(packed), mask);
91 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
92
93 const VU16 raw6 = And(ShiftRight<6>(packed), mask);
94 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
95
96 const VU16 raw7 = And(ShiftRight<7>(packed), mask);
97 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
98 }
99}; // Pack8<1>
100
101template <>
102struct Pack8<2> {
103 template <class D8>
104 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
105 uint8_t* HWY_RESTRICT packed_out) const {
106 const RepartitionToWide<decltype(d8)> d16;
107 using VU16 = Vec<decltype(d16)>;
108 const size_t N8 = Lanes(d8);
109 // 16-bit shifts avoid masking (bits will not cross 8-bit lanes).
110 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
111 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
112 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
113 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
114 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
115 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
116 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
117 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
118
119 const VU16 packed0 = Xor3(ShiftLeft<6>(raw6), ShiftLeft<4>(raw4),
120 Or(ShiftLeft<2>(raw2), raw0));
121 const VU16 packed1 = Xor3(ShiftLeft<6>(raw7), ShiftLeft<4>(raw5),
122 Or(ShiftLeft<2>(raw3), raw1));
123 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
124 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
125 }
126
127 template <class D8>
128 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
129 uint8_t* HWY_RESTRICT raw) const {
130 const RepartitionToWide<decltype(d8)> d16;
131 using VU16 = Vec<decltype(d16)>;
132 const size_t N8 = Lanes(d8);
133 const VU16 mask = Set(d16, 0x0303u); // Lowest 2 bits per byte
134
135 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
136 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
137
138 const VU16 raw0 = And(packed0, mask);
139 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
140
141 const VU16 raw1 = And(packed1, mask);
142 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
143
144 const VU16 raw2 = And(ShiftRight<2>(packed0), mask);
145 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
146
147 const VU16 raw3 = And(ShiftRight<2>(packed1), mask);
148 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
149
150 const VU16 raw4 = And(ShiftRight<4>(packed0), mask);
151 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
152
153 const VU16 raw5 = And(ShiftRight<4>(packed1), mask);
154 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
155
156 const VU16 raw6 = And(ShiftRight<6>(packed0), mask);
157 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
158
159 const VU16 raw7 = And(ShiftRight<6>(packed1), mask);
160 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
161 }
162}; // Pack8<2>
163
164template <>
165struct Pack8<3> {
166 template <class D8>
167 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
168 uint8_t* HWY_RESTRICT packed_out) const {
169 const RepartitionToWide<decltype(d8)> d16;
170 using VU16 = Vec<decltype(d16)>;
171 const size_t N8 = Lanes(d8);
172 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
173 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
174 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
175 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
176 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
177 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
178 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
179 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
180
181 // The upper two bits of these three will be filled with packed3 (6 bits).
182 VU16 packed0 = Or(ShiftLeft<3>(raw4), raw0);
183 VU16 packed1 = Or(ShiftLeft<3>(raw5), raw1);
184 VU16 packed2 = Or(ShiftLeft<3>(raw6), raw2);
185 const VU16 packed3 = Or(ShiftLeft<3>(raw7), raw3);
186
187 const VU16 hi2 = Set(d16, 0xC0C0u);
188 packed0 = OrAnd(packed0, ShiftLeft<2>(packed3), hi2);
189 packed1 = OrAnd(packed1, ShiftLeft<4>(packed3), hi2);
190 packed2 = OrAnd(packed2, ShiftLeft<6>(packed3), hi2);
191 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
192 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
193 StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
194 }
195
196 template <class D8>
197 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
198 uint8_t* HWY_RESTRICT raw) const {
199 const RepartitionToWide<decltype(d8)> d16;
200 using VU16 = Vec<decltype(d16)>;
201 const size_t N8 = Lanes(d8);
202 const VU16 mask = Set(d16, 0x0707u); // Lowest 3 bits per byte
203
204 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
205 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
206 const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
207
208 const VU16 raw0 = And(packed0, mask);
209 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
210
211 const VU16 raw1 = And(packed1, mask);
212 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
213
214 const VU16 raw2 = And(packed2, mask);
215 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
216
217 const VU16 raw4 = And(ShiftRight<3>(packed0), mask);
218 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
219
220 const VU16 raw5 = And(ShiftRight<3>(packed1), mask);
221 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
222
223 const VU16 raw6 = And(ShiftRight<3>(packed2), mask);
224 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
225
226 // raw73 is the concatenation of the upper two bits in packed0..2.
227 const VU16 hi2 = Set(d16, 0xC0C0u);
228 const VU16 raw73 = Xor3(ShiftRight<6>(And(packed2, hi2)), //
229 ShiftRight<4>(And(packed1, hi2)),
230 ShiftRight<2>(And(packed0, hi2)));
231
232 const VU16 raw3 = And(mask, raw73);
233 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
234
235 const VU16 raw7 = And(mask, ShiftRight<3>(raw73));
236 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
237 }
238}; // Pack8<3>
239
240template <>
241struct Pack8<4> {
242 template <class D8>
243 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
244 uint8_t* HWY_RESTRICT packed_out) const {
245 const RepartitionToWide<decltype(d8)> d16;
246 using VU16 = Vec<decltype(d16)>;
247 const size_t N8 = Lanes(d8);
248 // 16-bit shifts avoid masking (bits will not cross 8-bit lanes).
249 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
250 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
251 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
252 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
253 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
254 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
255 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
256 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
257
258 const VU16 packed0 = Or(ShiftLeft<4>(raw2), raw0);
259 const VU16 packed1 = Or(ShiftLeft<4>(raw3), raw1);
260 const VU16 packed2 = Or(ShiftLeft<4>(raw6), raw4);
261 const VU16 packed3 = Or(ShiftLeft<4>(raw7), raw5);
262
263 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
264 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
265 StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
266 StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8);
267 }
268
269 template <class D8>
270 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
271 uint8_t* HWY_RESTRICT raw) const {
272 const RepartitionToWide<decltype(d8)> d16;
273 using VU16 = Vec<decltype(d16)>;
274 const size_t N8 = Lanes(d8);
275 const VU16 mask = Set(d16, 0x0F0Fu); // Lowest 4 bits per byte
276
277 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
278 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
279 const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
280 const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8));
281
282 const VU16 raw0 = And(packed0, mask);
283 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
284
285 const VU16 raw1 = And(packed1, mask);
286 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
287
288 const VU16 raw2 = And(ShiftRight<4>(packed0), mask);
289 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
290
291 const VU16 raw3 = And(ShiftRight<4>(packed1), mask);
292 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
293
294 const VU16 raw4 = And(packed2, mask);
295 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
296
297 const VU16 raw5 = And(packed3, mask);
298 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
299
300 const VU16 raw6 = And(ShiftRight<4>(packed2), mask);
301 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
302
303 const VU16 raw7 = And(ShiftRight<4>(packed3), mask);
304 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
305 }
306}; // Pack8<4>
307
308template <>
309struct Pack8<5> {
310 template <class D8>
311 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
312 uint8_t* HWY_RESTRICT packed_out) const {
313 const RepartitionToWide<decltype(d8)> d16;
314 using VU16 = Vec<decltype(d16)>;
315 const size_t N8 = Lanes(d8);
316 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
317 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
318 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
319 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
320 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
321 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
322 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
323 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
324
325 // Fill upper three bits with upper bits from raw4..7.
326 const VU16 hi3 = Set(d16, 0xE0E0u);
327 const VU16 packed0 = OrAnd(raw0, ShiftLeft<3>(raw4), hi3);
328 const VU16 packed1 = OrAnd(raw1, ShiftLeft<3>(raw5), hi3);
329 const VU16 packed2 = OrAnd(raw2, ShiftLeft<3>(raw6), hi3);
330 const VU16 packed3 = OrAnd(raw3, ShiftLeft<3>(raw7), hi3);
331
332 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
333 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
334 StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
335 StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8);
336
337 // Combine lower two bits of raw4..7 into packed4.
338 const VU16 lo2 = Set(d16, 0x0303u);
339 const VU16 packed4 = Or(And(raw4, lo2), Xor3(ShiftLeft<2>(And(raw5, lo2)),
340 ShiftLeft<4>(And(raw6, lo2)),
341 ShiftLeft<6>(And(raw7, lo2))));
342 StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8);
343 }
344
345 template <class D8>
346 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
347 uint8_t* HWY_RESTRICT raw) const {
348 const RepartitionToWide<decltype(d8)> d16;
349 using VU16 = Vec<decltype(d16)>;
350 const size_t N8 = Lanes(d8);
351
352 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
353 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
354 const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
355 const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8));
356 const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8));
357
358 const VU16 mask = Set(d16, 0x1F1Fu); // Lowest 5 bits per byte
359
360 const VU16 raw0 = And(packed0, mask);
361 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
362
363 const VU16 raw1 = And(packed1, mask);
364 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
365
366 const VU16 raw2 = And(packed2, mask);
367 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
368
369 const VU16 raw3 = And(packed3, mask);
370 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
371
372 // The upper bits are the top 3 bits shifted right by three.
373 const VU16 top4 = ShiftRight<3>(AndNot(mask, packed0));
374 const VU16 top5 = ShiftRight<3>(AndNot(mask, packed1));
375 const VU16 top6 = ShiftRight<3>(AndNot(mask, packed2));
376 const VU16 top7 = ShiftRight<3>(AndNot(mask, packed3));
377
378 // Insert the lower 2 bits, which were concatenated into a byte.
379 const VU16 lo2 = Set(d16, 0x0303u);
380 const VU16 raw4 = OrAnd(top4, lo2, packed4);
381 const VU16 raw5 = OrAnd(top5, lo2, ShiftRight<2>(packed4));
382 const VU16 raw6 = OrAnd(top6, lo2, ShiftRight<4>(packed4));
383 const VU16 raw7 = OrAnd(top7, lo2, ShiftRight<6>(packed4));
384
385 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
386 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
387 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
388 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
389 }
390}; // Pack8<5>
391
392template <>
393struct Pack8<6> {
394 template <class D8>
395 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
396 uint8_t* HWY_RESTRICT packed_out) const {
397 const RepartitionToWide<decltype(d8)> d16;
398 using VU16 = Vec<decltype(d16)>;
399 const size_t N8 = Lanes(d8);
400 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
401 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
402 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
403 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
404 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
405 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
406 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
407 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
408
409 const VU16 hi2 = Set(d16, 0xC0C0u);
410 // Each triplet of these stores raw3/raw7 (6 bits) in the upper 2 bits.
411 const VU16 packed0 = OrAnd(raw0, ShiftLeft<2>(raw3), hi2);
412 const VU16 packed1 = OrAnd(raw1, ShiftLeft<4>(raw3), hi2);
413 const VU16 packed2 = OrAnd(raw2, ShiftLeft<6>(raw3), hi2);
414 const VU16 packed3 = OrAnd(raw4, ShiftLeft<2>(raw7), hi2);
415 const VU16 packed4 = OrAnd(raw5, ShiftLeft<4>(raw7), hi2);
416 const VU16 packed5 = OrAnd(raw6, ShiftLeft<6>(raw7), hi2);
417
418 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
419 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
420 StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
421 StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8);
422 StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8);
423 StoreU(BitCast(d8, packed5), d8, packed_out + 5 * N8);
424 }
425
426 template <class D8>
427 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
428 uint8_t* HWY_RESTRICT raw) const {
429 const RepartitionToWide<decltype(d8)> d16;
430 using VU16 = Vec<decltype(d16)>;
431 const size_t N8 = Lanes(d8);
432 const VU16 mask = Set(d16, 0x3F3Fu); // Lowest 6 bits per byte
433
434 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
435 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
436 const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
437 const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8));
438 const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8));
439 const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in + 5 * N8));
440
441 const VU16 raw0 = And(packed0, mask);
442 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
443
444 const VU16 raw1 = And(packed1, mask);
445 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
446
447 const VU16 raw2 = And(packed2, mask);
448 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
449
450 const VU16 raw4 = And(packed3, mask);
451 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
452
453 const VU16 raw5 = And(packed4, mask);
454 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
455
456 const VU16 raw6 = And(packed5, mask);
457 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
458
459 // raw3/7 are the concatenation of the upper two bits in packed0..2.
460 const VU16 raw3 = Xor3(ShiftRight<6>(AndNot(mask, packed2)),
461 ShiftRight<4>(AndNot(mask, packed1)),
462 ShiftRight<2>(AndNot(mask, packed0)));
463 const VU16 raw7 = Xor3(ShiftRight<6>(AndNot(mask, packed5)),
464 ShiftRight<4>(AndNot(mask, packed4)),
465 ShiftRight<2>(AndNot(mask, packed3)));
466 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
467 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
468 }
469}; // Pack8<6>
470
471template <>
472struct Pack8<7> {
473 template <class D8>
474 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
475 uint8_t* HWY_RESTRICT packed_out) const {
476 const RepartitionToWide<decltype(d8)> d16;
477 using VU16 = Vec<decltype(d16)>;
478 const size_t N8 = Lanes(d8);
479 const VU16 raw0 = BitCast(d16, LoadU(d8, raw + 0 * N8));
480 const VU16 raw1 = BitCast(d16, LoadU(d8, raw + 1 * N8));
481 const VU16 raw2 = BitCast(d16, LoadU(d8, raw + 2 * N8));
482 const VU16 raw3 = BitCast(d16, LoadU(d8, raw + 3 * N8));
483 const VU16 raw4 = BitCast(d16, LoadU(d8, raw + 4 * N8));
484 const VU16 raw5 = BitCast(d16, LoadU(d8, raw + 5 * N8));
485 const VU16 raw6 = BitCast(d16, LoadU(d8, raw + 6 * N8));
486 // Inserted into top bit of packed0..6.
487 const VU16 raw7 = BitCast(d16, LoadU(d8, raw + 7 * N8));
488
489 const VU16 hi1 = Set(d16, 0x8080u);
490 const VU16 packed0 = OrAnd(raw0, Add(raw7, raw7), hi1);
491 const VU16 packed1 = OrAnd(raw1, ShiftLeft<2>(raw7), hi1);
492 const VU16 packed2 = OrAnd(raw2, ShiftLeft<3>(raw7), hi1);
493 const VU16 packed3 = OrAnd(raw3, ShiftLeft<4>(raw7), hi1);
494 const VU16 packed4 = OrAnd(raw4, ShiftLeft<5>(raw7), hi1);
495 const VU16 packed5 = OrAnd(raw5, ShiftLeft<6>(raw7), hi1);
496 const VU16 packed6 = OrAnd(raw6, ShiftLeft<7>(raw7), hi1);
497
498 StoreU(BitCast(d8, packed0), d8, packed_out + 0 * N8);
499 StoreU(BitCast(d8, packed1), d8, packed_out + 1 * N8);
500 StoreU(BitCast(d8, packed2), d8, packed_out + 2 * N8);
501 StoreU(BitCast(d8, packed3), d8, packed_out + 3 * N8);
502 StoreU(BitCast(d8, packed4), d8, packed_out + 4 * N8);
503 StoreU(BitCast(d8, packed5), d8, packed_out + 5 * N8);
504 StoreU(BitCast(d8, packed6), d8, packed_out + 6 * N8);
505 }
506
507 template <class D8>
508 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
509 uint8_t* HWY_RESTRICT raw) const {
510 const RepartitionToWide<decltype(d8)> d16;
511 using VU16 = Vec<decltype(d16)>;
512 const size_t N8 = Lanes(d8);
513
514 const VU16 packed0 = BitCast(d16, LoadU(d8, packed_in + 0 * N8));
515 const VU16 packed1 = BitCast(d16, LoadU(d8, packed_in + 1 * N8));
516 const VU16 packed2 = BitCast(d16, LoadU(d8, packed_in + 2 * N8));
517 const VU16 packed3 = BitCast(d16, LoadU(d8, packed_in + 3 * N8));
518 const VU16 packed4 = BitCast(d16, LoadU(d8, packed_in + 4 * N8));
519 const VU16 packed5 = BitCast(d16, LoadU(d8, packed_in + 5 * N8));
520 const VU16 packed6 = BitCast(d16, LoadU(d8, packed_in + 6 * N8));
521
522 const VU16 mask = Set(d16, 0x7F7Fu); // Lowest 7 bits per byte
523
524 const VU16 raw0 = And(packed0, mask);
525 StoreU(BitCast(d8, raw0), d8, raw + 0 * N8);
526
527 const VU16 raw1 = And(packed1, mask);
528 StoreU(BitCast(d8, raw1), d8, raw + 1 * N8);
529
530 const VU16 raw2 = And(packed2, mask);
531 StoreU(BitCast(d8, raw2), d8, raw + 2 * N8);
532
533 const VU16 raw3 = And(packed3, mask);
534 StoreU(BitCast(d8, raw3), d8, raw + 3 * N8);
535
536 const VU16 raw4 = And(packed4, mask);
537 StoreU(BitCast(d8, raw4), d8, raw + 4 * N8);
538
539 const VU16 raw5 = And(packed5, mask);
540 StoreU(BitCast(d8, raw5), d8, raw + 5 * N8);
541
542 const VU16 raw6 = And(packed6, mask);
543 StoreU(BitCast(d8, raw6), d8, raw + 6 * N8);
544
545 const VU16 p0 = Xor3(ShiftRight<7>(AndNot(mask, packed6)),
546 ShiftRight<6>(AndNot(mask, packed5)),
547 ShiftRight<5>(AndNot(mask, packed4)));
548 const VU16 p1 = Xor3(ShiftRight<4>(AndNot(mask, packed3)),
549 ShiftRight<3>(AndNot(mask, packed2)),
550 ShiftRight<2>(AndNot(mask, packed1)));
551 const VU16 raw7 = Xor3(ShiftRight<1>(AndNot(mask, packed0)), p0, p1);
552 StoreU(BitCast(d8, raw7), d8, raw + 7 * N8);
553 }
554}; // Pack8<7>
555
556template <>
557struct Pack8<8> {
558 template <class D8>
559 HWY_INLINE void Pack(D8 d8, const uint8_t* HWY_RESTRICT raw,
560 uint8_t* HWY_RESTRICT packed_out) const {
561 using VU8 = Vec<decltype(d8)>;
562 const size_t N8 = Lanes(d8);
563 const VU8 raw0 = LoadU(d8, raw + 0 * N8);
564 const VU8 raw1 = LoadU(d8, raw + 1 * N8);
565 const VU8 raw2 = LoadU(d8, raw + 2 * N8);
566 const VU8 raw3 = LoadU(d8, raw + 3 * N8);
567 const VU8 raw4 = LoadU(d8, raw + 4 * N8);
568 const VU8 raw5 = LoadU(d8, raw + 5 * N8);
569 const VU8 raw6 = LoadU(d8, raw + 6 * N8);
570 const VU8 raw7 = LoadU(d8, raw + 7 * N8);
571
572 StoreU(raw0, d8, packed_out + 0 * N8);
573 StoreU(raw1, d8, packed_out + 1 * N8);
574 StoreU(raw2, d8, packed_out + 2 * N8);
575 StoreU(raw3, d8, packed_out + 3 * N8);
576 StoreU(raw4, d8, packed_out + 4 * N8);
577 StoreU(raw5, d8, packed_out + 5 * N8);
578 StoreU(raw6, d8, packed_out + 6 * N8);
579 StoreU(raw7, d8, packed_out + 7 * N8);
580 }
581
582 template <class D8>
583 HWY_INLINE void Unpack(D8 d8, const uint8_t* HWY_RESTRICT packed_in,
584 uint8_t* HWY_RESTRICT raw) const {
585 using VU8 = Vec<decltype(d8)>;
586 const size_t N8 = Lanes(d8);
587 const VU8 raw0 = LoadU(d8, packed_in + 0 * N8);
588 const VU8 raw1 = LoadU(d8, packed_in + 1 * N8);
589 const VU8 raw2 = LoadU(d8, packed_in + 2 * N8);
590 const VU8 raw3 = LoadU(d8, packed_in + 3 * N8);
591 const VU8 raw4 = LoadU(d8, packed_in + 4 * N8);
592 const VU8 raw5 = LoadU(d8, packed_in + 5 * N8);
593 const VU8 raw6 = LoadU(d8, packed_in + 6 * N8);
594 const VU8 raw7 = LoadU(d8, packed_in + 7 * N8);
595
596 StoreU(raw0, d8, raw + 0 * N8);
597 StoreU(raw1, d8, raw + 1 * N8);
598 StoreU(raw2, d8, raw + 2 * N8);
599 StoreU(raw3, d8, raw + 3 * N8);
600 StoreU(raw4, d8, raw + 4 * N8);
601 StoreU(raw5, d8, raw + 5 * N8);
602 StoreU(raw6, d8, raw + 6 * N8);
603 StoreU(raw7, d8, raw + 7 * N8);
604 }
605}; // Pack8<8>
606
607template <>
608struct Pack16<1> {
609 template <class D>
610 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
611 uint16_t* HWY_RESTRICT packed_out) const {
612 using VU16 = Vec<decltype(d)>;
613 const size_t N = Lanes(d);
614 const VU16 raw0 = LoadU(d, raw + 0 * N);
615 const VU16 raw1 = LoadU(d, raw + 1 * N);
616 const VU16 raw2 = LoadU(d, raw + 2 * N);
617 const VU16 raw3 = LoadU(d, raw + 3 * N);
618 const VU16 raw4 = LoadU(d, raw + 4 * N);
619 const VU16 raw5 = LoadU(d, raw + 5 * N);
620 const VU16 raw6 = LoadU(d, raw + 6 * N);
621 const VU16 raw7 = LoadU(d, raw + 7 * N);
622 const VU16 raw8 = LoadU(d, raw + 8 * N);
623 const VU16 raw9 = LoadU(d, raw + 9 * N);
624 const VU16 rawA = LoadU(d, raw + 0xA * N);
625 const VU16 rawB = LoadU(d, raw + 0xB * N);
626 const VU16 rawC = LoadU(d, raw + 0xC * N);
627 const VU16 rawD = LoadU(d, raw + 0xD * N);
628 const VU16 rawE = LoadU(d, raw + 0xE * N);
629 const VU16 rawF = LoadU(d, raw + 0xF * N);
630
631 const VU16 p0 = Xor3(ShiftLeft<2>(raw2), Add(raw1, raw1), raw0);
632 const VU16 p1 =
633 Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3));
634 const VU16 p2 =
635 Xor3(ShiftLeft<8>(raw8), ShiftLeft<7>(raw7), ShiftLeft<6>(raw6));
636 const VU16 p3 =
637 Xor3(ShiftLeft<0xB>(rawB), ShiftLeft<0xA>(rawA), ShiftLeft<9>(raw9));
638 const VU16 p4 =
639 Xor3(ShiftLeft<0xE>(rawE), ShiftLeft<0xD>(rawD), ShiftLeft<0xC>(rawC));
640 const VU16 packed =
641 Or(Xor3(ShiftLeft<0xF>(rawF), p0, p1), Xor3(p2, p3, p4));
642 StoreU(packed, d, packed_out);
643 }
644
645 template <class D>
646 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
647 uint16_t* HWY_RESTRICT raw) const {
648 using VU16 = Vec<decltype(d)>;
649 const size_t N = Lanes(d);
650 const VU16 mask = Set(d, 1u); // Lowest bit
651
652 const VU16 packed = LoadU(d, packed_in);
653
654 const VU16 raw0 = And(packed, mask);
655 StoreU(raw0, d, raw + 0 * N);
656
657 const VU16 raw1 = And(ShiftRight<1>(packed), mask);
658 StoreU(raw1, d, raw + 1 * N);
659
660 const VU16 raw2 = And(ShiftRight<2>(packed), mask);
661 StoreU(raw2, d, raw + 2 * N);
662
663 const VU16 raw3 = And(ShiftRight<3>(packed), mask);
664 StoreU(raw3, d, raw + 3 * N);
665
666 const VU16 raw4 = And(ShiftRight<4>(packed), mask);
667 StoreU(raw4, d, raw + 4 * N);
668
669 const VU16 raw5 = And(ShiftRight<5>(packed), mask);
670 StoreU(raw5, d, raw + 5 * N);
671
672 const VU16 raw6 = And(ShiftRight<6>(packed), mask);
673 StoreU(raw6, d, raw + 6 * N);
674
675 const VU16 raw7 = And(ShiftRight<7>(packed), mask);
676 StoreU(raw7, d, raw + 7 * N);
677
678 const VU16 raw8 = And(ShiftRight<8>(packed), mask);
679 StoreU(raw8, d, raw + 8 * N);
680
681 const VU16 raw9 = And(ShiftRight<9>(packed), mask);
682 StoreU(raw9, d, raw + 9 * N);
683
684 const VU16 rawA = And(ShiftRight<0xA>(packed), mask);
685 StoreU(rawA, d, raw + 0xA * N);
686
687 const VU16 rawB = And(ShiftRight<0xB>(packed), mask);
688 StoreU(rawB, d, raw + 0xB * N);
689
690 const VU16 rawC = And(ShiftRight<0xC>(packed), mask);
691 StoreU(rawC, d, raw + 0xC * N);
692
693 const VU16 rawD = And(ShiftRight<0xD>(packed), mask);
694 StoreU(rawD, d, raw + 0xD * N);
695
696 const VU16 rawE = And(ShiftRight<0xE>(packed), mask);
697 StoreU(rawE, d, raw + 0xE * N);
698
699 const VU16 rawF = ShiftRight<0xF>(packed);
700 StoreU(rawF, d, raw + 0xF * N);
701 }
702}; // Pack16<1>
703
704template <>
705struct Pack16<2> {
706 template <class D>
707 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
708 uint16_t* HWY_RESTRICT packed_out) const {
709 using VU16 = Vec<decltype(d)>;
710 const size_t N = Lanes(d);
711 const VU16 raw0 = LoadU(d, raw + 0 * N);
712 const VU16 raw1 = LoadU(d, raw + 1 * N);
713 const VU16 raw2 = LoadU(d, raw + 2 * N);
714 const VU16 raw3 = LoadU(d, raw + 3 * N);
715 const VU16 raw4 = LoadU(d, raw + 4 * N);
716 const VU16 raw5 = LoadU(d, raw + 5 * N);
717 const VU16 raw6 = LoadU(d, raw + 6 * N);
718 const VU16 raw7 = LoadU(d, raw + 7 * N);
719 const VU16 raw8 = LoadU(d, raw + 8 * N);
720 const VU16 raw9 = LoadU(d, raw + 9 * N);
721 const VU16 rawA = LoadU(d, raw + 0xA * N);
722 const VU16 rawB = LoadU(d, raw + 0xB * N);
723 const VU16 rawC = LoadU(d, raw + 0xC * N);
724 const VU16 rawD = LoadU(d, raw + 0xD * N);
725 const VU16 rawE = LoadU(d, raw + 0xE * N);
726 const VU16 rawF = LoadU(d, raw + 0xF * N);
727
728 VU16 packed0 = Xor3(ShiftLeft<4>(raw4), ShiftLeft<2>(raw2), raw0);
729 VU16 packed1 = Xor3(ShiftLeft<4>(raw5), ShiftLeft<2>(raw3), raw1);
730 packed0 = Xor3(packed0, ShiftLeft<8>(raw8), ShiftLeft<6>(raw6));
731 packed1 = Xor3(packed1, ShiftLeft<8>(raw9), ShiftLeft<6>(raw7));
732
733 packed0 = Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<10>(rawA));
734 packed1 = Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<10>(rawB));
735
736 packed0 = Or(packed0, ShiftLeft<14>(rawE));
737 packed1 = Or(packed1, ShiftLeft<14>(rawF));
738 StoreU(packed0, d, packed_out + 0 * N);
739 StoreU(packed1, d, packed_out + 1 * N);
740 }
741
742 template <class D>
743 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
744 uint16_t* HWY_RESTRICT raw) const {
745 using VU16 = Vec<decltype(d)>;
746 const size_t N = Lanes(d);
747 const VU16 mask = Set(d, 0x3u); // Lowest 2 bits
748
749 const VU16 packed0 = LoadU(d, packed_in + 0 * N);
750 const VU16 packed1 = LoadU(d, packed_in + 1 * N);
751
752 const VU16 raw0 = And(packed0, mask);
753 StoreU(raw0, d, raw + 0 * N);
754
755 const VU16 raw1 = And(packed1, mask);
756 StoreU(raw1, d, raw + 1 * N);
757
758 const VU16 raw2 = And(ShiftRight<2>(packed0), mask);
759 StoreU(raw2, d, raw + 2 * N);
760
761 const VU16 raw3 = And(ShiftRight<2>(packed1), mask);
762 StoreU(raw3, d, raw + 3 * N);
763
764 const VU16 raw4 = And(ShiftRight<4>(packed0), mask);
765 StoreU(raw4, d, raw + 4 * N);
766
767 const VU16 raw5 = And(ShiftRight<4>(packed1), mask);
768 StoreU(raw5, d, raw + 5 * N);
769
770 const VU16 raw6 = And(ShiftRight<6>(packed0), mask);
771 StoreU(raw6, d, raw + 6 * N);
772
773 const VU16 raw7 = And(ShiftRight<6>(packed1), mask);
774 StoreU(raw7, d, raw + 7 * N);
775
776 const VU16 raw8 = And(ShiftRight<8>(packed0), mask);
777 StoreU(raw8, d, raw + 8 * N);
778
779 const VU16 raw9 = And(ShiftRight<8>(packed1), mask);
780 StoreU(raw9, d, raw + 9 * N);
781
782 const VU16 rawA = And(ShiftRight<0xA>(packed0), mask);
783 StoreU(rawA, d, raw + 0xA * N);
784
785 const VU16 rawB = And(ShiftRight<0xA>(packed1), mask);
786 StoreU(rawB, d, raw + 0xB * N);
787
788 const VU16 rawC = And(ShiftRight<0xC>(packed0), mask);
789 StoreU(rawC, d, raw + 0xC * N);
790
791 const VU16 rawD = And(ShiftRight<0xC>(packed1), mask);
792 StoreU(rawD, d, raw + 0xD * N);
793
794 const VU16 rawE = ShiftRight<0xE>(packed0);
795 StoreU(rawE, d, raw + 0xE * N);
796
797 const VU16 rawF = ShiftRight<0xE>(packed1);
798 StoreU(rawF, d, raw + 0xF * N);
799 }
800}; // Pack16<2>
801
802template <>
803struct Pack16<3> {
804 template <class D>
805 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
806 uint16_t* HWY_RESTRICT packed_out) const {
807 using VU16 = Vec<decltype(d)>;
808 const size_t N = Lanes(d);
809 const VU16 raw0 = LoadU(d, raw + 0 * N);
810 const VU16 raw1 = LoadU(d, raw + 1 * N);
811 const VU16 raw2 = LoadU(d, raw + 2 * N);
812 const VU16 raw3 = LoadU(d, raw + 3 * N);
813 const VU16 raw4 = LoadU(d, raw + 4 * N);
814 const VU16 raw5 = LoadU(d, raw + 5 * N);
815 const VU16 raw6 = LoadU(d, raw + 6 * N);
816 const VU16 raw7 = LoadU(d, raw + 7 * N);
817 const VU16 raw8 = LoadU(d, raw + 8 * N);
818 const VU16 raw9 = LoadU(d, raw + 9 * N);
819 const VU16 rawA = LoadU(d, raw + 0xA * N);
820 const VU16 rawB = LoadU(d, raw + 0xB * N);
821 const VU16 rawC = LoadU(d, raw + 0xC * N);
822 const VU16 rawD = LoadU(d, raw + 0xD * N);
823 const VU16 rawE = LoadU(d, raw + 0xE * N);
824 const VU16 rawF = LoadU(d, raw + 0xF * N);
825
826 // We can fit 15 raw vectors in three packed vectors (five each).
827 VU16 packed0 = Xor3(ShiftLeft<6>(raw6), ShiftLeft<3>(raw3), raw0);
828 VU16 packed1 = Xor3(ShiftLeft<6>(raw7), ShiftLeft<3>(raw4), raw1);
829 VU16 packed2 = Xor3(ShiftLeft<6>(raw8), ShiftLeft<3>(raw5), raw2);
830
831 // rawF will be scattered into the upper bit of these three.
832 packed0 = Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<9>(raw9));
833 packed1 = Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<9>(rawA));
834 packed2 = Xor3(packed2, ShiftLeft<12>(rawE), ShiftLeft<9>(rawB));
835
836 const VU16 hi1 = Set(d, 0x8000u);
837 packed0 = Or(packed0, ShiftLeft<15>(rawF)); // MSB only, no mask
838 packed1 = OrAnd(packed1, ShiftLeft<14>(rawF), hi1);
839 packed2 = OrAnd(packed2, ShiftLeft<13>(rawF), hi1);
840 StoreU(packed0, d, packed_out + 0 * N);
841 StoreU(packed1, d, packed_out + 1 * N);
842 StoreU(packed2, d, packed_out + 2 * N);
843 }
844
845 template <class D>
846 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
847 uint16_t* HWY_RESTRICT raw) const {
848 using VU16 = Vec<decltype(d)>;
849 const size_t N = Lanes(d);
850 const VU16 mask = Set(d, 0x7u); // Lowest 3 bits
851
852 const VU16 packed0 = LoadU(d, packed_in + 0 * N);
853 const VU16 packed1 = LoadU(d, packed_in + 1 * N);
854 const VU16 packed2 = LoadU(d, packed_in + 2 * N);
855
856 const VU16 raw0 = And(mask, packed0);
857 StoreU(raw0, d, raw + 0 * N);
858
859 const VU16 raw1 = And(mask, packed1);
860 StoreU(raw1, d, raw + 1 * N);
861
862 const VU16 raw2 = And(mask, packed2);
863 StoreU(raw2, d, raw + 2 * N);
864
865 const VU16 raw3 = And(mask, ShiftRight<3>(packed0));
866 StoreU(raw3, d, raw + 3 * N);
867
868 const VU16 raw4 = And(mask, ShiftRight<3>(packed1));
869 StoreU(raw4, d, raw + 4 * N);
870
871 const VU16 raw5 = And(mask, ShiftRight<3>(packed2));
872 StoreU(raw5, d, raw + 5 * N);
873
874 const VU16 raw6 = And(mask, ShiftRight<6>(packed0));
875 StoreU(raw6, d, raw + 6 * N);
876
877 const VU16 raw7 = And(mask, ShiftRight<6>(packed1));
878 StoreU(raw7, d, raw + 7 * N);
879
880 const VU16 raw8 = And(mask, ShiftRight<6>(packed2));
881 StoreU(raw8, d, raw + 8 * N);
882
883 const VU16 raw9 = And(mask, ShiftRight<9>(packed0));
884 StoreU(raw9, d, raw + 9 * N);
885
886 const VU16 rawA = And(mask, ShiftRight<9>(packed1));
887 StoreU(rawA, d, raw + 0xA * N);
888
889 const VU16 rawB = And(mask, ShiftRight<9>(packed2));
890 StoreU(rawB, d, raw + 0xB * N);
891
892 const VU16 rawC = And(mask, ShiftRight<12>(packed0));
893 StoreU(rawC, d, raw + 0xC * N);
894
895 const VU16 rawD = And(mask, ShiftRight<12>(packed1));
896 StoreU(rawD, d, raw + 0xD * N);
897
898 const VU16 rawE = And(mask, ShiftRight<12>(packed2));
899 StoreU(rawE, d, raw + 0xE * N);
900
901 // rawF is the concatenation of the upper bit of packed0..2.
902 const VU16 down0 = ShiftRight<15>(packed0);
903 const VU16 down1 = ShiftRight<15>(packed1);
904 const VU16 down2 = ShiftRight<15>(packed2);
905 const VU16 rawF = Xor3(ShiftLeft<2>(down2), Add(down1, down1), down0);
906 StoreU(rawF, d, raw + 0xF * N);
907 }
908}; // Pack16<3>
909
910template <>
911struct Pack16<4> {
912 template <class D>
913 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
914 uint16_t* HWY_RESTRICT packed_out) const {
915 using VU16 = Vec<decltype(d)>;
916 const size_t N = Lanes(d);
917 const VU16 raw0 = LoadU(d, raw + 0 * N);
918 const VU16 raw1 = LoadU(d, raw + 1 * N);
919 const VU16 raw2 = LoadU(d, raw + 2 * N);
920 const VU16 raw3 = LoadU(d, raw + 3 * N);
921 const VU16 raw4 = LoadU(d, raw + 4 * N);
922 const VU16 raw5 = LoadU(d, raw + 5 * N);
923 const VU16 raw6 = LoadU(d, raw + 6 * N);
924 const VU16 raw7 = LoadU(d, raw + 7 * N);
925 const VU16 raw8 = LoadU(d, raw + 8 * N);
926 const VU16 raw9 = LoadU(d, raw + 9 * N);
927 const VU16 rawA = LoadU(d, raw + 0xA * N);
928 const VU16 rawB = LoadU(d, raw + 0xB * N);
929 const VU16 rawC = LoadU(d, raw + 0xC * N);
930 const VU16 rawD = LoadU(d, raw + 0xD * N);
931 const VU16 rawE = LoadU(d, raw + 0xE * N);
932 const VU16 rawF = LoadU(d, raw + 0xF * N);
933
934 VU16 packed0 = Xor3(ShiftLeft<8>(raw4), ShiftLeft<4>(raw2), raw0);
935 VU16 packed1 = Xor3(ShiftLeft<8>(raw5), ShiftLeft<4>(raw3), raw1);
936 packed0 = Or(packed0, ShiftLeft<12>(raw6));
937 packed1 = Or(packed1, ShiftLeft<12>(raw7));
938 VU16 packed2 = Xor3(ShiftLeft<8>(rawC), ShiftLeft<4>(rawA), raw8);
939 VU16 packed3 = Xor3(ShiftLeft<8>(rawD), ShiftLeft<4>(rawB), raw9);
940 packed2 = Or(packed2, ShiftLeft<12>(rawE));
941 packed3 = Or(packed3, ShiftLeft<12>(rawF));
942
943 StoreU(packed0, d, packed_out + 0 * N);
944 StoreU(packed1, d, packed_out + 1 * N);
945 StoreU(packed2, d, packed_out + 2 * N);
946 StoreU(packed3, d, packed_out + 3 * N);
947 }
948
949 template <class D>
950 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
951 uint16_t* HWY_RESTRICT raw) const {
952 using VU16 = Vec<decltype(d)>;
953 const size_t N = Lanes(d);
954 const VU16 mask = Set(d, 0xFu); // Lowest 4 bits
955
956 const VU16 packed0 = LoadU(d, packed_in + 0 * N);
957 const VU16 packed1 = LoadU(d, packed_in + 1 * N);
958 const VU16 packed2 = LoadU(d, packed_in + 2 * N);
959 const VU16 packed3 = LoadU(d, packed_in + 3 * N);
960
961 const VU16 raw0 = And(packed0, mask);
962 StoreU(raw0, d, raw + 0 * N);
963
964 const VU16 raw1 = And(packed1, mask);
965 StoreU(raw1, d, raw + 1 * N);
966
967 const VU16 raw2 = And(ShiftRight<4>(packed0), mask);
968 StoreU(raw2, d, raw + 2 * N);
969
970 const VU16 raw3 = And(ShiftRight<4>(packed1), mask);
971 StoreU(raw3, d, raw + 3 * N);
972
973 const VU16 raw4 = And(ShiftRight<8>(packed0), mask);
974 StoreU(raw4, d, raw + 4 * N);
975
976 const VU16 raw5 = And(ShiftRight<8>(packed1), mask);
977 StoreU(raw5, d, raw + 5 * N);
978
979 const VU16 raw6 = ShiftRight<12>(packed0); // no mask required
980 StoreU(raw6, d, raw + 6 * N);
981
982 const VU16 raw7 = ShiftRight<12>(packed1); // no mask required
983 StoreU(raw7, d, raw + 7 * N);
984
985 const VU16 raw8 = And(packed2, mask);
986 StoreU(raw8, d, raw + 8 * N);
987
988 const VU16 raw9 = And(packed3, mask);
989 StoreU(raw9, d, raw + 9 * N);
990
991 const VU16 rawA = And(ShiftRight<4>(packed2), mask);
992 StoreU(rawA, d, raw + 0xA * N);
993
994 const VU16 rawB = And(ShiftRight<4>(packed3), mask);
995 StoreU(rawB, d, raw + 0xB * N);
996
997 const VU16 rawC = And(ShiftRight<8>(packed2), mask);
998 StoreU(rawC, d, raw + 0xC * N);
999
1000 const VU16 rawD = And(ShiftRight<8>(packed3), mask);
1001 StoreU(rawD, d, raw + 0xD * N);
1002
1003 const VU16 rawE = ShiftRight<12>(packed2); // no mask required
1004 StoreU(rawE, d, raw + 0xE * N);
1005
1006 const VU16 rawF = ShiftRight<12>(packed3); // no mask required
1007 StoreU(rawF, d, raw + 0xF * N);
1008 }
1009}; // Pack16<4>
1010
1011template <>
1012struct Pack16<5> {
1013 template <class D>
1014 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
1015 uint16_t* HWY_RESTRICT packed_out) const {
1016 using VU16 = Vec<decltype(d)>;
1017 const size_t N = Lanes(d);
1018 const VU16 raw0 = LoadU(d, raw + 0 * N);
1019 const VU16 raw1 = LoadU(d, raw + 1 * N);
1020 const VU16 raw2 = LoadU(d, raw + 2 * N);
1021 const VU16 raw3 = LoadU(d, raw + 3 * N);
1022 const VU16 raw4 = LoadU(d, raw + 4 * N);
1023 const VU16 raw5 = LoadU(d, raw + 5 * N);
1024 const VU16 raw6 = LoadU(d, raw + 6 * N);
1025 const VU16 raw7 = LoadU(d, raw + 7 * N);
1026 const VU16 raw8 = LoadU(d, raw + 8 * N);
1027 const VU16 raw9 = LoadU(d, raw + 9 * N);
1028 const VU16 rawA = LoadU(d, raw + 0xA * N);
1029 const VU16 rawB = LoadU(d, raw + 0xB * N);
1030 const VU16 rawC = LoadU(d, raw + 0xC * N);
1031 const VU16 rawD = LoadU(d, raw + 0xD * N);
1032 const VU16 rawE = LoadU(d, raw + 0xE * N);
1033 const VU16 rawF = LoadU(d, raw + 0xF * N);
1034
1035 // We can fit 15 raw vectors in five packed vectors (three each).
1036 VU16 packed0 = Xor3(ShiftLeft<10>(rawA), ShiftLeft<5>(raw5), raw0);
1037 VU16 packed1 = Xor3(ShiftLeft<10>(rawB), ShiftLeft<5>(raw6), raw1);
1038 VU16 packed2 = Xor3(ShiftLeft<10>(rawC), ShiftLeft<5>(raw7), raw2);
1039 VU16 packed3 = Xor3(ShiftLeft<10>(rawD), ShiftLeft<5>(raw8), raw3);
1040 VU16 packed4 = Xor3(ShiftLeft<10>(rawE), ShiftLeft<5>(raw9), raw4);
1041
1042 // rawF will be scattered into the upper bits of these five.
1043 const VU16 hi1 = Set(d, 0x8000u);
1044 packed0 = Or(packed0, ShiftLeft<15>(rawF)); // MSB only, no mask
1045 packed1 = OrAnd(packed1, ShiftLeft<14>(rawF), hi1);
1046 packed2 = OrAnd(packed2, ShiftLeft<13>(rawF), hi1);
1047 packed3 = OrAnd(packed3, ShiftLeft<12>(rawF), hi1);
1048 packed4 = OrAnd(packed4, ShiftLeft<11>(rawF), hi1);
1049
1050 StoreU(packed0, d, packed_out + 0 * N);
1051 StoreU(packed1, d, packed_out + 1 * N);
1052 StoreU(packed2, d, packed_out + 2 * N);
1053 StoreU(packed3, d, packed_out + 3 * N);
1054 StoreU(packed4, d, packed_out + 4 * N);
1055 }
1056
1057 template <class D>
1058 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
1059 uint16_t* HWY_RESTRICT raw) const {
1060 using VU16 = Vec<decltype(d)>;
1061 const size_t N = Lanes(d);
1062
1063 const VU16 packed0 = LoadU(d, packed_in + 0 * N);
1064 const VU16 packed1 = LoadU(d, packed_in + 1 * N);
1065 const VU16 packed2 = LoadU(d, packed_in + 2 * N);
1066 const VU16 packed3 = LoadU(d, packed_in + 3 * N);
1067 const VU16 packed4 = LoadU(d, packed_in + 4 * N);
1068
1069 const VU16 mask = Set(d, 0x1Fu); // Lowest 5 bits
1070
1071 const VU16 raw0 = And(packed0, mask);
1072 StoreU(raw0, d, raw + 0 * N);
1073
1074 const VU16 raw1 = And(packed1, mask);
1075 StoreU(raw1, d, raw + 1 * N);
1076
1077 const VU16 raw2 = And(packed2, mask);
1078 StoreU(raw2, d, raw + 2 * N);
1079
1080 const VU16 raw3 = And(packed3, mask);
1081 StoreU(raw3, d, raw + 3 * N);
1082
1083 const VU16 raw4 = And(packed4, mask);
1084 StoreU(raw4, d, raw + 4 * N);
1085
1086 const VU16 raw5 = And(ShiftRight<5>(packed0), mask);
1087 StoreU(raw5, d, raw + 5 * N);
1088
1089 const VU16 raw6 = And(ShiftRight<5>(packed1), mask);
1090 StoreU(raw6, d, raw + 6 * N);
1091
1092 const VU16 raw7 = And(ShiftRight<5>(packed2), mask);
1093 StoreU(raw7, d, raw + 7 * N);
1094
1095 const VU16 raw8 = And(ShiftRight<5>(packed3), mask);
1096 StoreU(raw8, d, raw + 8 * N);
1097
1098 const VU16 raw9 = And(ShiftRight<5>(packed4), mask);
1099 StoreU(raw9, d, raw + 9 * N);
1100
1101 const VU16 rawA = And(ShiftRight<10>(packed0), mask);
1102 StoreU(rawA, d, raw + 0xA * N);
1103
1104 const VU16 rawB = And(ShiftRight<10>(packed1), mask);
1105 StoreU(rawB, d, raw + 0xB * N);
1106
1107 const VU16 rawC = And(ShiftRight<10>(packed2), mask);
1108 StoreU(rawC, d, raw + 0xC * N);
1109
1110 const VU16 rawD = And(ShiftRight<10>(packed3), mask);
1111 StoreU(rawD, d, raw + 0xD * N);
1112
1113 const VU16 rawE = And(ShiftRight<10>(packed4), mask);
1114 StoreU(rawE, d, raw + 0xE * N);
1115
1116 // rawF is the concatenation of the lower bit of packed0..4.
1117 const VU16 down0 = ShiftRight<15>(packed0);
1118 const VU16 down1 = ShiftRight<15>(packed1);
1119 const VU16 hi1 = Set(d, 0x8000u);
1120 const VU16 p0 =
1121 Xor3(ShiftRight<13>(And(packed2, hi1)), Add(down1, down1), down0);
1122 const VU16 rawF = Xor3(ShiftRight<11>(And(packed4, hi1)),
1123 ShiftRight<12>(And(packed3, hi1)), p0);
1124 StoreU(rawF, d, raw + 0xF * N);
1125 }
1126}; // Pack16<5>
1127
1128template <>
1129struct Pack16<6> {
1130 template <class D>
1131 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
1132 uint16_t* HWY_RESTRICT packed_out) const {
1133 using VU16 = Vec<decltype(d)>;
1134 const size_t N = Lanes(d);
1135 const VU16 raw0 = LoadU(d, raw + 0 * N);
1136 const VU16 raw1 = LoadU(d, raw + 1 * N);
1137 const VU16 raw2 = LoadU(d, raw + 2 * N);
1138 const VU16 raw3 = LoadU(d, raw + 3 * N);
1139 const VU16 raw4 = LoadU(d, raw + 4 * N);
1140 const VU16 raw5 = LoadU(d, raw + 5 * N);
1141 const VU16 raw6 = LoadU(d, raw + 6 * N);
1142 const VU16 raw7 = LoadU(d, raw + 7 * N);
1143 const VU16 raw8 = LoadU(d, raw + 8 * N);
1144 const VU16 raw9 = LoadU(d, raw + 9 * N);
1145 const VU16 rawA = LoadU(d, raw + 0xA * N);
1146 const VU16 rawB = LoadU(d, raw + 0xB * N);
1147 const VU16 rawC = LoadU(d, raw + 0xC * N);
1148 const VU16 rawD = LoadU(d, raw + 0xD * N);
1149 const VU16 rawE = LoadU(d, raw + 0xE * N);
1150 const VU16 rawF = LoadU(d, raw + 0xF * N);
1151
1152 const VU16 packed3 = Or(ShiftLeft<6>(raw7), raw3);
1153 const VU16 packed7 = Or(ShiftLeft<6>(rawF), rawB);
1154 // Three vectors, two 6-bit raw each; packed3 (12 bits) is spread over the
1155 // four remainder bits at the top of each vector.
1156 const VU16 packed0 = Xor3(ShiftLeft<12>(packed3), ShiftLeft<6>(raw4), raw0);
1157 VU16 packed1 = Or(ShiftLeft<6>(raw5), raw1);
1158 VU16 packed2 = Or(ShiftLeft<6>(raw6), raw2);
1159 const VU16 packed4 = Xor3(ShiftLeft<12>(packed7), ShiftLeft<6>(rawC), raw8);
1160 VU16 packed5 = Or(ShiftLeft<6>(rawD), raw9);
1161 VU16 packed6 = Or(ShiftLeft<6>(rawE), rawA);
1162
1163 const VU16 hi4 = Set(d, 0xF000u);
1164 packed1 = OrAnd(packed1, ShiftLeft<8>(packed3), hi4);
1165 packed2 = OrAnd(packed2, ShiftLeft<4>(packed3), hi4);
1166 packed5 = OrAnd(packed5, ShiftLeft<8>(packed7), hi4);
1167 packed6 = OrAnd(packed6, ShiftLeft<4>(packed7), hi4);
1168
1169 StoreU(packed0, d, packed_out + 0 * N);
1170 StoreU(packed1, d, packed_out + 1 * N);
1171 StoreU(packed2, d, packed_out + 2 * N);
1172 StoreU(packed4, d, packed_out + 3 * N);
1173 StoreU(packed5, d, packed_out + 4 * N);
1174 StoreU(packed6, d, packed_out + 5 * N);
1175 }
1176
1177 template <class D>
1178 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
1179 uint16_t* HWY_RESTRICT raw) const {
1180 using VU16 = Vec<decltype(d)>;
1181 const size_t N = Lanes(d);
1182 const VU16 mask = Set(d, 0x3Fu); // Lowest 6 bits
1183
1184 const VU16 packed0 = LoadU(d, packed_in + 0 * N);
1185 const VU16 packed1 = LoadU(d, packed_in + 1 * N);
1186 const VU16 packed2 = LoadU(d, packed_in + 2 * N);
1187 const VU16 packed4 = LoadU(d, packed_in + 3 * N);
1188 const VU16 packed5 = LoadU(d, packed_in + 4 * N);
1189 const VU16 packed6 = LoadU(d, packed_in + 5 * N);
1190
1191 const VU16 raw0 = And(packed0, mask);
1192 StoreU(raw0, d, raw + 0 * N);
1193
1194 const VU16 raw1 = And(packed1, mask);
1195 StoreU(raw1, d, raw + 1 * N);
1196
1197 const VU16 raw2 = And(packed2, mask);
1198 StoreU(raw2, d, raw + 2 * N);
1199
1200 const VU16 raw4 = And(ShiftRight<6>(packed0), mask);
1201 StoreU(raw4, d, raw + 4 * N);
1202
1203 const VU16 raw5 = And(ShiftRight<6>(packed1), mask);
1204 StoreU(raw5, d, raw + 5 * N);
1205
1206 const VU16 raw6 = And(ShiftRight<6>(packed2), mask);
1207 StoreU(raw6, d, raw + 6 * N);
1208
1209 const VU16 raw8 = And(packed4, mask);
1210 StoreU(raw8, d, raw + 8 * N);
1211
1212 const VU16 raw9 = And(packed5, mask);
1213 StoreU(raw9, d, raw + 9 * N);
1214
1215 const VU16 rawA = And(packed6, mask);
1216 StoreU(rawA, d, raw + 0xA * N);
1217
1218 const VU16 rawC = And(ShiftRight<6>(packed4), mask);
1219 StoreU(rawC, d, raw + 0xC * N);
1220
1221 const VU16 rawD = And(ShiftRight<6>(packed5), mask);
1222 StoreU(rawD, d, raw + 0xD * N);
1223
1224 const VU16 rawE = And(ShiftRight<6>(packed6), mask);
1225 StoreU(rawE, d, raw + 0xE * N);
1226
1227 // packed3 is the concatenation of the four upper bits in packed0..2.
1228 const VU16 down0 = ShiftRight<12>(packed0);
1229 const VU16 down4 = ShiftRight<12>(packed4);
1230 const VU16 hi4 = Set(d, 0xF000u);
1231 const VU16 packed3 = Xor3(ShiftRight<4>(And(packed2, hi4)),
1232 ShiftRight<8>(And(packed1, hi4)), down0);
1233 const VU16 packed7 = Xor3(ShiftRight<4>(And(packed6, hi4)),
1234 ShiftRight<8>(And(packed5, hi4)), down4);
1235 const VU16 raw3 = And(packed3, mask);
1236 StoreU(raw3, d, raw + 3 * N);
1237
1238 const VU16 rawB = And(packed7, mask);
1239 StoreU(rawB, d, raw + 0xB * N);
1240
1241 const VU16 raw7 = ShiftRight<6>(packed3); // upper bits already zero
1242 StoreU(raw7, d, raw + 7 * N);
1243
1244 const VU16 rawF = ShiftRight<6>(packed7); // upper bits already zero
1245 StoreU(rawF, d, raw + 0xF * N);
1246 }
1247}; // Pack16<6>
1248
1249template <>
1250struct Pack16<7> {
1251 template <class D>
1252 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
1253 uint16_t* HWY_RESTRICT packed_out) const {
1254 using VU16 = Vec<decltype(d)>;
1255 const size_t N = Lanes(d);
1256 const VU16 raw0 = LoadU(d, raw + 0 * N);
1257 const VU16 raw1 = LoadU(d, raw + 1 * N);
1258 const VU16 raw2 = LoadU(d, raw + 2 * N);
1259 const VU16 raw3 = LoadU(d, raw + 3 * N);
1260 const VU16 raw4 = LoadU(d, raw + 4 * N);
1261 const VU16 raw5 = LoadU(d, raw + 5 * N);
1262 const VU16 raw6 = LoadU(d, raw + 6 * N);
1263 const VU16 raw7 = LoadU(d, raw + 7 * N);
1264 const VU16 raw8 = LoadU(d, raw + 8 * N);
1265 const VU16 raw9 = LoadU(d, raw + 9 * N);
1266 const VU16 rawA = LoadU(d, raw + 0xA * N);
1267 const VU16 rawB = LoadU(d, raw + 0xB * N);
1268 const VU16 rawC = LoadU(d, raw + 0xC * N);
1269 const VU16 rawD = LoadU(d, raw + 0xD * N);
1270 const VU16 rawE = LoadU(d, raw + 0xE * N);
1271 const VU16 rawF = LoadU(d, raw + 0xF * N);
1272
1273 const VU16 packed7 = Or(ShiftLeft<7>(rawF), raw7);
1274 // Seven vectors, two 7-bit raw each; packed7 (14 bits) is spread over the
1275 // two remainder bits at the top of each vector.
1276 const VU16 packed0 = Xor3(ShiftLeft<14>(packed7), ShiftLeft<7>(raw8), raw0);
1277 VU16 packed1 = Or(ShiftLeft<7>(raw9), raw1);
1278 VU16 packed2 = Or(ShiftLeft<7>(rawA), raw2);
1279 VU16 packed3 = Or(ShiftLeft<7>(rawB), raw3);
1280 VU16 packed4 = Or(ShiftLeft<7>(rawC), raw4);
1281 VU16 packed5 = Or(ShiftLeft<7>(rawD), raw5);
1282 VU16 packed6 = Or(ShiftLeft<7>(rawE), raw6);
1283
1284 const VU16 hi2 = Set(d, 0xC000u);
1285 packed1 = OrAnd(packed1, ShiftLeft<12>(packed7), hi2);
1286 packed2 = OrAnd(packed2, ShiftLeft<10>(packed7), hi2);
1287 packed3 = OrAnd(packed3, ShiftLeft<8>(packed7), hi2);
1288 packed4 = OrAnd(packed4, ShiftLeft<6>(packed7), hi2);
1289 packed5 = OrAnd(packed5, ShiftLeft<4>(packed7), hi2);
1290 packed6 = OrAnd(packed6, ShiftLeft<2>(packed7), hi2);
1291
1292 StoreU(packed0, d, packed_out + 0 * N);
1293 StoreU(packed1, d, packed_out + 1 * N);
1294 StoreU(packed2, d, packed_out + 2 * N);
1295 StoreU(packed3, d, packed_out + 3 * N);
1296 StoreU(packed4, d, packed_out + 4 * N);
1297 StoreU(packed5, d, packed_out + 5 * N);
1298 StoreU(packed6, d, packed_out + 6 * N);
1299 }
1300
1301 template <class D>
1302 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
1303 uint16_t* HWY_RESTRICT raw) const {
1304 using VU16 = Vec<decltype(d)>;
1305 const size_t N = Lanes(d);
1306
1307 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
1308 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
1309 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
1310 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
1311 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
1312 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
1313 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
1314
1315 const VU16 mask = Set(d, 0x7Fu); // Lowest 7 bits
1316
1317 const VU16 raw0 = And(packed0, mask);
1318 StoreU(raw0, d, raw + 0 * N);
1319
1320 const VU16 raw1 = And(packed1, mask);
1321 StoreU(raw1, d, raw + 1 * N);
1322
1323 const VU16 raw2 = And(packed2, mask);
1324 StoreU(raw2, d, raw + 2 * N);
1325
1326 const VU16 raw3 = And(packed3, mask);
1327 StoreU(raw3, d, raw + 3 * N);
1328
1329 const VU16 raw4 = And(packed4, mask);
1330 StoreU(raw4, d, raw + 4 * N);
1331
1332 const VU16 raw5 = And(packed5, mask);
1333 StoreU(raw5, d, raw + 5 * N);
1334
1335 const VU16 raw6 = And(packed6, mask);
1336 StoreU(raw6, d, raw + 6 * N);
1337
1338 const VU16 raw8 = And(ShiftRight<7>(packed0), mask);
1339 StoreU(raw8, d, raw + 8 * N);
1340
1341 const VU16 raw9 = And(ShiftRight<7>(packed1), mask);
1342 StoreU(raw9, d, raw + 9 * N);
1343
1344 const VU16 rawA = And(ShiftRight<7>(packed2), mask);
1345 StoreU(rawA, d, raw + 0xA * N);
1346
1347 const VU16 rawB = And(ShiftRight<7>(packed3), mask);
1348 StoreU(rawB, d, raw + 0xB * N);
1349
1350 const VU16 rawC = And(ShiftRight<7>(packed4), mask);
1351 StoreU(rawC, d, raw + 0xC * N);
1352
1353 const VU16 rawD = And(ShiftRight<7>(packed5), mask);
1354 StoreU(rawD, d, raw + 0xD * N);
1355
1356 const VU16 rawE = And(ShiftRight<7>(packed6), mask);
1357 StoreU(rawE, d, raw + 0xE * N);
1358
1359 // packed7 is the concatenation of the two upper bits in packed0..6.
1360 const VU16 down0 = ShiftRight<14>(packed0);
1361 const VU16 hi2 = Set(d, 0xC000u);
1362 const VU16 p0 = Xor3(ShiftRight<12>(And(packed1, hi2)),
1363 ShiftRight<10>(And(packed2, hi2)), down0);
1364 const VU16 p1 = Xor3(ShiftRight<8>(And(packed3, hi2)), //
1365 ShiftRight<6>(And(packed4, hi2)),
1366 ShiftRight<4>(And(packed5, hi2)));
1367 const VU16 packed7 = Xor3(ShiftRight<2>(And(packed6, hi2)), p1, p0);
1368
1369 const VU16 raw7 = And(packed7, mask);
1370 StoreU(raw7, d, raw + 7 * N);
1371
1372 const VU16 rawF = ShiftRight<7>(packed7); // upper bits already zero
1373 StoreU(rawF, d, raw + 0xF * N);
1374 }
1375}; // Pack16<7>
1376
1377template <>
1378struct Pack16<8> {
1379 template <class D>
1380 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
1381 uint16_t* HWY_RESTRICT packed_out) const {
1382 using VU16 = Vec<decltype(d)>;
1383 const size_t N = Lanes(d);
1384 const VU16 raw0 = LoadU(d, raw + 0 * N);
1385 const VU16 raw1 = LoadU(d, raw + 1 * N);
1386 const VU16 raw2 = LoadU(d, raw + 2 * N);
1387 const VU16 raw3 = LoadU(d, raw + 3 * N);
1388 const VU16 raw4 = LoadU(d, raw + 4 * N);
1389 const VU16 raw5 = LoadU(d, raw + 5 * N);
1390 const VU16 raw6 = LoadU(d, raw + 6 * N);
1391 const VU16 raw7 = LoadU(d, raw + 7 * N);
1392 const VU16 raw8 = LoadU(d, raw + 8 * N);
1393 const VU16 raw9 = LoadU(d, raw + 9 * N);
1394 const VU16 rawA = LoadU(d, raw + 0xA * N);
1395 const VU16 rawB = LoadU(d, raw + 0xB * N);
1396 const VU16 rawC = LoadU(d, raw + 0xC * N);
1397 const VU16 rawD = LoadU(d, raw + 0xD * N);
1398 const VU16 rawE = LoadU(d, raw + 0xE * N);
1399 const VU16 rawF = LoadU(d, raw + 0xF * N);
1400
1401 // This is equivalent to ConcatEven with 8-bit lanes, but much more
1402 // efficient on RVV and slightly less efficient on SVE2.
1403 const VU16 packed0 = Or(ShiftLeft<8>(raw2), raw0);
1404 const VU16 packed1 = Or(ShiftLeft<8>(raw3), raw1);
1405 const VU16 packed2 = Or(ShiftLeft<8>(raw6), raw4);
1406 const VU16 packed3 = Or(ShiftLeft<8>(raw7), raw5);
1407 const VU16 packed4 = Or(ShiftLeft<8>(rawA), raw8);
1408 const VU16 packed5 = Or(ShiftLeft<8>(rawB), raw9);
1409 const VU16 packed6 = Or(ShiftLeft<8>(rawE), rawC);
1410 const VU16 packed7 = Or(ShiftLeft<8>(rawF), rawD);
1411
1412 StoreU(packed0, d, packed_out + 0 * N);
1413 StoreU(packed1, d, packed_out + 1 * N);
1414 StoreU(packed2, d, packed_out + 2 * N);
1415 StoreU(packed3, d, packed_out + 3 * N);
1416 StoreU(packed4, d, packed_out + 4 * N);
1417 StoreU(packed5, d, packed_out + 5 * N);
1418 StoreU(packed6, d, packed_out + 6 * N);
1419 StoreU(packed7, d, packed_out + 7 * N);
1420 }
1421
1422 template <class D>
1423 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
1424 uint16_t* HWY_RESTRICT raw) const {
1425 using VU16 = Vec<decltype(d)>;
1426 const size_t N = Lanes(d);
1427
1428 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
1429 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
1430 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
1431 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
1432 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
1433 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
1434 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
1435 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
1436 const VU16 mask = Set(d, 0xFFu); // Lowest 8 bits
1437
1438 const VU16 raw0 = And(packed0, mask);
1439 StoreU(raw0, d, raw + 0 * N);
1440
1441 const VU16 raw1 = And(packed1, mask);
1442 StoreU(raw1, d, raw + 1 * N);
1443
1444 const VU16 raw2 = ShiftRight<8>(packed0); // upper bits already zero
1445 StoreU(raw2, d, raw + 2 * N);
1446
1447 const VU16 raw3 = ShiftRight<8>(packed1); // upper bits already zero
1448 StoreU(raw3, d, raw + 3 * N);
1449
1450 const VU16 raw4 = And(packed2, mask);
1451 StoreU(raw4, d, raw + 4 * N);
1452
1453 const VU16 raw5 = And(packed3, mask);
1454 StoreU(raw5, d, raw + 5 * N);
1455
1456 const VU16 raw6 = ShiftRight<8>(packed2); // upper bits already zero
1457 StoreU(raw6, d, raw + 6 * N);
1458
1459 const VU16 raw7 = ShiftRight<8>(packed3); // upper bits already zero
1460 StoreU(raw7, d, raw + 7 * N);
1461
1462 const VU16 raw8 = And(packed4, mask);
1463 StoreU(raw8, d, raw + 8 * N);
1464
1465 const VU16 raw9 = And(packed5, mask);
1466 StoreU(raw9, d, raw + 9 * N);
1467
1468 const VU16 rawA = ShiftRight<8>(packed4); // upper bits already zero
1469 StoreU(rawA, d, raw + 0xA * N);
1470
1471 const VU16 rawB = ShiftRight<8>(packed5); // upper bits already zero
1472 StoreU(rawB, d, raw + 0xB * N);
1473
1474 const VU16 rawC = And(packed6, mask);
1475 StoreU(rawC, d, raw + 0xC * N);
1476
1477 const VU16 rawD = And(packed7, mask);
1478 StoreU(rawD, d, raw + 0xD * N);
1479
1480 const VU16 rawE = ShiftRight<8>(packed6); // upper bits already zero
1481 StoreU(rawE, d, raw + 0xE * N);
1482
1483 const VU16 rawF = ShiftRight<8>(packed7); // upper bits already zero
1484 StoreU(rawF, d, raw + 0xF * N);
1485 }
1486}; // Pack16<8>
1487
1488template <>
1489struct Pack16<9> {
1490 template <class D>
1491 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
1492 uint16_t* HWY_RESTRICT packed_out) const {
1493 using VU16 = Vec<decltype(d)>;
1494 const size_t N = Lanes(d);
1495 const VU16 raw0 = LoadU(d, raw + 0 * N);
1496 const VU16 raw1 = LoadU(d, raw + 1 * N);
1497 const VU16 raw2 = LoadU(d, raw + 2 * N);
1498 const VU16 raw3 = LoadU(d, raw + 3 * N);
1499 const VU16 raw4 = LoadU(d, raw + 4 * N);
1500 const VU16 raw5 = LoadU(d, raw + 5 * N);
1501 const VU16 raw6 = LoadU(d, raw + 6 * N);
1502 const VU16 raw7 = LoadU(d, raw + 7 * N);
1503 const VU16 raw8 = LoadU(d, raw + 8 * N);
1504 const VU16 raw9 = LoadU(d, raw + 9 * N);
1505 const VU16 rawA = LoadU(d, raw + 0xA * N);
1506 const VU16 rawB = LoadU(d, raw + 0xB * N);
1507 const VU16 rawC = LoadU(d, raw + 0xC * N);
1508 const VU16 rawD = LoadU(d, raw + 0xD * N);
1509 const VU16 rawE = LoadU(d, raw + 0xE * N);
1510 const VU16 rawF = LoadU(d, raw + 0xF * N);
1511 // 8 vectors, each with 9+7 bits; top 2 bits are concatenated into packed8.
1512 const VU16 packed0 = Or(ShiftLeft<9>(raw8), raw0);
1513 const VU16 packed1 = Or(ShiftLeft<9>(raw9), raw1);
1514 const VU16 packed2 = Or(ShiftLeft<9>(rawA), raw2);
1515 const VU16 packed3 = Or(ShiftLeft<9>(rawB), raw3);
1516 const VU16 packed4 = Or(ShiftLeft<9>(rawC), raw4);
1517 const VU16 packed5 = Or(ShiftLeft<9>(rawD), raw5);
1518 const VU16 packed6 = Or(ShiftLeft<9>(rawE), raw6);
1519 const VU16 packed7 = Or(ShiftLeft<9>(rawF), raw7);
1520
1521 // We could shift down, OR and shift up, but two shifts are typically more
1522 // expensive than AND, shift into position, and OR (which can be further
1523 // reduced via Xor3).
1524 const VU16 mid2 = Set(d, 0x180u); // top 2 in lower 9
1525 const VU16 part8 = ShiftRight<7>(And(raw8, mid2));
1526 const VU16 part9 = ShiftRight<5>(And(raw9, mid2));
1527 const VU16 partA = ShiftRight<3>(And(rawA, mid2));
1528 const VU16 partB = ShiftRight<1>(And(rawB, mid2));
1529 const VU16 partC = ShiftLeft<1>(And(rawC, mid2));
1530 const VU16 partD = ShiftLeft<3>(And(rawD, mid2));
1531 const VU16 partE = ShiftLeft<5>(And(rawE, mid2));
1532 const VU16 partF = ShiftLeft<7>(And(rawF, mid2));
1533 const VU16 packed8 = Xor3(Xor3(part8, part9, partA),
1534 Xor3(partB, partC, partD), Or(partE, partF));
1535
1536 StoreU(packed0, d, packed_out + 0 * N);
1537 StoreU(packed1, d, packed_out + 1 * N);
1538 StoreU(packed2, d, packed_out + 2 * N);
1539 StoreU(packed3, d, packed_out + 3 * N);
1540 StoreU(packed4, d, packed_out + 4 * N);
1541 StoreU(packed5, d, packed_out + 5 * N);
1542 StoreU(packed6, d, packed_out + 6 * N);
1543 StoreU(packed7, d, packed_out + 7 * N);
1544 StoreU(packed8, d, packed_out + 8 * N);
1545 }
1546
1547 template <class D>
1548 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
1549 uint16_t* HWY_RESTRICT raw) const {
1550 using VU16 = Vec<decltype(d)>;
1551 const size_t N = Lanes(d);
1552
1553 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
1554 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
1555 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
1556 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
1557 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
1558 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
1559 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
1560 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
1561 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
1562
1563 const VU16 mask = Set(d, 0x1FFu); // Lowest 9 bits
1564
1565 const VU16 raw0 = And(packed0, mask);
1566 StoreU(raw0, d, raw + 0 * N);
1567
1568 const VU16 raw1 = And(packed1, mask);
1569 StoreU(raw1, d, raw + 1 * N);
1570
1571 const VU16 raw2 = And(packed2, mask);
1572 StoreU(raw2, d, raw + 2 * N);
1573
1574 const VU16 raw3 = And(packed3, mask);
1575 StoreU(raw3, d, raw + 3 * N);
1576
1577 const VU16 raw4 = And(packed4, mask);
1578 StoreU(raw4, d, raw + 4 * N);
1579
1580 const VU16 raw5 = And(packed5, mask);
1581 StoreU(raw5, d, raw + 5 * N);
1582
1583 const VU16 raw6 = And(packed6, mask);
1584 StoreU(raw6, d, raw + 6 * N);
1585
1586 const VU16 raw7 = And(packed7, mask);
1587 StoreU(raw7, d, raw + 7 * N);
1588
1589 const VU16 mid2 = Set(d, 0x180u); // top 2 in lower 9
1590 const VU16 raw8 =
1591 OrAnd(ShiftRight<9>(packed0), ShiftLeft<7>(packed8), mid2);
1592 const VU16 raw9 =
1593 OrAnd(ShiftRight<9>(packed1), ShiftLeft<5>(packed8), mid2);
1594 const VU16 rawA =
1595 OrAnd(ShiftRight<9>(packed2), ShiftLeft<3>(packed8), mid2);
1596 const VU16 rawB =
1597 OrAnd(ShiftRight<9>(packed3), ShiftLeft<1>(packed8), mid2);
1598 const VU16 rawC =
1599 OrAnd(ShiftRight<9>(packed4), ShiftRight<1>(packed8), mid2);
1600 const VU16 rawD =
1601 OrAnd(ShiftRight<9>(packed5), ShiftRight<3>(packed8), mid2);
1602 const VU16 rawE =
1603 OrAnd(ShiftRight<9>(packed6), ShiftRight<5>(packed8), mid2);
1604 const VU16 rawF =
1605 OrAnd(ShiftRight<9>(packed7), ShiftRight<7>(packed8), mid2);
1606
1607 StoreU(raw8, d, raw + 8 * N);
1608 StoreU(raw9, d, raw + 9 * N);
1609 StoreU(rawA, d, raw + 0xA * N);
1610 StoreU(rawB, d, raw + 0xB * N);
1611 StoreU(rawC, d, raw + 0xC * N);
1612 StoreU(rawD, d, raw + 0xD * N);
1613 StoreU(rawE, d, raw + 0xE * N);
1614 StoreU(rawF, d, raw + 0xF * N);
1615 }
1616}; // Pack16<9>
1617
1618template <>
1619struct Pack16<10> {
1620 template <class D>
1621 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
1622 uint16_t* HWY_RESTRICT packed_out) const {
1623 using VU16 = Vec<decltype(d)>;
1624 const size_t N = Lanes(d);
1625 const VU16 raw0 = LoadU(d, raw + 0 * N);
1626 const VU16 raw1 = LoadU(d, raw + 1 * N);
1627 const VU16 raw2 = LoadU(d, raw + 2 * N);
1628 const VU16 raw3 = LoadU(d, raw + 3 * N);
1629 const VU16 raw4 = LoadU(d, raw + 4 * N);
1630 const VU16 raw5 = LoadU(d, raw + 5 * N);
1631 const VU16 raw6 = LoadU(d, raw + 6 * N);
1632 const VU16 raw7 = LoadU(d, raw + 7 * N);
1633 const VU16 raw8 = LoadU(d, raw + 8 * N);
1634 const VU16 raw9 = LoadU(d, raw + 9 * N);
1635 const VU16 rawA = LoadU(d, raw + 0xA * N);
1636 const VU16 rawB = LoadU(d, raw + 0xB * N);
1637 const VU16 rawC = LoadU(d, raw + 0xC * N);
1638 const VU16 rawD = LoadU(d, raw + 0xD * N);
1639 const VU16 rawE = LoadU(d, raw + 0xE * N);
1640 const VU16 rawF = LoadU(d, raw + 0xF * N);
1641
1642 // 8 vectors, each with 10+6 bits; top 4 bits are concatenated into
1643 // packed8 and packed9.
1644 const VU16 packed0 = Or(ShiftLeft<10>(raw8), raw0);
1645 const VU16 packed1 = Or(ShiftLeft<10>(raw9), raw1);
1646 const VU16 packed2 = Or(ShiftLeft<10>(rawA), raw2);
1647 const VU16 packed3 = Or(ShiftLeft<10>(rawB), raw3);
1648 const VU16 packed4 = Or(ShiftLeft<10>(rawC), raw4);
1649 const VU16 packed5 = Or(ShiftLeft<10>(rawD), raw5);
1650 const VU16 packed6 = Or(ShiftLeft<10>(rawE), raw6);
1651 const VU16 packed7 = Or(ShiftLeft<10>(rawF), raw7);
1652
1653 // We could shift down, OR and shift up, but two shifts are typically more
1654 // expensive than AND, shift into position, and OR (which can be further
1655 // reduced via Xor3).
1656 const VU16 mid4 = Set(d, 0x3C0u); // top 4 in lower 10
1657 const VU16 part8 = ShiftRight<6>(And(raw8, mid4));
1658 const VU16 part9 = ShiftRight<2>(And(raw9, mid4));
1659 const VU16 partA = ShiftLeft<2>(And(rawA, mid4));
1660 const VU16 partB = ShiftLeft<6>(And(rawB, mid4));
1661 const VU16 partC = ShiftRight<6>(And(rawC, mid4));
1662 const VU16 partD = ShiftRight<2>(And(rawD, mid4));
1663 const VU16 partE = ShiftLeft<2>(And(rawE, mid4));
1664 const VU16 partF = ShiftLeft<6>(And(rawF, mid4));
1665 const VU16 packed8 = Or(Xor3(part8, part9, partA), partB);
1666 const VU16 packed9 = Or(Xor3(partC, partD, partE), partF);
1667
1668 StoreU(packed0, d, packed_out + 0 * N);
1669 StoreU(packed1, d, packed_out + 1 * N);
1670 StoreU(packed2, d, packed_out + 2 * N);
1671 StoreU(packed3, d, packed_out + 3 * N);
1672 StoreU(packed4, d, packed_out + 4 * N);
1673 StoreU(packed5, d, packed_out + 5 * N);
1674 StoreU(packed6, d, packed_out + 6 * N);
1675 StoreU(packed7, d, packed_out + 7 * N);
1676 StoreU(packed8, d, packed_out + 8 * N);
1677 StoreU(packed9, d, packed_out + 9 * N);
1678 }
1679
1680 template <class D>
1681 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
1682 uint16_t* HWY_RESTRICT raw) const {
1683 using VU16 = Vec<decltype(d)>;
1684 const size_t N = Lanes(d);
1685
1686 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
1687 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
1688 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
1689 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
1690 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
1691 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
1692 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
1693 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
1694 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
1695 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
1696
1697 const VU16 mask = Set(d, 0x3FFu); // Lowest 10 bits
1698
1699 const VU16 raw0 = And(packed0, mask);
1700 StoreU(raw0, d, raw + 0 * N);
1701
1702 const VU16 raw1 = And(packed1, mask);
1703 StoreU(raw1, d, raw + 1 * N);
1704
1705 const VU16 raw2 = And(packed2, mask);
1706 StoreU(raw2, d, raw + 2 * N);
1707
1708 const VU16 raw3 = And(packed3, mask);
1709 StoreU(raw3, d, raw + 3 * N);
1710
1711 const VU16 raw4 = And(packed4, mask);
1712 StoreU(raw4, d, raw + 4 * N);
1713
1714 const VU16 raw5 = And(packed5, mask);
1715 StoreU(raw5, d, raw + 5 * N);
1716
1717 const VU16 raw6 = And(packed6, mask);
1718 StoreU(raw6, d, raw + 6 * N);
1719
1720 const VU16 raw7 = And(packed7, mask);
1721 StoreU(raw7, d, raw + 7 * N);
1722
1723 const VU16 mid4 = Set(d, 0x3C0u); // top 4 in lower 10
1724 const VU16 raw8 =
1725 OrAnd(ShiftRight<10>(packed0), ShiftLeft<6>(packed8), mid4);
1726 const VU16 raw9 =
1727 OrAnd(ShiftRight<10>(packed1), ShiftLeft<2>(packed8), mid4);
1728 const VU16 rawA =
1729 OrAnd(ShiftRight<10>(packed2), ShiftRight<2>(packed8), mid4);
1730 const VU16 rawB =
1731 OrAnd(ShiftRight<10>(packed3), ShiftRight<6>(packed8), mid4);
1732 const VU16 rawC =
1733 OrAnd(ShiftRight<10>(packed4), ShiftLeft<6>(packed9), mid4);
1734 const VU16 rawD =
1735 OrAnd(ShiftRight<10>(packed5), ShiftLeft<2>(packed9), mid4);
1736 const VU16 rawE =
1737 OrAnd(ShiftRight<10>(packed6), ShiftRight<2>(packed9), mid4);
1738 const VU16 rawF =
1739 OrAnd(ShiftRight<10>(packed7), ShiftRight<6>(packed9), mid4);
1740
1741 StoreU(raw8, d, raw + 8 * N);
1742 StoreU(raw9, d, raw + 9 * N);
1743 StoreU(rawA, d, raw + 0xA * N);
1744 StoreU(rawB, d, raw + 0xB * N);
1745 StoreU(rawC, d, raw + 0xC * N);
1746 StoreU(rawD, d, raw + 0xD * N);
1747 StoreU(rawE, d, raw + 0xE * N);
1748 StoreU(rawF, d, raw + 0xF * N);
1749 }
1750}; // Pack16<10>
1751
1752template <>
1753struct Pack16<11> {
1754 template <class D>
1755 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
1756 uint16_t* HWY_RESTRICT packed_out) const {
1757 using VU16 = Vec<decltype(d)>;
1758 const size_t N = Lanes(d);
1759 const VU16 raw0 = LoadU(d, raw + 0 * N);
1760 const VU16 raw1 = LoadU(d, raw + 1 * N);
1761 const VU16 raw2 = LoadU(d, raw + 2 * N);
1762 const VU16 raw3 = LoadU(d, raw + 3 * N);
1763 const VU16 raw4 = LoadU(d, raw + 4 * N);
1764 const VU16 raw5 = LoadU(d, raw + 5 * N);
1765 const VU16 raw6 = LoadU(d, raw + 6 * N);
1766 const VU16 raw7 = LoadU(d, raw + 7 * N);
1767 const VU16 raw8 = LoadU(d, raw + 8 * N);
1768 const VU16 raw9 = LoadU(d, raw + 9 * N);
1769 const VU16 rawA = LoadU(d, raw + 0xA * N);
1770 const VU16 rawB = LoadU(d, raw + 0xB * N);
1771 const VU16 rawC = LoadU(d, raw + 0xC * N);
1772 const VU16 rawD = LoadU(d, raw + 0xD * N);
1773 const VU16 rawE = LoadU(d, raw + 0xE * N);
1774 const VU16 rawF = LoadU(d, raw + 0xF * N);
1775
1776 // It is not obvious what the optimal partitioning looks like. To reduce the
1777 // number of constants, we want to minimize the number of distinct bit
1778 // lengths. 11+5 also requires 6-bit remnants with 4-bit leftovers.
1779 // 8+3 seems better: it is easier to scatter 3 bits into the MSBs.
1780 const VU16 lo8 = Set(d, 0xFFu);
1781
1782 // Lower 8 bits of all raw
1783 const VU16 packed0 = OrAnd(ShiftLeft<8>(raw1), raw0, lo8);
1784 const VU16 packed1 = OrAnd(ShiftLeft<8>(raw3), raw2, lo8);
1785 const VU16 packed2 = OrAnd(ShiftLeft<8>(raw5), raw4, lo8);
1786 const VU16 packed3 = OrAnd(ShiftLeft<8>(raw7), raw6, lo8);
1787 const VU16 packed4 = OrAnd(ShiftLeft<8>(raw9), raw8, lo8);
1788 const VU16 packed5 = OrAnd(ShiftLeft<8>(rawB), rawA, lo8);
1789 const VU16 packed6 = OrAnd(ShiftLeft<8>(rawD), rawC, lo8);
1790 const VU16 packed7 = OrAnd(ShiftLeft<8>(rawF), rawE, lo8);
1791
1792 StoreU(packed0, d, packed_out + 0 * N);
1793 StoreU(packed1, d, packed_out + 1 * N);
1794 StoreU(packed2, d, packed_out + 2 * N);
1795 StoreU(packed3, d, packed_out + 3 * N);
1796 StoreU(packed4, d, packed_out + 4 * N);
1797 StoreU(packed5, d, packed_out + 5 * N);
1798 StoreU(packed6, d, packed_out + 6 * N);
1799 StoreU(packed7, d, packed_out + 7 * N);
1800
1801 // Three vectors, five 3bit remnants each, plus one 3bit in their MSB.
1802 const VU16 top0 = ShiftRight<8>(raw0);
1803 const VU16 top1 = ShiftRight<8>(raw1);
1804 const VU16 top2 = ShiftRight<8>(raw2);
1805 // Insert top raw bits into 3-bit groups within packed8..A. Moving the
1806 // mask along avoids masking each of raw0..E and enables OrAnd.
1807 VU16 next = Set(d, 0x38u); // 0x7 << 3
1808 VU16 packed8 = OrAnd(top0, ShiftRight<5>(raw3), next);
1809 VU16 packed9 = OrAnd(top1, ShiftRight<5>(raw4), next);
1810 VU16 packedA = OrAnd(top2, ShiftRight<5>(raw5), next);
1811 next = ShiftLeft<3>(next);
1812 packed8 = OrAnd(packed8, ShiftRight<2>(raw6), next);
1813 packed9 = OrAnd(packed9, ShiftRight<2>(raw7), next);
1814 packedA = OrAnd(packedA, ShiftRight<2>(raw8), next);
1815 next = ShiftLeft<3>(next);
1816 packed8 = OrAnd(packed8, Add(raw9, raw9), next);
1817 packed9 = OrAnd(packed9, Add(rawA, rawA), next);
1818 packedA = OrAnd(packedA, Add(rawB, rawB), next);
1819 next = ShiftLeft<3>(next);
1820 packed8 = OrAnd(packed8, ShiftLeft<4>(rawC), next);
1821 packed9 = OrAnd(packed9, ShiftLeft<4>(rawD), next);
1822 packedA = OrAnd(packedA, ShiftLeft<4>(rawE), next);
1823
1824 // Scatter upper 3 bits of rawF into the upper bits.
1825 next = ShiftLeft<3>(next); // = 0x8000u
1826 packed8 = OrAnd(packed8, ShiftLeft<7>(rawF), next);
1827 packed9 = OrAnd(packed9, ShiftLeft<6>(rawF), next);
1828 packedA = OrAnd(packedA, ShiftLeft<5>(rawF), next);
1829
1830 StoreU(packed8, d, packed_out + 8 * N);
1831 StoreU(packed9, d, packed_out + 9 * N);
1832 StoreU(packedA, d, packed_out + 0xA * N);
1833 }
1834
1835 template <class D>
1836 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
1837 uint16_t* HWY_RESTRICT raw) const {
1838 using VU16 = Vec<decltype(d)>;
1839 const size_t N = Lanes(d);
1840
1841 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
1842 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
1843 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
1844 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
1845 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
1846 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
1847 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
1848 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
1849 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
1850 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
1851 const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
1852
1853 const VU16 mask = Set(d, 0xFFu); // Lowest 8 bits
1854
1855 const VU16 down0 = And(packed0, mask);
1856 const VU16 down1 = ShiftRight<8>(packed0);
1857 const VU16 down2 = And(packed1, mask);
1858 const VU16 down3 = ShiftRight<8>(packed1);
1859 const VU16 down4 = And(packed2, mask);
1860 const VU16 down5 = ShiftRight<8>(packed2);
1861 const VU16 down6 = And(packed3, mask);
1862 const VU16 down7 = ShiftRight<8>(packed3);
1863 const VU16 down8 = And(packed4, mask);
1864 const VU16 down9 = ShiftRight<8>(packed4);
1865 const VU16 downA = And(packed5, mask);
1866 const VU16 downB = ShiftRight<8>(packed5);
1867 const VU16 downC = And(packed6, mask);
1868 const VU16 downD = ShiftRight<8>(packed6);
1869 const VU16 downE = And(packed7, mask);
1870 const VU16 downF = ShiftRight<8>(packed7);
1871
1872 // Three bits from packed8..A, eight bits from down0..F.
1873 const VU16 hi3 = Set(d, 0x700u);
1874 const VU16 raw0 = OrAnd(down0, ShiftLeft<8>(packed8), hi3);
1875 const VU16 raw1 = OrAnd(down1, ShiftLeft<8>(packed9), hi3);
1876 const VU16 raw2 = OrAnd(down2, ShiftLeft<8>(packedA), hi3);
1877
1878 const VU16 raw3 = OrAnd(down3, ShiftLeft<5>(packed8), hi3);
1879 const VU16 raw4 = OrAnd(down4, ShiftLeft<5>(packed9), hi3);
1880 const VU16 raw5 = OrAnd(down5, ShiftLeft<5>(packedA), hi3);
1881
1882 const VU16 raw6 = OrAnd(down6, ShiftLeft<2>(packed8), hi3);
1883 const VU16 raw7 = OrAnd(down7, ShiftLeft<2>(packed9), hi3);
1884 const VU16 raw8 = OrAnd(down8, ShiftLeft<2>(packedA), hi3);
1885
1886 const VU16 raw9 = OrAnd(down9, ShiftRight<1>(packed8), hi3);
1887 const VU16 rawA = OrAnd(downA, ShiftRight<1>(packed9), hi3);
1888 const VU16 rawB = OrAnd(downB, ShiftRight<1>(packedA), hi3);
1889
1890 const VU16 rawC = OrAnd(downC, ShiftRight<4>(packed8), hi3);
1891 const VU16 rawD = OrAnd(downD, ShiftRight<4>(packed9), hi3);
1892 const VU16 rawE = OrAnd(downE, ShiftRight<4>(packedA), hi3);
1893
1894 // Shift MSB into the top 3-of-11 and mask.
1895 const VU16 rawF = Or(downF, Xor3(And(ShiftRight<7>(packed8), hi3),
1896 And(ShiftRight<6>(packed9), hi3),
1897 And(ShiftRight<5>(packedA), hi3)));
1898
1899 StoreU(raw0, d, raw + 0 * N);
1900 StoreU(raw1, d, raw + 1 * N);
1901 StoreU(raw2, d, raw + 2 * N);
1902 StoreU(raw3, d, raw + 3 * N);
1903 StoreU(raw4, d, raw + 4 * N);
1904 StoreU(raw5, d, raw + 5 * N);
1905 StoreU(raw6, d, raw + 6 * N);
1906 StoreU(raw7, d, raw + 7 * N);
1907 StoreU(raw8, d, raw + 8 * N);
1908 StoreU(raw9, d, raw + 9 * N);
1909 StoreU(rawA, d, raw + 0xA * N);
1910 StoreU(rawB, d, raw + 0xB * N);
1911 StoreU(rawC, d, raw + 0xC * N);
1912 StoreU(rawD, d, raw + 0xD * N);
1913 StoreU(rawE, d, raw + 0xE * N);
1914 StoreU(rawF, d, raw + 0xF * N);
1915 }
1916}; // Pack16<11>
1917
1918template <>
1919struct Pack16<12> {
1920 template <class D>
1921 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
1922 uint16_t* HWY_RESTRICT packed_out) const {
1923 using VU16 = Vec<decltype(d)>;
1924 const size_t N = Lanes(d);
1925 const VU16 raw0 = LoadU(d, raw + 0 * N);
1926 const VU16 raw1 = LoadU(d, raw + 1 * N);
1927 const VU16 raw2 = LoadU(d, raw + 2 * N);
1928 const VU16 raw3 = LoadU(d, raw + 3 * N);
1929 const VU16 raw4 = LoadU(d, raw + 4 * N);
1930 const VU16 raw5 = LoadU(d, raw + 5 * N);
1931 const VU16 raw6 = LoadU(d, raw + 6 * N);
1932 const VU16 raw7 = LoadU(d, raw + 7 * N);
1933 const VU16 raw8 = LoadU(d, raw + 8 * N);
1934 const VU16 raw9 = LoadU(d, raw + 9 * N);
1935 const VU16 rawA = LoadU(d, raw + 0xA * N);
1936 const VU16 rawB = LoadU(d, raw + 0xB * N);
1937 const VU16 rawC = LoadU(d, raw + 0xC * N);
1938 const VU16 rawD = LoadU(d, raw + 0xD * N);
1939 const VU16 rawE = LoadU(d, raw + 0xE * N);
1940 const VU16 rawF = LoadU(d, raw + 0xF * N);
1941
1942 // 8 vectors, each with 12+4 bits; top 8 bits are concatenated into
1943 // packed8 to packedB.
1944 const VU16 packed0 = Or(ShiftLeft<12>(raw8), raw0);
1945 const VU16 packed1 = Or(ShiftLeft<12>(raw9), raw1);
1946 const VU16 packed2 = Or(ShiftLeft<12>(rawA), raw2);
1947 const VU16 packed3 = Or(ShiftLeft<12>(rawB), raw3);
1948 const VU16 packed4 = Or(ShiftLeft<12>(rawC), raw4);
1949 const VU16 packed5 = Or(ShiftLeft<12>(rawD), raw5);
1950 const VU16 packed6 = Or(ShiftLeft<12>(rawE), raw6);
1951 const VU16 packed7 = Or(ShiftLeft<12>(rawF), raw7);
1952
1953 // Masking after shifting left enables OrAnd.
1954 const VU16 hi8 = Set(d, 0xFF00u);
1955 const VU16 packed8 = OrAnd(ShiftRight<4>(raw8), ShiftLeft<4>(raw9), hi8);
1956 const VU16 packed9 = OrAnd(ShiftRight<4>(rawA), ShiftLeft<4>(rawB), hi8);
1957 const VU16 packedA = OrAnd(ShiftRight<4>(rawC), ShiftLeft<4>(rawD), hi8);
1958 const VU16 packedB = OrAnd(ShiftRight<4>(rawE), ShiftLeft<4>(rawF), hi8);
1959 StoreU(packed0, d, packed_out + 0 * N);
1960 StoreU(packed1, d, packed_out + 1 * N);
1961 StoreU(packed2, d, packed_out + 2 * N);
1962 StoreU(packed3, d, packed_out + 3 * N);
1963 StoreU(packed4, d, packed_out + 4 * N);
1964 StoreU(packed5, d, packed_out + 5 * N);
1965 StoreU(packed6, d, packed_out + 6 * N);
1966 StoreU(packed7, d, packed_out + 7 * N);
1967 StoreU(packed8, d, packed_out + 8 * N);
1968 StoreU(packed9, d, packed_out + 9 * N);
1969 StoreU(packedA, d, packed_out + 0xA * N);
1970 StoreU(packedB, d, packed_out + 0xB * N);
1971 }
1972
1973 template <class D>
1974 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
1975 uint16_t* HWY_RESTRICT raw) const {
1976 using VU16 = Vec<decltype(d)>;
1977 const size_t N = Lanes(d);
1978
1979 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
1980 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
1981 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
1982 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
1983 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
1984 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
1985 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
1986 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
1987 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
1988 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
1989 const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
1990 const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N));
1991
1992 const VU16 mask = Set(d, 0xFFFu); // Lowest 12 bits
1993
1994 const VU16 raw0 = And(packed0, mask);
1995 StoreU(raw0, d, raw + 0 * N);
1996
1997 const VU16 raw1 = And(packed1, mask);
1998 StoreU(raw1, d, raw + 1 * N);
1999
2000 const VU16 raw2 = And(packed2, mask);
2001 StoreU(raw2, d, raw + 2 * N);
2002
2003 const VU16 raw3 = And(packed3, mask);
2004 StoreU(raw3, d, raw + 3 * N);
2005
2006 const VU16 raw4 = And(packed4, mask);
2007 StoreU(raw4, d, raw + 4 * N);
2008
2009 const VU16 raw5 = And(packed5, mask);
2010 StoreU(raw5, d, raw + 5 * N);
2011
2012 const VU16 raw6 = And(packed6, mask);
2013 StoreU(raw6, d, raw + 6 * N);
2014
2015 const VU16 raw7 = And(packed7, mask);
2016 StoreU(raw7, d, raw + 7 * N);
2017
2018 const VU16 mid8 = Set(d, 0xFF0u); // upper 8 in lower 12
2019 const VU16 raw8 =
2020 OrAnd(ShiftRight<12>(packed0), ShiftLeft<4>(packed8), mid8);
2021 const VU16 raw9 =
2022 OrAnd(ShiftRight<12>(packed1), ShiftRight<4>(packed8), mid8);
2023 const VU16 rawA =
2024 OrAnd(ShiftRight<12>(packed2), ShiftLeft<4>(packed9), mid8);
2025 const VU16 rawB =
2026 OrAnd(ShiftRight<12>(packed3), ShiftRight<4>(packed9), mid8);
2027 const VU16 rawC =
2028 OrAnd(ShiftRight<12>(packed4), ShiftLeft<4>(packedA), mid8);
2029 const VU16 rawD =
2030 OrAnd(ShiftRight<12>(packed5), ShiftRight<4>(packedA), mid8);
2031 const VU16 rawE =
2032 OrAnd(ShiftRight<12>(packed6), ShiftLeft<4>(packedB), mid8);
2033 const VU16 rawF =
2034 OrAnd(ShiftRight<12>(packed7), ShiftRight<4>(packedB), mid8);
2035 StoreU(raw8, d, raw + 8 * N);
2036 StoreU(raw9, d, raw + 9 * N);
2037 StoreU(rawA, d, raw + 0xA * N);
2038 StoreU(rawB, d, raw + 0xB * N);
2039 StoreU(rawC, d, raw + 0xC * N);
2040 StoreU(rawD, d, raw + 0xD * N);
2041 StoreU(rawE, d, raw + 0xE * N);
2042 StoreU(rawF, d, raw + 0xF * N);
2043 }
2044}; // Pack16<12>
2045
2046template <>
2047struct Pack16<13> {
2048 template <class D>
2049 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
2050 uint16_t* HWY_RESTRICT packed_out) const {
2051 using VU16 = Vec<decltype(d)>;
2052 const size_t N = Lanes(d);
2053 const VU16 raw0 = LoadU(d, raw + 0 * N);
2054 const VU16 raw1 = LoadU(d, raw + 1 * N);
2055 const VU16 raw2 = LoadU(d, raw + 2 * N);
2056 const VU16 raw3 = LoadU(d, raw + 3 * N);
2057 const VU16 raw4 = LoadU(d, raw + 4 * N);
2058 const VU16 raw5 = LoadU(d, raw + 5 * N);
2059 const VU16 raw6 = LoadU(d, raw + 6 * N);
2060 const VU16 raw7 = LoadU(d, raw + 7 * N);
2061 const VU16 raw8 = LoadU(d, raw + 8 * N);
2062 const VU16 raw9 = LoadU(d, raw + 9 * N);
2063 const VU16 rawA = LoadU(d, raw + 0xA * N);
2064 const VU16 rawB = LoadU(d, raw + 0xB * N);
2065 const VU16 rawC = LoadU(d, raw + 0xC * N);
2066 const VU16 rawD = LoadU(d, raw + 0xD * N);
2067 const VU16 rawE = LoadU(d, raw + 0xE * N);
2068 const VU16 rawF = LoadU(d, raw + 0xF * N);
2069
2070 // As with 11 bits, it is not obvious what the optimal partitioning looks
2071 // like. We similarly go with an 8+5 split.
2072 const VU16 lo8 = Set(d, 0xFFu);
2073
2074 // Lower 8 bits of all raw
2075 const VU16 packed0 = OrAnd(ShiftLeft<8>(raw1), raw0, lo8);
2076 const VU16 packed1 = OrAnd(ShiftLeft<8>(raw3), raw2, lo8);
2077 const VU16 packed2 = OrAnd(ShiftLeft<8>(raw5), raw4, lo8);
2078 const VU16 packed3 = OrAnd(ShiftLeft<8>(raw7), raw6, lo8);
2079 const VU16 packed4 = OrAnd(ShiftLeft<8>(raw9), raw8, lo8);
2080 const VU16 packed5 = OrAnd(ShiftLeft<8>(rawB), rawA, lo8);
2081 const VU16 packed6 = OrAnd(ShiftLeft<8>(rawD), rawC, lo8);
2082 const VU16 packed7 = OrAnd(ShiftLeft<8>(rawF), rawE, lo8);
2083
2084 StoreU(packed0, d, packed_out + 0 * N);
2085 StoreU(packed1, d, packed_out + 1 * N);
2086 StoreU(packed2, d, packed_out + 2 * N);
2087 StoreU(packed3, d, packed_out + 3 * N);
2088 StoreU(packed4, d, packed_out + 4 * N);
2089 StoreU(packed5, d, packed_out + 5 * N);
2090 StoreU(packed6, d, packed_out + 6 * N);
2091 StoreU(packed7, d, packed_out + 7 * N);
2092
2093 // Five vectors, three 5bit remnants each, plus one 5bit in their MSB.
2094 const VU16 top0 = ShiftRight<8>(raw0);
2095 const VU16 top1 = ShiftRight<8>(raw1);
2096 const VU16 top2 = ShiftRight<8>(raw2);
2097 const VU16 top3 = ShiftRight<8>(raw3);
2098 const VU16 top4 = ShiftRight<8>(raw4);
2099
2100 // Insert top raw bits into 5-bit groups within packed8..C. Moving the
2101 // mask along avoids masking each of raw0..E and enables OrAnd.
2102 VU16 next = Set(d, 0x3E0u); // 0x1F << 5
2103 VU16 packed8 = OrAnd(top0, ShiftRight<3>(raw5), next);
2104 VU16 packed9 = OrAnd(top1, ShiftRight<3>(raw6), next);
2105 VU16 packedA = OrAnd(top2, ShiftRight<3>(raw7), next);
2106 VU16 packedB = OrAnd(top3, ShiftRight<3>(raw8), next);
2107 VU16 packedC = OrAnd(top4, ShiftRight<3>(raw9), next);
2108 next = ShiftLeft<5>(next);
2109 packed8 = OrAnd(packed8, ShiftLeft<2>(rawA), next);
2110 packed9 = OrAnd(packed9, ShiftLeft<2>(rawB), next);
2111 packedA = OrAnd(packedA, ShiftLeft<2>(rawC), next);
2112 packedB = OrAnd(packedB, ShiftLeft<2>(rawD), next);
2113 packedC = OrAnd(packedC, ShiftLeft<2>(rawE), next);
2114
2115 // Scatter upper 5 bits of rawF into the upper bits.
2116 next = ShiftLeft<3>(next); // = 0x8000u
2117 packed8 = OrAnd(packed8, ShiftLeft<7>(rawF), next);
2118 packed9 = OrAnd(packed9, ShiftLeft<6>(rawF), next);
2119 packedA = OrAnd(packedA, ShiftLeft<5>(rawF), next);
2120 packedB = OrAnd(packedB, ShiftLeft<4>(rawF), next);
2121 packedC = OrAnd(packedC, ShiftLeft<3>(rawF), next);
2122
2123 StoreU(packed8, d, packed_out + 8 * N);
2124 StoreU(packed9, d, packed_out + 9 * N);
2125 StoreU(packedA, d, packed_out + 0xA * N);
2126 StoreU(packedB, d, packed_out + 0xB * N);
2127 StoreU(packedC, d, packed_out + 0xC * N);
2128 }
2129
2130 template <class D>
2131 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
2132 uint16_t* HWY_RESTRICT raw) const {
2133 using VU16 = Vec<decltype(d)>;
2134 const size_t N = Lanes(d);
2135
2136 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
2137 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
2138 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
2139 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
2140 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
2141 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
2142 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
2143 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
2144 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
2145 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
2146 const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
2147 const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N));
2148 const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N));
2149
2150 const VU16 mask = Set(d, 0xFFu); // Lowest 8 bits
2151
2152 const VU16 down0 = And(packed0, mask);
2153 const VU16 down1 = ShiftRight<8>(packed0);
2154 const VU16 down2 = And(packed1, mask);
2155 const VU16 down3 = ShiftRight<8>(packed1);
2156 const VU16 down4 = And(packed2, mask);
2157 const VU16 down5 = ShiftRight<8>(packed2);
2158 const VU16 down6 = And(packed3, mask);
2159 const VU16 down7 = ShiftRight<8>(packed3);
2160 const VU16 down8 = And(packed4, mask);
2161 const VU16 down9 = ShiftRight<8>(packed4);
2162 const VU16 downA = And(packed5, mask);
2163 const VU16 downB = ShiftRight<8>(packed5);
2164 const VU16 downC = And(packed6, mask);
2165 const VU16 downD = ShiftRight<8>(packed6);
2166 const VU16 downE = And(packed7, mask);
2167 const VU16 downF = ShiftRight<8>(packed7);
2168
2169 // Upper five bits from packed8..C, eight bits from down0..F.
2170 const VU16 hi5 = Set(d, 0x1F00u);
2171 const VU16 raw0 = OrAnd(down0, ShiftLeft<8>(packed8), hi5);
2172 const VU16 raw1 = OrAnd(down1, ShiftLeft<8>(packed9), hi5);
2173 const VU16 raw2 = OrAnd(down2, ShiftLeft<8>(packedA), hi5);
2174 const VU16 raw3 = OrAnd(down3, ShiftLeft<8>(packedB), hi5);
2175 const VU16 raw4 = OrAnd(down4, ShiftLeft<8>(packedC), hi5);
2176
2177 const VU16 raw5 = OrAnd(down5, ShiftLeft<3>(packed8), hi5);
2178 const VU16 raw6 = OrAnd(down6, ShiftLeft<3>(packed9), hi5);
2179 const VU16 raw7 = OrAnd(down7, ShiftLeft<3>(packedA), hi5);
2180 const VU16 raw8 = OrAnd(down8, ShiftLeft<3>(packed9), hi5);
2181 const VU16 raw9 = OrAnd(down9, ShiftLeft<3>(packedA), hi5);
2182
2183 const VU16 rawA = OrAnd(downA, ShiftRight<2>(packed8), hi5);
2184 const VU16 rawB = OrAnd(downB, ShiftRight<2>(packed9), hi5);
2185 const VU16 rawC = OrAnd(downC, ShiftRight<2>(packedA), hi5);
2186 const VU16 rawD = OrAnd(downD, ShiftRight<2>(packed9), hi5);
2187 const VU16 rawE = OrAnd(downE, ShiftRight<2>(packedA), hi5);
2188
2189 // Shift MSB into the top 5-of-11 and mask.
2190 const VU16 p0 = Xor3(And(ShiftRight<7>(packed8), hi5), //
2191 And(ShiftRight<6>(packed9), hi5),
2192 And(ShiftRight<5>(packedA), hi5));
2193 const VU16 p1 = Xor3(And(ShiftRight<4>(packedB), hi5),
2194 And(ShiftRight<3>(packedC), hi5), downF);
2195 const VU16 rawF = Or(p0, p1);
2196
2197 StoreU(raw0, d, raw + 0 * N);
2198 StoreU(raw1, d, raw + 1 * N);
2199 StoreU(raw2, d, raw + 2 * N);
2200 StoreU(raw3, d, raw + 3 * N);
2201 StoreU(raw4, d, raw + 4 * N);
2202 StoreU(raw5, d, raw + 5 * N);
2203 StoreU(raw6, d, raw + 6 * N);
2204 StoreU(raw7, d, raw + 7 * N);
2205 StoreU(raw8, d, raw + 8 * N);
2206 StoreU(raw9, d, raw + 9 * N);
2207 StoreU(rawA, d, raw + 0xA * N);
2208 StoreU(rawB, d, raw + 0xB * N);
2209 StoreU(rawC, d, raw + 0xC * N);
2210 StoreU(rawD, d, raw + 0xD * N);
2211 StoreU(rawE, d, raw + 0xE * N);
2212 StoreU(rawF, d, raw + 0xF * N);
2213 }
2214}; // Pack16<13>
2215
2216template <>
2217struct Pack16<14> {
2218 template <class D>
2219 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
2220 uint16_t* HWY_RESTRICT packed_out) const {
2221 using VU16 = Vec<decltype(d)>;
2222 const size_t N = Lanes(d);
2223 const VU16 raw0 = LoadU(d, raw + 0 * N);
2224 const VU16 raw1 = LoadU(d, raw + 1 * N);
2225 const VU16 raw2 = LoadU(d, raw + 2 * N);
2226 const VU16 raw3 = LoadU(d, raw + 3 * N);
2227 const VU16 raw4 = LoadU(d, raw + 4 * N);
2228 const VU16 raw5 = LoadU(d, raw + 5 * N);
2229 const VU16 raw6 = LoadU(d, raw + 6 * N);
2230 const VU16 raw7 = LoadU(d, raw + 7 * N);
2231 const VU16 raw8 = LoadU(d, raw + 8 * N);
2232 const VU16 raw9 = LoadU(d, raw + 9 * N);
2233 const VU16 rawA = LoadU(d, raw + 0xA * N);
2234 const VU16 rawB = LoadU(d, raw + 0xB * N);
2235 const VU16 rawC = LoadU(d, raw + 0xC * N);
2236 const VU16 rawD = LoadU(d, raw + 0xD * N);
2237 const VU16 rawE = LoadU(d, raw + 0xE * N);
2238 const VU16 rawF = LoadU(d, raw + 0xF * N);
2239
2240 // 14 vectors, each with 14+2 bits; two raw vectors are scattered
2241 // across the upper 2 bits.
2242 const VU16 hi2 = Set(d, 0xC000u);
2243 const VU16 packed0 = Or(raw0, ShiftLeft<14>(rawE));
2244 const VU16 packed1 = OrAnd(raw1, ShiftLeft<12>(rawE), hi2);
2245 const VU16 packed2 = OrAnd(raw2, ShiftLeft<10>(rawE), hi2);
2246 const VU16 packed3 = OrAnd(raw3, ShiftLeft<8>(rawE), hi2);
2247 const VU16 packed4 = OrAnd(raw4, ShiftLeft<6>(rawE), hi2);
2248 const VU16 packed5 = OrAnd(raw5, ShiftLeft<4>(rawE), hi2);
2249 const VU16 packed6 = OrAnd(raw6, ShiftLeft<2>(rawE), hi2);
2250 const VU16 packed7 = Or(raw7, ShiftLeft<14>(rawF));
2251 const VU16 packed8 = OrAnd(raw8, ShiftLeft<12>(rawF), hi2);
2252 const VU16 packed9 = OrAnd(raw9, ShiftLeft<10>(rawF), hi2);
2253 const VU16 packedA = OrAnd(rawA, ShiftLeft<8>(rawF), hi2);
2254 const VU16 packedB = OrAnd(rawB, ShiftLeft<6>(rawF), hi2);
2255 const VU16 packedC = OrAnd(rawC, ShiftLeft<4>(rawF), hi2);
2256 const VU16 packedD = OrAnd(rawD, ShiftLeft<2>(rawF), hi2);
2257
2258 StoreU(packed0, d, packed_out + 0 * N);
2259 StoreU(packed1, d, packed_out + 1 * N);
2260 StoreU(packed2, d, packed_out + 2 * N);
2261 StoreU(packed3, d, packed_out + 3 * N);
2262 StoreU(packed4, d, packed_out + 4 * N);
2263 StoreU(packed5, d, packed_out + 5 * N);
2264 StoreU(packed6, d, packed_out + 6 * N);
2265 StoreU(packed7, d, packed_out + 7 * N);
2266 StoreU(packed8, d, packed_out + 8 * N);
2267 StoreU(packed9, d, packed_out + 9 * N);
2268 StoreU(packedA, d, packed_out + 0xA * N);
2269 StoreU(packedB, d, packed_out + 0xB * N);
2270 StoreU(packedC, d, packed_out + 0xC * N);
2271 StoreU(packedD, d, packed_out + 0xD * N);
2272 }
2273
2274 template <class D>
2275 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
2276 uint16_t* HWY_RESTRICT raw) const {
2277 using VU16 = Vec<decltype(d)>;
2278 const size_t N = Lanes(d);
2279
2280 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
2281 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
2282 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
2283 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
2284 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
2285 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
2286 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
2287 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
2288 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
2289 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
2290 const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
2291 const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N));
2292 const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N));
2293 const VU16 packedD = BitCast(d, LoadU(d, packed_in + 0xD * N));
2294
2295 const VU16 mask = Set(d, 0x3FFFu); // Lowest 14 bits
2296
2297 const VU16 raw0 = And(packed0, mask);
2298 StoreU(raw0, d, raw + 0 * N);
2299
2300 const VU16 raw1 = And(packed1, mask);
2301 StoreU(raw1, d, raw + 1 * N);
2302
2303 const VU16 raw2 = And(packed2, mask);
2304 StoreU(raw2, d, raw + 2 * N);
2305
2306 const VU16 raw3 = And(packed3, mask);
2307 StoreU(raw3, d, raw + 3 * N);
2308
2309 const VU16 raw4 = And(packed4, mask);
2310 StoreU(raw4, d, raw + 4 * N);
2311
2312 const VU16 raw5 = And(packed5, mask);
2313 StoreU(raw5, d, raw + 5 * N);
2314
2315 const VU16 raw6 = And(packed6, mask);
2316 StoreU(raw6, d, raw + 6 * N);
2317
2318 const VU16 raw7 = And(packed7, mask);
2319 StoreU(raw7, d, raw + 7 * N);
2320
2321 const VU16 raw8 = And(packed8, mask);
2322 StoreU(raw8, d, raw + 8 * N);
2323
2324 const VU16 raw9 = And(packed9, mask);
2325 StoreU(raw9, d, raw + 9 * N);
2326
2327 const VU16 rawA = And(packedA, mask);
2328 StoreU(rawA, d, raw + 0xA * N);
2329
2330 const VU16 rawB = And(packedB, mask);
2331 StoreU(rawB, d, raw + 0xB * N);
2332
2333 const VU16 rawC = And(packedC, mask);
2334 StoreU(rawC, d, raw + 0xC * N);
2335
2336 const VU16 rawD = And(packedD, mask);
2337 StoreU(rawD, d, raw + 0xD * N);
2338
2339 // rawE is the concatenation of the top two bits in packed0..6.
2340 const VU16 E0 = Xor3(ShiftRight<14>(packed0), //
2341 ShiftRight<12>(AndNot(mask, packed1)),
2342 ShiftRight<10>(AndNot(mask, packed2)));
2343 const VU16 E1 = Xor3(ShiftRight<8>(AndNot(mask, packed3)),
2344 ShiftRight<6>(AndNot(mask, packed4)),
2345 ShiftRight<4>(AndNot(mask, packed5)));
2346 const VU16 rawE = Xor3(ShiftRight<2>(AndNot(mask, packed6)), E0, E1);
2347 const VU16 F0 = Xor3(ShiftRight<14>(AndNot(mask, packed7)),
2348 ShiftRight<12>(AndNot(mask, packed8)),
2349 ShiftRight<10>(AndNot(mask, packed9)));
2350 const VU16 F1 = Xor3(ShiftRight<8>(AndNot(mask, packedA)),
2351 ShiftRight<6>(AndNot(mask, packedB)),
2352 ShiftRight<4>(AndNot(mask, packedC)));
2353 const VU16 rawF = Xor3(ShiftRight<2>(AndNot(mask, packedD)), F0, F1);
2354 StoreU(rawE, d, raw + 0xE * N);
2355 StoreU(rawF, d, raw + 0xF * N);
2356 }
2357}; // Pack16<14>
2358
2359template <>
2360struct Pack16<15> {
2361 template <class D>
2362 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
2363 uint16_t* HWY_RESTRICT packed_out) const {
2364 using VU16 = Vec<decltype(d)>;
2365 const size_t N = Lanes(d);
2366 const VU16 raw0 = LoadU(d, raw + 0 * N);
2367 const VU16 raw1 = LoadU(d, raw + 1 * N);
2368 const VU16 raw2 = LoadU(d, raw + 2 * N);
2369 const VU16 raw3 = LoadU(d, raw + 3 * N);
2370 const VU16 raw4 = LoadU(d, raw + 4 * N);
2371 const VU16 raw5 = LoadU(d, raw + 5 * N);
2372 const VU16 raw6 = LoadU(d, raw + 6 * N);
2373 const VU16 raw7 = LoadU(d, raw + 7 * N);
2374 const VU16 raw8 = LoadU(d, raw + 8 * N);
2375 const VU16 raw9 = LoadU(d, raw + 9 * N);
2376 const VU16 rawA = LoadU(d, raw + 0xA * N);
2377 const VU16 rawB = LoadU(d, raw + 0xB * N);
2378 const VU16 rawC = LoadU(d, raw + 0xC * N);
2379 const VU16 rawD = LoadU(d, raw + 0xD * N);
2380 const VU16 rawE = LoadU(d, raw + 0xE * N);
2381 const VU16 rawF = LoadU(d, raw + 0xF * N);
2382
2383 // 15 vectors, each with 15+1 bits; one packed vector is scattered
2384 // across the upper bit.
2385 const VU16 hi1 = Set(d, 0x8000u);
2386 const VU16 packed0 = Or(raw0, ShiftLeft<15>(rawF));
2387 const VU16 packed1 = OrAnd(raw1, ShiftLeft<14>(rawF), hi1);
2388 const VU16 packed2 = OrAnd(raw2, ShiftLeft<13>(rawF), hi1);
2389 const VU16 packed3 = OrAnd(raw3, ShiftLeft<12>(rawF), hi1);
2390 const VU16 packed4 = OrAnd(raw4, ShiftLeft<11>(rawF), hi1);
2391 const VU16 packed5 = OrAnd(raw5, ShiftLeft<10>(rawF), hi1);
2392 const VU16 packed6 = OrAnd(raw6, ShiftLeft<9>(rawF), hi1);
2393 const VU16 packed7 = OrAnd(raw7, ShiftLeft<8>(rawF), hi1);
2394 const VU16 packed8 = OrAnd(raw8, ShiftLeft<7>(rawF), hi1);
2395 const VU16 packed9 = OrAnd(raw9, ShiftLeft<6>(rawF), hi1);
2396 const VU16 packedA = OrAnd(rawA, ShiftLeft<5>(rawF), hi1);
2397 const VU16 packedB = OrAnd(rawB, ShiftLeft<4>(rawF), hi1);
2398 const VU16 packedC = OrAnd(rawC, ShiftLeft<3>(rawF), hi1);
2399 const VU16 packedD = OrAnd(rawD, ShiftLeft<2>(rawF), hi1);
2400 const VU16 packedE = OrAnd(rawE, ShiftLeft<1>(rawF), hi1);
2401
2402 StoreU(packed0, d, packed_out + 0 * N);
2403 StoreU(packed1, d, packed_out + 1 * N);
2404 StoreU(packed2, d, packed_out + 2 * N);
2405 StoreU(packed3, d, packed_out + 3 * N);
2406 StoreU(packed4, d, packed_out + 4 * N);
2407 StoreU(packed5, d, packed_out + 5 * N);
2408 StoreU(packed6, d, packed_out + 6 * N);
2409 StoreU(packed7, d, packed_out + 7 * N);
2410 StoreU(packed8, d, packed_out + 8 * N);
2411 StoreU(packed9, d, packed_out + 9 * N);
2412 StoreU(packedA, d, packed_out + 0xA * N);
2413 StoreU(packedB, d, packed_out + 0xB * N);
2414 StoreU(packedC, d, packed_out + 0xC * N);
2415 StoreU(packedD, d, packed_out + 0xD * N);
2416 StoreU(packedE, d, packed_out + 0xE * N);
2417 }
2418
2419 template <class D>
2420 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
2421 uint16_t* HWY_RESTRICT raw) const {
2422 using VU16 = Vec<decltype(d)>;
2423 const size_t N = Lanes(d);
2424
2425 const VU16 packed0 = BitCast(d, LoadU(d, packed_in + 0 * N));
2426 const VU16 packed1 = BitCast(d, LoadU(d, packed_in + 1 * N));
2427 const VU16 packed2 = BitCast(d, LoadU(d, packed_in + 2 * N));
2428 const VU16 packed3 = BitCast(d, LoadU(d, packed_in + 3 * N));
2429 const VU16 packed4 = BitCast(d, LoadU(d, packed_in + 4 * N));
2430 const VU16 packed5 = BitCast(d, LoadU(d, packed_in + 5 * N));
2431 const VU16 packed6 = BitCast(d, LoadU(d, packed_in + 6 * N));
2432 const VU16 packed7 = BitCast(d, LoadU(d, packed_in + 7 * N));
2433 const VU16 packed8 = BitCast(d, LoadU(d, packed_in + 8 * N));
2434 const VU16 packed9 = BitCast(d, LoadU(d, packed_in + 9 * N));
2435 const VU16 packedA = BitCast(d, LoadU(d, packed_in + 0xA * N));
2436 const VU16 packedB = BitCast(d, LoadU(d, packed_in + 0xB * N));
2437 const VU16 packedC = BitCast(d, LoadU(d, packed_in + 0xC * N));
2438 const VU16 packedD = BitCast(d, LoadU(d, packed_in + 0xD * N));
2439 const VU16 packedE = BitCast(d, LoadU(d, packed_in + 0xE * N));
2440
2441 const VU16 mask = Set(d, 0x7FFFu); // Lowest 15 bits
2442
2443 const VU16 raw0 = And(packed0, mask);
2444 StoreU(raw0, d, raw + 0 * N);
2445
2446 const VU16 raw1 = And(packed1, mask);
2447 StoreU(raw1, d, raw + 1 * N);
2448
2449 const VU16 raw2 = And(packed2, mask);
2450 StoreU(raw2, d, raw + 2 * N);
2451
2452 const VU16 raw3 = And(packed3, mask);
2453 StoreU(raw3, d, raw + 3 * N);
2454
2455 const VU16 raw4 = And(packed4, mask);
2456 StoreU(raw4, d, raw + 4 * N);
2457
2458 const VU16 raw5 = And(packed5, mask);
2459 StoreU(raw5, d, raw + 5 * N);
2460
2461 const VU16 raw6 = And(packed6, mask);
2462 StoreU(raw6, d, raw + 6 * N);
2463
2464 const VU16 raw7 = And(packed7, mask);
2465 StoreU(raw7, d, raw + 7 * N);
2466
2467 const VU16 raw8 = And(packed8, mask);
2468 StoreU(raw8, d, raw + 8 * N);
2469
2470 const VU16 raw9 = And(packed9, mask);
2471 StoreU(raw9, d, raw + 9 * N);
2472
2473 const VU16 rawA = And(packedA, mask);
2474 StoreU(rawA, d, raw + 0xA * N);
2475
2476 const VU16 rawB = And(packedB, mask);
2477 StoreU(rawB, d, raw + 0xB * N);
2478
2479 const VU16 rawC = And(packedC, mask);
2480 StoreU(rawC, d, raw + 0xC * N);
2481
2482 const VU16 rawD = And(packedD, mask);
2483 StoreU(rawD, d, raw + 0xD * N);
2484
2485 const VU16 rawE = And(packedE, mask);
2486 StoreU(rawE, d, raw + 0xE * N);
2487
2488 // rawF is the concatenation of the top bit in packed0..E.
2489 const VU16 F0 = Xor3(ShiftRight<15>(packed0), //
2490 ShiftRight<14>(AndNot(mask, packed1)),
2491 ShiftRight<13>(AndNot(mask, packed2)));
2492 const VU16 F1 = Xor3(ShiftRight<12>(AndNot(mask, packed3)),
2493 ShiftRight<11>(AndNot(mask, packed4)),
2494 ShiftRight<10>(AndNot(mask, packed5)));
2495 const VU16 F2 = Xor3(ShiftRight<9>(AndNot(mask, packed6)),
2496 ShiftRight<8>(AndNot(mask, packed7)),
2497 ShiftRight<7>(AndNot(mask, packed8)));
2498 const VU16 F3 = Xor3(ShiftRight<6>(AndNot(mask, packed9)),
2499 ShiftRight<5>(AndNot(mask, packedA)),
2500 ShiftRight<4>(AndNot(mask, packedB)));
2501 const VU16 F4 = Xor3(ShiftRight<3>(AndNot(mask, packedC)),
2502 ShiftRight<2>(AndNot(mask, packedD)),
2503 ShiftRight<1>(AndNot(mask, packedE)));
2504 const VU16 rawF = Xor3(F0, F1, Xor3(F2, F3, F4));
2505 StoreU(rawF, d, raw + 0xF * N);
2506 }
2507}; // Pack16<15>
2508
2509template <>
2510struct Pack16<16> {
2511 template <class D>
2512 HWY_INLINE void Pack(D d, const uint16_t* HWY_RESTRICT raw,
2513 uint16_t* HWY_RESTRICT packed_out) const {
2514 using VU16 = Vec<decltype(d)>;
2515 const size_t N = Lanes(d);
2516 const VU16 raw0 = LoadU(d, raw + 0 * N);
2517 const VU16 raw1 = LoadU(d, raw + 1 * N);
2518 const VU16 raw2 = LoadU(d, raw + 2 * N);
2519 const VU16 raw3 = LoadU(d, raw + 3 * N);
2520 const VU16 raw4 = LoadU(d, raw + 4 * N);
2521 const VU16 raw5 = LoadU(d, raw + 5 * N);
2522 const VU16 raw6 = LoadU(d, raw + 6 * N);
2523 const VU16 raw7 = LoadU(d, raw + 7 * N);
2524 const VU16 raw8 = LoadU(d, raw + 8 * N);
2525 const VU16 raw9 = LoadU(d, raw + 9 * N);
2526 const VU16 rawA = LoadU(d, raw + 0xA * N);
2527 const VU16 rawB = LoadU(d, raw + 0xB * N);
2528 const VU16 rawC = LoadU(d, raw + 0xC * N);
2529 const VU16 rawD = LoadU(d, raw + 0xD * N);
2530 const VU16 rawE = LoadU(d, raw + 0xE * N);
2531 const VU16 rawF = LoadU(d, raw + 0xF * N);
2532
2533 StoreU(raw0, d, packed_out + 0 * N);
2534 StoreU(raw1, d, packed_out + 1 * N);
2535 StoreU(raw2, d, packed_out + 2 * N);
2536 StoreU(raw3, d, packed_out + 3 * N);
2537 StoreU(raw4, d, packed_out + 4 * N);
2538 StoreU(raw5, d, packed_out + 5 * N);
2539 StoreU(raw6, d, packed_out + 6 * N);
2540 StoreU(raw7, d, packed_out + 7 * N);
2541 StoreU(raw8, d, packed_out + 8 * N);
2542 StoreU(raw9, d, packed_out + 9 * N);
2543 StoreU(rawA, d, packed_out + 0xA * N);
2544 StoreU(rawB, d, packed_out + 0xB * N);
2545 StoreU(rawC, d, packed_out + 0xC * N);
2546 StoreU(rawD, d, packed_out + 0xD * N);
2547 StoreU(rawE, d, packed_out + 0xE * N);
2548 StoreU(rawF, d, packed_out + 0xF * N);
2549 }
2550
2551 template <class D>
2552 HWY_INLINE void Unpack(D d, const uint16_t* HWY_RESTRICT packed_in,
2553 uint16_t* HWY_RESTRICT raw) const {
2554 using VU16 = Vec<decltype(d)>;
2555 const size_t N = Lanes(d);
2556
2557 const VU16 raw0 = BitCast(d, LoadU(d, packed_in + 0 * N));
2558 const VU16 raw1 = BitCast(d, LoadU(d, packed_in + 1 * N));
2559 const VU16 raw2 = BitCast(d, LoadU(d, packed_in + 2 * N));
2560 const VU16 raw3 = BitCast(d, LoadU(d, packed_in + 3 * N));
2561 const VU16 raw4 = BitCast(d, LoadU(d, packed_in + 4 * N));
2562 const VU16 raw5 = BitCast(d, LoadU(d, packed_in + 5 * N));
2563 const VU16 raw6 = BitCast(d, LoadU(d, packed_in + 6 * N));
2564 const VU16 raw7 = BitCast(d, LoadU(d, packed_in + 7 * N));
2565 const VU16 raw8 = BitCast(d, LoadU(d, packed_in + 8 * N));
2566 const VU16 raw9 = BitCast(d, LoadU(d, packed_in + 9 * N));
2567 const VU16 rawA = BitCast(d, LoadU(d, packed_in + 0xA * N));
2568 const VU16 rawB = BitCast(d, LoadU(d, packed_in + 0xB * N));
2569 const VU16 rawC = BitCast(d, LoadU(d, packed_in + 0xC * N));
2570 const VU16 rawD = BitCast(d, LoadU(d, packed_in + 0xD * N));
2571 const VU16 rawE = BitCast(d, LoadU(d, packed_in + 0xE * N));
2572 const VU16 rawF = BitCast(d, LoadU(d, packed_in + 0xF * N));
2573
2574 StoreU(raw0, d, raw + 0 * N);
2575 StoreU(raw1, d, raw + 1 * N);
2576 StoreU(raw2, d, raw + 2 * N);
2577 StoreU(raw3, d, raw + 3 * N);
2578 StoreU(raw4, d, raw + 4 * N);
2579 StoreU(raw5, d, raw + 5 * N);
2580 StoreU(raw6, d, raw + 6 * N);
2581 StoreU(raw7, d, raw + 7 * N);
2582 StoreU(raw8, d, raw + 8 * N);
2583 StoreU(raw9, d, raw + 9 * N);
2584 StoreU(rawA, d, raw + 0xA * N);
2585 StoreU(rawB, d, raw + 0xB * N);
2586 StoreU(rawC, d, raw + 0xC * N);
2587 StoreU(rawD, d, raw + 0xD * N);
2588 StoreU(rawE, d, raw + 0xE * N);
2589 StoreU(rawF, d, raw + 0xF * N);
2590 }
2591}; // Pack16<16>
2592
2593// NOLINTNEXTLINE(google-readability-namespace-comments)
2594} // namespace HWY_NAMESPACE
2595} // namespace hwy
2597
2598#endif // HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
#define HWY_RESTRICT
Definition base.h:95
#define HWY_INLINE
Definition base.h:101
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
D d
Definition arm_sve-inl.h:1915
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:46
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
Definition abort.h:8
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1681
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1621
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1755
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1836
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1974
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1921
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2049
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2131
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2275
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2219
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2362
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2420
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2552
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2512
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:646
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:610
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:707
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:743
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:846
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:805
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:950
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:913
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1058
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1014
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1178
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1131
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1302
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1252
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1380
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1423
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1548
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1491
Definition bit_pack-inl.h:38
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:66
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:43
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:128
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:104
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:197
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:167
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:270
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:243
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:311
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:346
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:427
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:395
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:474
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:508
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:583
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:559
Definition bit_pack-inl.h:36