17#if defined(HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_) == defined(HWY_TARGET_TOGGLE)
18#ifdef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
19#undef HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
21#define HIGHWAY_HWY_CONTRIB_BIT_PACK_INL_H_
35template <
size_t kBits>
37template <
size_t kBits>
46 using VU16 =
Vec<
decltype(d16)>;
47 const size_t N8 =
Lanes(d8);
59 Xor3(
Or(ShiftLeft<7>(raw7), ShiftLeft<6>(raw6)),
60 Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3)),
61 Xor3(ShiftLeft<2>(raw2), ShiftLeft<1>(raw1), raw0));
69 using VU16 =
Vec<
decltype(d16)>;
70 const size_t N8 =
Lanes(d8);
71 const VU16 mask =
Set(d16, 0x0101u);
75 const VU16 raw0 =
And(packed, mask);
78 const VU16 raw1 =
And(ShiftRight<1>(packed), mask);
81 const VU16 raw2 =
And(ShiftRight<2>(packed), mask);
84 const VU16 raw3 =
And(ShiftRight<3>(packed), mask);
87 const VU16 raw4 =
And(ShiftRight<4>(packed), mask);
90 const VU16 raw5 =
And(ShiftRight<5>(packed), mask);
93 const VU16 raw6 =
And(ShiftRight<6>(packed), mask);
96 const VU16 raw7 =
And(ShiftRight<7>(packed), mask);
107 using VU16 =
Vec<
decltype(d16)>;
108 const size_t N8 =
Lanes(d8);
110 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
111 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
112 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
113 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
114 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
115 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
116 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
117 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
119 const VU16 packed0 =
Xor3(ShiftLeft<6>(raw6), ShiftLeft<4>(raw4),
120 Or(ShiftLeft<2>(raw2), raw0));
121 const VU16 packed1 =
Xor3(ShiftLeft<6>(raw7), ShiftLeft<4>(raw5),
122 Or(ShiftLeft<2>(raw3), raw1));
131 using VU16 =
Vec<
decltype(d16)>;
132 const size_t N8 =
Lanes(d8);
133 const VU16 mask =
Set(d16, 0x0303u);
135 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
136 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
138 const VU16 raw0 =
And(packed0, mask);
141 const VU16 raw1 =
And(packed1, mask);
144 const VU16 raw2 =
And(ShiftRight<2>(packed0), mask);
147 const VU16 raw3 =
And(ShiftRight<2>(packed1), mask);
150 const VU16 raw4 =
And(ShiftRight<4>(packed0), mask);
153 const VU16 raw5 =
And(ShiftRight<4>(packed1), mask);
156 const VU16 raw6 =
And(ShiftRight<6>(packed0), mask);
159 const VU16 raw7 =
And(ShiftRight<6>(packed1), mask);
170 using VU16 =
Vec<
decltype(d16)>;
171 const size_t N8 =
Lanes(d8);
172 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
173 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
174 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
175 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
176 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
177 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
178 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
179 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
182 VU16 packed0 =
Or(ShiftLeft<3>(raw4), raw0);
183 VU16 packed1 =
Or(ShiftLeft<3>(raw5), raw1);
184 VU16 packed2 =
Or(ShiftLeft<3>(raw6), raw2);
185 const VU16 packed3 =
Or(ShiftLeft<3>(raw7), raw3);
187 const VU16 hi2 =
Set(d16, 0xC0C0u);
188 packed0 =
OrAnd(packed0, ShiftLeft<2>(packed3), hi2);
189 packed1 =
OrAnd(packed1, ShiftLeft<4>(packed3), hi2);
190 packed2 =
OrAnd(packed2, ShiftLeft<6>(packed3), hi2);
200 using VU16 =
Vec<
decltype(d16)>;
201 const size_t N8 =
Lanes(d8);
202 const VU16 mask =
Set(d16, 0x0707u);
204 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
205 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
206 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
208 const VU16 raw0 =
And(packed0, mask);
211 const VU16 raw1 =
And(packed1, mask);
214 const VU16 raw2 =
And(packed2, mask);
217 const VU16 raw4 =
And(ShiftRight<3>(packed0), mask);
220 const VU16 raw5 =
And(ShiftRight<3>(packed1), mask);
223 const VU16 raw6 =
And(ShiftRight<3>(packed2), mask);
227 const VU16 hi2 =
Set(d16, 0xC0C0u);
228 const VU16 raw73 =
Xor3(ShiftRight<6>(
And(packed2, hi2)),
229 ShiftRight<4>(
And(packed1, hi2)),
230 ShiftRight<2>(
And(packed0, hi2)));
232 const VU16 raw3 =
And(mask, raw73);
235 const VU16 raw7 =
And(mask, ShiftRight<3>(raw73));
246 using VU16 =
Vec<
decltype(d16)>;
247 const size_t N8 =
Lanes(d8);
249 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
250 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
251 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
252 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
253 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
254 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
255 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
256 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
258 const VU16 packed0 =
Or(ShiftLeft<4>(raw2), raw0);
259 const VU16 packed1 =
Or(ShiftLeft<4>(raw3), raw1);
260 const VU16 packed2 =
Or(ShiftLeft<4>(raw6), raw4);
261 const VU16 packed3 =
Or(ShiftLeft<4>(raw7), raw5);
273 using VU16 =
Vec<
decltype(d16)>;
274 const size_t N8 =
Lanes(d8);
275 const VU16 mask =
Set(d16, 0x0F0Fu);
277 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
278 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
279 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
280 const VU16 packed3 =
BitCast(d16,
LoadU(d8, packed_in + 3 * N8));
282 const VU16 raw0 =
And(packed0, mask);
285 const VU16 raw1 =
And(packed1, mask);
288 const VU16 raw2 =
And(ShiftRight<4>(packed0), mask);
291 const VU16 raw3 =
And(ShiftRight<4>(packed1), mask);
294 const VU16 raw4 =
And(packed2, mask);
297 const VU16 raw5 =
And(packed3, mask);
300 const VU16 raw6 =
And(ShiftRight<4>(packed2), mask);
303 const VU16 raw7 =
And(ShiftRight<4>(packed3), mask);
314 using VU16 =
Vec<
decltype(d16)>;
315 const size_t N8 =
Lanes(d8);
316 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
317 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
318 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
319 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
320 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
321 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
322 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
323 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
326 const VU16 hi3 =
Set(d16, 0xE0E0u);
327 const VU16 packed0 =
OrAnd(raw0, ShiftLeft<3>(raw4), hi3);
328 const VU16 packed1 =
OrAnd(raw1, ShiftLeft<3>(raw5), hi3);
329 const VU16 packed2 =
OrAnd(raw2, ShiftLeft<3>(raw6), hi3);
330 const VU16 packed3 =
OrAnd(raw3, ShiftLeft<3>(raw7), hi3);
338 const VU16 lo2 =
Set(d16, 0x0303u);
339 const VU16 packed4 =
Or(
And(raw4, lo2),
Xor3(ShiftLeft<2>(
And(raw5, lo2)),
340 ShiftLeft<4>(
And(raw6, lo2)),
341 ShiftLeft<6>(
And(raw7, lo2))));
349 using VU16 =
Vec<
decltype(d16)>;
350 const size_t N8 =
Lanes(d8);
352 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
353 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
354 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
355 const VU16 packed3 =
BitCast(d16,
LoadU(d8, packed_in + 3 * N8));
356 const VU16 packed4 =
BitCast(d16,
LoadU(d8, packed_in + 4 * N8));
358 const VU16 mask =
Set(d16, 0x1F1Fu);
360 const VU16 raw0 =
And(packed0, mask);
363 const VU16 raw1 =
And(packed1, mask);
366 const VU16 raw2 =
And(packed2, mask);
369 const VU16 raw3 =
And(packed3, mask);
373 const VU16 top4 = ShiftRight<3>(
AndNot(mask, packed0));
374 const VU16 top5 = ShiftRight<3>(
AndNot(mask, packed1));
375 const VU16 top6 = ShiftRight<3>(
AndNot(mask, packed2));
376 const VU16 top7 = ShiftRight<3>(
AndNot(mask, packed3));
379 const VU16 lo2 =
Set(d16, 0x0303u);
380 const VU16 raw4 =
OrAnd(top4, lo2, packed4);
381 const VU16 raw5 =
OrAnd(top5, lo2, ShiftRight<2>(packed4));
382 const VU16 raw6 =
OrAnd(top6, lo2, ShiftRight<4>(packed4));
383 const VU16 raw7 =
OrAnd(top7, lo2, ShiftRight<6>(packed4));
398 using VU16 =
Vec<
decltype(d16)>;
399 const size_t N8 =
Lanes(d8);
400 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
401 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
402 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
403 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
404 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
405 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
406 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
407 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
409 const VU16 hi2 =
Set(d16, 0xC0C0u);
411 const VU16 packed0 =
OrAnd(raw0, ShiftLeft<2>(raw3), hi2);
412 const VU16 packed1 =
OrAnd(raw1, ShiftLeft<4>(raw3), hi2);
413 const VU16 packed2 =
OrAnd(raw2, ShiftLeft<6>(raw3), hi2);
414 const VU16 packed3 =
OrAnd(raw4, ShiftLeft<2>(raw7), hi2);
415 const VU16 packed4 =
OrAnd(raw5, ShiftLeft<4>(raw7), hi2);
416 const VU16 packed5 =
OrAnd(raw6, ShiftLeft<6>(raw7), hi2);
430 using VU16 =
Vec<
decltype(d16)>;
431 const size_t N8 =
Lanes(d8);
432 const VU16 mask =
Set(d16, 0x3F3Fu);
434 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
435 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
436 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
437 const VU16 packed3 =
BitCast(d16,
LoadU(d8, packed_in + 3 * N8));
438 const VU16 packed4 =
BitCast(d16,
LoadU(d8, packed_in + 4 * N8));
439 const VU16 packed5 =
BitCast(d16,
LoadU(d8, packed_in + 5 * N8));
441 const VU16 raw0 =
And(packed0, mask);
444 const VU16 raw1 =
And(packed1, mask);
447 const VU16 raw2 =
And(packed2, mask);
450 const VU16 raw4 =
And(packed3, mask);
453 const VU16 raw5 =
And(packed4, mask);
456 const VU16 raw6 =
And(packed5, mask);
460 const VU16 raw3 =
Xor3(ShiftRight<6>(
AndNot(mask, packed2)),
461 ShiftRight<4>(
AndNot(mask, packed1)),
462 ShiftRight<2>(
AndNot(mask, packed0)));
463 const VU16 raw7 =
Xor3(ShiftRight<6>(
AndNot(mask, packed5)),
464 ShiftRight<4>(
AndNot(mask, packed4)),
465 ShiftRight<2>(
AndNot(mask, packed3)));
477 using VU16 =
Vec<
decltype(d16)>;
478 const size_t N8 =
Lanes(d8);
479 const VU16 raw0 =
BitCast(d16,
LoadU(d8, raw + 0 * N8));
480 const VU16 raw1 =
BitCast(d16,
LoadU(d8, raw + 1 * N8));
481 const VU16 raw2 =
BitCast(d16,
LoadU(d8, raw + 2 * N8));
482 const VU16 raw3 =
BitCast(d16,
LoadU(d8, raw + 3 * N8));
483 const VU16 raw4 =
BitCast(d16,
LoadU(d8, raw + 4 * N8));
484 const VU16 raw5 =
BitCast(d16,
LoadU(d8, raw + 5 * N8));
485 const VU16 raw6 =
BitCast(d16,
LoadU(d8, raw + 6 * N8));
487 const VU16 raw7 =
BitCast(d16,
LoadU(d8, raw + 7 * N8));
489 const VU16 hi1 =
Set(d16, 0x8080u);
490 const VU16 packed0 =
OrAnd(raw0,
Add(raw7, raw7), hi1);
491 const VU16 packed1 =
OrAnd(raw1, ShiftLeft<2>(raw7), hi1);
492 const VU16 packed2 =
OrAnd(raw2, ShiftLeft<3>(raw7), hi1);
493 const VU16 packed3 =
OrAnd(raw3, ShiftLeft<4>(raw7), hi1);
494 const VU16 packed4 =
OrAnd(raw4, ShiftLeft<5>(raw7), hi1);
495 const VU16 packed5 =
OrAnd(raw5, ShiftLeft<6>(raw7), hi1);
496 const VU16 packed6 =
OrAnd(raw6, ShiftLeft<7>(raw7), hi1);
511 using VU16 =
Vec<
decltype(d16)>;
512 const size_t N8 =
Lanes(d8);
514 const VU16 packed0 =
BitCast(d16,
LoadU(d8, packed_in + 0 * N8));
515 const VU16 packed1 =
BitCast(d16,
LoadU(d8, packed_in + 1 * N8));
516 const VU16 packed2 =
BitCast(d16,
LoadU(d8, packed_in + 2 * N8));
517 const VU16 packed3 =
BitCast(d16,
LoadU(d8, packed_in + 3 * N8));
518 const VU16 packed4 =
BitCast(d16,
LoadU(d8, packed_in + 4 * N8));
519 const VU16 packed5 =
BitCast(d16,
LoadU(d8, packed_in + 5 * N8));
520 const VU16 packed6 =
BitCast(d16,
LoadU(d8, packed_in + 6 * N8));
522 const VU16 mask =
Set(d16, 0x7F7Fu);
524 const VU16 raw0 =
And(packed0, mask);
527 const VU16 raw1 =
And(packed1, mask);
530 const VU16 raw2 =
And(packed2, mask);
533 const VU16 raw3 =
And(packed3, mask);
536 const VU16 raw4 =
And(packed4, mask);
539 const VU16 raw5 =
And(packed5, mask);
542 const VU16 raw6 =
And(packed6, mask);
545 const VU16 p0 =
Xor3(ShiftRight<7>(
AndNot(mask, packed6)),
546 ShiftRight<6>(
AndNot(mask, packed5)),
547 ShiftRight<5>(
AndNot(mask, packed4)));
548 const VU16 p1 =
Xor3(ShiftRight<4>(
AndNot(mask, packed3)),
549 ShiftRight<3>(
AndNot(mask, packed2)),
550 ShiftRight<2>(
AndNot(mask, packed1)));
551 const VU16 raw7 =
Xor3(ShiftRight<1>(
AndNot(mask, packed0)), p0, p1);
561 using VU8 =
Vec<
decltype(d8)>;
562 const size_t N8 =
Lanes(d8);
563 const VU8 raw0 =
LoadU(d8, raw + 0 * N8);
564 const VU8 raw1 =
LoadU(d8, raw + 1 * N8);
565 const VU8 raw2 =
LoadU(d8, raw + 2 * N8);
566 const VU8 raw3 =
LoadU(d8, raw + 3 * N8);
567 const VU8 raw4 =
LoadU(d8, raw + 4 * N8);
568 const VU8 raw5 =
LoadU(d8, raw + 5 * N8);
569 const VU8 raw6 =
LoadU(d8, raw + 6 * N8);
570 const VU8 raw7 =
LoadU(d8, raw + 7 * N8);
572 StoreU(raw0, d8, packed_out + 0 * N8);
573 StoreU(raw1, d8, packed_out + 1 * N8);
574 StoreU(raw2, d8, packed_out + 2 * N8);
575 StoreU(raw3, d8, packed_out + 3 * N8);
576 StoreU(raw4, d8, packed_out + 4 * N8);
577 StoreU(raw5, d8, packed_out + 5 * N8);
578 StoreU(raw6, d8, packed_out + 6 * N8);
579 StoreU(raw7, d8, packed_out + 7 * N8);
585 using VU8 =
Vec<
decltype(d8)>;
586 const size_t N8 =
Lanes(d8);
587 const VU8 raw0 =
LoadU(d8, packed_in + 0 * N8);
588 const VU8 raw1 =
LoadU(d8, packed_in + 1 * N8);
589 const VU8 raw2 =
LoadU(d8, packed_in + 2 * N8);
590 const VU8 raw3 =
LoadU(d8, packed_in + 3 * N8);
591 const VU8 raw4 =
LoadU(d8, packed_in + 4 * N8);
592 const VU8 raw5 =
LoadU(d8, packed_in + 5 * N8);
593 const VU8 raw6 =
LoadU(d8, packed_in + 6 * N8);
594 const VU8 raw7 =
LoadU(d8, packed_in + 7 * N8);
596 StoreU(raw0, d8, raw + 0 * N8);
597 StoreU(raw1, d8, raw + 1 * N8);
598 StoreU(raw2, d8, raw + 2 * N8);
599 StoreU(raw3, d8, raw + 3 * N8);
600 StoreU(raw4, d8, raw + 4 * N8);
601 StoreU(raw5, d8, raw + 5 * N8);
602 StoreU(raw6, d8, raw + 6 * N8);
603 StoreU(raw7, d8, raw + 7 * N8);
612 using VU16 =
Vec<
decltype(
d)>;
613 const size_t N =
Lanes(
d);
614 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
615 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
616 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
617 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
618 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
619 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
620 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
621 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
622 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
623 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
624 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
625 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
626 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
627 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
628 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
629 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
631 const VU16 p0 =
Xor3(ShiftLeft<2>(raw2),
Add(raw1, raw1), raw0);
633 Xor3(ShiftLeft<5>(raw5), ShiftLeft<4>(raw4), ShiftLeft<3>(raw3));
635 Xor3(ShiftLeft<8>(raw8), ShiftLeft<7>(raw7), ShiftLeft<6>(raw6));
637 Xor3(ShiftLeft<0xB>(rawB), ShiftLeft<0xA>(rawA), ShiftLeft<9>(raw9));
639 Xor3(ShiftLeft<0xE>(rawE), ShiftLeft<0xD>(rawD), ShiftLeft<0xC>(rawC));
641 Or(
Xor3(ShiftLeft<0xF>(rawF), p0, p1),
Xor3(p2, p3, p4));
648 using VU16 =
Vec<
decltype(
d)>;
649 const size_t N =
Lanes(
d);
650 const VU16 mask =
Set(
d, 1u);
652 const VU16 packed =
LoadU(
d, packed_in);
654 const VU16 raw0 =
And(packed, mask);
657 const VU16 raw1 =
And(ShiftRight<1>(packed), mask);
660 const VU16 raw2 =
And(ShiftRight<2>(packed), mask);
663 const VU16 raw3 =
And(ShiftRight<3>(packed), mask);
666 const VU16 raw4 =
And(ShiftRight<4>(packed), mask);
669 const VU16 raw5 =
And(ShiftRight<5>(packed), mask);
672 const VU16 raw6 =
And(ShiftRight<6>(packed), mask);
675 const VU16 raw7 =
And(ShiftRight<7>(packed), mask);
678 const VU16 raw8 =
And(ShiftRight<8>(packed), mask);
681 const VU16 raw9 =
And(ShiftRight<9>(packed), mask);
684 const VU16 rawA =
And(ShiftRight<0xA>(packed), mask);
685 StoreU(rawA,
d, raw + 0xA * N);
687 const VU16 rawB =
And(ShiftRight<0xB>(packed), mask);
688 StoreU(rawB,
d, raw + 0xB * N);
690 const VU16 rawC =
And(ShiftRight<0xC>(packed), mask);
691 StoreU(rawC,
d, raw + 0xC * N);
693 const VU16 rawD =
And(ShiftRight<0xD>(packed), mask);
694 StoreU(rawD,
d, raw + 0xD * N);
696 const VU16 rawE =
And(ShiftRight<0xE>(packed), mask);
697 StoreU(rawE,
d, raw + 0xE * N);
699 const VU16 rawF = ShiftRight<0xF>(packed);
700 StoreU(rawF,
d, raw + 0xF * N);
709 using VU16 =
Vec<
decltype(
d)>;
710 const size_t N =
Lanes(
d);
711 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
712 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
713 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
714 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
715 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
716 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
717 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
718 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
719 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
720 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
721 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
722 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
723 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
724 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
725 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
726 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
728 VU16 packed0 =
Xor3(ShiftLeft<4>(raw4), ShiftLeft<2>(raw2), raw0);
729 VU16 packed1 =
Xor3(ShiftLeft<4>(raw5), ShiftLeft<2>(raw3), raw1);
730 packed0 =
Xor3(packed0, ShiftLeft<8>(raw8), ShiftLeft<6>(raw6));
731 packed1 =
Xor3(packed1, ShiftLeft<8>(raw9), ShiftLeft<6>(raw7));
733 packed0 =
Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<10>(rawA));
734 packed1 =
Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<10>(rawB));
736 packed0 =
Or(packed0, ShiftLeft<14>(rawE));
737 packed1 =
Or(packed1, ShiftLeft<14>(rawF));
738 StoreU(packed0,
d, packed_out + 0 * N);
739 StoreU(packed1,
d, packed_out + 1 * N);
745 using VU16 =
Vec<
decltype(
d)>;
746 const size_t N =
Lanes(
d);
747 const VU16 mask =
Set(
d, 0x3u);
749 const VU16 packed0 =
LoadU(
d, packed_in + 0 * N);
750 const VU16 packed1 =
LoadU(
d, packed_in + 1 * N);
752 const VU16 raw0 =
And(packed0, mask);
755 const VU16 raw1 =
And(packed1, mask);
758 const VU16 raw2 =
And(ShiftRight<2>(packed0), mask);
761 const VU16 raw3 =
And(ShiftRight<2>(packed1), mask);
764 const VU16 raw4 =
And(ShiftRight<4>(packed0), mask);
767 const VU16 raw5 =
And(ShiftRight<4>(packed1), mask);
770 const VU16 raw6 =
And(ShiftRight<6>(packed0), mask);
773 const VU16 raw7 =
And(ShiftRight<6>(packed1), mask);
776 const VU16 raw8 =
And(ShiftRight<8>(packed0), mask);
779 const VU16 raw9 =
And(ShiftRight<8>(packed1), mask);
782 const VU16 rawA =
And(ShiftRight<0xA>(packed0), mask);
783 StoreU(rawA,
d, raw + 0xA * N);
785 const VU16 rawB =
And(ShiftRight<0xA>(packed1), mask);
786 StoreU(rawB,
d, raw + 0xB * N);
788 const VU16 rawC =
And(ShiftRight<0xC>(packed0), mask);
789 StoreU(rawC,
d, raw + 0xC * N);
791 const VU16 rawD =
And(ShiftRight<0xC>(packed1), mask);
792 StoreU(rawD,
d, raw + 0xD * N);
794 const VU16 rawE = ShiftRight<0xE>(packed0);
795 StoreU(rawE,
d, raw + 0xE * N);
797 const VU16 rawF = ShiftRight<0xE>(packed1);
798 StoreU(rawF,
d, raw + 0xF * N);
807 using VU16 =
Vec<
decltype(
d)>;
808 const size_t N =
Lanes(
d);
809 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
810 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
811 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
812 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
813 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
814 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
815 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
816 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
817 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
818 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
819 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
820 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
821 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
822 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
823 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
824 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
827 VU16 packed0 =
Xor3(ShiftLeft<6>(raw6), ShiftLeft<3>(raw3), raw0);
828 VU16 packed1 =
Xor3(ShiftLeft<6>(raw7), ShiftLeft<3>(raw4), raw1);
829 VU16 packed2 =
Xor3(ShiftLeft<6>(raw8), ShiftLeft<3>(raw5), raw2);
832 packed0 =
Xor3(packed0, ShiftLeft<12>(rawC), ShiftLeft<9>(raw9));
833 packed1 =
Xor3(packed1, ShiftLeft<12>(rawD), ShiftLeft<9>(rawA));
834 packed2 =
Xor3(packed2, ShiftLeft<12>(rawE), ShiftLeft<9>(rawB));
836 const VU16 hi1 =
Set(
d, 0x8000u);
837 packed0 =
Or(packed0, ShiftLeft<15>(rawF));
838 packed1 =
OrAnd(packed1, ShiftLeft<14>(rawF), hi1);
839 packed2 =
OrAnd(packed2, ShiftLeft<13>(rawF), hi1);
840 StoreU(packed0,
d, packed_out + 0 * N);
841 StoreU(packed1,
d, packed_out + 1 * N);
842 StoreU(packed2,
d, packed_out + 2 * N);
848 using VU16 =
Vec<
decltype(
d)>;
849 const size_t N =
Lanes(
d);
850 const VU16 mask =
Set(
d, 0x7u);
852 const VU16 packed0 =
LoadU(
d, packed_in + 0 * N);
853 const VU16 packed1 =
LoadU(
d, packed_in + 1 * N);
854 const VU16 packed2 =
LoadU(
d, packed_in + 2 * N);
856 const VU16 raw0 =
And(mask, packed0);
859 const VU16 raw1 =
And(mask, packed1);
862 const VU16 raw2 =
And(mask, packed2);
865 const VU16 raw3 =
And(mask, ShiftRight<3>(packed0));
868 const VU16 raw4 =
And(mask, ShiftRight<3>(packed1));
871 const VU16 raw5 =
And(mask, ShiftRight<3>(packed2));
874 const VU16 raw6 =
And(mask, ShiftRight<6>(packed0));
877 const VU16 raw7 =
And(mask, ShiftRight<6>(packed1));
880 const VU16 raw8 =
And(mask, ShiftRight<6>(packed2));
883 const VU16 raw9 =
And(mask, ShiftRight<9>(packed0));
886 const VU16 rawA =
And(mask, ShiftRight<9>(packed1));
887 StoreU(rawA,
d, raw + 0xA * N);
889 const VU16 rawB =
And(mask, ShiftRight<9>(packed2));
890 StoreU(rawB,
d, raw + 0xB * N);
892 const VU16 rawC =
And(mask, ShiftRight<12>(packed0));
893 StoreU(rawC,
d, raw + 0xC * N);
895 const VU16 rawD =
And(mask, ShiftRight<12>(packed1));
896 StoreU(rawD,
d, raw + 0xD * N);
898 const VU16 rawE =
And(mask, ShiftRight<12>(packed2));
899 StoreU(rawE,
d, raw + 0xE * N);
902 const VU16 down0 = ShiftRight<15>(packed0);
903 const VU16 down1 = ShiftRight<15>(packed1);
904 const VU16 down2 = ShiftRight<15>(packed2);
905 const VU16 rawF =
Xor3(ShiftLeft<2>(down2),
Add(down1, down1), down0);
906 StoreU(rawF,
d, raw + 0xF * N);
915 using VU16 =
Vec<
decltype(
d)>;
916 const size_t N =
Lanes(
d);
917 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
918 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
919 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
920 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
921 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
922 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
923 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
924 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
925 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
926 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
927 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
928 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
929 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
930 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
931 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
932 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
934 VU16 packed0 =
Xor3(ShiftLeft<8>(raw4), ShiftLeft<4>(raw2), raw0);
935 VU16 packed1 =
Xor3(ShiftLeft<8>(raw5), ShiftLeft<4>(raw3), raw1);
936 packed0 =
Or(packed0, ShiftLeft<12>(raw6));
937 packed1 =
Or(packed1, ShiftLeft<12>(raw7));
938 VU16 packed2 =
Xor3(ShiftLeft<8>(rawC), ShiftLeft<4>(rawA), raw8);
939 VU16 packed3 =
Xor3(ShiftLeft<8>(rawD), ShiftLeft<4>(rawB), raw9);
940 packed2 =
Or(packed2, ShiftLeft<12>(rawE));
941 packed3 =
Or(packed3, ShiftLeft<12>(rawF));
943 StoreU(packed0,
d, packed_out + 0 * N);
944 StoreU(packed1,
d, packed_out + 1 * N);
945 StoreU(packed2,
d, packed_out + 2 * N);
946 StoreU(packed3,
d, packed_out + 3 * N);
952 using VU16 =
Vec<
decltype(
d)>;
953 const size_t N =
Lanes(
d);
954 const VU16 mask =
Set(
d, 0xFu);
956 const VU16 packed0 =
LoadU(
d, packed_in + 0 * N);
957 const VU16 packed1 =
LoadU(
d, packed_in + 1 * N);
958 const VU16 packed2 =
LoadU(
d, packed_in + 2 * N);
959 const VU16 packed3 =
LoadU(
d, packed_in + 3 * N);
961 const VU16 raw0 =
And(packed0, mask);
964 const VU16 raw1 =
And(packed1, mask);
967 const VU16 raw2 =
And(ShiftRight<4>(packed0), mask);
970 const VU16 raw3 =
And(ShiftRight<4>(packed1), mask);
973 const VU16 raw4 =
And(ShiftRight<8>(packed0), mask);
976 const VU16 raw5 =
And(ShiftRight<8>(packed1), mask);
979 const VU16 raw6 = ShiftRight<12>(packed0);
982 const VU16 raw7 = ShiftRight<12>(packed1);
985 const VU16 raw8 =
And(packed2, mask);
988 const VU16 raw9 =
And(packed3, mask);
991 const VU16 rawA =
And(ShiftRight<4>(packed2), mask);
992 StoreU(rawA,
d, raw + 0xA * N);
994 const VU16 rawB =
And(ShiftRight<4>(packed3), mask);
995 StoreU(rawB,
d, raw + 0xB * N);
997 const VU16 rawC =
And(ShiftRight<8>(packed2), mask);
998 StoreU(rawC,
d, raw + 0xC * N);
1000 const VU16 rawD =
And(ShiftRight<8>(packed3), mask);
1001 StoreU(rawD,
d, raw + 0xD * N);
1003 const VU16 rawE = ShiftRight<12>(packed2);
1004 StoreU(rawE,
d, raw + 0xE * N);
1006 const VU16 rawF = ShiftRight<12>(packed3);
1007 StoreU(rawF,
d, raw + 0xF * N);
1016 using VU16 =
Vec<
decltype(
d)>;
1017 const size_t N =
Lanes(
d);
1018 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
1019 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
1020 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
1021 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
1022 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
1023 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
1024 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
1025 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
1026 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
1027 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
1028 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
1029 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
1030 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
1031 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
1032 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
1033 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
1036 VU16 packed0 =
Xor3(ShiftLeft<10>(rawA), ShiftLeft<5>(raw5), raw0);
1037 VU16 packed1 =
Xor3(ShiftLeft<10>(rawB), ShiftLeft<5>(raw6), raw1);
1038 VU16 packed2 =
Xor3(ShiftLeft<10>(rawC), ShiftLeft<5>(raw7), raw2);
1039 VU16 packed3 =
Xor3(ShiftLeft<10>(rawD), ShiftLeft<5>(raw8), raw3);
1040 VU16 packed4 =
Xor3(ShiftLeft<10>(rawE), ShiftLeft<5>(raw9), raw4);
1043 const VU16 hi1 =
Set(
d, 0x8000u);
1044 packed0 =
Or(packed0, ShiftLeft<15>(rawF));
1045 packed1 =
OrAnd(packed1, ShiftLeft<14>(rawF), hi1);
1046 packed2 =
OrAnd(packed2, ShiftLeft<13>(rawF), hi1);
1047 packed3 =
OrAnd(packed3, ShiftLeft<12>(rawF), hi1);
1048 packed4 =
OrAnd(packed4, ShiftLeft<11>(rawF), hi1);
1050 StoreU(packed0,
d, packed_out + 0 * N);
1051 StoreU(packed1,
d, packed_out + 1 * N);
1052 StoreU(packed2,
d, packed_out + 2 * N);
1053 StoreU(packed3,
d, packed_out + 3 * N);
1054 StoreU(packed4,
d, packed_out + 4 * N);
1060 using VU16 =
Vec<
decltype(
d)>;
1061 const size_t N =
Lanes(
d);
1063 const VU16 packed0 =
LoadU(
d, packed_in + 0 * N);
1064 const VU16 packed1 =
LoadU(
d, packed_in + 1 * N);
1065 const VU16 packed2 =
LoadU(
d, packed_in + 2 * N);
1066 const VU16 packed3 =
LoadU(
d, packed_in + 3 * N);
1067 const VU16 packed4 =
LoadU(
d, packed_in + 4 * N);
1069 const VU16 mask =
Set(
d, 0x1Fu);
1071 const VU16 raw0 =
And(packed0, mask);
1074 const VU16 raw1 =
And(packed1, mask);
1077 const VU16 raw2 =
And(packed2, mask);
1080 const VU16 raw3 =
And(packed3, mask);
1083 const VU16 raw4 =
And(packed4, mask);
1086 const VU16 raw5 =
And(ShiftRight<5>(packed0), mask);
1089 const VU16 raw6 =
And(ShiftRight<5>(packed1), mask);
1092 const VU16 raw7 =
And(ShiftRight<5>(packed2), mask);
1095 const VU16 raw8 =
And(ShiftRight<5>(packed3), mask);
1098 const VU16 raw9 =
And(ShiftRight<5>(packed4), mask);
1101 const VU16 rawA =
And(ShiftRight<10>(packed0), mask);
1102 StoreU(rawA,
d, raw + 0xA * N);
1104 const VU16 rawB =
And(ShiftRight<10>(packed1), mask);
1105 StoreU(rawB,
d, raw + 0xB * N);
1107 const VU16 rawC =
And(ShiftRight<10>(packed2), mask);
1108 StoreU(rawC,
d, raw + 0xC * N);
1110 const VU16 rawD =
And(ShiftRight<10>(packed3), mask);
1111 StoreU(rawD,
d, raw + 0xD * N);
1113 const VU16 rawE =
And(ShiftRight<10>(packed4), mask);
1114 StoreU(rawE,
d, raw + 0xE * N);
1117 const VU16 down0 = ShiftRight<15>(packed0);
1118 const VU16 down1 = ShiftRight<15>(packed1);
1119 const VU16 hi1 =
Set(
d, 0x8000u);
1121 Xor3(ShiftRight<13>(
And(packed2, hi1)),
Add(down1, down1), down0);
1122 const VU16 rawF =
Xor3(ShiftRight<11>(
And(packed4, hi1)),
1123 ShiftRight<12>(
And(packed3, hi1)), p0);
1124 StoreU(rawF,
d, raw + 0xF * N);
1133 using VU16 =
Vec<
decltype(
d)>;
1134 const size_t N =
Lanes(
d);
1135 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
1136 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
1137 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
1138 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
1139 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
1140 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
1141 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
1142 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
1143 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
1144 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
1145 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
1146 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
1147 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
1148 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
1149 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
1150 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
1152 const VU16 packed3 =
Or(ShiftLeft<6>(raw7), raw3);
1153 const VU16 packed7 =
Or(ShiftLeft<6>(rawF), rawB);
1156 const VU16 packed0 =
Xor3(ShiftLeft<12>(packed3), ShiftLeft<6>(raw4), raw0);
1157 VU16 packed1 =
Or(ShiftLeft<6>(raw5), raw1);
1158 VU16 packed2 =
Or(ShiftLeft<6>(raw6), raw2);
1159 const VU16 packed4 =
Xor3(ShiftLeft<12>(packed7), ShiftLeft<6>(rawC), raw8);
1160 VU16 packed5 =
Or(ShiftLeft<6>(rawD), raw9);
1161 VU16 packed6 =
Or(ShiftLeft<6>(rawE), rawA);
1163 const VU16 hi4 =
Set(
d, 0xF000u);
1164 packed1 =
OrAnd(packed1, ShiftLeft<8>(packed3), hi4);
1165 packed2 =
OrAnd(packed2, ShiftLeft<4>(packed3), hi4);
1166 packed5 =
OrAnd(packed5, ShiftLeft<8>(packed7), hi4);
1167 packed6 =
OrAnd(packed6, ShiftLeft<4>(packed7), hi4);
1169 StoreU(packed0,
d, packed_out + 0 * N);
1170 StoreU(packed1,
d, packed_out + 1 * N);
1171 StoreU(packed2,
d, packed_out + 2 * N);
1172 StoreU(packed4,
d, packed_out + 3 * N);
1173 StoreU(packed5,
d, packed_out + 4 * N);
1174 StoreU(packed6,
d, packed_out + 5 * N);
1180 using VU16 =
Vec<
decltype(
d)>;
1181 const size_t N =
Lanes(
d);
1182 const VU16 mask =
Set(
d, 0x3Fu);
1184 const VU16 packed0 =
LoadU(
d, packed_in + 0 * N);
1185 const VU16 packed1 =
LoadU(
d, packed_in + 1 * N);
1186 const VU16 packed2 =
LoadU(
d, packed_in + 2 * N);
1187 const VU16 packed4 =
LoadU(
d, packed_in + 3 * N);
1188 const VU16 packed5 =
LoadU(
d, packed_in + 4 * N);
1189 const VU16 packed6 =
LoadU(
d, packed_in + 5 * N);
1191 const VU16 raw0 =
And(packed0, mask);
1194 const VU16 raw1 =
And(packed1, mask);
1197 const VU16 raw2 =
And(packed2, mask);
1200 const VU16 raw4 =
And(ShiftRight<6>(packed0), mask);
1203 const VU16 raw5 =
And(ShiftRight<6>(packed1), mask);
1206 const VU16 raw6 =
And(ShiftRight<6>(packed2), mask);
1209 const VU16 raw8 =
And(packed4, mask);
1212 const VU16 raw9 =
And(packed5, mask);
1215 const VU16 rawA =
And(packed6, mask);
1216 StoreU(rawA,
d, raw + 0xA * N);
1218 const VU16 rawC =
And(ShiftRight<6>(packed4), mask);
1219 StoreU(rawC,
d, raw + 0xC * N);
1221 const VU16 rawD =
And(ShiftRight<6>(packed5), mask);
1222 StoreU(rawD,
d, raw + 0xD * N);
1224 const VU16 rawE =
And(ShiftRight<6>(packed6), mask);
1225 StoreU(rawE,
d, raw + 0xE * N);
1228 const VU16 down0 = ShiftRight<12>(packed0);
1229 const VU16 down4 = ShiftRight<12>(packed4);
1230 const VU16 hi4 =
Set(
d, 0xF000u);
1231 const VU16 packed3 =
Xor3(ShiftRight<4>(
And(packed2, hi4)),
1232 ShiftRight<8>(
And(packed1, hi4)), down0);
1233 const VU16 packed7 =
Xor3(ShiftRight<4>(
And(packed6, hi4)),
1234 ShiftRight<8>(
And(packed5, hi4)), down4);
1235 const VU16 raw3 =
And(packed3, mask);
1238 const VU16 rawB =
And(packed7, mask);
1239 StoreU(rawB,
d, raw + 0xB * N);
1241 const VU16 raw7 = ShiftRight<6>(packed3);
1244 const VU16 rawF = ShiftRight<6>(packed7);
1245 StoreU(rawF,
d, raw + 0xF * N);
1254 using VU16 =
Vec<
decltype(
d)>;
1255 const size_t N =
Lanes(
d);
1256 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
1257 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
1258 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
1259 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
1260 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
1261 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
1262 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
1263 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
1264 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
1265 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
1266 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
1267 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
1268 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
1269 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
1270 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
1271 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
1273 const VU16 packed7 =
Or(ShiftLeft<7>(rawF), raw7);
1276 const VU16 packed0 =
Xor3(ShiftLeft<14>(packed7), ShiftLeft<7>(raw8), raw0);
1277 VU16 packed1 =
Or(ShiftLeft<7>(raw9), raw1);
1278 VU16 packed2 =
Or(ShiftLeft<7>(rawA), raw2);
1279 VU16 packed3 =
Or(ShiftLeft<7>(rawB), raw3);
1280 VU16 packed4 =
Or(ShiftLeft<7>(rawC), raw4);
1281 VU16 packed5 =
Or(ShiftLeft<7>(rawD), raw5);
1282 VU16 packed6 =
Or(ShiftLeft<7>(rawE), raw6);
1284 const VU16 hi2 =
Set(
d, 0xC000u);
1285 packed1 =
OrAnd(packed1, ShiftLeft<12>(packed7), hi2);
1286 packed2 =
OrAnd(packed2, ShiftLeft<10>(packed7), hi2);
1287 packed3 =
OrAnd(packed3, ShiftLeft<8>(packed7), hi2);
1288 packed4 =
OrAnd(packed4, ShiftLeft<6>(packed7), hi2);
1289 packed5 =
OrAnd(packed5, ShiftLeft<4>(packed7), hi2);
1290 packed6 =
OrAnd(packed6, ShiftLeft<2>(packed7), hi2);
1292 StoreU(packed0,
d, packed_out + 0 * N);
1293 StoreU(packed1,
d, packed_out + 1 * N);
1294 StoreU(packed2,
d, packed_out + 2 * N);
1295 StoreU(packed3,
d, packed_out + 3 * N);
1296 StoreU(packed4,
d, packed_out + 4 * N);
1297 StoreU(packed5,
d, packed_out + 5 * N);
1298 StoreU(packed6,
d, packed_out + 6 * N);
1304 using VU16 =
Vec<
decltype(
d)>;
1305 const size_t N =
Lanes(
d);
1315 const VU16 mask =
Set(
d, 0x7Fu);
1317 const VU16 raw0 =
And(packed0, mask);
1320 const VU16 raw1 =
And(packed1, mask);
1323 const VU16 raw2 =
And(packed2, mask);
1326 const VU16 raw3 =
And(packed3, mask);
1329 const VU16 raw4 =
And(packed4, mask);
1332 const VU16 raw5 =
And(packed5, mask);
1335 const VU16 raw6 =
And(packed6, mask);
1338 const VU16 raw8 =
And(ShiftRight<7>(packed0), mask);
1341 const VU16 raw9 =
And(ShiftRight<7>(packed1), mask);
1344 const VU16 rawA =
And(ShiftRight<7>(packed2), mask);
1345 StoreU(rawA,
d, raw + 0xA * N);
1347 const VU16 rawB =
And(ShiftRight<7>(packed3), mask);
1348 StoreU(rawB,
d, raw + 0xB * N);
1350 const VU16 rawC =
And(ShiftRight<7>(packed4), mask);
1351 StoreU(rawC,
d, raw + 0xC * N);
1353 const VU16 rawD =
And(ShiftRight<7>(packed5), mask);
1354 StoreU(rawD,
d, raw + 0xD * N);
1356 const VU16 rawE =
And(ShiftRight<7>(packed6), mask);
1357 StoreU(rawE,
d, raw + 0xE * N);
1360 const VU16 down0 = ShiftRight<14>(packed0);
1361 const VU16 hi2 =
Set(
d, 0xC000u);
1362 const VU16 p0 =
Xor3(ShiftRight<12>(
And(packed1, hi2)),
1363 ShiftRight<10>(
And(packed2, hi2)), down0);
1364 const VU16 p1 =
Xor3(ShiftRight<8>(
And(packed3, hi2)),
1365 ShiftRight<6>(
And(packed4, hi2)),
1366 ShiftRight<4>(
And(packed5, hi2)));
1367 const VU16 packed7 =
Xor3(ShiftRight<2>(
And(packed6, hi2)), p1, p0);
1369 const VU16 raw7 =
And(packed7, mask);
1372 const VU16 rawF = ShiftRight<7>(packed7);
1373 StoreU(rawF,
d, raw + 0xF * N);
1382 using VU16 =
Vec<
decltype(
d)>;
1383 const size_t N =
Lanes(
d);
1384 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
1385 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
1386 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
1387 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
1388 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
1389 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
1390 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
1391 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
1392 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
1393 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
1394 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
1395 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
1396 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
1397 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
1398 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
1399 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
1403 const VU16 packed0 =
Or(ShiftLeft<8>(raw2), raw0);
1404 const VU16 packed1 =
Or(ShiftLeft<8>(raw3), raw1);
1405 const VU16 packed2 =
Or(ShiftLeft<8>(raw6), raw4);
1406 const VU16 packed3 =
Or(ShiftLeft<8>(raw7), raw5);
1407 const VU16 packed4 =
Or(ShiftLeft<8>(rawA), raw8);
1408 const VU16 packed5 =
Or(ShiftLeft<8>(rawB), raw9);
1409 const VU16 packed6 =
Or(ShiftLeft<8>(rawE), rawC);
1410 const VU16 packed7 =
Or(ShiftLeft<8>(rawF), rawD);
1412 StoreU(packed0,
d, packed_out + 0 * N);
1413 StoreU(packed1,
d, packed_out + 1 * N);
1414 StoreU(packed2,
d, packed_out + 2 * N);
1415 StoreU(packed3,
d, packed_out + 3 * N);
1416 StoreU(packed4,
d, packed_out + 4 * N);
1417 StoreU(packed5,
d, packed_out + 5 * N);
1418 StoreU(packed6,
d, packed_out + 6 * N);
1419 StoreU(packed7,
d, packed_out + 7 * N);
1425 using VU16 =
Vec<
decltype(
d)>;
1426 const size_t N =
Lanes(
d);
1436 const VU16 mask =
Set(
d, 0xFFu);
1438 const VU16 raw0 =
And(packed0, mask);
1441 const VU16 raw1 =
And(packed1, mask);
1444 const VU16 raw2 = ShiftRight<8>(packed0);
1447 const VU16 raw3 = ShiftRight<8>(packed1);
1450 const VU16 raw4 =
And(packed2, mask);
1453 const VU16 raw5 =
And(packed3, mask);
1456 const VU16 raw6 = ShiftRight<8>(packed2);
1459 const VU16 raw7 = ShiftRight<8>(packed3);
1462 const VU16 raw8 =
And(packed4, mask);
1465 const VU16 raw9 =
And(packed5, mask);
1468 const VU16 rawA = ShiftRight<8>(packed4);
1469 StoreU(rawA,
d, raw + 0xA * N);
1471 const VU16 rawB = ShiftRight<8>(packed5);
1472 StoreU(rawB,
d, raw + 0xB * N);
1474 const VU16 rawC =
And(packed6, mask);
1475 StoreU(rawC,
d, raw + 0xC * N);
1477 const VU16 rawD =
And(packed7, mask);
1478 StoreU(rawD,
d, raw + 0xD * N);
1480 const VU16 rawE = ShiftRight<8>(packed6);
1481 StoreU(rawE,
d, raw + 0xE * N);
1483 const VU16 rawF = ShiftRight<8>(packed7);
1484 StoreU(rawF,
d, raw + 0xF * N);
1493 using VU16 =
Vec<
decltype(
d)>;
1494 const size_t N =
Lanes(
d);
1495 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
1496 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
1497 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
1498 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
1499 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
1500 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
1501 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
1502 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
1503 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
1504 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
1505 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
1506 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
1507 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
1508 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
1509 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
1510 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
1512 const VU16 packed0 =
Or(ShiftLeft<9>(raw8), raw0);
1513 const VU16 packed1 =
Or(ShiftLeft<9>(raw9), raw1);
1514 const VU16 packed2 =
Or(ShiftLeft<9>(rawA), raw2);
1515 const VU16 packed3 =
Or(ShiftLeft<9>(rawB), raw3);
1516 const VU16 packed4 =
Or(ShiftLeft<9>(rawC), raw4);
1517 const VU16 packed5 =
Or(ShiftLeft<9>(rawD), raw5);
1518 const VU16 packed6 =
Or(ShiftLeft<9>(rawE), raw6);
1519 const VU16 packed7 =
Or(ShiftLeft<9>(rawF), raw7);
1524 const VU16 mid2 =
Set(
d, 0x180u);
1525 const VU16 part8 = ShiftRight<7>(
And(raw8, mid2));
1526 const VU16 part9 = ShiftRight<5>(
And(raw9, mid2));
1527 const VU16 partA = ShiftRight<3>(
And(rawA, mid2));
1528 const VU16 partB = ShiftRight<1>(
And(rawB, mid2));
1529 const VU16 partC = ShiftLeft<1>(
And(rawC, mid2));
1530 const VU16 partD = ShiftLeft<3>(
And(rawD, mid2));
1531 const VU16 partE = ShiftLeft<5>(
And(rawE, mid2));
1532 const VU16 partF = ShiftLeft<7>(
And(rawF, mid2));
1533 const VU16 packed8 =
Xor3(
Xor3(part8, part9, partA),
1534 Xor3(partB, partC, partD),
Or(partE, partF));
1536 StoreU(packed0,
d, packed_out + 0 * N);
1537 StoreU(packed1,
d, packed_out + 1 * N);
1538 StoreU(packed2,
d, packed_out + 2 * N);
1539 StoreU(packed3,
d, packed_out + 3 * N);
1540 StoreU(packed4,
d, packed_out + 4 * N);
1541 StoreU(packed5,
d, packed_out + 5 * N);
1542 StoreU(packed6,
d, packed_out + 6 * N);
1543 StoreU(packed7,
d, packed_out + 7 * N);
1544 StoreU(packed8,
d, packed_out + 8 * N);
1550 using VU16 =
Vec<
decltype(
d)>;
1551 const size_t N =
Lanes(
d);
1563 const VU16 mask =
Set(
d, 0x1FFu);
1565 const VU16 raw0 =
And(packed0, mask);
1568 const VU16 raw1 =
And(packed1, mask);
1571 const VU16 raw2 =
And(packed2, mask);
1574 const VU16 raw3 =
And(packed3, mask);
1577 const VU16 raw4 =
And(packed4, mask);
1580 const VU16 raw5 =
And(packed5, mask);
1583 const VU16 raw6 =
And(packed6, mask);
1586 const VU16 raw7 =
And(packed7, mask);
1589 const VU16 mid2 =
Set(
d, 0x180u);
1591 OrAnd(ShiftRight<9>(packed0), ShiftLeft<7>(packed8), mid2);
1593 OrAnd(ShiftRight<9>(packed1), ShiftLeft<5>(packed8), mid2);
1595 OrAnd(ShiftRight<9>(packed2), ShiftLeft<3>(packed8), mid2);
1597 OrAnd(ShiftRight<9>(packed3), ShiftLeft<1>(packed8), mid2);
1599 OrAnd(ShiftRight<9>(packed4), ShiftRight<1>(packed8), mid2);
1601 OrAnd(ShiftRight<9>(packed5), ShiftRight<3>(packed8), mid2);
1603 OrAnd(ShiftRight<9>(packed6), ShiftRight<5>(packed8), mid2);
1605 OrAnd(ShiftRight<9>(packed7), ShiftRight<7>(packed8), mid2);
1609 StoreU(rawA,
d, raw + 0xA * N);
1610 StoreU(rawB,
d, raw + 0xB * N);
1611 StoreU(rawC,
d, raw + 0xC * N);
1612 StoreU(rawD,
d, raw + 0xD * N);
1613 StoreU(rawE,
d, raw + 0xE * N);
1614 StoreU(rawF,
d, raw + 0xF * N);
1623 using VU16 =
Vec<
decltype(
d)>;
1624 const size_t N =
Lanes(
d);
1625 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
1626 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
1627 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
1628 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
1629 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
1630 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
1631 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
1632 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
1633 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
1634 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
1635 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
1636 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
1637 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
1638 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
1639 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
1640 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
1644 const VU16 packed0 =
Or(ShiftLeft<10>(raw8), raw0);
1645 const VU16 packed1 =
Or(ShiftLeft<10>(raw9), raw1);
1646 const VU16 packed2 =
Or(ShiftLeft<10>(rawA), raw2);
1647 const VU16 packed3 =
Or(ShiftLeft<10>(rawB), raw3);
1648 const VU16 packed4 =
Or(ShiftLeft<10>(rawC), raw4);
1649 const VU16 packed5 =
Or(ShiftLeft<10>(rawD), raw5);
1650 const VU16 packed6 =
Or(ShiftLeft<10>(rawE), raw6);
1651 const VU16 packed7 =
Or(ShiftLeft<10>(rawF), raw7);
1656 const VU16 mid4 =
Set(
d, 0x3C0u);
1657 const VU16 part8 = ShiftRight<6>(
And(raw8, mid4));
1658 const VU16 part9 = ShiftRight<2>(
And(raw9, mid4));
1659 const VU16 partA = ShiftLeft<2>(
And(rawA, mid4));
1660 const VU16 partB = ShiftLeft<6>(
And(rawB, mid4));
1661 const VU16 partC = ShiftRight<6>(
And(rawC, mid4));
1662 const VU16 partD = ShiftRight<2>(
And(rawD, mid4));
1663 const VU16 partE = ShiftLeft<2>(
And(rawE, mid4));
1664 const VU16 partF = ShiftLeft<6>(
And(rawF, mid4));
1665 const VU16 packed8 =
Or(
Xor3(part8, part9, partA), partB);
1666 const VU16 packed9 =
Or(
Xor3(partC, partD, partE), partF);
1668 StoreU(packed0,
d, packed_out + 0 * N);
1669 StoreU(packed1,
d, packed_out + 1 * N);
1670 StoreU(packed2,
d, packed_out + 2 * N);
1671 StoreU(packed3,
d, packed_out + 3 * N);
1672 StoreU(packed4,
d, packed_out + 4 * N);
1673 StoreU(packed5,
d, packed_out + 5 * N);
1674 StoreU(packed6,
d, packed_out + 6 * N);
1675 StoreU(packed7,
d, packed_out + 7 * N);
1676 StoreU(packed8,
d, packed_out + 8 * N);
1677 StoreU(packed9,
d, packed_out + 9 * N);
1683 using VU16 =
Vec<
decltype(
d)>;
1684 const size_t N =
Lanes(
d);
1697 const VU16 mask =
Set(
d, 0x3FFu);
1699 const VU16 raw0 =
And(packed0, mask);
1702 const VU16 raw1 =
And(packed1, mask);
1705 const VU16 raw2 =
And(packed2, mask);
1708 const VU16 raw3 =
And(packed3, mask);
1711 const VU16 raw4 =
And(packed4, mask);
1714 const VU16 raw5 =
And(packed5, mask);
1717 const VU16 raw6 =
And(packed6, mask);
1720 const VU16 raw7 =
And(packed7, mask);
1723 const VU16 mid4 =
Set(
d, 0x3C0u);
1725 OrAnd(ShiftRight<10>(packed0), ShiftLeft<6>(packed8), mid4);
1727 OrAnd(ShiftRight<10>(packed1), ShiftLeft<2>(packed8), mid4);
1729 OrAnd(ShiftRight<10>(packed2), ShiftRight<2>(packed8), mid4);
1731 OrAnd(ShiftRight<10>(packed3), ShiftRight<6>(packed8), mid4);
1733 OrAnd(ShiftRight<10>(packed4), ShiftLeft<6>(packed9), mid4);
1735 OrAnd(ShiftRight<10>(packed5), ShiftLeft<2>(packed9), mid4);
1737 OrAnd(ShiftRight<10>(packed6), ShiftRight<2>(packed9), mid4);
1739 OrAnd(ShiftRight<10>(packed7), ShiftRight<6>(packed9), mid4);
1743 StoreU(rawA,
d, raw + 0xA * N);
1744 StoreU(rawB,
d, raw + 0xB * N);
1745 StoreU(rawC,
d, raw + 0xC * N);
1746 StoreU(rawD,
d, raw + 0xD * N);
1747 StoreU(rawE,
d, raw + 0xE * N);
1748 StoreU(rawF,
d, raw + 0xF * N);
1757 using VU16 =
Vec<
decltype(
d)>;
1758 const size_t N =
Lanes(
d);
1759 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
1760 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
1761 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
1762 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
1763 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
1764 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
1765 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
1766 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
1767 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
1768 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
1769 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
1770 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
1771 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
1772 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
1773 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
1774 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
1780 const VU16 lo8 =
Set(
d, 0xFFu);
1783 const VU16 packed0 =
OrAnd(ShiftLeft<8>(raw1), raw0, lo8);
1784 const VU16 packed1 =
OrAnd(ShiftLeft<8>(raw3), raw2, lo8);
1785 const VU16 packed2 =
OrAnd(ShiftLeft<8>(raw5), raw4, lo8);
1786 const VU16 packed3 =
OrAnd(ShiftLeft<8>(raw7), raw6, lo8);
1787 const VU16 packed4 =
OrAnd(ShiftLeft<8>(raw9), raw8, lo8);
1788 const VU16 packed5 =
OrAnd(ShiftLeft<8>(rawB), rawA, lo8);
1789 const VU16 packed6 =
OrAnd(ShiftLeft<8>(rawD), rawC, lo8);
1790 const VU16 packed7 =
OrAnd(ShiftLeft<8>(rawF), rawE, lo8);
1792 StoreU(packed0,
d, packed_out + 0 * N);
1793 StoreU(packed1,
d, packed_out + 1 * N);
1794 StoreU(packed2,
d, packed_out + 2 * N);
1795 StoreU(packed3,
d, packed_out + 3 * N);
1796 StoreU(packed4,
d, packed_out + 4 * N);
1797 StoreU(packed5,
d, packed_out + 5 * N);
1798 StoreU(packed6,
d, packed_out + 6 * N);
1799 StoreU(packed7,
d, packed_out + 7 * N);
1802 const VU16 top0 = ShiftRight<8>(raw0);
1803 const VU16 top1 = ShiftRight<8>(raw1);
1804 const VU16 top2 = ShiftRight<8>(raw2);
1807 VU16 next =
Set(
d, 0x38u);
1808 VU16 packed8 =
OrAnd(top0, ShiftRight<5>(raw3), next);
1809 VU16 packed9 =
OrAnd(top1, ShiftRight<5>(raw4), next);
1810 VU16 packedA =
OrAnd(top2, ShiftRight<5>(raw5), next);
1811 next = ShiftLeft<3>(next);
1812 packed8 =
OrAnd(packed8, ShiftRight<2>(raw6), next);
1813 packed9 =
OrAnd(packed9, ShiftRight<2>(raw7), next);
1814 packedA =
OrAnd(packedA, ShiftRight<2>(raw8), next);
1815 next = ShiftLeft<3>(next);
1816 packed8 =
OrAnd(packed8,
Add(raw9, raw9), next);
1817 packed9 =
OrAnd(packed9,
Add(rawA, rawA), next);
1818 packedA =
OrAnd(packedA,
Add(rawB, rawB), next);
1819 next = ShiftLeft<3>(next);
1820 packed8 =
OrAnd(packed8, ShiftLeft<4>(rawC), next);
1821 packed9 =
OrAnd(packed9, ShiftLeft<4>(rawD), next);
1822 packedA =
OrAnd(packedA, ShiftLeft<4>(rawE), next);
1825 next = ShiftLeft<3>(next);
1826 packed8 =
OrAnd(packed8, ShiftLeft<7>(rawF), next);
1827 packed9 =
OrAnd(packed9, ShiftLeft<6>(rawF), next);
1828 packedA =
OrAnd(packedA, ShiftLeft<5>(rawF), next);
1830 StoreU(packed8,
d, packed_out + 8 * N);
1831 StoreU(packed9,
d, packed_out + 9 * N);
1832 StoreU(packedA,
d, packed_out + 0xA * N);
1838 using VU16 =
Vec<
decltype(
d)>;
1839 const size_t N =
Lanes(
d);
1853 const VU16 mask =
Set(
d, 0xFFu);
1855 const VU16 down0 =
And(packed0, mask);
1856 const VU16 down1 = ShiftRight<8>(packed0);
1857 const VU16 down2 =
And(packed1, mask);
1858 const VU16 down3 = ShiftRight<8>(packed1);
1859 const VU16 down4 =
And(packed2, mask);
1860 const VU16 down5 = ShiftRight<8>(packed2);
1861 const VU16 down6 =
And(packed3, mask);
1862 const VU16 down7 = ShiftRight<8>(packed3);
1863 const VU16 down8 =
And(packed4, mask);
1864 const VU16 down9 = ShiftRight<8>(packed4);
1865 const VU16 downA =
And(packed5, mask);
1866 const VU16 downB = ShiftRight<8>(packed5);
1867 const VU16 downC =
And(packed6, mask);
1868 const VU16 downD = ShiftRight<8>(packed6);
1869 const VU16 downE =
And(packed7, mask);
1870 const VU16 downF = ShiftRight<8>(packed7);
1873 const VU16 hi3 =
Set(
d, 0x700u);
1874 const VU16 raw0 =
OrAnd(down0, ShiftLeft<8>(packed8), hi3);
1875 const VU16 raw1 =
OrAnd(down1, ShiftLeft<8>(packed9), hi3);
1876 const VU16 raw2 =
OrAnd(down2, ShiftLeft<8>(packedA), hi3);
1878 const VU16 raw3 =
OrAnd(down3, ShiftLeft<5>(packed8), hi3);
1879 const VU16 raw4 =
OrAnd(down4, ShiftLeft<5>(packed9), hi3);
1880 const VU16 raw5 =
OrAnd(down5, ShiftLeft<5>(packedA), hi3);
1882 const VU16 raw6 =
OrAnd(down6, ShiftLeft<2>(packed8), hi3);
1883 const VU16 raw7 =
OrAnd(down7, ShiftLeft<2>(packed9), hi3);
1884 const VU16 raw8 =
OrAnd(down8, ShiftLeft<2>(packedA), hi3);
1886 const VU16 raw9 =
OrAnd(down9, ShiftRight<1>(packed8), hi3);
1887 const VU16 rawA =
OrAnd(downA, ShiftRight<1>(packed9), hi3);
1888 const VU16 rawB =
OrAnd(downB, ShiftRight<1>(packedA), hi3);
1890 const VU16 rawC =
OrAnd(downC, ShiftRight<4>(packed8), hi3);
1891 const VU16 rawD =
OrAnd(downD, ShiftRight<4>(packed9), hi3);
1892 const VU16 rawE =
OrAnd(downE, ShiftRight<4>(packedA), hi3);
1895 const VU16 rawF =
Or(downF,
Xor3(
And(ShiftRight<7>(packed8), hi3),
1896 And(ShiftRight<6>(packed9), hi3),
1897 And(ShiftRight<5>(packedA), hi3)));
1909 StoreU(rawA,
d, raw + 0xA * N);
1910 StoreU(rawB,
d, raw + 0xB * N);
1911 StoreU(rawC,
d, raw + 0xC * N);
1912 StoreU(rawD,
d, raw + 0xD * N);
1913 StoreU(rawE,
d, raw + 0xE * N);
1914 StoreU(rawF,
d, raw + 0xF * N);
1923 using VU16 =
Vec<
decltype(
d)>;
1924 const size_t N =
Lanes(
d);
1925 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
1926 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
1927 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
1928 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
1929 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
1930 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
1931 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
1932 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
1933 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
1934 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
1935 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
1936 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
1937 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
1938 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
1939 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
1940 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
1944 const VU16 packed0 =
Or(ShiftLeft<12>(raw8), raw0);
1945 const VU16 packed1 =
Or(ShiftLeft<12>(raw9), raw1);
1946 const VU16 packed2 =
Or(ShiftLeft<12>(rawA), raw2);
1947 const VU16 packed3 =
Or(ShiftLeft<12>(rawB), raw3);
1948 const VU16 packed4 =
Or(ShiftLeft<12>(rawC), raw4);
1949 const VU16 packed5 =
Or(ShiftLeft<12>(rawD), raw5);
1950 const VU16 packed6 =
Or(ShiftLeft<12>(rawE), raw6);
1951 const VU16 packed7 =
Or(ShiftLeft<12>(rawF), raw7);
1954 const VU16 hi8 =
Set(
d, 0xFF00u);
1955 const VU16 packed8 =
OrAnd(ShiftRight<4>(raw8), ShiftLeft<4>(raw9), hi8);
1956 const VU16 packed9 =
OrAnd(ShiftRight<4>(rawA), ShiftLeft<4>(rawB), hi8);
1957 const VU16 packedA =
OrAnd(ShiftRight<4>(rawC), ShiftLeft<4>(rawD), hi8);
1958 const VU16 packedB =
OrAnd(ShiftRight<4>(rawE), ShiftLeft<4>(rawF), hi8);
1959 StoreU(packed0,
d, packed_out + 0 * N);
1960 StoreU(packed1,
d, packed_out + 1 * N);
1961 StoreU(packed2,
d, packed_out + 2 * N);
1962 StoreU(packed3,
d, packed_out + 3 * N);
1963 StoreU(packed4,
d, packed_out + 4 * N);
1964 StoreU(packed5,
d, packed_out + 5 * N);
1965 StoreU(packed6,
d, packed_out + 6 * N);
1966 StoreU(packed7,
d, packed_out + 7 * N);
1967 StoreU(packed8,
d, packed_out + 8 * N);
1968 StoreU(packed9,
d, packed_out + 9 * N);
1969 StoreU(packedA,
d, packed_out + 0xA * N);
1970 StoreU(packedB,
d, packed_out + 0xB * N);
1976 using VU16 =
Vec<
decltype(
d)>;
1977 const size_t N =
Lanes(
d);
1992 const VU16 mask =
Set(
d, 0xFFFu);
1994 const VU16 raw0 =
And(packed0, mask);
1997 const VU16 raw1 =
And(packed1, mask);
2000 const VU16 raw2 =
And(packed2, mask);
2003 const VU16 raw3 =
And(packed3, mask);
2006 const VU16 raw4 =
And(packed4, mask);
2009 const VU16 raw5 =
And(packed5, mask);
2012 const VU16 raw6 =
And(packed6, mask);
2015 const VU16 raw7 =
And(packed7, mask);
2018 const VU16 mid8 =
Set(
d, 0xFF0u);
2020 OrAnd(ShiftRight<12>(packed0), ShiftLeft<4>(packed8), mid8);
2022 OrAnd(ShiftRight<12>(packed1), ShiftRight<4>(packed8), mid8);
2024 OrAnd(ShiftRight<12>(packed2), ShiftLeft<4>(packed9), mid8);
2026 OrAnd(ShiftRight<12>(packed3), ShiftRight<4>(packed9), mid8);
2028 OrAnd(ShiftRight<12>(packed4), ShiftLeft<4>(packedA), mid8);
2030 OrAnd(ShiftRight<12>(packed5), ShiftRight<4>(packedA), mid8);
2032 OrAnd(ShiftRight<12>(packed6), ShiftLeft<4>(packedB), mid8);
2034 OrAnd(ShiftRight<12>(packed7), ShiftRight<4>(packedB), mid8);
2037 StoreU(rawA,
d, raw + 0xA * N);
2038 StoreU(rawB,
d, raw + 0xB * N);
2039 StoreU(rawC,
d, raw + 0xC * N);
2040 StoreU(rawD,
d, raw + 0xD * N);
2041 StoreU(rawE,
d, raw + 0xE * N);
2042 StoreU(rawF,
d, raw + 0xF * N);
2051 using VU16 =
Vec<
decltype(
d)>;
2052 const size_t N =
Lanes(
d);
2053 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
2054 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
2055 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
2056 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
2057 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
2058 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
2059 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
2060 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
2061 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
2062 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
2063 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
2064 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
2065 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
2066 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
2067 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
2068 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
2072 const VU16 lo8 =
Set(
d, 0xFFu);
2075 const VU16 packed0 =
OrAnd(ShiftLeft<8>(raw1), raw0, lo8);
2076 const VU16 packed1 =
OrAnd(ShiftLeft<8>(raw3), raw2, lo8);
2077 const VU16 packed2 =
OrAnd(ShiftLeft<8>(raw5), raw4, lo8);
2078 const VU16 packed3 =
OrAnd(ShiftLeft<8>(raw7), raw6, lo8);
2079 const VU16 packed4 =
OrAnd(ShiftLeft<8>(raw9), raw8, lo8);
2080 const VU16 packed5 =
OrAnd(ShiftLeft<8>(rawB), rawA, lo8);
2081 const VU16 packed6 =
OrAnd(ShiftLeft<8>(rawD), rawC, lo8);
2082 const VU16 packed7 =
OrAnd(ShiftLeft<8>(rawF), rawE, lo8);
2084 StoreU(packed0,
d, packed_out + 0 * N);
2085 StoreU(packed1,
d, packed_out + 1 * N);
2086 StoreU(packed2,
d, packed_out + 2 * N);
2087 StoreU(packed3,
d, packed_out + 3 * N);
2088 StoreU(packed4,
d, packed_out + 4 * N);
2089 StoreU(packed5,
d, packed_out + 5 * N);
2090 StoreU(packed6,
d, packed_out + 6 * N);
2091 StoreU(packed7,
d, packed_out + 7 * N);
2094 const VU16 top0 = ShiftRight<8>(raw0);
2095 const VU16 top1 = ShiftRight<8>(raw1);
2096 const VU16 top2 = ShiftRight<8>(raw2);
2097 const VU16 top3 = ShiftRight<8>(raw3);
2098 const VU16 top4 = ShiftRight<8>(raw4);
2102 VU16 next =
Set(
d, 0x3E0u);
2103 VU16 packed8 =
OrAnd(top0, ShiftRight<3>(raw5), next);
2104 VU16 packed9 =
OrAnd(top1, ShiftRight<3>(raw6), next);
2105 VU16 packedA =
OrAnd(top2, ShiftRight<3>(raw7), next);
2106 VU16 packedB =
OrAnd(top3, ShiftRight<3>(raw8), next);
2107 VU16 packedC =
OrAnd(top4, ShiftRight<3>(raw9), next);
2108 next = ShiftLeft<5>(next);
2109 packed8 =
OrAnd(packed8, ShiftLeft<2>(rawA), next);
2110 packed9 =
OrAnd(packed9, ShiftLeft<2>(rawB), next);
2111 packedA =
OrAnd(packedA, ShiftLeft<2>(rawC), next);
2112 packedB =
OrAnd(packedB, ShiftLeft<2>(rawD), next);
2113 packedC =
OrAnd(packedC, ShiftLeft<2>(rawE), next);
2116 next = ShiftLeft<3>(next);
2117 packed8 =
OrAnd(packed8, ShiftLeft<7>(rawF), next);
2118 packed9 =
OrAnd(packed9, ShiftLeft<6>(rawF), next);
2119 packedA =
OrAnd(packedA, ShiftLeft<5>(rawF), next);
2120 packedB =
OrAnd(packedB, ShiftLeft<4>(rawF), next);
2121 packedC =
OrAnd(packedC, ShiftLeft<3>(rawF), next);
2123 StoreU(packed8,
d, packed_out + 8 * N);
2124 StoreU(packed9,
d, packed_out + 9 * N);
2125 StoreU(packedA,
d, packed_out + 0xA * N);
2126 StoreU(packedB,
d, packed_out + 0xB * N);
2127 StoreU(packedC,
d, packed_out + 0xC * N);
2133 using VU16 =
Vec<
decltype(
d)>;
2134 const size_t N =
Lanes(
d);
2150 const VU16 mask =
Set(
d, 0xFFu);
2152 const VU16 down0 =
And(packed0, mask);
2153 const VU16 down1 = ShiftRight<8>(packed0);
2154 const VU16 down2 =
And(packed1, mask);
2155 const VU16 down3 = ShiftRight<8>(packed1);
2156 const VU16 down4 =
And(packed2, mask);
2157 const VU16 down5 = ShiftRight<8>(packed2);
2158 const VU16 down6 =
And(packed3, mask);
2159 const VU16 down7 = ShiftRight<8>(packed3);
2160 const VU16 down8 =
And(packed4, mask);
2161 const VU16 down9 = ShiftRight<8>(packed4);
2162 const VU16 downA =
And(packed5, mask);
2163 const VU16 downB = ShiftRight<8>(packed5);
2164 const VU16 downC =
And(packed6, mask);
2165 const VU16 downD = ShiftRight<8>(packed6);
2166 const VU16 downE =
And(packed7, mask);
2167 const VU16 downF = ShiftRight<8>(packed7);
2170 const VU16 hi5 =
Set(
d, 0x1F00u);
2171 const VU16 raw0 =
OrAnd(down0, ShiftLeft<8>(packed8), hi5);
2172 const VU16 raw1 =
OrAnd(down1, ShiftLeft<8>(packed9), hi5);
2173 const VU16 raw2 =
OrAnd(down2, ShiftLeft<8>(packedA), hi5);
2174 const VU16 raw3 =
OrAnd(down3, ShiftLeft<8>(packedB), hi5);
2175 const VU16 raw4 =
OrAnd(down4, ShiftLeft<8>(packedC), hi5);
2177 const VU16 raw5 =
OrAnd(down5, ShiftLeft<3>(packed8), hi5);
2178 const VU16 raw6 =
OrAnd(down6, ShiftLeft<3>(packed9), hi5);
2179 const VU16 raw7 =
OrAnd(down7, ShiftLeft<3>(packedA), hi5);
2180 const VU16 raw8 =
OrAnd(down8, ShiftLeft<3>(packed9), hi5);
2181 const VU16 raw9 =
OrAnd(down9, ShiftLeft<3>(packedA), hi5);
2183 const VU16 rawA =
OrAnd(downA, ShiftRight<2>(packed8), hi5);
2184 const VU16 rawB =
OrAnd(downB, ShiftRight<2>(packed9), hi5);
2185 const VU16 rawC =
OrAnd(downC, ShiftRight<2>(packedA), hi5);
2186 const VU16 rawD =
OrAnd(downD, ShiftRight<2>(packed9), hi5);
2187 const VU16 rawE =
OrAnd(downE, ShiftRight<2>(packedA), hi5);
2190 const VU16 p0 =
Xor3(
And(ShiftRight<7>(packed8), hi5),
2191 And(ShiftRight<6>(packed9), hi5),
2192 And(ShiftRight<5>(packedA), hi5));
2193 const VU16 p1 =
Xor3(
And(ShiftRight<4>(packedB), hi5),
2194 And(ShiftRight<3>(packedC), hi5), downF);
2195 const VU16 rawF =
Or(p0, p1);
2207 StoreU(rawA,
d, raw + 0xA * N);
2208 StoreU(rawB,
d, raw + 0xB * N);
2209 StoreU(rawC,
d, raw + 0xC * N);
2210 StoreU(rawD,
d, raw + 0xD * N);
2211 StoreU(rawE,
d, raw + 0xE * N);
2212 StoreU(rawF,
d, raw + 0xF * N);
2221 using VU16 =
Vec<
decltype(
d)>;
2222 const size_t N =
Lanes(
d);
2223 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
2224 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
2225 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
2226 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
2227 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
2228 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
2229 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
2230 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
2231 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
2232 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
2233 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
2234 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
2235 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
2236 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
2237 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
2238 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
2242 const VU16 hi2 =
Set(
d, 0xC000u);
2243 const VU16 packed0 =
Or(raw0, ShiftLeft<14>(rawE));
2244 const VU16 packed1 =
OrAnd(raw1, ShiftLeft<12>(rawE), hi2);
2245 const VU16 packed2 =
OrAnd(raw2, ShiftLeft<10>(rawE), hi2);
2246 const VU16 packed3 =
OrAnd(raw3, ShiftLeft<8>(rawE), hi2);
2247 const VU16 packed4 =
OrAnd(raw4, ShiftLeft<6>(rawE), hi2);
2248 const VU16 packed5 =
OrAnd(raw5, ShiftLeft<4>(rawE), hi2);
2249 const VU16 packed6 =
OrAnd(raw6, ShiftLeft<2>(rawE), hi2);
2250 const VU16 packed7 =
Or(raw7, ShiftLeft<14>(rawF));
2251 const VU16 packed8 =
OrAnd(raw8, ShiftLeft<12>(rawF), hi2);
2252 const VU16 packed9 =
OrAnd(raw9, ShiftLeft<10>(rawF), hi2);
2253 const VU16 packedA =
OrAnd(rawA, ShiftLeft<8>(rawF), hi2);
2254 const VU16 packedB =
OrAnd(rawB, ShiftLeft<6>(rawF), hi2);
2255 const VU16 packedC =
OrAnd(rawC, ShiftLeft<4>(rawF), hi2);
2256 const VU16 packedD =
OrAnd(rawD, ShiftLeft<2>(rawF), hi2);
2258 StoreU(packed0,
d, packed_out + 0 * N);
2259 StoreU(packed1,
d, packed_out + 1 * N);
2260 StoreU(packed2,
d, packed_out + 2 * N);
2261 StoreU(packed3,
d, packed_out + 3 * N);
2262 StoreU(packed4,
d, packed_out + 4 * N);
2263 StoreU(packed5,
d, packed_out + 5 * N);
2264 StoreU(packed6,
d, packed_out + 6 * N);
2265 StoreU(packed7,
d, packed_out + 7 * N);
2266 StoreU(packed8,
d, packed_out + 8 * N);
2267 StoreU(packed9,
d, packed_out + 9 * N);
2268 StoreU(packedA,
d, packed_out + 0xA * N);
2269 StoreU(packedB,
d, packed_out + 0xB * N);
2270 StoreU(packedC,
d, packed_out + 0xC * N);
2271 StoreU(packedD,
d, packed_out + 0xD * N);
2277 using VU16 =
Vec<
decltype(
d)>;
2278 const size_t N =
Lanes(
d);
2295 const VU16 mask =
Set(
d, 0x3FFFu);
2297 const VU16 raw0 =
And(packed0, mask);
2300 const VU16 raw1 =
And(packed1, mask);
2303 const VU16 raw2 =
And(packed2, mask);
2306 const VU16 raw3 =
And(packed3, mask);
2309 const VU16 raw4 =
And(packed4, mask);
2312 const VU16 raw5 =
And(packed5, mask);
2315 const VU16 raw6 =
And(packed6, mask);
2318 const VU16 raw7 =
And(packed7, mask);
2321 const VU16 raw8 =
And(packed8, mask);
2324 const VU16 raw9 =
And(packed9, mask);
2327 const VU16 rawA =
And(packedA, mask);
2328 StoreU(rawA,
d, raw + 0xA * N);
2330 const VU16 rawB =
And(packedB, mask);
2331 StoreU(rawB,
d, raw + 0xB * N);
2333 const VU16 rawC =
And(packedC, mask);
2334 StoreU(rawC,
d, raw + 0xC * N);
2336 const VU16 rawD =
And(packedD, mask);
2337 StoreU(rawD,
d, raw + 0xD * N);
2340 const VU16 E0 =
Xor3(ShiftRight<14>(packed0),
2341 ShiftRight<12>(
AndNot(mask, packed1)),
2342 ShiftRight<10>(
AndNot(mask, packed2)));
2343 const VU16 E1 =
Xor3(ShiftRight<8>(
AndNot(mask, packed3)),
2344 ShiftRight<6>(
AndNot(mask, packed4)),
2345 ShiftRight<4>(
AndNot(mask, packed5)));
2346 const VU16 rawE =
Xor3(ShiftRight<2>(
AndNot(mask, packed6)), E0, E1);
2347 const VU16 F0 =
Xor3(ShiftRight<14>(
AndNot(mask, packed7)),
2348 ShiftRight<12>(
AndNot(mask, packed8)),
2349 ShiftRight<10>(
AndNot(mask, packed9)));
2350 const VU16 F1 =
Xor3(ShiftRight<8>(
AndNot(mask, packedA)),
2351 ShiftRight<6>(
AndNot(mask, packedB)),
2352 ShiftRight<4>(
AndNot(mask, packedC)));
2353 const VU16 rawF =
Xor3(ShiftRight<2>(
AndNot(mask, packedD)), F0, F1);
2354 StoreU(rawE,
d, raw + 0xE * N);
2355 StoreU(rawF,
d, raw + 0xF * N);
2364 using VU16 =
Vec<
decltype(
d)>;
2365 const size_t N =
Lanes(
d);
2366 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
2367 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
2368 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
2369 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
2370 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
2371 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
2372 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
2373 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
2374 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
2375 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
2376 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
2377 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
2378 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
2379 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
2380 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
2381 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
2385 const VU16 hi1 =
Set(
d, 0x8000u);
2386 const VU16 packed0 =
Or(raw0, ShiftLeft<15>(rawF));
2387 const VU16 packed1 =
OrAnd(raw1, ShiftLeft<14>(rawF), hi1);
2388 const VU16 packed2 =
OrAnd(raw2, ShiftLeft<13>(rawF), hi1);
2389 const VU16 packed3 =
OrAnd(raw3, ShiftLeft<12>(rawF), hi1);
2390 const VU16 packed4 =
OrAnd(raw4, ShiftLeft<11>(rawF), hi1);
2391 const VU16 packed5 =
OrAnd(raw5, ShiftLeft<10>(rawF), hi1);
2392 const VU16 packed6 =
OrAnd(raw6, ShiftLeft<9>(rawF), hi1);
2393 const VU16 packed7 =
OrAnd(raw7, ShiftLeft<8>(rawF), hi1);
2394 const VU16 packed8 =
OrAnd(raw8, ShiftLeft<7>(rawF), hi1);
2395 const VU16 packed9 =
OrAnd(raw9, ShiftLeft<6>(rawF), hi1);
2396 const VU16 packedA =
OrAnd(rawA, ShiftLeft<5>(rawF), hi1);
2397 const VU16 packedB =
OrAnd(rawB, ShiftLeft<4>(rawF), hi1);
2398 const VU16 packedC =
OrAnd(rawC, ShiftLeft<3>(rawF), hi1);
2399 const VU16 packedD =
OrAnd(rawD, ShiftLeft<2>(rawF), hi1);
2400 const VU16 packedE =
OrAnd(rawE, ShiftLeft<1>(rawF), hi1);
2402 StoreU(packed0,
d, packed_out + 0 * N);
2403 StoreU(packed1,
d, packed_out + 1 * N);
2404 StoreU(packed2,
d, packed_out + 2 * N);
2405 StoreU(packed3,
d, packed_out + 3 * N);
2406 StoreU(packed4,
d, packed_out + 4 * N);
2407 StoreU(packed5,
d, packed_out + 5 * N);
2408 StoreU(packed6,
d, packed_out + 6 * N);
2409 StoreU(packed7,
d, packed_out + 7 * N);
2410 StoreU(packed8,
d, packed_out + 8 * N);
2411 StoreU(packed9,
d, packed_out + 9 * N);
2412 StoreU(packedA,
d, packed_out + 0xA * N);
2413 StoreU(packedB,
d, packed_out + 0xB * N);
2414 StoreU(packedC,
d, packed_out + 0xC * N);
2415 StoreU(packedD,
d, packed_out + 0xD * N);
2416 StoreU(packedE,
d, packed_out + 0xE * N);
2422 using VU16 =
Vec<
decltype(
d)>;
2423 const size_t N =
Lanes(
d);
2441 const VU16 mask =
Set(
d, 0x7FFFu);
2443 const VU16 raw0 =
And(packed0, mask);
2446 const VU16 raw1 =
And(packed1, mask);
2449 const VU16 raw2 =
And(packed2, mask);
2452 const VU16 raw3 =
And(packed3, mask);
2455 const VU16 raw4 =
And(packed4, mask);
2458 const VU16 raw5 =
And(packed5, mask);
2461 const VU16 raw6 =
And(packed6, mask);
2464 const VU16 raw7 =
And(packed7, mask);
2467 const VU16 raw8 =
And(packed8, mask);
2470 const VU16 raw9 =
And(packed9, mask);
2473 const VU16 rawA =
And(packedA, mask);
2474 StoreU(rawA,
d, raw + 0xA * N);
2476 const VU16 rawB =
And(packedB, mask);
2477 StoreU(rawB,
d, raw + 0xB * N);
2479 const VU16 rawC =
And(packedC, mask);
2480 StoreU(rawC,
d, raw + 0xC * N);
2482 const VU16 rawD =
And(packedD, mask);
2483 StoreU(rawD,
d, raw + 0xD * N);
2485 const VU16 rawE =
And(packedE, mask);
2486 StoreU(rawE,
d, raw + 0xE * N);
2489 const VU16 F0 =
Xor3(ShiftRight<15>(packed0),
2490 ShiftRight<14>(
AndNot(mask, packed1)),
2491 ShiftRight<13>(
AndNot(mask, packed2)));
2492 const VU16 F1 =
Xor3(ShiftRight<12>(
AndNot(mask, packed3)),
2493 ShiftRight<11>(
AndNot(mask, packed4)),
2494 ShiftRight<10>(
AndNot(mask, packed5)));
2495 const VU16 F2 =
Xor3(ShiftRight<9>(
AndNot(mask, packed6)),
2496 ShiftRight<8>(
AndNot(mask, packed7)),
2497 ShiftRight<7>(
AndNot(mask, packed8)));
2498 const VU16 F3 =
Xor3(ShiftRight<6>(
AndNot(mask, packed9)),
2499 ShiftRight<5>(
AndNot(mask, packedA)),
2500 ShiftRight<4>(
AndNot(mask, packedB)));
2501 const VU16 F4 =
Xor3(ShiftRight<3>(
AndNot(mask, packedC)),
2502 ShiftRight<2>(
AndNot(mask, packedD)),
2503 ShiftRight<1>(
AndNot(mask, packedE)));
2504 const VU16 rawF =
Xor3(F0, F1,
Xor3(F2, F3, F4));
2505 StoreU(rawF,
d, raw + 0xF * N);
2514 using VU16 =
Vec<
decltype(
d)>;
2515 const size_t N =
Lanes(
d);
2516 const VU16 raw0 =
LoadU(
d, raw + 0 * N);
2517 const VU16 raw1 =
LoadU(
d, raw + 1 * N);
2518 const VU16 raw2 =
LoadU(
d, raw + 2 * N);
2519 const VU16 raw3 =
LoadU(
d, raw + 3 * N);
2520 const VU16 raw4 =
LoadU(
d, raw + 4 * N);
2521 const VU16 raw5 =
LoadU(
d, raw + 5 * N);
2522 const VU16 raw6 =
LoadU(
d, raw + 6 * N);
2523 const VU16 raw7 =
LoadU(
d, raw + 7 * N);
2524 const VU16 raw8 =
LoadU(
d, raw + 8 * N);
2525 const VU16 raw9 =
LoadU(
d, raw + 9 * N);
2526 const VU16 rawA =
LoadU(
d, raw + 0xA * N);
2527 const VU16 rawB =
LoadU(
d, raw + 0xB * N);
2528 const VU16 rawC =
LoadU(
d, raw + 0xC * N);
2529 const VU16 rawD =
LoadU(
d, raw + 0xD * N);
2530 const VU16 rawE =
LoadU(
d, raw + 0xE * N);
2531 const VU16 rawF =
LoadU(
d, raw + 0xF * N);
2533 StoreU(raw0,
d, packed_out + 0 * N);
2534 StoreU(raw1,
d, packed_out + 1 * N);
2535 StoreU(raw2,
d, packed_out + 2 * N);
2536 StoreU(raw3,
d, packed_out + 3 * N);
2537 StoreU(raw4,
d, packed_out + 4 * N);
2538 StoreU(raw5,
d, packed_out + 5 * N);
2539 StoreU(raw6,
d, packed_out + 6 * N);
2540 StoreU(raw7,
d, packed_out + 7 * N);
2541 StoreU(raw8,
d, packed_out + 8 * N);
2542 StoreU(raw9,
d, packed_out + 9 * N);
2543 StoreU(rawA,
d, packed_out + 0xA * N);
2544 StoreU(rawB,
d, packed_out + 0xB * N);
2545 StoreU(rawC,
d, packed_out + 0xC * N);
2546 StoreU(rawD,
d, packed_out + 0xD * N);
2547 StoreU(rawE,
d, packed_out + 0xE * N);
2548 StoreU(rawF,
d, packed_out + 0xF * N);
2554 using VU16 =
Vec<
decltype(
d)>;
2555 const size_t N =
Lanes(
d);
2584 StoreU(rawA,
d, raw + 0xA * N);
2585 StoreU(rawB,
d, raw + 0xB * N);
2586 StoreU(rawC,
d, raw + 0xC * N);
2587 StoreU(rawD,
d, raw + 0xD * N);
2588 StoreU(rawE,
d, raw + 0xE * N);
2589 StoreU(rawF,
d, raw + 0xF * N);
#define HWY_RESTRICT
Definition base.h:95
#define HWY_INLINE
Definition base.h:101
D d
Definition arm_sve-inl.h:1915
HWY_API VFromD< D > BitCast(D d, Vec128< FromT, Repartition< FromT, D >().MaxLanes()> v)
Definition arm_neon-inl.h:1581
HWY_API Vec128< T, N > Xor3(Vec128< T, N > x1, Vec128< T, N > x2, Vec128< T, N > x3)
Definition arm_neon-inl.h:2766
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
Repartition< MakeWide< TFromD< D > >, D > RepartitionToWide
Definition ops/shared-inl.h:474
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
HWY_API V Add(V a, V b)
Definition generic_ops-inl.h:7300
HWY_API Vec128< T, N > OrAnd(Vec128< T, N > o, Vec128< T, N > a1, Vec128< T, N > a2)
Definition arm_neon-inl.h:2779
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
HWY_API Vec128< T, N > Or(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2727
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:46
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API Vec128< T, N > AndNot(const Vec128< T, N > not_mask, const Vec128< T, N > mask)
Definition arm_neon-inl.h:2705
#define HWY_NAMESPACE
Definition set_macros-inl.h:166
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1681
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1621
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1755
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1836
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1974
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1921
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2049
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2131
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2275
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2219
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2362
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2420
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:2552
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:2512
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:646
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:610
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:707
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:743
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:846
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:805
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:950
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:913
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1058
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1014
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1178
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1131
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1302
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1252
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1380
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1423
HWY_INLINE void Unpack(D d, const uint16_t *HWY_RESTRICT packed_in, uint16_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:1548
HWY_INLINE void Pack(D d, const uint16_t *HWY_RESTRICT raw, uint16_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:1491
Definition bit_pack-inl.h:38
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:66
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:43
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:128
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:104
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:197
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:167
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:270
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:243
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:311
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:346
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:427
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:395
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:474
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:508
HWY_INLINE void Unpack(D8 d8, const uint8_t *HWY_RESTRICT packed_in, uint8_t *HWY_RESTRICT raw) const
Definition bit_pack-inl.h:583
HWY_INLINE void Pack(D8 d8, const uint8_t *HWY_RESTRICT raw, uint8_t *HWY_RESTRICT packed_out) const
Definition bit_pack-inl.h:559
Definition bit_pack-inl.h:36