Grok 12.0.1
copy-inl.h
Go to the documentation of this file.
1// Copyright 2022 Google LLC
2// SPDX-License-Identifier: Apache-2.0
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8// http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15
16// Per-target include guard
17#if defined(HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_) == \
18 defined(HWY_TARGET_TOGGLE) // NOLINT
19#ifdef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
20#undef HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
21#else
22#define HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
23#endif
24
25#include <stddef.h>
26#include <stdint.h>
27
28#include "hwy/highway.h"
29
31namespace hwy {
32namespace HWY_NAMESPACE {
33
34// These functions avoid having to write a loop plus remainder handling in the
35// (unfortunately still common) case where arrays are not aligned/padded. If the
36// inputs are known to be aligned/padded, it is more efficient to write a single
37// loop using Load(). We do not provide a CopyAlignedPadded because it
38// would be more verbose than such a loop.
39
40// Fills `to`[0, `count`) with `value`.
41template <class D, typename T = TFromD<D>>
42void Fill(D d, T value, size_t count, T* HWY_RESTRICT to) {
43 const size_t N = Lanes(d);
44 const Vec<D> v = Set(d, value);
45
46 size_t idx = 0;
47 if (count >= N) {
48 for (; idx <= count - N; idx += N) {
49 StoreU(v, d, to + idx);
50 }
51 }
52
53 // `count` was a multiple of the vector length `N`: already done.
54 if (HWY_UNLIKELY(idx == count)) return;
55
56 const size_t remaining = count - idx;
57 HWY_DASSERT(0 != remaining && remaining < N);
58 SafeFillN(remaining, value, d, to + idx);
59}
60
61// Copies `from`[0, `count`) to `to`, which must not overlap `from`.
62template <class D, typename T = TFromD<D>>
63void Copy(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to) {
64 const size_t N = Lanes(d);
65
66 size_t idx = 0;
67 if (count >= N) {
68 for (; idx <= count - N; idx += N) {
69 const Vec<D> v = LoadU(d, from + idx);
70 StoreU(v, d, to + idx);
71 }
72 }
73
74 // `count` was a multiple of the vector length `N`: already done.
75 if (HWY_UNLIKELY(idx == count)) return;
76
77 const size_t remaining = count - idx;
78 HWY_DASSERT(0 != remaining && remaining < N);
79 SafeCopyN(remaining, d, from + idx, to + idx);
80}
81
82// For idx in [0, count) in ascending order, appends `from[idx]` to `to` if the
83// corresponding mask element of `func(d, v)` is true. Returns the STL-style end
84// of the newly written elements in `to`.
85//
86// `func` is either a functor with a templated operator()(d, v) returning a
87// mask, or a generic lambda if using C++14. Due to apparent limitations of
88// Clang on Windows, it is currently necessary to add HWY_ATTR before the
89// opening { of the lambda to avoid errors about "function .. requires target".
90//
91// NOTE: this is only supported for 16-, 32- or 64-bit types.
92// NOTE: Func may be called a second time for elements it has already seen, but
93// these elements will not be written to `to` again.
94template <class D, class Func, typename T = TFromD<D>>
95T* CopyIf(D d, const T* HWY_RESTRICT from, size_t count, T* HWY_RESTRICT to,
96 const Func& func) {
97 const size_t N = Lanes(d);
98
99 size_t idx = 0;
100 if (count >= N) {
101 for (; idx <= count - N; idx += N) {
102 const Vec<D> v = LoadU(d, from + idx);
103 to += CompressBlendedStore(v, func(d, v), d, to);
104 }
105 }
106
107 // `count` was a multiple of the vector length `N`: already done.
108 if (HWY_UNLIKELY(idx == count)) return to;
109
110#if HWY_MEM_OPS_MIGHT_FAULT
111 // Proceed one by one.
112 const CappedTag<T, 1> d1;
113 for (; idx < count; ++idx) {
114 using V1 = Vec<decltype(d1)>;
115 // Workaround for -Waggressive-loop-optimizations on GCC 8
116 // (iteration 2305843009213693951 invokes undefined behavior for T=i64)
117 const uintptr_t addr = reinterpret_cast<uintptr_t>(from);
118 const T* HWY_RESTRICT from_idx =
119 reinterpret_cast<const T * HWY_RESTRICT>(addr + (idx * sizeof(T)));
120 const V1 v = LoadU(d1, from_idx);
121 // Avoid storing to `to` unless we know it should be kept - otherwise, we
122 // might overrun the end if it was allocated for the exact count.
123 if (CountTrue(d1, func(d1, v)) == 0) continue;
124 StoreU(v, d1, to);
125 to += 1;
126 }
127#else
128 // Start index of the last unaligned whole vector, ending at the array end.
129 const size_t last = count - N;
130 // Number of elements before `from` or already written.
131 const size_t invalid = idx - last;
132 HWY_DASSERT(0 != invalid && invalid < N);
133 const Mask<D> mask = Not(FirstN(d, invalid));
134 const Vec<D> v = MaskedLoad(mask, d, from + last);
135 to += CompressBlendedStore(v, And(mask, func(d, v)), d, to);
136#endif
137 return to;
138}
139
140// NOLINTNEXTLINE(google-readability-namespace-comments)
141} // namespace HWY_NAMESPACE
142} // namespace hwy
144
145#endif // HIGHWAY_HWY_CONTRIB_ALGO_COPY_INL_H_
#define HWY_RESTRICT
Definition base.h:95
#define HWY_DASSERT(condition)
Definition base.h:290
#define HWY_UNLIKELY(expr)
Definition base.h:107
HWY_AFTER_NAMESPACE()
HWY_BEFORE_NAMESPACE()
T * CopyIf(D d, const T *HWY_RESTRICT from, size_t count, T *HWY_RESTRICT to, const Func &func)
Definition copy-inl.h:95
D d
Definition arm_sve-inl.h:1915
HWY_API size_t CountTrue(D, Mask128< T > mask)
Definition arm_neon-inl.h:8358
HWY_API Vec128< T, N > And(const Vec128< T, N > a, const Vec128< T, N > b)
Definition arm_neon-inl.h:2690
HWY_API Vec128< uint8_t > LoadU(D, const uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3442
HWY_API Vec128< T > Not(const Vec128< T > v)
Definition arm_neon-inl.h:2672
HWY_API void StoreU(Vec128< uint8_t > v, D, uint8_t *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:3689
typename detail::CappedTagChecker< T, kLimit, kPow2 >::type CappedTag
Definition ops/shared-inl.h:379
decltype(MaskFromVec(Zero(D()))) Mask
Definition generic_ops-inl.h:52
HWY_INLINE Vec128< TFromD< D > > Set(D, T t)
Definition arm_neon-inl.h:931
void Fill(D d, T value, size_t count, T *HWY_RESTRICT to)
Definition copy-inl.h:42
void Copy(D d, const T *HWY_RESTRICT from, size_t count, T *HWY_RESTRICT to)
Definition copy-inl.h:63
HWY_API void SafeFillN(const size_t num, const T value, D d, T *HWY_RESTRICT to)
Definition generic_ops-inl.h:172
HWY_API void SafeCopyN(const size_t num, D d, const T *HWY_RESTRICT from, T *HWY_RESTRICT to)
Definition generic_ops-inl.h:187
decltype(Zero(D())) Vec
Definition generic_ops-inl.h:46
HWY_API size_t Lanes(D)
Definition rvv-inl.h:598
HWY_API MFromD< D > FirstN(D d, size_t num)
Definition arm_neon-inl.h:3232
HWY_API size_t CompressBlendedStore(VFromD< D > v, MFromD< D > m, D d, TFromD< D > *HWY_RESTRICT unaligned)
Definition arm_neon-inl.h:8955
HWY_API VFromD< D > MaskedLoad(MFromD< D > m, D d, const TFromD< D > *HWY_RESTRICT aligned)
Definition arm_neon-inl.h:3669
Definition abort.h:8
FuncOutput(*)(const void *, FuncInput) Func
Definition nanobenchmark.h:87
#define HWY_NAMESPACE
Definition set_macros-inl.h:166